src/extract_lambda.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

import csv
import json
import logging
import re
from datetime import datetime
from io import StringIO

import boto3
from botocore.exceptions import ClientError
from pg8000.native import Connection, InterfaceError, identifier

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# DB Exception class


class DBConnectionException(Exception):
    """Wraps pg8000.native Error or DatabaseError."""

    def __init__(self, e):
        """Initialise with provided error message."""
        self.message = str(e)
        super().__init__(self.message)


def lambda_handler(event, context):
    """This lambda function connects to the Totesys database, lists the contents of the ingestion bucket,
    and converts all tables to CSV and if any of those tables do not exist in, or are different to the ones in s3, it uploads them
    it uses 3 helper functions to achieve these 3 functionalities
    """
    db = None
    try:
        db = connect_to_database()
        existing_files = list_existing_s3_files()
        any_changes = process_and_upload_tables(db, existing_files)

        if not any_changes["updated"]:
            logger.info("No changes detected in the database.")
            return {
                "statusCode": 200,
                "body": json.dumps("No changes detected, no CSV files were uploaded."),
            }
        return {
            "statusCode": 200,
            "body": json.dumps(
                f"""CSV files processed for {', '.join(any_changes['updated'])} and uploaded successfully.{
                'The following tables were not updated: '+', '.join(any_changes['no change']) if any_changes['no change'] else ''}"""
            ),
        }
    except Exception as e:
        logger.error(f"Error: {e}")
        return {"statusCode": 500, "body": json.dumps("Internal server error.")}
    finally:
        if db:
            db.close()


def retrieve_secrets():
    secret_name = "bentley-secrets"
    region_name = "eu-west-2"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(service_name="secretsmanager", region_name=region_name)

    try:
        get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    except ClientError as e:
        logger.error(f"Failed to retrieve secret {secret_name}: {str(e)}")
        raise e
    except KeyError:
        logger.error(f"Secret {secret_name} does not contain a SecretString")
        raise ValueError(f"Secret {secret_name} does not contain a SecretString")

    return get_secret_value_response["SecretString"]


def connect_to_database() -> Connection:
    try:
        secrets = json.loads(retrieve_secrets())
        host = secrets["host"]
        port = secrets["port"]
        user = secrets["user"]
        password = secrets["password"]
        database = secrets["database"]

        return Connection(
            database=database, user=user, password=password, host=host, port=port
        )
    except InterfaceError as i:
        logger.error(f"Interface error: {i}")
        raise DBConnectionException("Failed to connect to database")


def extract_bucket(client=boto3.client("s3")):
    response = client.list_buckets()
    extract_bucket_filter = [
        bucket["Name"] for bucket in response["Buckets"] if "extract" in bucket["Name"]
    ]

    return extract_bucket_filter[0]


def list_existing_s3_files(bucket_name=extract_bucket(), client=boto3.client("s3")):
    """Creates a dictionary and populates it with the
    results of listing the contents of the s3 bucket, then
    returns the populated dictionary
    """

    existing_files = {}

    try:
        response = client.list_objects_v2(Bucket=bucket_name)

        if "Contents" in response:
            for obj in response["Contents"]:
                s3_key = obj["Key"]
                try:
                    file_obj = client.get_object(Bucket=bucket_name, Key=s3_key)
                    file_content = file_obj["Body"].read().decode("utf-8")
                    existing_files[s3_key] = file_content
                except ClientError as e:
                    logger.error(f"Error retrieving S3 object {s3_key}: {e}")
        else:
            logger.error("The bucket is empty")

    except ClientError as e:
        logger.error(f"Error listing S3 objects: {e}")

    return existing_files


def get_latest_timestamp(existing_files):
    all_datetimes = []
    for file_name in existing_files.keys():
        match = re.search(r"\/(.+/).+_(.+)\.csv", file_name)
        if match:
            datetime_str = "".join(match.group(1, 2))
            all_datetimes.append(datetime.strptime(datetime_str, "%Y/%m/%d/%H:%M:%S"))
    return max(all_datetimes) if all_datetimes else datetime.min


def stream_to_s3(table_name, rows, column_names, s3_client, bucket_name, s3_key):
    csv_buffer = StringIO()
    csv_writer = csv.writer(csv_buffer)

    csv_writer.writerow(column_names)

    for row in rows:
        csv_writer.writerow(row)

    csv_buffer.seek(0)

    s3_client.upload_fileobj(csv_buffer, bucket_name, s3_key)


def process_and_upload_tables(db, existing_files, client=boto3.client("s3")):
    """Creates a list of the tables from a database query and
    then selects everything from each table in individual queries
    it then writes each table to CSV files and compares with the item
    in the existing_files dictionary with the same name. If it finds any changes
    to files, or new tables/files it uploads them to the s3 bucket
    """
    load_status = {"updated": [], "no change": []}
    latest_timestamp = get_latest_timestamp(existing_files)

    tables = db.run(
        """
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema='public'
        AND table_type='BASE TABLE';
        """
    )

    for table in tables:
        table_name = table[0]
        rows = db.run(
            f"SELECT * FROM {identifier(table_name)} WHERE last_updated >= :latest;",
            latest={datetime.strftime(latest_timestamp, "%Y-%m-%d %H:%M:%S")},
        )
        if rows:
            column_names = [
                col_name[0]
                for col_name in db.run(
                    """SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS 
                       WHERE table_name = :table ;""",
                    table=table_name,
                )
            ]

            s3_key = (
                f"{table_name}/{datetime.now().strftime('%Y/%m/%d')}/"
                f"{table_name}_{datetime.now().strftime('%H:%M:%S')}.csv"
            )

            try:
                stream_to_s3(
                    table_name, rows, column_names, client, extract_bucket(), s3_key
                )
                load_status["updated"].append(table_name)
                logger.info(f"Uploaded {s3_key} to S3.")
            except ClientError as e:
                logger.error(f"Error uploading to S3: {e}")
        else:
            load_status["no change"].append(table_name)
            logger.info(f"No new data")
    return load_status