aboutsummaryrefslogtreecommitdiffstats
path: root/src/extract_lambda.py
diff options
context:
space:
mode:
authorAlex Schofield <git@ajschof.me>2024-08-19 23:13:39 +0100
committerAlex Schofield <git@ajschof.me>2024-08-19 23:13:39 +0100
commitcaed81dc699b9b4105da2b8924310f1a370217c7 (patch)
tree370b3705ed5915e065708937b0beffee99cf6f0d /src/extract_lambda.py
parent58f12b9a61ae6806fccaf64781824a26b480688e (diff)
downloadde-project-bentley-caed81dc699b9b4105da2b8924310f1a370217c7.tar.gz
de-project-bentley-caed81dc699b9b4105da2b8924310f1a370217c7.zip
refactor: add timestamp function in extract_lambda.py
Diffstat (limited to 'src/extract_lambda.py')
-rw-r--r--src/extract_lambda.py29
1 files changed, 17 insertions, 12 deletions
diff --git a/src/extract_lambda.py b/src/extract_lambda.py
index 8353481..ad3c970 100644
--- a/src/extract_lambda.py
+++ b/src/extract_lambda.py
@@ -129,6 +129,16 @@ def list_existing_s3_files(bucket_name=extract_bucket(), client=boto3.client("s3
return existing_files
+def get_latest_timestamp(existing_files):
+ all_datetimes = []
+ for file_name in existing_files.keys():
+ match = re.search(r"\/(.+/).+_(.+)\.csv", file_name)
+ if match:
+ datetime_str = "".join(match.group(1, 2))
+ all_datetimes.append(datetime.strptime(datetime_str, "%Y/%m/%d/%H:%M:%S"))
+ return max(all_datetimes) if all_datetimes else datetime.min
+
+
def process_and_upload_tables(db, existing_files, client=boto3.client("s3")):
"""Creates a list of the tables from a database query and
then selects everything from each table in individual queries
@@ -137,22 +147,17 @@ def process_and_upload_tables(db, existing_files, client=boto3.client("s3")):
to files, or new tables/files it uploads them to the s3 bucket
"""
load_status = {"updated": [], "no change": []}
- # Retrieving the latest file timestamp from S3 extract bucket
- all_datetimes = []
- for file_names in existing_files.keys():
- datetime_str_on_s3 = "".join(
- re.search(r"\/(.+/).+_(.+)\.csv", file_names).group(1, 2)
- )
- all_datetimes.append(datetime.strptime(datetime_str_on_s3, "%Y/%m/%d/%H:%M:%S"))
- latest_timestamp = max(all_datetimes)
+ latest_timestamp = get_latest_timestamp(existing_files)
- # Iterating through tables on the database and retrieving only latest changes vs previous file load
tables = db.run(
"""
- SELECT table_name
- FROM information_schema.tables
- WHERE table_schema='public' AND table_type='BASE TABLE';"""
+ SELECT table_name
+ FROM information_schema.tables
+ WHERE table_schema='public'
+ AND table_type='BASE TABLE';
+ """
)
+
for table in tables:
print(tables)
table_name = table[0]
git.ajschof.me — hosted by ajschofield — powered by cgit