aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/transform_lambda.py26
-rw-r--r--tests/dummy_2.csv5
-rw-r--r--tests/test_transform_lambda.py21
3 files changed, 28 insertions, 24 deletions
diff --git a/src/transform_lambda.py b/src/transform_lambda.py
index 6f65728..ea4e16f 100644
--- a/src/transform_lambda.py
+++ b/src/transform_lambda.py
@@ -15,18 +15,6 @@ import pandas as pd
def lambda_handler(event, context):
s3_client = boto3.client('s3')
-
- tables = ['sales_order',
- 'transaction',
- 'payment',
- 'counterparty',
- 'address',
- 'staff',
- 'purchase_order',
- 'department',
- 'currency',
- 'design',
- 'payment_type']
try:
s3_bucket_name = event["Records"][0]["s3"]["bucket"]["name"]
s3_file_name = event["Records"][0]["s3"]["object"]["key"]
@@ -51,9 +39,8 @@ def lambda_handler(event, context):
'body': json.dumps('')
}
-## each csv file must be converted into a pandas df
-## done via read_csv, where stringIO creates an file-like-object from string - treats string like a file: as file is not physically stored in file
-## each file needs its own panda df (?) to be normalised
+## Started from fresh on Wed 21st Aug:
+
tables = ['sales_order',
'transaction',
'payment',
@@ -70,14 +57,9 @@ def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')):
table_dfs = {}
for table in tables:
response = client.list_objects_v2(Bucket=bucket, Prefix=table)
- list_of_keys = ['s3://'+object['Key'] for object in response['Contents']]
- print(list_of_keys)
+ list_of_keys = ['s3://'+bucket+'/'+object['Key'] for object in response['Contents']]
list_of_df = [pd.read_csv(key) for key in list_of_keys]
table_dfs[table] = pd.concat(list_of_df)
return table_dfs
- # exec("%s = %d" % (table,pd.concat(list_of_df)))
- # exec(f"{table} = {pd.concat(list_of_df)}")
- # table_dfs = [sales_order, transaction, payment, counterparty, address,
- # staff, purchase_order, department, currency, design, payment_type]
-
+
diff --git a/tests/dummy_2.csv b/tests/dummy_2.csv
new file mode 100644
index 0000000..8abc9bf
--- /dev/null
+++ b/tests/dummy_2.csv
@@ -0,0 +1,5 @@
+Car_type,Brand,Colour
+Truck,Chevrolet,Grey
+Convertible,Mercedes,Red
+Van,Volkswagen,Blue
+
diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py
index a3ec4a8..7de1bf3 100644
--- a/tests/test_transform_lambda.py
+++ b/tests/test_transform_lambda.py
@@ -4,6 +4,7 @@ import pytest
import pandas as pd
import os
import boto3
+import numpy as np
@pytest.fixture(scope='class')
def aws_credentials():
@@ -27,7 +28,23 @@ class TestReadFromS3:
tables = ['Foods']
result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client)
print(result)
+ expected_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]),
+ columns=['Food_type', 'Flavour', 'Colour'])
assert isinstance(result,dict)
- assert list(result.keys()) == 'Foods'
+ assert list(result.keys())[0] == 'Foods'
assert isinstance(result['Foods'],pd.DataFrame)
- \ No newline at end of file
+ assert result['Foods'].eq(expected_df,axis='columns').all(axis=None)
+
+ def test_returns_dictionary_of_dataframes_for_multiple_tables(self,s3_client):
+ s3_client.upload_file('tests/dummy_2.csv', 'dummy_buc', 'Cars/2024/08/21/Cars_14:03:56.csv')
+ tables = ['Foods','Cars']
+ result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client)
+ expected_foods_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]),
+ columns=['Food_type', 'Flavour', 'Colour'])
+ expected_cars_df = pd.DataFrame(np.array([['Truck', 'Chevrolet', 'Grey'], ['Convertible', 'Mercedes','Red'],['Van','Volkswagen','Blue']]),
+ columns=['Car_type', 'Brand', 'Colour'])
+ assert list(result.keys()) == tables
+ assert result['Foods'].eq(expected_foods_df,axis='columns').all(axis=None)
+ assert result['Cars'].eq(expected_cars_df,axis='columns').all(axis=None)
+
+
git.ajschof.me — hosted by ajschofield — powered by cgit