diff options
| -rw-r--r-- | src/transform_lambda.py | 26 | ||||
| -rw-r--r-- | tests/dummy_2.csv | 5 | ||||
| -rw-r--r-- | tests/test_transform_lambda.py | 21 |
3 files changed, 28 insertions, 24 deletions
diff --git a/src/transform_lambda.py b/src/transform_lambda.py index 6f65728..ea4e16f 100644 --- a/src/transform_lambda.py +++ b/src/transform_lambda.py @@ -15,18 +15,6 @@ import pandas as pd def lambda_handler(event, context): s3_client = boto3.client('s3') - - tables = ['sales_order', - 'transaction', - 'payment', - 'counterparty', - 'address', - 'staff', - 'purchase_order', - 'department', - 'currency', - 'design', - 'payment_type'] try: s3_bucket_name = event["Records"][0]["s3"]["bucket"]["name"] s3_file_name = event["Records"][0]["s3"]["object"]["key"] @@ -51,9 +39,8 @@ def lambda_handler(event, context): 'body': json.dumps('') } -## each csv file must be converted into a pandas df -## done via read_csv, where stringIO creates an file-like-object from string - treats string like a file: as file is not physically stored in file -## each file needs its own panda df (?) to be normalised +## Started from fresh on Wed 21st Aug: + tables = ['sales_order', 'transaction', 'payment', @@ -70,14 +57,9 @@ def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')): table_dfs = {} for table in tables: response = client.list_objects_v2(Bucket=bucket, Prefix=table) - list_of_keys = ['s3://'+object['Key'] for object in response['Contents']] - print(list_of_keys) + list_of_keys = ['s3://'+bucket+'/'+object['Key'] for object in response['Contents']] list_of_df = [pd.read_csv(key) for key in list_of_keys] table_dfs[table] = pd.concat(list_of_df) return table_dfs - # exec("%s = %d" % (table,pd.concat(list_of_df))) - # exec(f"{table} = {pd.concat(list_of_df)}") - # table_dfs = [sales_order, transaction, payment, counterparty, address, - # staff, purchase_order, department, currency, design, payment_type] - + diff --git a/tests/dummy_2.csv b/tests/dummy_2.csv new file mode 100644 index 0000000..8abc9bf --- /dev/null +++ b/tests/dummy_2.csv @@ -0,0 +1,5 @@ +Car_type,Brand,Colour +Truck,Chevrolet,Grey +Convertible,Mercedes,Red +Van,Volkswagen,Blue + diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py index a3ec4a8..7de1bf3 100644 --- a/tests/test_transform_lambda.py +++ b/tests/test_transform_lambda.py @@ -4,6 +4,7 @@ import pytest import pandas as pd import os import boto3 +import numpy as np @pytest.fixture(scope='class') def aws_credentials(): @@ -27,7 +28,23 @@ class TestReadFromS3: tables = ['Foods'] result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) print(result) + expected_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]), + columns=['Food_type', 'Flavour', 'Colour']) assert isinstance(result,dict) - assert list(result.keys()) == 'Foods' + assert list(result.keys())[0] == 'Foods' assert isinstance(result['Foods'],pd.DataFrame) -
\ No newline at end of file + assert result['Foods'].eq(expected_df,axis='columns').all(axis=None) + + def test_returns_dictionary_of_dataframes_for_multiple_tables(self,s3_client): + s3_client.upload_file('tests/dummy_2.csv', 'dummy_buc', 'Cars/2024/08/21/Cars_14:03:56.csv') + tables = ['Foods','Cars'] + result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) + expected_foods_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]), + columns=['Food_type', 'Flavour', 'Colour']) + expected_cars_df = pd.DataFrame(np.array([['Truck', 'Chevrolet', 'Grey'], ['Convertible', 'Mercedes','Red'],['Van','Volkswagen','Blue']]), + columns=['Car_type', 'Brand', 'Colour']) + assert list(result.keys()) == tables + assert result['Foods'].eq(expected_foods_df,axis='columns').all(axis=None) + assert result['Cars'].eq(expected_cars_df,axis='columns').all(axis=None) + + |
