diff options
| -rw-r--r-- | requirements.txt | 2 | ||||
| -rw-r--r-- | src/transform_lambda.py | 34 | ||||
| -rw-r--r-- | tests/test_transform_lambda.py | 34 |
3 files changed, 64 insertions, 6 deletions
diff --git a/requirements.txt b/requirements.txt index 6f383f9..087d1c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ asn1crypto==1.5.1 boto3==1.34.159 -botocore==1.34.159 +botocore==1.34.7 certifi==2024.7.4 cffi==1.17.0 charset-normalizer==3.3.2 diff --git a/src/transform_lambda.py b/src/transform_lambda.py index 900bf4b..6f65728 100644 --- a/src/transform_lambda.py +++ b/src/transform_lambda.py @@ -1,3 +1,4 @@ +#from src.extract_lambda import extract_bucket import json import boto3 import re @@ -10,9 +11,7 @@ import pandas as pd ##In-order to use PANDAS module in lambda function, a Lambda Layer needs to be attached to the AWS Lambda Function. ##need a function that normalises the data - - -s3_resource = boto3.resource('s3') ##need this for a way of reuploading data after transformation +#s3_resource = boto3.resource('s3') ##need this for a way of reuploading data after transformation def lambda_handler(event, context): s3_client = boto3.client('s3') @@ -54,4 +53,31 @@ def lambda_handler(event, context): ## each csv file must be converted into a pandas df ## done via read_csv, where stringIO creates an file-like-object from string - treats string like a file: as file is not physically stored in file -## each file needs its own panda df (?) to be normalised
\ No newline at end of file +## each file needs its own panda df (?) to be normalised +tables = ['sales_order', + 'transaction', + 'payment', + 'counterparty', + 'address', + 'staff', + 'purchase_order', + 'department', + 'currency', + 'design', + 'payment_type'] + +def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')): + table_dfs = {} + for table in tables: + response = client.list_objects_v2(Bucket=bucket, Prefix=table) + list_of_keys = ['s3://'+object['Key'] for object in response['Contents']] + print(list_of_keys) + list_of_df = [pd.read_csv(key) for key in list_of_keys] + table_dfs[table] = pd.concat(list_of_df) + return table_dfs + # exec("%s = %d" % (table,pd.concat(list_of_df))) + # exec(f"{table} = {pd.concat(list_of_df)}") + # table_dfs = [sales_order, transaction, payment, counterparty, address, + # staff, purchase_order, department, currency, design, payment_type] + + diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py index dd08b6a..a3ec4a8 100644 --- a/tests/test_transform_lambda.py +++ b/tests/test_transform_lambda.py @@ -1 +1,33 @@ -from src.transform_lambda import lambda_handler
\ No newline at end of file +from src.transform_lambda import read_from_s3_subfolder_to_df +from moto import mock_aws +import pytest +import pandas as pd +import os +import boto3 + +@pytest.fixture(scope='class') +def aws_credentials(): + os.environ["AWS_ACCESS_KEY_ID"] = 'testing' + os.environ["AWS_SECRET_ACCESS_KEY"] = 'testing' + os.environ["AWS_SECURIT_TOKEN"] = 'testing' + os.environ["AWS_SESSION_TOKEN"] = 'testing' + os.environ["AWS_DEFAULT_REGION"]= 'eu-west-2' + +@pytest.fixture(scope='class') +def s3_client(aws_credentials): + with mock_aws(): + yield boto3.client('s3') +class TestReadFromS3: + + def test_returns_dictionary_with_correct_value_pair(self,s3_client): + s3_client.create_bucket(Bucket = 'dummy_buc',CreateBucketConfiguration={ + 'LocationConstraint': 'eu-west-2' + }) + s3_client.upload_file('tests/dummy_identical.csv', 'dummy_buc', 'Foods/2024/08/21/Foods_12:03:10.csv') + tables = ['Foods'] + result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) + print(result) + assert isinstance(result,dict) + assert list(result.keys()) == 'Foods' + assert isinstance(result['Foods'],pd.DataFrame) +
\ No newline at end of file |
