aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--requirements.txt2
-rw-r--r--src/transform_lambda.py34
-rw-r--r--tests/test_transform_lambda.py34
3 files changed, 64 insertions, 6 deletions
diff --git a/requirements.txt b/requirements.txt
index 6f383f9..087d1c2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
asn1crypto==1.5.1
boto3==1.34.159
-botocore==1.34.159
+botocore==1.34.7
certifi==2024.7.4
cffi==1.17.0
charset-normalizer==3.3.2
diff --git a/src/transform_lambda.py b/src/transform_lambda.py
index 900bf4b..6f65728 100644
--- a/src/transform_lambda.py
+++ b/src/transform_lambda.py
@@ -1,3 +1,4 @@
+#from src.extract_lambda import extract_bucket
import json
import boto3
import re
@@ -10,9 +11,7 @@ import pandas as pd
##In-order to use PANDAS module in lambda function, a Lambda Layer needs to be attached to the AWS Lambda Function.
##need a function that normalises the data
-
-
-s3_resource = boto3.resource('s3') ##need this for a way of reuploading data after transformation
+#s3_resource = boto3.resource('s3') ##need this for a way of reuploading data after transformation
def lambda_handler(event, context):
s3_client = boto3.client('s3')
@@ -54,4 +53,31 @@ def lambda_handler(event, context):
## each csv file must be converted into a pandas df
## done via read_csv, where stringIO creates an file-like-object from string - treats string like a file: as file is not physically stored in file
-## each file needs its own panda df (?) to be normalised \ No newline at end of file
+## each file needs its own panda df (?) to be normalised
+tables = ['sales_order',
+ 'transaction',
+ 'payment',
+ 'counterparty',
+ 'address',
+ 'staff',
+ 'purchase_order',
+ 'department',
+ 'currency',
+ 'design',
+ 'payment_type']
+
+def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')):
+ table_dfs = {}
+ for table in tables:
+ response = client.list_objects_v2(Bucket=bucket, Prefix=table)
+ list_of_keys = ['s3://'+object['Key'] for object in response['Contents']]
+ print(list_of_keys)
+ list_of_df = [pd.read_csv(key) for key in list_of_keys]
+ table_dfs[table] = pd.concat(list_of_df)
+ return table_dfs
+ # exec("%s = %d" % (table,pd.concat(list_of_df)))
+ # exec(f"{table} = {pd.concat(list_of_df)}")
+ # table_dfs = [sales_order, transaction, payment, counterparty, address,
+ # staff, purchase_order, department, currency, design, payment_type]
+
+
diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py
index dd08b6a..a3ec4a8 100644
--- a/tests/test_transform_lambda.py
+++ b/tests/test_transform_lambda.py
@@ -1 +1,33 @@
-from src.transform_lambda import lambda_handler \ No newline at end of file
+from src.transform_lambda import read_from_s3_subfolder_to_df
+from moto import mock_aws
+import pytest
+import pandas as pd
+import os
+import boto3
+
+@pytest.fixture(scope='class')
+def aws_credentials():
+ os.environ["AWS_ACCESS_KEY_ID"] = 'testing'
+ os.environ["AWS_SECRET_ACCESS_KEY"] = 'testing'
+ os.environ["AWS_SECURIT_TOKEN"] = 'testing'
+ os.environ["AWS_SESSION_TOKEN"] = 'testing'
+ os.environ["AWS_DEFAULT_REGION"]= 'eu-west-2'
+
+@pytest.fixture(scope='class')
+def s3_client(aws_credentials):
+ with mock_aws():
+ yield boto3.client('s3')
+class TestReadFromS3:
+
+ def test_returns_dictionary_with_correct_value_pair(self,s3_client):
+ s3_client.create_bucket(Bucket = 'dummy_buc',CreateBucketConfiguration={
+ 'LocationConstraint': 'eu-west-2'
+ })
+ s3_client.upload_file('tests/dummy_identical.csv', 'dummy_buc', 'Foods/2024/08/21/Foods_12:03:10.csv')
+ tables = ['Foods']
+ result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client)
+ print(result)
+ assert isinstance(result,dict)
+ assert list(result.keys()) == 'Foods'
+ assert isinstance(result['Foods'],pd.DataFrame)
+ \ No newline at end of file
git.ajschof.me — hosted by ajschofield — powered by cgit