From 3f40e96217418590ca66af6912f595cc04425849 Mon Sep 17 00:00:00 2001 From: lian-manonog Date: Mon, 19 Aug 2024 15:52:14 +0100 Subject: wip: setting up test files for transform_lambda --- tests/test_transform_lambda.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/test_transform_lambda.py (limited to 'tests/test_transform_lambda.py') diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py new file mode 100644 index 0000000..dd08b6a --- /dev/null +++ b/tests/test_transform_lambda.py @@ -0,0 +1 @@ +from src.transform_lambda import lambda_handler \ No newline at end of file -- cgit v1.2.3 From b4fafcd9731f11f6f2efde843242b9c5cb84e85f Mon Sep 17 00:00:00 2001 From: Ang Bel Date: Wed, 21 Aug 2024 12:50:32 +0100 Subject: function to write files from s3 into a list of dataframes. Current test is failing due to AioClientCreator object has no attribute "_inject_s3_input_parameters" --- requirements.txt | 2 +- src/transform_lambda.py | 34 ++++++++++++++++++++++++++++++---- tests/test_transform_lambda.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 6 deletions(-) (limited to 'tests/test_transform_lambda.py') diff --git a/requirements.txt b/requirements.txt index 6f383f9..087d1c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ asn1crypto==1.5.1 boto3==1.34.159 -botocore==1.34.159 +botocore==1.34.7 certifi==2024.7.4 cffi==1.17.0 charset-normalizer==3.3.2 diff --git a/src/transform_lambda.py b/src/transform_lambda.py index 900bf4b..6f65728 100644 --- a/src/transform_lambda.py +++ b/src/transform_lambda.py @@ -1,3 +1,4 @@ +#from src.extract_lambda import extract_bucket import json import boto3 import re @@ -10,9 +11,7 @@ import pandas as pd ##In-order to use PANDAS module in lambda function, a Lambda Layer needs to be attached to the AWS Lambda Function. ##need a function that normalises the data - - -s3_resource = boto3.resource('s3') ##need this for a way of reuploading data after transformation +#s3_resource = boto3.resource('s3') ##need this for a way of reuploading data after transformation def lambda_handler(event, context): s3_client = boto3.client('s3') @@ -54,4 +53,31 @@ def lambda_handler(event, context): ## each csv file must be converted into a pandas df ## done via read_csv, where stringIO creates an file-like-object from string - treats string like a file: as file is not physically stored in file -## each file needs its own panda df (?) to be normalised \ No newline at end of file +## each file needs its own panda df (?) to be normalised +tables = ['sales_order', + 'transaction', + 'payment', + 'counterparty', + 'address', + 'staff', + 'purchase_order', + 'department', + 'currency', + 'design', + 'payment_type'] + +def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')): + table_dfs = {} + for table in tables: + response = client.list_objects_v2(Bucket=bucket, Prefix=table) + list_of_keys = ['s3://'+object['Key'] for object in response['Contents']] + print(list_of_keys) + list_of_df = [pd.read_csv(key) for key in list_of_keys] + table_dfs[table] = pd.concat(list_of_df) + return table_dfs + # exec("%s = %d" % (table,pd.concat(list_of_df))) + # exec(f"{table} = {pd.concat(list_of_df)}") + # table_dfs = [sales_order, transaction, payment, counterparty, address, + # staff, purchase_order, department, currency, design, payment_type] + + diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py index dd08b6a..a3ec4a8 100644 --- a/tests/test_transform_lambda.py +++ b/tests/test_transform_lambda.py @@ -1 +1,33 @@ -from src.transform_lambda import lambda_handler \ No newline at end of file +from src.transform_lambda import read_from_s3_subfolder_to_df +from moto import mock_aws +import pytest +import pandas as pd +import os +import boto3 + +@pytest.fixture(scope='class') +def aws_credentials(): + os.environ["AWS_ACCESS_KEY_ID"] = 'testing' + os.environ["AWS_SECRET_ACCESS_KEY"] = 'testing' + os.environ["AWS_SECURIT_TOKEN"] = 'testing' + os.environ["AWS_SESSION_TOKEN"] = 'testing' + os.environ["AWS_DEFAULT_REGION"]= 'eu-west-2' + +@pytest.fixture(scope='class') +def s3_client(aws_credentials): + with mock_aws(): + yield boto3.client('s3') +class TestReadFromS3: + + def test_returns_dictionary_with_correct_value_pair(self,s3_client): + s3_client.create_bucket(Bucket = 'dummy_buc',CreateBucketConfiguration={ + 'LocationConstraint': 'eu-west-2' + }) + s3_client.upload_file('tests/dummy_identical.csv', 'dummy_buc', 'Foods/2024/08/21/Foods_12:03:10.csv') + tables = ['Foods'] + result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) + print(result) + assert isinstance(result,dict) + assert list(result.keys()) == 'Foods' + assert isinstance(result['Foods'],pd.DataFrame) + \ No newline at end of file -- cgit v1.2.3 From 0c6e2f8486d1ec4d9b0bd4984e01baca3a159df0 Mon Sep 17 00:00:00 2001 From: Ang Bel Date: Wed, 21 Aug 2024 15:07:51 +0100 Subject: (tests) Read from s3 to df passes --- src/transform_lambda.py | 26 ++++---------------------- tests/dummy_2.csv | 5 +++++ tests/test_transform_lambda.py | 21 +++++++++++++++++++-- 3 files changed, 28 insertions(+), 24 deletions(-) create mode 100644 tests/dummy_2.csv (limited to 'tests/test_transform_lambda.py') diff --git a/src/transform_lambda.py b/src/transform_lambda.py index 6f65728..ea4e16f 100644 --- a/src/transform_lambda.py +++ b/src/transform_lambda.py @@ -15,18 +15,6 @@ import pandas as pd def lambda_handler(event, context): s3_client = boto3.client('s3') - - tables = ['sales_order', - 'transaction', - 'payment', - 'counterparty', - 'address', - 'staff', - 'purchase_order', - 'department', - 'currency', - 'design', - 'payment_type'] try: s3_bucket_name = event["Records"][0]["s3"]["bucket"]["name"] s3_file_name = event["Records"][0]["s3"]["object"]["key"] @@ -51,9 +39,8 @@ def lambda_handler(event, context): 'body': json.dumps('') } -## each csv file must be converted into a pandas df -## done via read_csv, where stringIO creates an file-like-object from string - treats string like a file: as file is not physically stored in file -## each file needs its own panda df (?) to be normalised +## Started from fresh on Wed 21st Aug: + tables = ['sales_order', 'transaction', 'payment', @@ -70,14 +57,9 @@ def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')): table_dfs = {} for table in tables: response = client.list_objects_v2(Bucket=bucket, Prefix=table) - list_of_keys = ['s3://'+object['Key'] for object in response['Contents']] - print(list_of_keys) + list_of_keys = ['s3://'+bucket+'/'+object['Key'] for object in response['Contents']] list_of_df = [pd.read_csv(key) for key in list_of_keys] table_dfs[table] = pd.concat(list_of_df) return table_dfs - # exec("%s = %d" % (table,pd.concat(list_of_df))) - # exec(f"{table} = {pd.concat(list_of_df)}") - # table_dfs = [sales_order, transaction, payment, counterparty, address, - # staff, purchase_order, department, currency, design, payment_type] - + diff --git a/tests/dummy_2.csv b/tests/dummy_2.csv new file mode 100644 index 0000000..8abc9bf --- /dev/null +++ b/tests/dummy_2.csv @@ -0,0 +1,5 @@ +Car_type,Brand,Colour +Truck,Chevrolet,Grey +Convertible,Mercedes,Red +Van,Volkswagen,Blue + diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py index a3ec4a8..7de1bf3 100644 --- a/tests/test_transform_lambda.py +++ b/tests/test_transform_lambda.py @@ -4,6 +4,7 @@ import pytest import pandas as pd import os import boto3 +import numpy as np @pytest.fixture(scope='class') def aws_credentials(): @@ -27,7 +28,23 @@ class TestReadFromS3: tables = ['Foods'] result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) print(result) + expected_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]), + columns=['Food_type', 'Flavour', 'Colour']) assert isinstance(result,dict) - assert list(result.keys()) == 'Foods' + assert list(result.keys())[0] == 'Foods' assert isinstance(result['Foods'],pd.DataFrame) - \ No newline at end of file + assert result['Foods'].eq(expected_df,axis='columns').all(axis=None) + + def test_returns_dictionary_of_dataframes_for_multiple_tables(self,s3_client): + s3_client.upload_file('tests/dummy_2.csv', 'dummy_buc', 'Cars/2024/08/21/Cars_14:03:56.csv') + tables = ['Foods','Cars'] + result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) + expected_foods_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]), + columns=['Food_type', 'Flavour', 'Colour']) + expected_cars_df = pd.DataFrame(np.array([['Truck', 'Chevrolet', 'Grey'], ['Convertible', 'Mercedes','Red'],['Van','Volkswagen','Blue']]), + columns=['Car_type', 'Brand', 'Colour']) + assert list(result.keys()) == tables + assert result['Foods'].eq(expected_foods_df,axis='columns').all(axis=None) + assert result['Cars'].eq(expected_cars_df,axis='columns').all(axis=None) + + -- cgit v1.2.3 From c8e94530b65d6807b2b9bb246a542963839cce9d Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:49:56 +0000 Subject: style: format code with Autopep8, Black and Ruff Formatter This commit fixes the style issues introduced in b882bb0 according to the output from Autopep8, Black and Ruff Formatter. Details: https://github.com/ajschofield/de-project-bentley/pull/84 --- src/transform_lambda.py | 36 +++++++++------- tests/test_transform_lambda.py | 94 ++++++++++++++++++++++++++---------------- 2 files changed, 79 insertions(+), 51 deletions(-) (limited to 'tests/test_transform_lambda.py') diff --git a/src/transform_lambda.py b/src/transform_lambda.py index 3a7cf43..b176ccc 100644 --- a/src/transform_lambda.py +++ b/src/transform_lambda.py @@ -1,4 +1,4 @@ -#from src.extract_lambda import extract_bucket +# from src.extract_lambda import extract_bucket import json import boto3 import re @@ -6,29 +6,33 @@ import io from io import StringIO import pandas as pd + def lambda_handler(event, context): pass -tables = ['sales_order', - 'transaction', - 'payment', - 'counterparty', - 'address', - 'staff', - 'purchase_order', - 'department', - 'currency', - 'design', - 'payment_type'] +tables = [ + "sales_order", + "transaction", + "payment", + "counterparty", + "address", + "staff", + "purchase_order", + "department", + "currency", + "design", + "payment_type", +] + -def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client('s3')): +def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client("s3")): table_dfs = {} for table in tables: response = client.list_objects_v2(Bucket=bucket, Prefix=table) - list_of_keys = ['s3://'+bucket+'/'+object['Key'] for object in response['Contents']] + list_of_keys = [ + "s3://" + bucket + "/" + object["Key"] for object in response["Contents"] + ] list_of_df = [pd.read_csv(key) for key in list_of_keys] table_dfs[table] = pd.concat(list_of_df) return table_dfs - - diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py index 7de1bf3..5121905 100644 --- a/tests/test_transform_lambda.py +++ b/tests/test_transform_lambda.py @@ -6,45 +6,69 @@ import os import boto3 import numpy as np -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def aws_credentials(): - os.environ["AWS_ACCESS_KEY_ID"] = 'testing' - os.environ["AWS_SECRET_ACCESS_KEY"] = 'testing' - os.environ["AWS_SECURIT_TOKEN"] = 'testing' - os.environ["AWS_SESSION_TOKEN"] = 'testing' - os.environ["AWS_DEFAULT_REGION"]= 'eu-west-2' + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURIT_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "eu-west-2" + -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def s3_client(aws_credentials): with mock_aws(): - yield boto3.client('s3') + yield boto3.client("s3") + + class TestReadFromS3: - - def test_returns_dictionary_with_correct_value_pair(self,s3_client): - s3_client.create_bucket(Bucket = 'dummy_buc',CreateBucketConfiguration={ - 'LocationConstraint': 'eu-west-2' - }) - s3_client.upload_file('tests/dummy_identical.csv', 'dummy_buc', 'Foods/2024/08/21/Foods_12:03:10.csv') - tables = ['Foods'] - result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) + def test_returns_dictionary_with_correct_value_pair(self, s3_client): + s3_client.create_bucket( + Bucket="dummy_buc", + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + s3_client.upload_file( + "tests/dummy_identical.csv", + "dummy_buc", + "Foods/2024/08/21/Foods_12:03:10.csv", + ) + tables = ["Foods"] + result = read_from_s3_subfolder_to_df( + tables, bucket="dummy_buc", client=s3_client + ) print(result) - expected_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]), - columns=['Food_type', 'Flavour', 'Colour']) - assert isinstance(result,dict) - assert list(result.keys())[0] == 'Foods' - assert isinstance(result['Foods'],pd.DataFrame) - assert result['Foods'].eq(expected_df,axis='columns').all(axis=None) - - def test_returns_dictionary_of_dataframes_for_multiple_tables(self,s3_client): - s3_client.upload_file('tests/dummy_2.csv', 'dummy_buc', 'Cars/2024/08/21/Cars_14:03:56.csv') - tables = ['Foods','Cars'] - result = read_from_s3_subfolder_to_df(tables,bucket='dummy_buc',client=s3_client) - expected_foods_df = pd.DataFrame(np.array([['Vegetable', 'Sour', 'Green'], ['Berry', 'Sweet', 'Red']]), - columns=['Food_type', 'Flavour', 'Colour']) - expected_cars_df = pd.DataFrame(np.array([['Truck', 'Chevrolet', 'Grey'], ['Convertible', 'Mercedes','Red'],['Van','Volkswagen','Blue']]), - columns=['Car_type', 'Brand', 'Colour']) - assert list(result.keys()) == tables - assert result['Foods'].eq(expected_foods_df,axis='columns').all(axis=None) - assert result['Cars'].eq(expected_cars_df,axis='columns').all(axis=None) - + expected_df = pd.DataFrame( + np.array([["Vegetable", "Sour", "Green"], ["Berry", "Sweet", "Red"]]), + columns=["Food_type", "Flavour", "Colour"], + ) + assert isinstance(result, dict) + assert list(result.keys())[0] == "Foods" + assert isinstance(result["Foods"], pd.DataFrame) + assert result["Foods"].eq(expected_df, axis="columns").all(axis=None) + def test_returns_dictionary_of_dataframes_for_multiple_tables(self, s3_client): + s3_client.upload_file( + "tests/dummy_2.csv", "dummy_buc", "Cars/2024/08/21/Cars_14:03:56.csv" + ) + tables = ["Foods", "Cars"] + result = read_from_s3_subfolder_to_df( + tables, bucket="dummy_buc", client=s3_client + ) + expected_foods_df = pd.DataFrame( + np.array([["Vegetable", "Sour", "Green"], ["Berry", "Sweet", "Red"]]), + columns=["Food_type", "Flavour", "Colour"], + ) + expected_cars_df = pd.DataFrame( + np.array( + [ + ["Truck", "Chevrolet", "Grey"], + ["Convertible", "Mercedes", "Red"], + ["Van", "Volkswagen", "Blue"], + ] + ), + columns=["Car_type", "Brand", "Colour"], + ) + assert list(result.keys()) == tables + assert result["Foods"].eq(expected_foods_df, axis="columns").all(axis=None) + assert result["Cars"].eq(expected_cars_df, axis="columns").all(axis=None) -- cgit v1.2.3