aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/deploy.yml43
-rw-r--r--.github/workflows/dev-tests.yml48
-rw-r--r--README.md4
-rw-r--r--requirements.txt8
-rw-r--r--src/fact-sales-order.py48
-rw-r--r--src/transform_lambda.py37
-rw-r--r--tests/dummy_2.csv5
-rw-r--r--tests/test_transform_lambda.py74
8 files changed, 203 insertions, 64 deletions
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
deleted file mode 100644
index 09b8490..0000000
--- a/.github/workflows/deploy.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: deploy-terraform
-
-on:
- pull_request:
- branches:
- - main
- push:
- branches:
- - main
-
-
-jobs:
- deploy-terraform:
- if: github.ref == 'refs/heads/main'
- name: Deploy Terraform
- runs-on: ubuntu-latest
- #needs: run-checks (must ref on-commit.yml file)
- environment: production
- steps:
- - name: Checkout Repo
- uses: actions/checkout@v4
-
- - name: Install Terraform
- uses: hashicorp/setup-terraform@v3
-
- - name: Configure AWS Credentials
- uses: aws-actions/configure-aws-credentials@v4
- with:
- aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
- aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- aws-region: ${{ secrets.AWS_REGION }}
-
- - name: Terraform Init
- working-directory: terraform
- run: terraform init
-
- - name: Terraform Plan
- working-directory: terraform
- run: terraform plan
-
- - name: Terraform Apply
- working-directory: terraform
- run: terraform apply --auto-approve
diff --git a/.github/workflows/dev-tests.yml b/.github/workflows/dev-tests.yml
new file mode 100644
index 0000000..d66f1c6
--- /dev/null
+++ b/.github/workflows/dev-tests.yml
@@ -0,0 +1,48 @@
+name: dev-tests
+
+on:
+ pull_request:
+ branches:
+ - development
+ push:
+ branches:
+ - development
+
+jobs:
+ validate-and-test:
+ name: Validate Terraform and Run Tests
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout Repo
+ uses: actions/checkout@v4
+
+ - name: Install Terraform
+ uses: hashicorp/setup-terraform@v3
+
+ - name: Terraform Init
+ working-directory: terraform
+ run: terraform init -backend=false
+
+ - name: Terraform Validate
+ working-directory: terraform
+ run: terraform validate
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install Python dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install pytest pytest-testdox
+ pip install -r requirements.txt
+
+ - name: Run pytest
+ run: pytest tests/ -vvrP --testdox
+ continue-on-error: true
+ id: pytest
+
+ - name: Check on failures
+ if: steps.pytest.outcome == 'failure'
+ run: exit 1
diff --git a/README.md b/README.md
index cbb446c..7d7e499 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ The solution showcases our skills in:
- Amazon Web Services (AWS)
- Agile methodologies
-# Main Objective
+# Main Objectives
Our goal is to create a reliable ETL (Extract, Transform, Load) pipeline that
can:
@@ -48,4 +48,4 @@ others.
TBA
# Contributors
-TBA \ No newline at end of file
+TBA
diff --git a/requirements.txt b/requirements.txt
index 6f383f9..62ebbf4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
asn1crypto==1.5.1
-boto3==1.34.159
-botocore==1.34.159
+boto3
+botocore
certifi==2024.7.4
cffi==1.17.0
charset-normalizer==3.3.2
@@ -27,4 +27,6 @@ scramp==1.4.5
six==1.16.0
urllib3==2.2.2
Werkzeug==3.0.3
-xmltodict==0.13.0 \ No newline at end of file
+xmltodict==0.13.0
+s3fs
+pandas \ No newline at end of file
diff --git a/src/fact-sales-order.py b/src/fact-sales-order.py
index a143889..30c958f 100644
--- a/src/fact-sales-order.py
+++ b/src/fact-sales-order.py
@@ -11,7 +11,7 @@ df_address = dict_of_df[address]
df_staff = dict_of_df[staff]
df_department = dict_of_df[department]
df_counterparty = dict_of_df[counterparty]
-
+df_sales = dict_of_df[sales]
# creates the dim_design dataframe
dim_design = df_design["design_id", "design_name", "file_name", "file_location"]
@@ -27,28 +27,46 @@ currency_names = pd.DataFrame(data=d)
join_currency = pd.merge(df_currency, currency_names, on="currency_name", how="outer")
dim_currency = join_currency["currency_id", "currency_code", "currency_name"]
-# creates the dim_location dataframe
-# need to change address id to location id
-"dim_location dataframe: (location_id, address_line_1, address_line_2, district, city, postal code, country, phone)"
-dim_location = df_address["address_id", "address_line_1", "address_line_2", "district", "city", "postal_code" "country", "phone"]
-
-
-
+# Using .map to add currency_name column and link it to the currency code
+# dim_currency = df_currency["currency_id", "currency_code"]
+# mappings = {
+# "GBP": "Pound",
+# "USD": "US Dollar",
+# "EUR": "Euro"
+# }
+# dim_currency["currency_name"] = dim_currency["currency_code"].map(mappings)
+# creates the dim_location dataframe
+# need to change address id to location id
+"dim_location dataframe: (location_id, address_line_1, address_line_2, district, city, postal code, country, phone)"
+df_address.rename(columns={"address_id": "location_id"})
+dim_location = df_address["location_id", "address_line_1", "address_line_2", "district", "city", "postal_code" "country", "phone"]
+# creates the dim_counterparty dataframe
+counterparty_address = pd.merge(df_counterparty, df_address, left_on="legal_address_id", right_on='address_id', how="outer")
+counterparty_address.rename(columns={"address_line_1": "counterparty_legal_address_line_1", "address_line_2": "counterparty_legal_address_line_2",
+ "district": "counterparty_legal_district", "city": "counterparty_legal_city", "postal_code": "counterparty_postal_code",
+ "country": "counterparty_legal_country", "phone": "counterparty_legal_phone_number"})
+dim_counterparty = df_counterparty["counterparty_id", "counterparty_legal_name", "counterparty_legal_address_line_1",
+ "counterparty_legal_address_line_2", "counterparty_legal_district", "counterpart_legal_city",
+ "counterparty_legal_postal_code", "counterparty_legal_country", "counterparty_legal_phone_number"]
-# creates the dim_counterparty dataframe
-# counterparty_address = pd.merge(df_counterparty, df_address, left_on="legal_address_id", right_on='address_id', how="outer")
+# creates the dim_date dataframe
+df_sales = df_sales["agreed_delivery_date"]
+df_sales["agreed_delivery_date"] = pd.to_datetime["agreed_delivery_date"]
+df_sales["year"] = df_sales["agreed_delivery_date"].dt.year
+df_sales["month"] = df_sales["agreed_delivery_date"].dt.month
+df_sales["day"] = df_sales["agreed_delivery_date"].dt.day
+df_sales["day_of_week"] = df_sales["agreed_delivery_date"].dt.dayofweek
+df_sales["day_name"] = df_sales["agreed_delivery_date"].dt.day_name()
+df_sales["month_name"] = df_sales["agreed_delivery_date"].dt.month_name()
+df_sales["quarter"] = df_sales["agreed_delivery_date"].dt.quarter()
-# dim_counterparty = df_counterparty["counterparty_id", "counterparty_legal_name", "counterparty_legal_address_line_1",
-# "counterparty_legal_address_line_2", "counterparty_legal_district", "counterpart_legal_city",
-# "counterparty_legal_postal_code", "counterparty_legal_country", "counterparty_legal_phone_number"]
+dim_date = ["date_id", "year", "month", "day", "day_of_week", "day_name", "month_name", "quarter"] #series.dt.quarter()
# TO DO:
-# dim_location
-# dim_date
# fact_sales_order \ No newline at end of file
diff --git a/src/transform_lambda.py b/src/transform_lambda.py
index c6a8e60..9238180 100644
--- a/src/transform_lambda.py
+++ b/src/transform_lambda.py
@@ -1,2 +1,37 @@
-def lambda_handler():
+import json
+import boto3
+import re
+import io
+from io import StringIO
+import pandas as pd
+
+
+def lambda_handler(event, context):
pass
+
+
+tables = [
+ "sales_order",
+ "transaction",
+ "payment",
+ "counterparty",
+ "address",
+ "staff",
+ "purchase_order",
+ "department",
+ "currency",
+ "design",
+ "payment_type",
+]
+
+
+def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client("s3")):
+ table_dfs = {}
+ for table in tables:
+ response = client.list_objects_v2(Bucket=bucket, Prefix=table)
+ list_of_keys = [
+ "s3://" + bucket + "/" + object["Key"] for object in response["Contents"]
+ ]
+ list_of_df = [pd.read_csv(key) for key in list_of_keys]
+ table_dfs[table] = pd.concat(list_of_df)
+ return table_dfs
diff --git a/tests/dummy_2.csv b/tests/dummy_2.csv
new file mode 100644
index 0000000..8abc9bf
--- /dev/null
+++ b/tests/dummy_2.csv
@@ -0,0 +1,5 @@
+Car_type,Brand,Colour
+Truck,Chevrolet,Grey
+Convertible,Mercedes,Red
+Van,Volkswagen,Blue
+
diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py
new file mode 100644
index 0000000..5121905
--- /dev/null
+++ b/tests/test_transform_lambda.py
@@ -0,0 +1,74 @@
+from src.transform_lambda import read_from_s3_subfolder_to_df
+from moto import mock_aws
+import pytest
+import pandas as pd
+import os
+import boto3
+import numpy as np
+
+
+@pytest.fixture(scope="class")
+def aws_credentials():
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+ os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+ os.environ["AWS_SECURIT_TOKEN"] = "testing"
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
+ os.environ["AWS_DEFAULT_REGION"] = "eu-west-2"
+
+
+@pytest.fixture(scope="class")
+def s3_client(aws_credentials):
+ with mock_aws():
+ yield boto3.client("s3")
+
+
+class TestReadFromS3:
+ def test_returns_dictionary_with_correct_value_pair(self, s3_client):
+ s3_client.create_bucket(
+ Bucket="dummy_buc",
+ CreateBucketConfiguration={"LocationConstraint": "eu-west-2"},
+ )
+ s3_client.upload_file(
+ "tests/dummy_identical.csv",
+ "dummy_buc",
+ "Foods/2024/08/21/Foods_12:03:10.csv",
+ )
+ tables = ["Foods"]
+ result = read_from_s3_subfolder_to_df(
+ tables, bucket="dummy_buc", client=s3_client
+ )
+ print(result)
+ expected_df = pd.DataFrame(
+ np.array([["Vegetable", "Sour", "Green"], ["Berry", "Sweet", "Red"]]),
+ columns=["Food_type", "Flavour", "Colour"],
+ )
+ assert isinstance(result, dict)
+ assert list(result.keys())[0] == "Foods"
+ assert isinstance(result["Foods"], pd.DataFrame)
+ assert result["Foods"].eq(expected_df, axis="columns").all(axis=None)
+
+ def test_returns_dictionary_of_dataframes_for_multiple_tables(self, s3_client):
+ s3_client.upload_file(
+ "tests/dummy_2.csv", "dummy_buc", "Cars/2024/08/21/Cars_14:03:56.csv"
+ )
+ tables = ["Foods", "Cars"]
+ result = read_from_s3_subfolder_to_df(
+ tables, bucket="dummy_buc", client=s3_client
+ )
+ expected_foods_df = pd.DataFrame(
+ np.array([["Vegetable", "Sour", "Green"], ["Berry", "Sweet", "Red"]]),
+ columns=["Food_type", "Flavour", "Colour"],
+ )
+ expected_cars_df = pd.DataFrame(
+ np.array(
+ [
+ ["Truck", "Chevrolet", "Grey"],
+ ["Convertible", "Mercedes", "Red"],
+ ["Van", "Volkswagen", "Blue"],
+ ]
+ ),
+ columns=["Car_type", "Brand", "Colour"],
+ )
+ assert list(result.keys()) == tables
+ assert result["Foods"].eq(expected_foods_df, axis="columns").all(axis=None)
+ assert result["Cars"].eq(expected_cars_df, axis="columns").all(axis=None)
git.ajschof.me — hosted by ajschofield — powered by cgit