8 files changed, 203 insertions, 64 deletions
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
deleted file mode 100644
index 09b8490..0000000
--- a/.github/workflows/deploy.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: deploy-terraform
-
-on:
-  pull_request:
-    branches:
-      - main
-  push:
-    branches:
-      - main
-      
-
-jobs:
-  deploy-terraform:
-    if: github.ref == 'refs/heads/main'
-    name: Deploy Terraform
-    runs-on: ubuntu-latest
-    #needs: run-checks (must ref on-commit.yml file)
-    environment: production
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-      
-      - name: Install Terraform
-        uses: hashicorp/setup-terraform@v3
-
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
-
-      - name: Terraform Init
-        working-directory: terraform
-        run: terraform init
-
-      - name: Terraform Plan
-        working-directory: terraform
-        run: terraform plan
-
-      - name: Terraform Apply
-        working-directory: terraform
-        run: terraform apply --auto-approve
diff --git a/.github/workflows/dev-tests.yml b/.github/workflows/dev-tests.yml
new file mode 100644
index 0000000..d66f1c6
--- /dev/null
+++ b/.github/workflows/dev-tests.yml
@@ -0,0 +1,48 @@
+name: dev-tests
+
+on:
+  pull_request:
+    branches:
+      - development
+  push:
+    branches:
+      - development
+
+jobs:
+  validate-and-test:
+    name: Validate Terraform and Run Tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+      
+      - name: Install Terraform
+        uses: hashicorp/setup-terraform@v3
+
+      - name: Terraform Init
+        working-directory: terraform
+        run: terraform init -backend=false
+
+      - name: Terraform Validate
+        working-directory: terraform
+        run: terraform validate
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest pytest-testdox
+          pip install -r requirements.txt
+
+      - name: Run pytest
+        run: pytest tests/ -vvrP --testdox
+        continue-on-error: true
+        id: pytest
+
+      - name: Check on failures
+        if: steps.pytest.outcome == 'failure'
+        run: exit 1
diff --git a/README.md b/README.md
index cbb446c..7d7e499 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ The solution showcases our skills in:
 - Amazon Web Services (AWS)
 - Agile methodologies
 
-# Main Objective
+# Main Objectives
 
 Our goal is to create a reliable ETL (Extract, Transform, Load) pipeline that
 can:
@@ -48,4 +48,4 @@ others.
 TBA
 
 # Contributors
-TBA
-\ No newline at end of file
+TBA
diff --git a/requirements.txt b/requirements.txt
index 6f383f9..62ebbf4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 asn1crypto==1.5.1
-boto3==1.34.159
-botocore==1.34.159
+boto3
+botocore
 certifi==2024.7.4
 cffi==1.17.0
 charset-normalizer==3.3.2
@@ -27,4 +27,6 @@ scramp==1.4.5
 six==1.16.0
 urllib3==2.2.2
 Werkzeug==3.0.3
-xmltodict==0.13.0
-\ No newline at end of file
+xmltodict==0.13.0
+s3fs 
+pandas
+\ No newline at end of file
diff --git a/src/fact-sales-order.py b/src/fact-sales-order.py
index a143889..30c958f 100644
--- a/src/fact-sales-order.py
+++ b/src/fact-sales-order.py
@@ -11,7 +11,7 @@ df_address = dict_of_df[address]
 df_staff = dict_of_df[staff]
 df_department = dict_of_df[department]
 df_counterparty = dict_of_df[counterparty]
-
+df_sales = dict_of_df[sales]
 
 # creates the dim_design dataframe
 dim_design = df_design["design_id", "design_name", "file_name", "file_location"]
@@ -27,28 +27,46 @@ currency_names = pd.DataFrame(data=d)
 join_currency = pd.merge(df_currency, currency_names, on="currency_name", how="outer")
 dim_currency = join_currency["currency_id", "currency_code", "currency_name"]
 
-# creates the dim_location dataframe
-# need to change address id to location id 
-"dim_location dataframe: (location_id, address_line_1, address_line_2, district, city, postal code, country, phone)"
-dim_location = df_address["address_id", "address_line_1", "address_line_2", "district", "city", "postal_code" "country", "phone"]
-
-
-
+# Using .map to add currency_name column and link it to the currency code
+# dim_currency = df_currency["currency_id", "currency_code"]
+# mappings = {
+#     "GBP": "Pound",
+#     "USD": "US Dollar",
+#     "EUR": "Euro"
+# }
+# dim_currency["currency_name"] = dim_currency["currency_code"].map(mappings)
 
 
 
+# creates the dim_location dataframe
+# need to change address id to location id 
+"dim_location dataframe: (location_id, address_line_1, address_line_2, district, city, postal code, country, phone)"
+df_address.rename(columns={"address_id": "location_id"})
+dim_location = df_address["location_id", "address_line_1", "address_line_2", "district", "city", "postal_code" "country", "phone"]
 
+# creates the dim_counterparty dataframe
+counterparty_address = pd.merge(df_counterparty, df_address, left_on="legal_address_id", right_on='address_id', how="outer")
+counterparty_address.rename(columns={"address_line_1": "counterparty_legal_address_line_1", "address_line_2": "counterparty_legal_address_line_2",
+                                     "district": "counterparty_legal_district", "city": "counterparty_legal_city", "postal_code": "counterparty_postal_code",
+                                     "country": "counterparty_legal_country", "phone": "counterparty_legal_phone_number"})
 
+dim_counterparty = df_counterparty["counterparty_id", "counterparty_legal_name", "counterparty_legal_address_line_1",
+                                   "counterparty_legal_address_line_2", "counterparty_legal_district", "counterpart_legal_city",
+                                   "counterparty_legal_postal_code", "counterparty_legal_country", "counterparty_legal_phone_number"]
 
-# creates the dim_counterparty dataframe
-# counterparty_address = pd.merge(df_counterparty, df_address, left_on="legal_address_id", right_on='address_id', how="outer")
+# creates the dim_date dataframe
+df_sales = df_sales["agreed_delivery_date"]
+df_sales["agreed_delivery_date"] = pd.to_datetime["agreed_delivery_date"]
+df_sales["year"] = df_sales["agreed_delivery_date"].dt.year
+df_sales["month"] = df_sales["agreed_delivery_date"].dt.month
+df_sales["day"] = df_sales["agreed_delivery_date"].dt.day
+df_sales["day_of_week"] = df_sales["agreed_delivery_date"].dt.dayofweek
+df_sales["day_name"] = df_sales["agreed_delivery_date"].dt.day_name()
+df_sales["month_name"] = df_sales["agreed_delivery_date"].dt.month_name()
+df_sales["quarter"] = df_sales["agreed_delivery_date"].dt.quarter()
 
-# dim_counterparty = df_counterparty["counterparty_id", "counterparty_legal_name", "counterparty_legal_address_line_1",
-#                                    "counterparty_legal_address_line_2", "counterparty_legal_district", "counterpart_legal_city",
-#                                    "counterparty_legal_postal_code", "counterparty_legal_country", "counterparty_legal_phone_number"]
+dim_date = ["date_id", "year", "month", "day", "day_of_week", "day_name", "month_name", "quarter"]   #series.dt.quarter()
 
 
 # TO DO:
-# dim_location
-# dim_date
 # fact_sales_order
 \ No newline at end of file
diff --git a/src/transform_lambda.py b/src/transform_lambda.py
index c6a8e60..9238180 100644
--- a/src/transform_lambda.py
+++ b/src/transform_lambda.py
@@ -1,2 +1,37 @@
-def lambda_handler():
+import json
+import boto3
+import re
+import io
+from io import StringIO
+import pandas as pd
+
+
+def lambda_handler(event, context):
     pass
+
+
+tables = [
+    "sales_order",
+    "transaction",
+    "payment",
+    "counterparty",
+    "address",
+    "staff",
+    "purchase_order",
+    "department",
+    "currency",
+    "design",
+    "payment_type",
+]
+
+
+def read_from_s3_subfolder_to_df(tables, bucket, client=boto3.client("s3")):
+    table_dfs = {}
+    for table in tables:
+        response = client.list_objects_v2(Bucket=bucket, Prefix=table)
+        list_of_keys = [
+            "s3://" + bucket + "/" + object["Key"] for object in response["Contents"]
+        ]
+        list_of_df = [pd.read_csv(key) for key in list_of_keys]
+        table_dfs[table] = pd.concat(list_of_df)
+    return table_dfs
diff --git a/tests/dummy_2.csv b/tests/dummy_2.csv
new file mode 100644
index 0000000..8abc9bf
--- /dev/null
+++ b/tests/dummy_2.csv
@@ -0,0 +1,5 @@
+Car_type,Brand,Colour
+Truck,Chevrolet,Grey
+Convertible,Mercedes,Red
+Van,Volkswagen,Blue
+
diff --git a/tests/test_transform_lambda.py b/tests/test_transform_lambda.py
new file mode 100644
index 0000000..5121905
--- /dev/null
+++ b/tests/test_transform_lambda.py
@@ -0,0 +1,74 @@
+from src.transform_lambda import read_from_s3_subfolder_to_df
+from moto import mock_aws
+import pytest
+import pandas as pd
+import os
+import boto3
+import numpy as np
+
+
+@pytest.fixture(scope="class")
+def aws_credentials():
+    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+    os.environ["AWS_SECURIT_TOKEN"] = "testing"
+    os.environ["AWS_SESSION_TOKEN"] = "testing"
+    os.environ["AWS_DEFAULT_REGION"] = "eu-west-2"
+
+
+@pytest.fixture(scope="class")
+def s3_client(aws_credentials):
+    with mock_aws():
+        yield boto3.client("s3")
+
+
+class TestReadFromS3:
+    def test_returns_dictionary_with_correct_value_pair(self, s3_client):
+        s3_client.create_bucket(
+            Bucket="dummy_buc",
+            CreateBucketConfiguration={"LocationConstraint": "eu-west-2"},
+        )
+        s3_client.upload_file(
+            "tests/dummy_identical.csv",
+            "dummy_buc",
+            "Foods/2024/08/21/Foods_12:03:10.csv",
+        )
+        tables = ["Foods"]
+        result = read_from_s3_subfolder_to_df(
+            tables, bucket="dummy_buc", client=s3_client
+        )
+        print(result)
+        expected_df = pd.DataFrame(
+            np.array([["Vegetable", "Sour", "Green"], ["Berry", "Sweet", "Red"]]),
+            columns=["Food_type", "Flavour", "Colour"],
+        )
+        assert isinstance(result, dict)
+        assert list(result.keys())[0] == "Foods"
+        assert isinstance(result["Foods"], pd.DataFrame)
+        assert result["Foods"].eq(expected_df, axis="columns").all(axis=None)
+
+    def test_returns_dictionary_of_dataframes_for_multiple_tables(self, s3_client):
+        s3_client.upload_file(
+            "tests/dummy_2.csv", "dummy_buc", "Cars/2024/08/21/Cars_14:03:56.csv"
+        )
+        tables = ["Foods", "Cars"]
+        result = read_from_s3_subfolder_to_df(
+            tables, bucket="dummy_buc", client=s3_client
+        )
+        expected_foods_df = pd.DataFrame(
+            np.array([["Vegetable", "Sour", "Green"], ["Berry", "Sweet", "Red"]]),
+            columns=["Food_type", "Flavour", "Colour"],
+        )
+        expected_cars_df = pd.DataFrame(
+            np.array(
+                [
+                    ["Truck", "Chevrolet", "Grey"],
+                    ["Convertible", "Mercedes", "Red"],
+                    ["Van", "Volkswagen", "Blue"],
+                ]
+            ),
+            columns=["Car_type", "Brand", "Colour"],
+        )
+        assert list(result.keys()) == tables
+        assert result["Foods"].eq(expected_foods_df, axis="columns").all(axis=None)
+        assert result["Cars"].eq(expected_cars_df, axis="columns").all(axis=None)