diff options
| -rw-r--r-- | src/dataframes.py | 234 | ||||
| -rw-r--r-- | tests/test_fact_sales_order.py | 168 |
2 files changed, 233 insertions, 169 deletions
diff --git a/src/dataframes.py b/src/dataframes.py index 18e1fac..f2cae5d 100644 --- a/src/dataframes.py +++ b/src/dataframes.py @@ -1,11 +1,5 @@ import pandas as pd from bs4 import BeautifulSoup -# from src.transform_lambda import read_from_s3_subfolder_to_df, tables -# from src.extract_lambda import extract_bucket -# import json -# import boto3 -# import re -# from datetime import datetime as dt import requests # Table names: @@ -22,19 +16,15 @@ import requests # dim_counterparty -def create_dim_transaction(dict_of_df): - pass - - def create_fact_sales_order(dict_of_df): df_sales = dict_of_df["sales_order"] df_sales.index.name = "sales_record_id" df_sales["created_date"] = pd.to_datetime(df_sales["created_at"]).dt.date df_sales["created_time"] = pd.to_datetime(df_sales["created_at"]).dt.time - df_sales["last_updated_date"] = pd.to_datetime(df_sales["last_updated"]).dt.date - df_sales["last_updated_time"] = pd.to_datetime(df_sales["last_updated"]).dt.time - pd.merge(dict_of_df["staff"], df_sales["sales_staff_id"], on="staff_id", how="left") - # df_sales.rename(columns={"staff_id": "sales_staff_id"}) + df_sales["last_updated_date"] = pd.to_datetime( + df_sales["last_updated"]).dt.date + df_sales["last_updated_time"] = pd.to_datetime( + df_sales["last_updated"]).dt.time fact_sales_order = df_sales.loc[ :, [ @@ -81,10 +71,14 @@ def create_fact_purchase_orders(dict_of_df): def create_fact_payment(dict_of_df): df_payment = dict_of_df["payment"] df_payment.index.name = "payment_record_id" - df_payment["created_date"] = pd.to_datetime(df_payment["created_at"]).dt.date - df_payment["created_time"] = pd.to_datetime(df_payment["created_at"]).dt.time - df_payment["last_updated_date"] = pd.to_datetime(df_payment["last_updated"]).dt.date - df_payment["last_updated_time"] = pd.to_datetime(df_payment["last_updated"]).dt.time + df_payment["created_date"] = pd.to_datetime( + df_payment["created_at"]).dt.date + df_payment["created_time"] = pd.to_datetime( + df_payment["created_at"]).dt.time + df_payment["last_updated_date"] = pd.to_datetime( + df_payment["last_updated"]).dt.date + df_payment["last_updated_time"] = pd.to_datetime( + df_payment["last_updated"]).dt.time fact_payment = df_payment.loc[ :, [ @@ -106,125 +100,126 @@ def create_fact_payment(dict_of_df): return fact_payment -# dim_location from address --> drops 2 columns +# test passed + +def create_dim_transaction(dict_of_df): + df_transaction = dict_of_df["transaction"].drop( + labels=["created_at", "last_updated"], axis=1 + ) + return df_transaction + +# test passed def create_dim_location(dict_of_df): df_loc = ( dict_of_df["address"] .drop(labels=["created_at", "last_updated"], axis=1) .rename(columns={"address_id": "location_id"}) - .set_index("location_id") - ) return df_loc -# dim_counterparty from address and counterparty - def create_dim_counterparty(dict_of_df): - df_prefixed_address = dict_of_df["address"].add_prefix( + df_prefixed_address=dict_of_df["address"].add_prefix( "counterparty_legal_", axis=1 ) - df_cp = pd.merge( + df_cp=pd.merge( dict_of_df["counterparty"], df_prefixed_address, left_on="legal_address_id", - right_on="address_id", + right_on="counterparty_legal_address_id", how="outer", - ).set_index("counterparty_id") + ) + df_cp.drop( + columns=["legal_address_id", "counterparty_legal_address_id"], inplace=True + ) return df_cp -# dim_date from purchase_order -def create_dim_date(dict_of_df): - sr_date = pd.concat( - [ - dict_of_df["created_date"], - dict_of_df["last_updated_date"], - dict_of_df["agreed_delivery_date"], - dict_of_df["agreed_payment_date"], - ] - ).sort() - df_date = pd.DataFrame(sr_date, columns="date_id") - df_date["year"] = df_date["date_id"].dt.year - df_date["month"] = df_date["date_id"].dt.month - df_date["day"] = df_date["date_id"].dt.day - df_date["day_of_week"] = df_date["date_id"].dt.dayofweek - df_date["day_name"] = df_date["date_id"].dt.day_name - df_date["month_name"] = df_date["date_id"].dt.month_name - df_date["quarter"] = df_date["date_id"].dt.quarter - df_date.set_index("date_id") +# test passed +def create_dim_date(dict_of_df): + fact_dfs=[ + create_fact_payment(dict_of_df), + create_fact_purchase_orders(dict_of_df), + create_fact_sales_order(dict_of_df), + ] + date_col_names=[ + col_name for col_name in list(fact_dfs[0].columns) if "date" in col_name + ] + list_of_date_columns=[] + for df in fact_dfs: + for col in date_col_names: + list_of_date_columns.append(df[col]) + sr_date=pd.array(pd.concat(list_of_date_columns), dtype="datetime64[ns]") + df_date=pd.DataFrame(data=sr_date, columns=["date_id"]) + df_date.drop_duplicates(inplace=True) + df_date["year"]=df_date["date_id"].dt.year + df_date["month"]=df_date["date_id"].dt.month + df_date["day"]=df_date["date_id"].dt.day + df_date["day_of_week"]=df_date["date_id"].dt.dayofweek + df_date["day_name"]=df_date["date_id"].dt.day_name() + df_date["month_name"]=df_date["date_id"].dt.month_name() + df_date["quarter"]=df_date["date_id"].dt.quarter + return df_date + + +# tests passed def scrape_currency_names(): - response = requests.get("https://www.xe.com/currency/").content - soup = BeautifulSoup(response, "html.parser") - currency = [ + response=requests.get("https://www.xe.com/currency/").content + soup=BeautifulSoup(response, "html.parser") + currency=[ item.text for item in soup.findAll("a", attrs={"class": "sc-299dec64-6 fZPTSw"}) ] - sr = pd.Series(currency) - df_cur = sr.str.split(pat=" - ", expand=True).rename( + sr=pd.Series(currency) + df_cur=sr.str.split(pat=" - ", expand=True).rename( {0: "currency_code", 1: "currency_name"}, axis=1 ) return df_cur +# tests passed + def create_dim_currency(dict_of_df, names=scrape_currency_names()): - df_cur = dict_of_df["currency"].drop(labels=["created_at", "last_updated"], axis=1) - dim_cur = pd.merge( + df_cur=dict_of_df["currency"].drop( + labels=["created_at", "last_updated"], axis=1) + dim_cur=pd.merge( df_cur, names, left_on="currency_code", right_on="currency_code", how="inner" - ).set_index("currency_id") + ) return dim_cur +# tests passed + def create_dim_payment_type(dict_of_df): - df_payment_type = dict_of_df["payment_type"] - dim_payment_type = df_payment_type.loc[:, ["payment_type_id", "payment_type_name"]] + df_payment_type=dict_of_df["payment_type"] + dim_payment_type=df_payment_type.loc[:, [ + "payment_type_id", "payment_type_name"]] return dim_payment_type -def create_fact_payment(dict_of_df): - df_payment = dict_of_df["payment"] - df_payment.index.name = "payment_record_id" - df_payment["created_date"] = pd.to_datetime(df_payment["created_at"]).dt.date - df_payment["created_time"] = pd.to_datetime(df_payment["created_at"]).dt.time - df_payment["last_updated_date"] = pd.to_datetime(df_payment["last_updated"]).dt.date - df_payment["last_updated_time"] = pd.to_datetime(df_payment["last_updated"]).dt.time - fact_payment = df_payment.loc[ - :, - [ - "payment_record_id", - "payment_id", - "created_date", - "created_time", - "last_updated_date", - "last_updated_time", - "transaction_id", - "counterparty_id", - "payment_amount", - "currency_id", - "payment_type_id", - "paid", - "payment_date", - ], - ] - return fact_payment + +# tests passed def create_dim_design(dict_of_df): - df_design = dict_of_df["design"] - dim_design = df_design.loc[ + df_design=dict_of_df["design"] + dim_design=df_design.loc[ :, ["design_id", "design_name", "file_name", "file_location"] ] return dim_design + +# tests passed + def create_dim_staff(dict_of_df): - staff_department = pd.merge( + staff_department=pd.merge( dict_of_df["staff"], dict_of_df["department"], on="department_id", how="left" ) - dim_staff = staff_department.loc[ + dim_staff=staff_department.loc[ :, [ "staff_id", @@ -236,70 +231,3 @@ def create_dim_staff(dict_of_df): ], ] return dim_staff - - -def create_dim_currency(dict_of_df): - df_currency = dict_of_df["currency"] - dim_currency = df_currency.loc[:, ["currency_id", "currency_code"]] - mappings = {"GBP": "Pound", "USD": "US Dollar", "EUR": "Euro"} - dim_currency["currency_name"] = dim_currency["currency_code"].map(mappings) - return dim_currency - - -def create_dim_date(dict_of_df): - df_sales = dict_of_df["sales"] - df_sales = df_sales.loc[:, ["agreed_delivery_date"]] - df_sales["agreed_delivery_date"] = pd.to_datetime["agreed_delivery_date"] - df_sales["year"] = df_sales["agreed_delivery_date"].dt.year - df_sales["month"] = df_sales["agreed_delivery_date"].dt.month - df_sales["day"] = df_sales["agreed_delivery_date"].dt.day - df_sales["day_of_week"] = df_sales["agreed_delivery_date"].dt.dayofweek - df_sales["day_name"] = df_sales["agreed_delivery_date"].dt.day_name() - df_sales["month_name"] = df_sales["agreed_delivery_date"].dt.month_name() - df_sales["quarter"] = df_sales["agreed_delivery_date"].dt.quarter() - dim_date = [ - "date_id", - "year", - "month", - "day", - "day_of_week", - "day_name", - "month_name", - "quarter", - ] # series.dt.quarter() - return dim_date - - -# TO DO: -# complete dim_date from merged fact table -# merge dataframes into one dataframe -# remove duplicates -# test dim_date and fact_sales_order - - -def create_sales_star_schema(dict_of_df): - dim_design = create_dim_design(dict_of_df) - dim_staff = create_dim_staff(dict_of_df) - dim_currency = create_dim_currency(dict_of_df) - dim_date = create_dim_date(dict_of_df) - - fact_sales_order = create_fact_sales_order(dict_of_df) - - fact_sales_order = fact_sales_order.merge(dim_design, on="design_id", how="left") - fact_sales_order = fact_sales_order.merge( - dim_staff, left_on="sales_staff_id", right_on="staff_id", how="left" - ) - fact_sales_order = fact_sales_order.merge( - dim_currency, on="currency_id", how="left" - ) - fact_sales_order = fact_sales_order.merge( - dim_date, left_on="agreed_delivery_date", right_on="date_id", how="left" - ) - - return fact_sales_order - - -def create_dim_payment_type(dict_of_df): - df_payment_type = dict_of_df["payment_type"] - dim_payment_type = df_payment_type.loc[:, ["payment_type_id", "payment_type_name"]] - return dim_payment_type diff --git a/tests/test_fact_sales_order.py b/tests/test_fact_sales_order.py index 48426b4..a245379 100644 --- a/tests/test_fact_sales_order.py +++ b/tests/test_fact_sales_order.py @@ -1,10 +1,7 @@ +from src.dataframes import * import pandas as pd -from fact_sales_order import create_dim_design, create_dim_staff, create_dim_currency -from src.fact_sales_order import ( - create_dim_design, - create_dim_staff, - create_dim_currency, -) +from unittest.mock import patch +from datetime import datetime as dt class TestCreateDimDesign: @@ -89,22 +86,161 @@ class TestCreateDimStaff: assert result.equals(expected_result) -class TestCreateDimCurrency: - def test_dim_currency_returns_dataframe(self): - d = {"currency_id": [1, 2, 3], "currency_code": ["USD", "EUR", "GBP"]} - test_df = {"currency": pd.DataFrame(data=d)} - result = create_dim_currency(test_df) +class TestCreatePaymentType: + def test_create_dim_payment_type_returns_correct_columns_and_values(self): + d = {"payment_type_id": ["Hello", "Bye"], "payment_type_name": ["Hello", "Bye"]} + test_df = {"payment_type": pd.DataFrame(data=d)} + result = create_dim_payment_type(test_df) + expected_columns = ["payment_type_id", "payment_type_name"] + expected_d = { + "payment_type_id": ["Hello", "Bye"], + "payment_type_name": ["Hello", "Bye"], + } + expected_df = pd.DataFrame(data=expected_d) assert isinstance(result, pd.DataFrame) + assert list(result.columns) == expected_columns + assert result.equals(expected_df) + +class TestCreateDimCounterparty: + def test_create_dim_counterparty_type_returns_correct_columns_and_object(self): + data_l = pd.DataFrame( + data={ + "counterparty_id": ["Hello", "Bye"], + "counterparty_legal_name": ["Hello", "Bye"], + "commercial_contact": ["Hello", "Bye"], + "legal_address_id": ["bond street", "regent street"], + } + ) + data_a = pd.DataFrame( + data={ + "address_id": ["bond street", "regent street"], + "postcode": [98365, 93753], + } + ) + test_df = {"address": data_a, "counterparty": data_l} + result = create_dim_counterparty(test_df) + + expected_columns = [ + "counterparty_id", + "counterparty_legal_name", + "commercial_contact", + "counterparty_legal_postcode", + ] + print(data_l) + print(data_a) + assert isinstance(result, pd.DataFrame) + assert list(result.columns) == expected_columns + + +class TestCreateDimCurrency: def test_dim_currency_returns_columns_and_values(self): - d = {"currency_id": [1, 2, 3], "currency_code": ["USD", "EUR", "GBP"]} + nones = [None, None, None] + d = { + "currency_id": [1, 2, 3], + "currency_code": ["USD", "EUR", "GBP"], + "created_at": nones, + "last_updated": nones, + } test_df = {"currency": pd.DataFrame(data=d)} - result = create_dim_currency(test_df) + scraper_output = pd.DataFrame( + { + "currency_code": ["RUS", "USD", "PHP", "GBP", "EUR"], + "currency_name": ["Rubble", "US Dollar", "Peso", "Pound", "Euro"], + } + ) + result = create_dim_currency(test_df, names=scraper_output).sort_values( + by="currency_code", axis=0 + ) expected_d = { "currency_id": [1, 2, 3], "currency_code": ["USD", "EUR", "GBP"], "currency_name": ["US Dollar", "Euro", "Pound"], } - expected_df = pd.DataFrame(data=expected_d) - expected_result = expected_df.copy() - assert result.equals(expected_result) + expected_df = pd.DataFrame(data=expected_d).sort_values( + by="currency_code", axis=0 + ) + assert isinstance(result, pd.DataFrame) + assert result.equals(expected_df) + + def test_scrape_currency_names_returns_dataframe_with_correct_collumns(self): + result = scrape_currency_names() + assert isinstance(result, pd.DataFrame) + assert list(result.columns) == ["currency_code", "currency_name"] + + +class TestCreateDimDate: + def test_returns_required_columns(self): + df_one = pd.DataFrame( + data={ + "updated_date": dt(2020, 5, 17), + "created_date": dt(2021, 5, 13), + "not_dat": None, + }, + index=[0], + ) + df_two = pd.DataFrame( + data={"updated_date": dt(2020, 5, 17), "created_date": dt(2021, 9, 13)}, + index=[0], + ) + df_three = pd.DataFrame( + data={"updated_date": dt(2022, 5, 17), "created_date": dt(2023, 5, 13)}, + index=[0], + ) + expected_df = pd.DataFrame( + data=[ + [dt(2020, 5, 17), 2020, 5, 17, 6, "Sunday", "May", 2], + [dt(2021, 5, 13), 2021, 5, 13, 3, "Thursday", "May", 2], + [dt(2021, 9, 13), 2021, 9, 13, 0, "Monday", "September", 3], + [dt(2022, 5, 17), 2022, 5, 17, 1, "Tuesday", "May", 2], + [dt(2023, 5, 13), 2023, 5, 13, 5, "Saturday", "May", 2], + ], + columns=[ + "date_id", + "year", + "month", + "day", + "day_of_week", + "day_name", + "month_name", + "quarter", + ], + ) + with patch("src.dataframes.create_fact_payment") as mock_fp: + with patch("src.dataframes.create_fact_purchase_orders") as mock_fpo: + with patch("src.dataframes.create_fact_sales_order") as mock_fso: + mock_fp.return_value = df_one + mock_fpo.return_value = df_two + mock_fso.return_value = df_three + result = create_dim_date({"dum": 0}) + result.reset_index(inplace=True, drop=True) + assert result.eq(expected_df, axis="columns").all(axis=None) + + +class TestCreateDimLocation: + def test_returns_correct_columns_lo(self): + dict_df = { + "address": pd.DataFrame( + data=[["some_time", "some_other_time", 1, "SE18 9QO"]], + columns=["created_at", "last_updated", "address_id", "postal_code"], + ) + } + result = create_dim_location(dict_df) + assert list(result.columns) == ["location_id", "postal_code"] + + +class TestCreateDimTransaction: + def test_returns_correct_columns_tr(self): + dict_df = { + "transaction": pd.DataFrame( + data=[["some_time", "some_other_time", 1, "SE18 9QO"]], + columns=[ + "created_at", + "last_updated", + "transaction_id", + "some_other_id", + ], + ) + } + result = create_dim_transaction(dict_df) + assert list(result.columns) == ["transaction_id", "some_other_id"] |
