diff options
Diffstat (limited to 'src/transform_lambda')
| -rw-r--r-- | src/transform_lambda/dataframes.py | 157 | ||||
| -rw-r--r-- | src/transform_lambda/transform_lambda.py | 5 |
2 files changed, 106 insertions, 56 deletions
diff --git a/src/transform_lambda/dataframes.py b/src/transform_lambda/dataframes.py index 2a46bd6..bf0556b 100644 --- a/src/transform_lambda/dataframes.py +++ b/src/transform_lambda/dataframes.py @@ -37,30 +37,28 @@ def create_fact_sales_order(dict_of_df): df_sales["agreed_payment_date"] = pd.to_datetime( df_sales["agreed_payment_date"], format="%Y-%m-%d" ) - df_sales = df_sales.drop(labels=["created_at", "last_updated"], axis=1) - - df_sales.reset_index(inplace=True) - return df_sales + fact_sales = df_sales.loc[:, + [ + "sales_order_id", + "created_date", + "created_time", + "last_updated_date", + "last_updated_time", + "staff_id", + "counterparty_id", + "units_sold", + "unit_price", + "currency_id", + "design_id", + "agreed_payment_date", + "agreed_delivery_date", + "agreed_delivery_location_id" + ], + ] + fact_sales.rename(columns={"staff_id": "sales_staff_id"}).reset_index(inplace=True) + - df_sales["created_date"] = df_sales["created_at"].astype("datetime64[ns]").dt.date - df_sales["created_time"] = ( - df_sales["created_at"].astype("datetime64[ns]").dt.floor("s").dt.time - ) - df_sales["last_updated_date"] = ( - df_sales["last_updated"].astype("datetime64[ns]").dt.date - ) - df_sales["last_updated_time"] = ( - df_sales["last_updated"].astype("datetime64[ns]").dt.floor("s").dt.time - ) - df_sales["agreed_delivery_date"] = pd.to_datetime( - df_sales["agreed_delivery_date"], format="%Y-%m-%d" - ) - df_sales["agreed_payment_date"] = pd.to_datetime( - df_sales["agreed_payment_date"], format="%Y-%m-%d" - ) - df_sales = df_sales.drop(labels=["created_at", "last_updated"], axis=1) - df_sales.reset_index(inplace=True) - return df_sales + return fact_sales # no test, same as fact_payment @@ -83,9 +81,27 @@ def create_fact_purchase_orders(dict_of_df): df_po["agreed_payment_date"] = pd.to_datetime( df_po["agreed_payment_date"], format="%Y-%m-%d" ) - df_po = df_po.drop(labels=["created_at", "last_updated"], axis=1) - df_po.reset_index(inplace=True) - return df_po + fact_purchase_order = df_po.loc[:, + [ + "purchase_order_id", + "created_date", + "created_time", + "last_updated_date", + "last_updated_time", + "staff_id", + "counterparty_id", + "item_code", + "item_quantity", + "item_unit_price", + "currency_id", + "agreed_delivery_date", + "agreed_payment_date", + "agreed_delivery_location_id" + ] + + ] + fact_purchase_order.reset_index(inplace=True) + return fact_purchase_order # test passed @@ -109,38 +125,57 @@ def create_fact_payment(dict_of_df): df_payment["payment_date"] = pd.to_datetime( df_payment["payment_date"], format="%Y-%m-%d" ) - df_payment = df_payment.drop(labels=["created_at", "last_updated"], axis=1) - - df_payment.reset_index(inplace=True) - return df_payment + fact_payment = df_payment.loc[:, + [ + "payment_id", + "created_date", + "created_time", + "last_updated_date", + "last_updated_time", + "transaction_id", + "counterparty_id", + "payment_amount", + "currency_id", + "payment_type_id", + "paid", + "payment_date" + ] + ] + fact_payment.reset_index(inplace=True) + return fact_payment # test passed def create_dim_transaction(dict_of_df): - df_transaction = dict_of_df["transaction"].drop( - labels=["created_at", "last_updated"], axis=1 - ) - return df_transaction + dim_transaction = dict_of_df["transaction"].loc[:, + [ + "transaction_id", + "transaction_type", + "sales_order_id", + "purchase_order_id" + ] + ] + return dim_transaction # test passed def create_dim_location(dict_of_df): - df_loc = ( - dict_of_df["address"] - .drop(labels=["created_at", "last_updated"], axis=1) + dim_location = ( + dict_of_df["address"].drop(labels=["created_at", "last_updated"], axis=1) .rename(columns={"address_id": "location_id"}) ) - return df_loc + return dim_location def create_dim_counterparty(dict_of_df): df_prefixed_address = ( dict_of_df["address"] .drop(labels=["created_at", "last_updated"], axis=1) + .rename(columns={"phone": "phone_number"}) .add_prefix("counterparty_legal_", axis=1) ) df_cp = pd.merge( @@ -149,15 +184,18 @@ def create_dim_counterparty(dict_of_df): left_on="legal_address_id", right_on="counterparty_legal_address_id", how="inner", - ) - df_cp.drop( - columns=[ + )#.dropna(inplace=True) + dim_counterparty = df_cp.drop( + labels=[ "legal_address_id", "counterparty_legal_address_id", - ], - inplace=True, + "created_at", + "last_updated", + "commercial_contact", + "delivery_contact" + ], axis=1 ) - return df_cp + return dim_counterparty # test passed @@ -179,6 +217,7 @@ def create_dim_date(dict_of_df): sr_date = pd.array(pd.concat(list_of_date_columns), dtype="datetime64[ns]") df_date = pd.DataFrame(data=sr_date, columns=["date_id"]) df_date.drop_duplicates(inplace=True) + # df_date.dropna(inplace=True) df_date["year"] = df_date["date_id"].dt.year df_date["month"] = df_date["date_id"].dt.month df_date["day"] = df_date["date_id"].dt.day @@ -210,10 +249,11 @@ def scrape_currency_names(): def create_dim_currency(dict_of_df, names=scrape_currency_names()): df_cur = dict_of_df["currency"].drop(labels=["created_at", "last_updated"], axis=1) - dim_cur = pd.merge( - df_cur, names, left_on="currency_code", right_on="currency_code", how="inner" + dim_currency = pd.merge( + df_cur, names, left_on="currency_code", right_on="currency_code", how="left" ) - return dim_cur + dim_currency.drop_duplicates(inplace=True) + return dim_currency # tests passed @@ -221,7 +261,12 @@ def create_dim_currency(dict_of_df, names=scrape_currency_names()): def create_dim_payment_type(dict_of_df): df_payment_type = dict_of_df["payment_type"] - dim_payment_type = df_payment_type.loc[:, ["payment_type_id", "payment_type_name"]] + dim_payment_type = df_payment_type.loc[:, + [ + "payment_type_id", + "payment_type_name" + ] + ] return dim_payment_type @@ -230,8 +275,13 @@ def create_dim_payment_type(dict_of_df): def create_dim_design(dict_of_df): df_design = dict_of_df["design"] - dim_design = df_design.loc[ - :, ["design_id", "design_name", "file_name", "file_location"] + dim_design = df_design.loc[:, + [ + "design_id", + "design_name", + "file_name", + "file_location" + ] ] return dim_design @@ -243,15 +293,14 @@ def create_dim_staff(dict_of_df): staff_department = pd.merge( dict_of_df["staff"], dict_of_df["department"], on="department_id", how="left" ) - dim_staff = staff_department.loc[ - :, + dim_staff = staff_department.loc[:, [ "staff_id", "first_name", "last_name", "department_name", "location", - "email_address", - ], + "email_address" + ] ] return dim_staff diff --git a/src/transform_lambda/transform_lambda.py b/src/transform_lambda/transform_lambda.py index 93b2284..1453c6c 100644 --- a/src/transform_lambda/transform_lambda.py +++ b/src/transform_lambda/transform_lambda.py @@ -42,7 +42,7 @@ TABLES = [ "department", "currency", "design", - "payment_type", + "payment_type" ] @@ -73,7 +73,8 @@ def lambda_handler(event, context): "fact_payment": create_fact_payment(dict_of_df), "dim_currency": create_dim_currency(dict_of_df), } - + print(immutable_df_dict.values()) + print(mutable_df_dict.values()) status = process_to_parquet_and_upload_to_s3( existing_s3_files, immutable_df_dict, mutable_df_dict, bucket ) |
