From 610261fec06ab3b6106465960d6935dd9df85df0 Mon Sep 17 00:00:00 2001 From: Ang Bel Date: Fri, 16 Aug 2024 09:46:53 +0100 Subject: Secrets manager integration into the extract lambda reviewed. --- src/extract_lambda.py | 29 +++++++++-------- tests/test_secrets_manager.py | 73 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 13 deletions(-) create mode 100644 tests/test_secrets_manager.py diff --git a/src/extract_lambda.py b/src/extract_lambda.py index fb2d7e8..3055f63 100644 --- a/src/extract_lambda.py +++ b/src/extract_lambda.py @@ -1,5 +1,4 @@ -from pg8000.native import Connection, DatabaseError, InterfaceError -from dotenv import dotenv_values +from pg8000.native import Connection, InterfaceError import boto3 import csv from botocore.exceptions import ClientError @@ -42,31 +41,35 @@ def lambda_handler(event, context): 'statusCode': 200, 'body': json.dumps('CSV files processed and uploaded successfully.') } - except Exception as e: logger.error(f'Error: {e}') return { 'statusCode': 500, 'body': json.dumps('Internal server error.') } - finally: - if db: db.close() -def get_config(path: str = ".env") -> dict: - return dotenv_values(path) +def retrieve_secrets(sm_client=boto3.client('secretsmanager'), secret_name='bentley-secrets'): + try: + response = sm_client.get_secret_value(SecretId=secret_name) + if 'SecretString' in response: + secret = json.loads(response['SecretString']) + return secret + except ClientError as e: + logger.error(f'Could not retrieve secrets: {e}') + raise e def connect_to_database() -> Connection: try: - config = get_config() - host = config["host"] - port = config["port"] - user = config["user"] - password = config["password"] - database = config["database"] + secrets = retrieve_secrets() + host = secrets["host"] + port = secrets["port"] + user = secrets["user"] + password = secrets["password"] + database = secrets["database"] return Connection( database=database, diff --git a/tests/test_secrets_manager.py b/tests/test_secrets_manager.py new file mode 100644 index 0000000..a30be86 --- /dev/null +++ b/tests/test_secrets_manager.py @@ -0,0 +1,73 @@ +from src.secrets_manager import sm_client, retrieve_secrets +import boto3 +import botocore.exceptions +from moto import mock_aws +import json +import pytest +import os + +@pytest.fixture(scope='function') +def aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "eu-west-2" + +@pytest.fixture(scope='function') +def mock_sm_client(aws_credentials): + with mock_aws(): + yield boto3.client("secretsmanager") + +@pytest.fixture(scope='function') +def mock_store_secret(mock_sm_client): + secret = { + "cohort_id": "test_cohort_id", + "user": "test_user_id", + "password": "test_password", + "host": "test_host", + "database": "test_database", + "port": "test_port" + } + + secret_name = "test_secret" + + response = mock_sm_client.create_secret(Name=secret_name, SecretString=json.dumps(secret)) + + return response + +def test_retrieves_secrets_returns_dictionary(mock_sm_client, mock_store_secret): + secret_name = "test_secret" + + result = retrieve_secrets(mock_sm_client, secret_name) + + assert isinstance(result, dict) + +def test_retrieves_secrets_returns_correct_keys_and_values(mock_sm_client, mock_store_secret): + + secret_name = "test_secret" + + result = retrieve_secrets(mock_sm_client, secret_name) + + assert result["cohort_id"] == "test_cohort_id" + assert result["user"] == "test_user_id" + assert result["password"] == "test_password" + assert result["host"] == "test_host" + assert result["database"] == "test_database" + assert result["port"] == "test_port" + +def test_retrieves_secrets_raises_error_if_secret_name_incorrect_data_type(mock_sm_client): + secret_name = [1, 2, 3] + + + with pytest.raises(botocore.exceptions.ParamValidationError) as error: + retrieve_secrets(mock_sm_client, secret_name) + + +def test_retrieves_secrets_raises_error_if_secret_name_does_not_exist(mock_sm_client, mock_store_secret): + secret_name = 'test_secret_2' + + + with pytest.raises(botocore.exceptions.ClientError) as error: + retrieve_secrets(mock_sm_client, secret_name) \ No newline at end of file -- cgit v1.2.3 From 938ddda10ff2f7d5360ca0a939fa2f16d6beb09d Mon Sep 17 00:00:00 2001 From: Ang Bel Date: Fri, 16 Aug 2024 10:01:06 +0100 Subject: extract bucket name retrieval helper function and replace the bucket name placeholders --- src/extract_lambda.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/extract_lambda.py b/src/extract_lambda.py index 3055f63..f4c0c1d 100644 --- a/src/extract_lambda.py +++ b/src/extract_lambda.py @@ -82,9 +82,12 @@ def connect_to_database() -> Connection: logger.error(f'Interface error: {i}') raise DBConnectionException("Failed to connect to database") +def extract_bucket(client=boto3.client('s3')): + response = client.list_buckets() + extract_bucket_filter = [bucket['Name'] for bucket in response['Buckets'] if 'extract' in bucket['Name']] + return extract_bucket_filter[0] - -def list_existing_s3_files(bucket_name='extract_bucket', client=boto3.client('s3')): +def list_existing_s3_files(bucket_name=extract_bucket(), client=boto3.client('s3')): """Creates a dictionary and populates it with the results of listing the contents of the s3 bucket, then returns the populated dictionary @@ -93,7 +96,7 @@ def list_existing_s3_files(bucket_name='extract_bucket', client=boto3.client('s3 existing_files = {} try: - response = client.list_objects_v2(Bucket='extract_bucket') + response = client.list_objects_v2(Bucket=bucket_name) if 'Contents' in response: for obj in response['Contents']: @@ -150,7 +153,7 @@ def process_and_upload_tables(db, existing_files, client=boto3.client('s3')): ## END OF NEW CODE if existing_files[latest_s3_object_key] != new_csv_content: try: - client.upload_file(csv_file_path, 'extract_bucket', s3_key) + client.upload_file(csv_file_path, extract_bucket(), s3_key) logger.info(f"Uploaded {s3_key} to S3.") except ClientError as e: logger.error(f'Error uploading to S3: {e}') -- cgit v1.2.3 From c937a7e098d818dadbc769b3c9eb9fd93cc05af2 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Fri, 16 Aug 2024 10:01:28 +0100 Subject: docs: rm DEVNOTES.md basically redundant now --- DEVNOTES.md | 100 ------------------------------------------------------------ 1 file changed, 100 deletions(-) delete mode 100644 DEVNOTES.md diff --git a/DEVNOTES.md b/DEVNOTES.md deleted file mode 100644 index 00b4ddd..0000000 --- a/DEVNOTES.md +++ /dev/null @@ -1,100 +0,0 @@ -# Workflow - -## References - -https://nvie.com/posts/a-successful-git-branching-model/ \ -https://learn.microsoft.com/en-us/azure/devops/repos/git/merging-with-squash?view=azure-devops - - -## Branching - -*Based off GitFlow but slightly modified* - -- There are two main branches - - `main` - production-ready code - - `development` - integration branch for features - - `staging` - represents the current staging state -- In addition, there are additional branches - - Feature branches - for new features and non-urgent bugfixes - - Hotfix branches - probably won't be used but for critical bugs in production (this is what testing should prevent) - - Release branches - for preparation of production releases - -- Feature branches - e.g. `feature/short-description` -- Bugfix branches - e.g. `bugfix/short-description` -- Hotfix branches - e.g. `hotfix/short-description` -- Release branches - e.g. `release/vX.Y.Z` - -### Examples -``` -feature/add-data-extractor -bugfix/fix-s3-upload-error -hotfix/security-patch -release/v1.0.0 -``` - -## Environments - -1. Development - where active development and initial testing occur -2. Staging - for integration testing and final checks before production -3. Production - live and stable environment - -## Deployment - -1. `main` - represents the current production state -2. `develop` - represents the integration branch for features and non-urgent fixes -3. `staging` - represents the current staging state - -## Staging Flow - -1. Create feature branches from `develop` & merge completed features back into `develop` -2. When the `develop` branch is ready for testing, create a `staging` branch from `develop` -3. Deploy the `staging` branch to the staging environment and perform our unit-tests -4. If staging tests pass, create a `release/vX.Y.Z` branch from `staging` -5. Make any final adjustments in the `release/vX.Y.Z` branch -6. Once we have approved the changes in the `release/vX.Y.Z` branch, merge into `main` -7. Tag the release in `main` - -### Notes - -- No new features should be included in the release branches and any new features should be merged into `develop` for the next release cycle - -## Commit Messages - -Please follow the conventional commits specification: - -``` -[optional scope]: - - - -[optional footer(s)] -``` - -### Types -- feat: new features -- fix: bugfixes -- docs: documentation-only changes -- style: changes that do not affect the meaning of the code -- refactor: code changes that neither fix bugs nor adds features -- perf: code changes that improve performance -- test: adding tests or correcting existing tests -- chore: changes to build process or tools/libraries (probably not needed) -- infra: changes to infrastructure configuration (e.g. Terraform) - -### Examples -``` -feat(extract): add automatic scheduling for data ingestion -docs: update README with project setup instructions -``` - -Configuration files for things such as Terraform isn't native to Conventional Commits, but we can add our own: - -``` -infra(tf): update S3 bucket policy -``` - -If the Terraform change involves a fix, you may combine `fix` and `infra`: - -``` -fix(infra): ... -``` -- cgit v1.2.3 From d25f05ba140cb85847ca604bef0e68b76a17ba62 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Fri, 16 Aug 2024 10:34:50 +0100 Subject: docs: add draft summary section --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8ae0cb3..203482e 100644 --- a/README.md +++ b/README.md @@ -1 +1,14 @@ -# de-project-bentley \ No newline at end of file +# ToteSys - Data Engineering Project + +# Summary +The project aims to implement a data platform that can extract data from an +operational database, archive it in a data lake, and make it easily accessible +within a remodelled OLAP data warehouse. + +The solution showcases our skills in: + +- Python +- PostgreSQL +- Database modelling +- Amazon Web Services (AWS) +- Agile methodologies \ No newline at end of file -- cgit v1.2.3 From 9809e7ca1351d7b27f62b3c7c74db7124cab5dc9 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Fri, 16 Aug 2024 10:40:00 +0100 Subject: docs: add draft main objective section --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 203482e..e55cb16 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,14 @@ The solution showcases our skills in: - PostgreSQL - Database modelling - Amazon Web Services (AWS) -- Agile methodologies \ No newline at end of file +- Agile methodologies + +# Main Objective + +Our goal is to create a reliable ETL (Extract, Transform, Load) pipeline that +can: + +1. Extract the data from the `totesys` operational database +2. Store the data in AWS S3 buckets, that will form our data lake +3. Transform the data into a suitable schema for the data warehouse +4. Load the data into the data warehouse hosted on AWS \ No newline at end of file -- cgit v1.2.3 From 37eb3bb7974904614867c7d0c2d4f6eccb39f22e Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Fri, 16 Aug 2024 10:41:01 +0100 Subject: docs(main_obj): clarify data being loaded into data warehouse --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e55cb16..9c7baee 100644 --- a/README.md +++ b/README.md @@ -21,4 +21,4 @@ can: 1. Extract the data from the `totesys` operational database 2. Store the data in AWS S3 buckets, that will form our data lake 3. Transform the data into a suitable schema for the data warehouse -4. Load the data into the data warehouse hosted on AWS \ No newline at end of file +4. Load the transformed data into the data warehouse hosted on AWS \ No newline at end of file -- cgit v1.2.3 From 67a3caf058416718e9413520cb74be049af1e93e Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Fri, 16 Aug 2024 11:09:59 +0100 Subject: docs: add draft key features section --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9c7baee..0bf6b9d 100644 --- a/README.md +++ b/README.md @@ -21,4 +21,17 @@ can: 1. Extract the data from the `totesys` operational database 2. Store the data in AWS S3 buckets, that will form our data lake 3. Transform the data into a suitable schema for the data warehouse -4. Load the transformed data into the data warehouse hosted on AWS \ No newline at end of file +4. Load the transformed data into the data warehouse hosted on AWS + +# Key Features + +We aim for the project to have certain features. Some are more prioritised than +others. + +- [ ] Automated data ingestion from `totesys` db +- [ ] Data storage for ingested and processed data in S3 buckets +- [ ] Data transformation for data warehouse schema +- [ ] Automated data loading into the data warehouse schema +- [ ] Logging and monitoring with CloudWatch +- [ ] Notifications for errors and successful runs (e.g. successful ingestion) +- [ ] Visualisation of warehouse data \ No newline at end of file -- cgit v1.2.3 From 9ad481989e7033735815df7c2fe7a277433587a6 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 16 Aug 2024 12:36:46 +0100 Subject: ci: create .deepsource.toml for automated commit checking --- .deepsource.toml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .deepsource.toml diff --git a/.deepsource.toml b/.deepsource.toml new file mode 100644 index 0000000..a5002ab --- /dev/null +++ b/.deepsource.toml @@ -0,0 +1,25 @@ +version = 1 + +[[analyzers]] +name = "sql" + +[[analyzers]] +name = "terraform" + +[[analyzers]] +name = "python" + + [analyzers.meta] + runtime_version = "3.x.x" + +[[analyzers]] +name = "secrets" + +[[transformers]] +name = "black" + +[[transformers]] +name = "autopep8" + +[[transformers]] +name = "ruff" -- cgit v1.2.3 From ba82306f1646215f17b55099fd6342bca2a35c97 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 16 Aug 2024 12:42:48 +0100 Subject: ci: update .deepsource.toml --- .deepsource.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.deepsource.toml b/.deepsource.toml index a5002ab..b435c8b 100644 --- a/.deepsource.toml +++ b/.deepsource.toml @@ -17,9 +17,14 @@ name = "secrets" [[transformers]] name = "black" +enabled = "true" [[transformers]] name = "autopep8" +enabled = "true" [[transformers]] name = "ruff" +enabled = "true" + + -- cgit v1.2.3 From b3aa2d97e4844b6c205fc9b85427f8d5f150388a Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 16 Aug 2024 12:48:06 +0100 Subject: fix(ci): fix .deepsource.toml config --- .deepsource.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.deepsource.toml b/.deepsource.toml index b435c8b..a840b78 100644 --- a/.deepsource.toml +++ b/.deepsource.toml @@ -17,14 +17,11 @@ name = "secrets" [[transformers]] name = "black" -enabled = "true" [[transformers]] name = "autopep8" -enabled = "true" [[transformers]] name = "ruff" -enabled = "true" -- cgit v1.2.3 From a217da60ba75a226bf72a9fc680c4cbabe883aea Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Fri, 16 Aug 2024 12:53:22 +0100 Subject: docs: add empty sections --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0bf6b9d..6bc75dc 100644 --- a/README.md +++ b/README.md @@ -34,4 +34,10 @@ others. - [ ] Automated data loading into the data warehouse schema - [ ] Logging and monitoring with CloudWatch - [ ] Notifications for errors and successful runs (e.g. successful ingestion) -- [ ] Visualisation of warehouse data \ No newline at end of file +- [ ] Visualisation of warehouse data + +# Test Coverage +TBA + +# Contributors +TBA \ No newline at end of file -- cgit v1.2.3