diff options
| author | Alex <git@ajschof.me> | 2024-08-19 12:09:25 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-08-19 12:09:25 +0100 |
| commit | f28e4038d20b4630fafcae9a7825794e529bace2 (patch) | |
| tree | 0c378561e0dde843c0a281c692d137bb6bb0d0a7 | |
| parent | 5cc511d2afeea262db0db7039c8f83c123da77ea (diff) | |
| parent | 09b8b7903098a988a9a022d0ab607f8131c9c78f (diff) | |
| download | de-project-bentley-f28e4038d20b4630fafcae9a7825794e529bace2.tar.gz de-project-bentley-f28e4038d20b4630fafcae9a7825794e529bace2.zip | |
Merge branch 'development' into feature/test-extract-lambda
| -rw-r--r-- | .deepsource.toml | 27 | ||||
| -rw-r--r-- | .github/workflows/deploy.yml | 13 | ||||
| -rw-r--r-- | .github/workflows/on-commit.yml | 50 | ||||
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | DEVNOTES.md | 100 | ||||
| -rw-r--r-- | Makefile | 80 | ||||
| -rw-r--r-- | README.md | 52 | ||||
| -rw-r--r-- | src/extract_lambda.py | 196 | ||||
| -rw-r--r-- | src/load_lambda.py | 2 | ||||
| -rw-r--r-- | src/secrets_manager.py | 31 | ||||
| -rw-r--r-- | src/transform_lambda.py | 2 | ||||
| -rw-r--r-- | terraform/lambda.tf | 78 | ||||
| -rw-r--r-- | terraform/main.tf | 8 | ||||
| -rw-r--r-- | terraform/rds.tf | 128 | ||||
| -rw-r--r-- | terraform/s3.tf | 5 | ||||
| -rw-r--r-- | test/test_secrets_manager.py | 19 | ||||
| -rw-r--r-- | tests/test_extract_lambda.py | 6 | ||||
| -rw-r--r-- | tests/test_secrets_manager.py | 84 |
18 files changed, 531 insertions, 351 deletions
diff --git a/.deepsource.toml b/.deepsource.toml new file mode 100644 index 0000000..a840b78 --- /dev/null +++ b/.deepsource.toml @@ -0,0 +1,27 @@ +version = 1 + +[[analyzers]] +name = "sql" + +[[analyzers]] +name = "terraform" + +[[analyzers]] +name = "python" + + [analyzers.meta] + runtime_version = "3.x.x" + +[[analyzers]] +name = "secrets" + +[[transformers]] +name = "black" + +[[transformers]] +name = "autopep8" + +[[transformers]] +name = "ruff" + + diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 372d0b3..5672048 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,19 +1,24 @@ name: deploy-terraform on: + pull_request: + branches: + - main push: branches: - - test-ci/** # Adjust the branch based on our deployment strategy + - main + jobs: deploy-terraform: name: Deploy Terraform runs-on: ubuntu-latest - environment: test-env + #needs: run-checks (must ref on-commit.yml file) + environment: production steps: - name: Checkout Repo uses: actions/checkout@v4 - + - name: Install Terraform uses: hashicorp/setup-terraform@v3 @@ -34,4 +39,4 @@ jobs: - name: Terraform Apply working-directory: terraform - run: terraform apply --auto-approve
\ No newline at end of file + run: terraform apply --auto-approve diff --git a/.github/workflows/on-commit.yml b/.github/workflows/on-commit.yml deleted file mode 100644 index fd9ffb8..0000000 --- a/.github/workflows/on-commit.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: commit-qc-checks - -on: - push: - branches-ignore: - - 'main' - -jobs: - python-quality-checks: - runs-on: ubuntu-latest - steps: - - uses : actions/checkout@v4 - - name : 'Python: Setup' - uses : actions/setup-python@v5 - with: - python-version: 3.11 - - name : 'Python: Install Dependencies' - run: | - python -m pip install --upgrade pip - pip install flake8 pylint black bandit safety - continue-on-error: true - - name : 'Python: Linting' - run: | - flake8 . - find . -name "*.py" | xargs pylint - continue-on-error: true - - name : 'Python: Formatting' - run: | - black --check . - continue-on-error: true - terraform-quality-checks: - runs-on: ubuntu-latest - steps: - - uses : actions/checkout@v4 - - name: 'Terraform: Setup' - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: latest - - name: 'Terraform: Formatting' - working-directory: terraform - run: terraform fmt -check -recursive - continue-on-error: true - - name: 'Terraform: Initialise' - working-directory: terraform - run: terraform init -backend=false - continue-on-error: true - - name: 'Terraform: Validate' - working-directory: terraform - run: terraform validate - continue-on-error: true
\ No newline at end of file @@ -14,3 +14,4 @@ __pycache__/ # OS-Related Files .DS_Store +venv
\ No newline at end of file diff --git a/DEVNOTES.md b/DEVNOTES.md deleted file mode 100644 index 00b4ddd..0000000 --- a/DEVNOTES.md +++ /dev/null @@ -1,100 +0,0 @@ -# Workflow - -## References - -https://nvie.com/posts/a-successful-git-branching-model/ \ -https://learn.microsoft.com/en-us/azure/devops/repos/git/merging-with-squash?view=azure-devops - - -## Branching - -*Based off GitFlow but slightly modified* - -- There are two main branches - - `main` - production-ready code - - `development` - integration branch for features - - `staging` - represents the current staging state -- In addition, there are additional branches - - Feature branches - for new features and non-urgent bugfixes - - Hotfix branches - probably won't be used but for critical bugs in production (this is what testing should prevent) - - Release branches - for preparation of production releases - -- Feature branches - e.g. `feature/short-description` -- Bugfix branches - e.g. `bugfix/short-description` -- Hotfix branches - e.g. `hotfix/short-description` -- Release branches - e.g. `release/vX.Y.Z` - -### Examples -``` -feature/add-data-extractor -bugfix/fix-s3-upload-error -hotfix/security-patch -release/v1.0.0 -``` - -## Environments - -1. Development - where active development and initial testing occur -2. Staging - for integration testing and final checks before production -3. Production - live and stable environment - -## Deployment - -1. `main` - represents the current production state -2. `develop` - represents the integration branch for features and non-urgent fixes -3. `staging` - represents the current staging state - -## Staging Flow - -1. Create feature branches from `develop` & merge completed features back into `develop` -2. When the `develop` branch is ready for testing, create a `staging` branch from `develop` -3. Deploy the `staging` branch to the staging environment and perform our unit-tests -4. If staging tests pass, create a `release/vX.Y.Z` branch from `staging` -5. Make any final adjustments in the `release/vX.Y.Z` branch -6. Once we have approved the changes in the `release/vX.Y.Z` branch, merge into `main` -7. Tag the release in `main` - -### Notes - -- No new features should be included in the release branches and any new features should be merged into `develop` for the next release cycle - -## Commit Messages - -Please follow the conventional commits specification: - -``` -<type>[optional scope]: <description> - -<optional body> - -[optional footer(s)] -``` - -### Types -- feat: new features -- fix: bugfixes -- docs: documentation-only changes -- style: changes that do not affect the meaning of the code -- refactor: code changes that neither fix bugs nor adds features -- perf: code changes that improve performance -- test: adding tests or correcting existing tests -- chore: changes to build process or tools/libraries (probably not needed) -- infra: changes to infrastructure configuration (e.g. Terraform) - -### Examples -``` -feat(extract): add automatic scheduling for data ingestion -docs: update README with project setup instructions -``` - -Configuration files for things such as Terraform isn't native to Conventional Commits, but we can add our own: - -``` -infra(tf): update S3 bucket policy -``` - -If the Terraform change involves a fix, you may combine `fix` and `infra`: - -``` -fix(infra): ... -``` diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..077cd98 --- /dev/null +++ b/Makefile @@ -0,0 +1,80 @@ +############################################## +# # +# MAKEFILE TO BUILD THE PROJECT # +# # +############################################## + +PROJECT_NAME = de-project-bentley +REGION = eu-west-2 +PYTHON_INTERPRETER = python +WD=$(shell pwd) +PYTHONPATH=${WD} +SHELL := /bin/bash +PROFILE = default +PIP:=pip + +## PYTHON INTERPRETER ENVIRONMENT +create-environment: + @echo ">>> About to create environment: $(PROJECT_NAME)..." + @echo ">>> check python3 version" + ( \ + $(PYTHON_INTERPRETER) --version; \ + ) + @echo ">>> Setting up VirtualEnv." + ( \ + $(PIP) install -q virtualenv virtualenvwrapper; \ + virtualenv venv --python=$(PYTHON_INTERPRETER); \ + ) + +ACTIVATE_ENV := source venv/bin/activate + +# Execute python related functionalities from within the project's environment +define execute_in_env + $(ACTIVATE_ENV) && $1 +endef + +## Build the environment requirements +requirements: create-environment + $(call execute_in_env, $(PIP) install -r ./requirements.txt) + +# Set Up +## Install bandit +bandit: + $(call execute_in_env, $(PIP) install bandit) + +## Install safety +safety: + $(call execute_in_env, $(PIP) install safety) + +## Install black +black: + $(call execute_in_env, $(PIP) install black) + +## Install coverage +coverage: + $(call execute_in_env, $(PIP) install coverage) + +## Set up dev requirements (bandit, safety, black) +dev-setup: bandit safety black coverage + +# Build / Run + +## Run the security test (bandit + safety) +security-test: + $(call execute_in_env, safety check -r ./requirements.txt) + $(call execute_in_env, bandit -lll */*.py *c/*/*.py) + +## Run the black code check +run-black: + $(call execute_in_env, black ./src/*/*.py ./test/*/*.py) + +## Run the unit tests +unit-test: + $(call execute_in_env, PYTHONPATH=${PYTHONPATH} pytest -v) + +## Run the coverage check +check-coverage: + $(call execute_in_env, PYTHONPATH=${PYTHONPATH} pytest --cov=src test/) + +## Run all checks +run-checks: security-test run-black unit-test check-coverage @@ -1 +1,51 @@ -# de-project-bentley
\ No newline at end of file +# ToteSys - Data Engineering Project + +[](https://www.python.org/) +[](https://aws.amazon.com/) +[](https://www.terraform.io/) +[](https://www.postgresql.org/) +[](https://github.com/features/actions) + +[](https://github.com/ajschofield/de-project-bentley/actions/workflows/deploy.yml?query=branch%3Amain) +[](https://github.com/ajschofield/de-project-bentley/deployments/production) +# Summary +The project aims to implement a data platform that can extract data from an +operational database, archive it in a data lake, and make it easily accessible +within a remodelled OLAP data warehouse. + +The solution showcases our skills in: + +- Python +- PostgreSQL +- Database modelling +- Amazon Web Services (AWS) +- Agile methodologies + +# Main Objective + +Our goal is to create a reliable ETL (Extract, Transform, Load) pipeline that +can: + +1. Extract the data from the `totesys` operational database +2. Store the data in AWS S3 buckets, that will form our data lake +3. Transform the data into a suitable schema for the data warehouse +4. Load the transformed data into the data warehouse hosted on AWS + +# Key Features + +We aim for the project to have certain features. Some are more prioritised than +others. + +- [ ] Automated data ingestion from `totesys` db +- [ ] Data storage for ingested and processed data in S3 buckets +- [ ] Data transformation for data warehouse schema +- [ ] Automated data loading into the data warehouse schema +- [ ] Logging and monitoring with CloudWatch +- [ ] Notifications for errors and successful runs (e.g. successful ingestion) +- [ ] Visualisation of warehouse data + +# Test Coverage +TBA + +# Contributors +TBA
\ No newline at end of file diff --git a/src/extract_lambda.py b/src/extract_lambda.py index fb2d7e8..4168e27 100644 --- a/src/extract_lambda.py +++ b/src/extract_lambda.py @@ -1,5 +1,4 @@ -from pg8000.native import Connection, DatabaseError, InterfaceError -from dotenv import dotenv_values +from pg8000.native import Connection, InterfaceError, identifier import boto3 import csv from botocore.exceptions import ClientError @@ -12,6 +11,8 @@ import re logger = logging.getLogger() logger.setLevel(logging.INFO) +# DB Exception class + class DBConnectionException(Exception): """Wraps pg8000.native Error or DatabaseError.""" @@ -21,136 +22,167 @@ class DBConnectionException(Exception): self.message = str(e) super().__init__(self.message) + def lambda_handler(event, context): """This lambda function connects to the Totesys database, lists the contents of the ingestion bucket, - and converts all tables to CSV and if any of those tables do not exist in, or are different to the ones in s3, it uploads them - it uses 3 helper functions to achieve these 3 functionalities + and converts all tables to CSV and if any of those tables do not exist in, or are different to the ones in s3, it uploads them + it uses 3 helper functions to achieve these 3 functionalities """ try: db = connect_to_database() existing_files = list_existing_s3_files() any_changes = process_and_upload_tables(db, existing_files) - - if not any_changes: + + if not any_changes["updated"]: logger.info("No changes detected in the database.") return { - 'statusCode': 200, - 'body': json.dumps('No changes detected, no CSV files were uploaded.') + "statusCode": 200, + "body": json.dumps("No changes detected, no CSV files were uploaded."), } else: return { - 'statusCode': 200, - 'body': json.dumps('CSV files processed and uploaded successfully.') + "statusCode": 200, + "body": json.dumps( + f"""CSV files processed for {', '.join(any_changes['updated'])} and uploaded successfully.{ + 'The following tables were not updated: '+', '.join(any_changes['no change']) if any_changes['no change'] else ''}""" + ), } - except Exception as e: - logger.error(f'Error: {e}') - return { - 'statusCode': 500, - 'body': json.dumps('Internal server error.') - } - + logger.error(f"Error: {e}") + return {"statusCode": 500, "body": json.dumps("Internal server error.")} finally: - if db: db.close() -def get_config(path: str = ".env") -> dict: - return dotenv_values(path) + +def retrieve_secrets( + sm_client=boto3.client("secretsmanager"), secret_name="bentley-secrets" +): + try: + response = sm_client.get_secret_value(SecretId=secret_name) + if "SecretString" in response: + secret = json.loads(response["SecretString"]) + return secret + except ClientError as e: + logger.error(f"Could not retrieve secrets: {e}") + raise e def connect_to_database() -> Connection: try: - config = get_config() - host = config["host"] - port = config["port"] - user = config["user"] - password = config["password"] - database = config["database"] + secrets = retrieve_secrets() + host = secrets["host"] + port = secrets["port"] + user = secrets["user"] + password = secrets["password"] + database = secrets["database"] return Connection( - database=database, - user=user, - password=password, - host=host, - port=port + database=database, user=user, password=password, host=host, port=port ) except InterfaceError as i: - logger.error(f'Interface error: {i}') + logger.error(f"Interface error: {i}") raise DBConnectionException("Failed to connect to database") +def extract_bucket(client=boto3.client("s3")): + response = client.list_buckets() + extract_bucket_filter = [ + bucket["Name"] for bucket in response["Buckets"] if "extract" in bucket["Name"] + ] + return extract_bucket_filter[0] -def list_existing_s3_files(bucket_name='extract_bucket', client=boto3.client('s3')): - """Creates a dictionary and populates it with the - results of listing the contents of the s3 bucket, then - returns the populated dictionary + +def list_existing_s3_files(bucket_name=extract_bucket(), client=boto3.client("s3")): + """Creates a dictionary and populates it with the + results of listing the contents of the s3 bucket, then + returns the populated dictionary """ - + existing_files = {} - + try: - response = client.list_objects_v2(Bucket='extract_bucket') - - if 'Contents' in response: - for obj in response['Contents']: - s3_key = obj['Key'] + response = client.list_objects_v2(Bucket=bucket_name) + + if "Contents" in response: + for obj in response["Contents"]: + s3_key = obj["Key"] try: file_obj = client.get_object(Bucket=bucket_name, Key=s3_key) - file_content = file_obj['Body'].read().decode('utf-8') + file_content = file_obj["Body"].read().decode("utf-8") existing_files[s3_key] = file_content except ClientError as e: - logger.error(f'Error retrieving S3 object {s3_key}: {e}') + logger.error(f"Error retrieving S3 object {s3_key}: {e}") else: - logger.error('The bucket is empty') - + logger.error("The bucket is empty") + except ClientError as e: - logger.error(f'Error listing S3 objects: {e}') - - return existing_files + logger.error(f"Error listing S3 objects: {e}") + return existing_files -def process_and_upload_tables(db, existing_files, client=boto3.client('s3')): - """Creates a list of the tables from a database query and - then selects everything from each table in individual queries - it then writes each table to CSV files and compares with the item - in the existing_files dictionary with the same name. If it finds any changes - to files, or new tables/files it uploads them to the s3 bucket +def process_and_upload_tables(db, existing_files, client=boto3.client("s3")): + """Creates a list of the tables from a database query and + then selects everything from each table in individual queries + it then writes each table to CSV files and compares with the item + in the existing_files dictionary with the same name. If it finds any changes + to files, or new tables/files it uploads them to the s3 bucket """ - ## NEW CODE + load_status = {"updated": [], "no change": []} + # Retrieving the latest file timestamp from S3 extract bucket all_datetimes = [] for file_names in existing_files.keys(): - datetime_str_on_s3 = ''.join(re.search(r'\/(.+/).+_(.+)\.csv',file_names).group(1,2)) - all_datetimes.append(datetime.strptime(datetime_str_on_s3, '%Y/%m/%d/%H:%M:%S')) + datetime_str_on_s3 = "".join( + re.search(r"\/(.+/).+_(.+)\.csv", file_names).group(1, 2) + ) + all_datetimes.append(datetime.strptime(datetime_str_on_s3, "%Y/%m/%d/%H:%M:%S")) latest_timestamp = max(all_datetimes) - ## END OF NEW CODE - tables = db.run("SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';") - print(tables) + # Iterating through tables on the database and retrieving only latest changes vs previous file load + tables = db.run( + """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema='public' AND table_type='BASE TABLE';""" + ) for table in tables: table_name = table[0] - rows = db.run(f"SELECT * FROM {table_name};") - - - csv_file_path = f"/tmp/{table_name}.csv" - with open(csv_file_path, "w", newline='') as file: - writer = csv.writer(file) - #column_names = [desc["name"] for desc in db.columns(f"SELECT * FROM {table_name};")] - column_names = [col_name[0] for col_name in db.run(f"SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where table_name = '{table_name}';")] - writer.writerow(column_names) - writer.writerows(rows) - s3_key = datetime.strftime(datetime.today(),f'{table_name}/%Y/%m/%d/{table_name}_%H:%M:%S.csv') - new_csv_content = open(csv_file_path, "r").read() - ## NEW CODE - latest_s3_object_key = datetime.strftime(latest_timestamp,f'{table_name}/%Y/%m/%d/{table_name}_%H:%M:%S.csv') - ## END OF NEW CODE - if existing_files[latest_s3_object_key] != new_csv_content: + rows = db.run( + f"SELECT * FROM {identifier(table_name)} " "WHERE last_updated >= :latest;", + latest={datetime.strftime(latest_timestamp, "%H-%m-%d %H:%M:%S")}, + ) + + # Creating a temporary file path and writing the column name to it followed by each row of data + if rows: + csv_file_path = f"/tmp/{table_name}.csv" + with open(csv_file_path, "w", newline="") as file: + writer = csv.writer(file) + # column_names = [desc["name"] for desc in db.columns(f"SELECT * FROM {table_name};")] + column_names = [ + col_name[0] + for col_name in db.run( + """SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS + WHERE table_name = :table ;""", + table=table_name, + ) + ] + writer.writerow(column_names) + writer.writerows(rows) + s3_key = datetime.strftime( + datetime.today(), f"{table_name}/%Y/%m/%d/{table_name}_%H:%M:%S.csv" + ) + + # Writing the new file to S3 extract bucket: try: - client.upload_file(csv_file_path, 'extract_bucket', s3_key) + client.upload_file(csv_file_path, extract_bucket(), s3_key) + load_status["updated"].append(table_name) logger.info(f"Uploaded {s3_key} to S3.") except ClientError as e: - logger.error(f'Error uploading to S3: {e}') + logger.error(f"Error uploading to S3: {e}") else: - logger.info(f"No new data.") -
\ No newline at end of file + load_status["no change"].append(table_name) + logger.info( + f"No new data in {table_name} name. Latest data retrieved is from {latest_timestamp}." + ) + return load_status diff --git a/src/load_lambda.py b/src/load_lambda.py index 6ee681f..c6a8e60 100644 --- a/src/load_lambda.py +++ b/src/load_lambda.py @@ -1,2 +1,2 @@ def lambda_handler(): - pass
\ No newline at end of file + pass diff --git a/src/secrets_manager.py b/src/secrets_manager.py index c0fb61e..3484688 100644 --- a/src/secrets_manager.py +++ b/src/secrets_manager.py @@ -4,45 +4,46 @@ import json def sm_client(): - sm_client = boto3.client('secretsmanager') + sm_client = boto3.client("secretsmanager") yield sm_client -def create_secret(sm_client, secret_name, cohort_id, user, password, host, database, port): + +def create_secret( + sm_client, secret_name, cohort_id, user, password, host, database, port +): secret = { "cohort_id": cohort_id, "user": user, "password": password, "host": host, "database": database, - "port": port + "port": port, } response = sm_client.create_secret( - Name = secret_name, - SecretString = json.dumps(secret) + Name=secret_name, SecretString=json.dumps(secret) ) print(response) return response + def list_secret(sm_client): response = sm_client.list_secrets() - secret_dict = response['SecretList'] + secret_dict = response["SecretList"] secret_names = [] for items in secret_dict: - secret_names.append(items['Name']) - print(f'{len(secret_names)} secret(s) available') + secret_names.append(items["Name"]) + print(f"{len(secret_names)} secret(s) available") for name in secret_names: print(name) return secret_names -def retrieve_secrets(sm_client): - response = sm_client.get_secrets( - - ) +def retrieve_secrets(sm_client): + response = sm_client.get_secrets() -#retrieve secret -#so lambda can access totesy db -#so lambda connect to the db and then retrieve the data
\ No newline at end of file +# retrieve secret +# so lambda can access totesy db +# so lambda connect to the db and then retrieve the data diff --git a/src/transform_lambda.py b/src/transform_lambda.py index 6ee681f..c6a8e60 100644 --- a/src/transform_lambda.py +++ b/src/transform_lambda.py @@ -1,2 +1,2 @@ def lambda_handler(): - pass
\ No newline at end of file + pass diff --git a/terraform/lambda.tf b/terraform/lambda.tf index 72d1306..e33bc79 100644 --- a/terraform/lambda.tf +++ b/terraform/lambda.tf @@ -12,12 +12,14 @@ resource "aws_s3_object" "extract_lambda_code" { } resource "aws_lambda_function" "extract_lambda" { - function_name = var.extract_lambda_name - s3_bucket = aws_s3_bucket.lambda_code_bucket.bucket - s3_key = aws_s3_object.extract_lambda_code.key - role = aws_iam_role.multi_service_role.arn - handler = "extract_lambda.extract" - runtime = "python3.11" + function_name = var.extract_lambda_name + s3_bucket = aws_s3_bucket.lambda_code_bucket.bucket + s3_key = aws_s3_object.extract_lambda_code.key + layers = [aws_lambda_layer_version.lambda_layer.arn] + role = aws_iam_role.multi_service_role.arn + handler = "extract_lambda.lambda_handler" + runtime = "python3.11" + source_code_hash = data.archive_file.extract_lambda_zip.output_base64sha256 lifecycle { create_before_destroy = true @@ -40,12 +42,14 @@ resource "aws_s3_object" "transform_lambda_code" { } resource "aws_lambda_function" "transform_lambda" { - function_name = var.transform_lambda_name - s3_bucket = aws_s3_bucket.lambda_code_bucket.bucket - s3_key = aws_s3_object.transform_lambda_code.key - role = aws_iam_role.multi_service_role.arn - handler = "transform_lambda.transform" - runtime = "python3.11" + function_name = var.transform_lambda_name + s3_bucket = aws_s3_bucket.lambda_code_bucket.bucket + s3_key = aws_s3_object.transform_lambda_code.key + layers = [aws_lambda_layer_version.lambda_layer.arn] + role = aws_iam_role.multi_service_role.arn + handler = "transform_lambda.lambda_handler" + runtime = "python3.11" + source_code_hash = data.archive_file.transform_lambda_zip.output_base64sha256 lifecycle { create_before_destroy = true @@ -68,12 +72,14 @@ resource "aws_s3_object" "load_lambda_code" { } resource "aws_lambda_function" "load_lambda" { - function_name = var.load_lambda_name - s3_bucket = aws_s3_bucket.lambda_code_bucket.bucket - s3_key = aws_s3_object.load_lambda_code.key - role = aws_iam_role.multi_service_role.arn - handler = "load_lambda.load" - runtime = "python3.11" + function_name = var.load_lambda_name + s3_bucket = aws_s3_bucket.lambda_code_bucket.bucket + s3_key = aws_s3_object.load_lambda_code.key + layers = [aws_lambda_layer_version.lambda_layer.arn] + role = aws_iam_role.multi_service_role.arn + handler = "load_lambda.lambda_handler" + runtime = "python3.11" + source_code_hash = data.archive_file.load_lambda_zip.output_base64sha256 lifecycle { create_before_destroy = true @@ -81,3 +87,39 @@ resource "aws_lambda_function" "load_lambda" { depends_on = [aws_s3_object.load_lambda_code] } + +# Lambda Layer Specification +locals { + layer_dir = "lambda_layer" + requirements = "requirements.txt" + layer_zip = "layer.zip" + layer_name = "lambda_layer_dev" +} + +resource "null_resource" "prepare_layer" { + provisioner "local-exec" { + command = <<EOT + cd ${local.layer_dir} + rm -rf python + mkdir python + pip3 install -r ${local.requirements} -t python/ + zip -r ${local.layer_zip} python + EOT + } #removed / at the end of python in line 99 +} + +resource "aws_s3_object" "lambda_layer_zip" { + bucket = aws_s3_bucket.lambda_code_bucket.id #bucket instead of id + key = "lambda_layer/${local.layer_name}/${local.layer_zip}" + source = "${local.layer_dir}/${local.layer_zip}" + depends_on = [null_resource.prepare_layer] +} + +resource "aws_lambda_layer_version" "lambda_layer" { + layer_name = local.layer_name + compatible_runtimes = ["python3.11"] + s3_bucket = aws_s3_bucket.lambda_layer_bucket.id #bucket instead of id + s3_key = aws_s3_object.lambda_layer_zip.key + skip_destroy = true + depends_on = [aws_s3_object.lambda_layer_zip] +} diff --git a/terraform/main.tf b/terraform/main.tf index 3b06701..310a251 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -4,6 +4,14 @@ terraform { source = "hashicorp/aws" version = "~>5.0" } + null = { + source = "hashicorp/null" + version = "~>3.2.2" + } + archive = { + source = "hashicorp/archive" + version = "~>2.5.0" + } } backend "s3" { bucket = "bentley-project-secrets" diff --git a/terraform/rds.tf b/terraform/rds.tf index 88783b7..a013fb3 100644 --- a/terraform/rds.tf +++ b/terraform/rds.tf @@ -1,80 +1,70 @@ -data "aws_availability_zones" "available" {} +# data "aws_availability_zones" "available" {} -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "5.12.1" +# module "vpc" { +# source = "terraform-aws-modules/vpc/aws" +# version = "5.12.1" - name = var.project_name - cidr = "10.0.0.0/16" - azs = data.aws_availability_zones.available.names - public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"] - enable_dns_hostnames = true - enable_dns_support = true -} +# name = var.project_name +# cidr = "10.0.0.0/16" +# azs = data.aws_availability_zones.available.names +# public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"] +# enable_dns_hostnames = true +# enable_dns_support = true +# } -resource "aws_db_subnet_group" "Terrific-Totes-sub-gr" { - name = "tt-db-subnet" - subnet_ids = module.vpc.public_subnets +# resource "aws_db_subnet_group" "Terrific-Totes-sub-gr" { +# name = "tt-db-subnet" +# subnet_ids = module.vpc.public_subnets - tags = { - Name = "${var.project_name}" - } -} +# tags = { +# Name = "${var.project_name}" +# } +# } -resource "aws_security_group" "rds" { - name = "${var.project_name}-rds" - vpc_id = module.vpc.vpc_id +# resource "aws_security_group" "rds" { +# name = "${var.project_name}-rds" +# vpc_id = module.vpc.vpc_id - ingress { - from_port = 5432 - to_port = 5432 - protocol = "tcp" - cidr_blocks = ["0.0.0.0/0"] - } +# ingress { +# from_port = 5432 +# to_port = 5432 +# protocol = "tcp" +# cidr_blocks = ["0.0.0.0/0"] +# } - egress { - from_port = 5432 - to_port = 5432 - protocol = "tcp" - cidr_blocks = ["0.0.0.0/0"] - } +# egress { +# from_port = 5432 +# to_port = 5432 +# protocol = "tcp" +# cidr_blocks = ["0.0.0.0/0"] +# } - tags = { - Name = "${var.project_name}-rds" - } -} +# tags = { +# Name = "${var.project_name}-rds" +# } +# } -resource "aws_db_parameter_group" "Terrific-Totes-param-gr" { - name = "tt-db-param" - family = "postgres14" +# resource "aws_db_parameter_group" "Terrific-Totes-param-gr" { +# name = "tt-db-param" +# family = "postgres14" - parameter { - name = "log_connections" - value = "1" - } -} +# parameter { +# name = "log_connections" +# value = "1" +# } +# } -resource "aws_db_instance" "terrific-totes-rds" { - db_name = var.project_name - instance_class = "db.t3.micro" - allocated_storage = 5 - engine = "postgres" - engine_version = "14.10" - username = "totes" - password = "totes123" - # username = "user credentials for the root user" # we could use .env here - # password = "user password for the root user" # we could use .env here - ### alternatively to providing username nad password we can specify: - # resource "aws_kms_key" "example_key" { - # description = "Example KMS Key" - # } - # within the resource: - # manage_master_user_password = true - # master_user_secret_kms_key_id = aws_kms_key.example.key_id - # } - db_subnet_group_name = aws_db_subnet_group.Terrific-Totes-sub-gr.name - vpc_security_group_ids = [aws_security_group.rds.id] - parameter_group_name = aws_db_parameter_group.Terrific-Totes-param-gr.name - publicly_accessible = false - skip_final_snapshot = true -} +# resource "aws_db_instance" "terrific-totes-rds" { +# db_name = var.project_name +# instance_class = "db.t3.micro" +# allocated_storage = 5 +# engine = "postgres" +# engine_version = "14.10" +# username = "" +# password = "" +# db_subnet_group_name = aws_db_subnet_group.Terrific-Totes-sub-gr.name +# vpc_security_group_ids = [aws_security_group.rds.id] +# parameter_group_name = aws_db_parameter_group.Terrific-Totes-param-gr.name +# publicly_accessible = false +# skip_final_snapshot = true +# } diff --git a/terraform/s3.tf b/terraform/s3.tf index d5cdee3..b3a863c 100644 --- a/terraform/s3.tf +++ b/terraform/s3.tf @@ -12,3 +12,8 @@ resource "aws_s3_bucket" "transform_bucket" { resource "aws_s3_bucket" "lambda_code_bucket" { bucket_prefix = "${var.s3_code_bucket_name}-" } + +### LAMBDA LAYER BUCKET +resource "aws_s3_bucket" "lambda_layer_bucket" { + bucket_prefix = "lambda-layer-dev-" +}
\ No newline at end of file diff --git a/test/test_secrets_manager.py b/test/test_secrets_manager.py index 86533bc..cb4ec15 100644 --- a/test/test_secrets_manager.py +++ b/test/test_secrets_manager.py @@ -2,10 +2,12 @@ from src.secrets_manager import sm_client, create_secret, list_secret import boto3 from moto import mock_aws import json -import pytest +import pytest import os -pytest.fixture(scope='class') +pytest.fixture(scope="class") + + def mock_aws_credentials(): """Mocked AWS Credentials for moto.""" os.environ["AWS_ACCESS_KEY_ID"] = "testing" @@ -14,10 +16,11 @@ def mock_aws_credentials(): os.environ["AWS_SESSION_TOKEN"] = "testing" os.environ["AWS_DEFAULT_REGION"] = "eu-west-2" -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def mock_sm_client(mock_aws_credentials): with mock_aws(): - yield boto3.client('secretsmanager') + yield boto3.client("secretsmanager") def test_create_secret_stores_secrets(mock_sm_client): @@ -29,6 +32,8 @@ def test_create_secret_stores_secrets(mock_sm_client): port = "test_port" secret_name = "test_secret" - response = create_secret(mock_sm_client, secret_name, cohort_id, user, password, host, database, port) - - assert response['Name'] == secret_name
\ No newline at end of file + response = create_secret( + mock_sm_client, secret_name, cohort_id, user, password, host, database, port + ) + + assert response["Name"] == secret_name diff --git a/tests/test_extract_lambda.py b/tests/test_extract_lambda.py index 67cb6d3..7707cbf 100644 --- a/tests/test_extract_lambda.py +++ b/tests/test_extract_lambda.py @@ -3,6 +3,9 @@ import boto3 from moto import mock_aws from unittest.mock import patch, MagicMock from unittest import TestCase +import os +import logging +import json from src.extract_lambda import ( list_existing_s3_files, connect_to_database, @@ -10,9 +13,6 @@ from src.extract_lambda import ( lambda_handler, process_and_upload_tables, ) -import os -import logging -import json @pytest.fixture(scope="class") diff --git a/tests/test_secrets_manager.py b/tests/test_secrets_manager.py new file mode 100644 index 0000000..609c572 --- /dev/null +++ b/tests/test_secrets_manager.py @@ -0,0 +1,84 @@ +from src.secrets_manager import sm_client, retrieve_secrets +import boto3 +import botocore.exceptions +from moto import mock_aws +import json +import pytest +import os + + +@pytest.fixture(scope="function") +def aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "eu-west-2" + + +@pytest.fixture(scope="function") +def mock_sm_client(aws_credentials): + with mock_aws(): + yield boto3.client("secretsmanager") + + +@pytest.fixture(scope="function") +def mock_store_secret(mock_sm_client): + secret = { + "cohort_id": "test_cohort_id", + "user": "test_user_id", + "password": "test_password", + "host": "test_host", + "database": "test_database", + "port": "test_port", + } + + secret_name = "test_secret" + + response = mock_sm_client.create_secret( + Name=secret_name, SecretString=json.dumps(secret) + ) + + return response + + +def test_retrieves_secrets_returns_dictionary(mock_sm_client, mock_store_secret): + secret_name = "test_secret" + + result = retrieve_secrets(mock_sm_client, secret_name) + + assert isinstance(result, dict) + + +def test_retrieves_secrets_returns_correct_keys_and_values( + mock_sm_client, mock_store_secret +): + secret_name = "test_secret" + + result = retrieve_secrets(mock_sm_client, secret_name) + + assert result["cohort_id"] == "test_cohort_id" + assert result["user"] == "test_user_id" + assert result["password"] == "test_password" + assert result["host"] == "test_host" + assert result["database"] == "test_database" + assert result["port"] == "test_port" + + +def test_retrieves_secrets_raises_error_if_secret_name_incorrect_data_type( + mock_sm_client, +): + secret_name = [1, 2, 3] + + with pytest.raises(botocore.exceptions.ParamValidationError) as error: + retrieve_secrets(mock_sm_client, secret_name) + + +def test_retrieves_secrets_raises_error_if_secret_name_does_not_exist( + mock_sm_client, mock_store_secret +): + secret_name = "test_secret_2" + + with pytest.raises(botocore.exceptions.ClientError) as error: + retrieve_secrets(mock_sm_client, secret_name) |
