diff options
| -rw-r--r-- | .github/workflows/deploy.yml | 37 | ||||
| -rw-r--r-- | .github/workflows/on-commit.yml | 50 | ||||
| -rw-r--r-- | .github/workflows/python.yml | 50 | ||||
| -rw-r--r-- | .github/workflows/terraform.yml | 37 | ||||
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | requirements.txt | 30 | ||||
| -rw-r--r-- | src/extract_lambda.py | 166 | ||||
| -rw-r--r-- | src/secrets_manager.py | 48 | ||||
| -rw-r--r-- | terraform/rds.tf | 42 | ||||
| -rw-r--r-- | terraform/vars.tf | 4 | ||||
| -rw-r--r-- | test.py | 0 | ||||
| -rw-r--r-- | test/test_secrets_manager.py | 34 | ||||
| -rw-r--r-- | tests/dummy.txt | 1 | ||||
| -rw-r--r-- | tests/dummy_identical.csv | 4 | ||||
| -rw-r--r-- | tests/test_extract_lambda.py | 109 |
15 files changed, 483 insertions, 130 deletions
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..372d0b3 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,37 @@ +name: deploy-terraform + +on: + push: + branches: + - test-ci/** # Adjust the branch based on our deployment strategy + +jobs: + deploy-terraform: + name: Deploy Terraform + runs-on: ubuntu-latest + environment: test-env + steps: + - name: Checkout Repo + uses: actions/checkout@v4 + + - name: Install Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Terraform Init + working-directory: terraform + run: terraform init + + - name: Terraform Plan + working-directory: terraform + run: terraform plan + + - name: Terraform Apply + working-directory: terraform + run: terraform apply --auto-approve
\ No newline at end of file diff --git a/.github/workflows/on-commit.yml b/.github/workflows/on-commit.yml new file mode 100644 index 0000000..fd9ffb8 --- /dev/null +++ b/.github/workflows/on-commit.yml @@ -0,0 +1,50 @@ +name: commit-qc-checks + +on: + push: + branches-ignore: + - 'main' + +jobs: + python-quality-checks: + runs-on: ubuntu-latest + steps: + - uses : actions/checkout@v4 + - name : 'Python: Setup' + uses : actions/setup-python@v5 + with: + python-version: 3.11 + - name : 'Python: Install Dependencies' + run: | + python -m pip install --upgrade pip + pip install flake8 pylint black bandit safety + continue-on-error: true + - name : 'Python: Linting' + run: | + flake8 . + find . -name "*.py" | xargs pylint + continue-on-error: true + - name : 'Python: Formatting' + run: | + black --check . + continue-on-error: true + terraform-quality-checks: + runs-on: ubuntu-latest + steps: + - uses : actions/checkout@v4 + - name: 'Terraform: Setup' + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: latest + - name: 'Terraform: Formatting' + working-directory: terraform + run: terraform fmt -check -recursive + continue-on-error: true + - name: 'Terraform: Initialise' + working-directory: terraform + run: terraform init -backend=false + continue-on-error: true + - name: 'Terraform: Validate' + working-directory: terraform + run: terraform validate + continue-on-error: true
\ No newline at end of file diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml deleted file mode 100644 index 7d5b5b1..0000000 --- a/.github/workflows/python.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: python-quality-checks - -on: - push: - branches: [development] - pull_request: - branches: [development, staging] - -jobs: - - check-if-py-files-exist: - runs-on: ubuntu-latest - outputs: - py_files_exist: ${{ steps.check.outputs.py_files_exist }} - steps: - - uses: actions/checkout@v2 - - id: check_files - run: | - if [ -n "$(find . -name '*.py')" ]; then - echo "::set-output name=py_files_exist::true" - else - echo "::set-output name=py_files_exist::false" - fi - - quality-checks: - needs: check-if-py-files-exist - if: ${{ needs.check-if-py-files-exist.outputs.py_files_exist == 'true' }} - runs-on: ubuntu-latest - steps: - - uses : actions/checkout@v2 - - name : Setup - uses : actions/setup-python@v2 - with: - python-version: 3.11 - - name : Dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pylint black bandit safety - - name : Linting - run: | - flake8 . - find . -name "*.py" | xargs pylint - - name : Formatting - run: | - black --check . - - name: Security - run: | - bandit -r . - safety check -
\ No newline at end of file diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml deleted file mode 100644 index c349756..0000000 --- a/.github/workflows/terraform.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: terraform-quality-checks - -on: - push: - branches: [development] - paths: - - 'terraform/**.tf' - - 'terraform/**.tfvars' - pull_request: - branches: [development, staging] - paths: - - 'terraform/**.tf' - - 'terraform/**.tfvars' -jobs: - terraform-validation: - runs-on: ubuntu-latest - defaults: - run: - working-directory: ./terraform - steps: - - uses: actions/checkout@v2 - - name: Setup Terraform - uses: hashicorp/setup-terraform@v1 - with: - terraform_version: latest # Using the latest version, but not sure if it's the best practice - - name: Format - run: terraform fmt -check -recursive - - name: Init - run: terraform init -backend=false - - name: Validate - run: terraform validate - - name: Setup TFLint - uses: terraform-linters/setup-tflint@v2 - with: - tflint_version: latest - - name: Run TFLint - run: tflint -f compact
\ No newline at end of file @@ -10,6 +10,7 @@ # Output Files *.zip log* +__pycache__/ # OS-Related Files .DS_Store diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6f383f9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +asn1crypto==1.5.1 +boto3==1.34.159 +botocore==1.34.159 +certifi==2024.7.4 +cffi==1.17.0 +charset-normalizer==3.3.2 +cryptography==43.0.0 +idna==3.7 +iniconfig==2.0.0 +Jinja2==3.1.4 +jmespath==1.0.1 +MarkupSafe==2.1.5 +moto==5.0.12 +packaging==24.1 +pg8000==1.31.2 +pluggy==1.5.0 +pycparser==2.22 +pytest==8.3.2 +pytest-mock==3.14.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +PyYAML==6.0.2 +requests==2.32.3 +responses==0.25.3 +s3transfer==0.10.2 +scramp==1.4.5 +six==1.16.0 +urllib3==2.2.2 +Werkzeug==3.0.3 +xmltodict==0.13.0
\ No newline at end of file diff --git a/src/extract_lambda.py b/src/extract_lambda.py index 7d56c66..fb2d7e8 100644 --- a/src/extract_lambda.py +++ b/src/extract_lambda.py @@ -1,32 +1,156 @@ -from pg8000.native import Connection, Error, DatabaseError, InterfaceError -from dotenv import load_dotenv -import os +from pg8000.native import Connection, DatabaseError, InterfaceError +from dotenv import dotenv_values +import boto3 +import csv +from botocore.exceptions import ClientError +import logging +import json +from datetime import datetime +import re -load_dotenv() -def extract(): +logger = logging.getLogger() +logger.setLevel(logging.INFO) -# temporary credentials for dev- will not have access when uploaded - database = os.getenv('database') - user = os.getenv('user') - password = os.getenv('password') - host = os.getenv('host') - port = os.getenv('port') +class DBConnectionException(Exception): + """Wraps pg8000.native Error or DatabaseError.""" + def __init__(self, e): + """Initialise with provided error message.""" + self.message = str(e) + super().__init__(self.message) +def lambda_handler(event, context): + """This lambda function connects to the Totesys database, lists the contents of the ingestion bucket, + and converts all tables to CSV and if any of those tables do not exist in, or are different to the ones in s3, it uploads them + it uses 3 helper functions to achieve these 3 functionalities + """ try: - db = Connection.run( - database=database, - user=user, - password=password, - host=host, - port=port + db = connect_to_database() + existing_files = list_existing_s3_files() + any_changes = process_and_upload_tables(db, existing_files) + + if not any_changes: + logger.info("No changes detected in the database.") + return { + 'statusCode': 200, + 'body': json.dumps('No changes detected, no CSV files were uploaded.') + } + else: + return { + 'statusCode': 200, + 'body': json.dumps('CSV files processed and uploaded successfully.') + } + + except Exception as e: + logger.error(f'Error: {e}') + return { + 'statusCode': 500, + 'body': json.dumps('Internal server error.') + } + + finally: + + if db: + db.close() + +def get_config(path: str = ".env") -> dict: + return dotenv_values(path) + + +def connect_to_database() -> Connection: + try: + config = get_config() + host = config["host"] + port = config["port"] + user = config["user"] + password = config["password"] + database = config["database"] + + return Connection( + database=database, + user=user, + password=password, + host=host, + port=port ) - except DatabaseError as e: - print(e) except InterfaceError as i: - print(i) + logger.error(f'Interface error: {i}') + raise DBConnectionException("Failed to connect to database") + + + +def list_existing_s3_files(bucket_name='extract_bucket', client=boto3.client('s3')): + """Creates a dictionary and populates it with the + results of listing the contents of the s3 bucket, then + returns the populated dictionary + """ + + existing_files = {} + + try: + response = client.list_objects_v2(Bucket='extract_bucket') + + if 'Contents' in response: + for obj in response['Contents']: + s3_key = obj['Key'] + try: + file_obj = client.get_object(Bucket=bucket_name, Key=s3_key) + file_content = file_obj['Body'].read().decode('utf-8') + existing_files[s3_key] = file_content + except ClientError as e: + logger.error(f'Error retrieving S3 object {s3_key}: {e}') + else: + logger.error('The bucket is empty') + + except ClientError as e: + logger.error(f'Error listing S3 objects: {e}') + + return existing_files + + + +def process_and_upload_tables(db, existing_files, client=boto3.client('s3')): + """Creates a list of the tables from a database query and + then selects everything from each table in individual queries + it then writes each table to CSV files and compares with the item + in the existing_files dictionary with the same name. If it finds any changes + to files, or new tables/files it uploads them to the s3 bucket + """ + ## NEW CODE + all_datetimes = [] + for file_names in existing_files.keys(): + datetime_str_on_s3 = ''.join(re.search(r'\/(.+/).+_(.+)\.csv',file_names).group(1,2)) + all_datetimes.append(datetime.strptime(datetime_str_on_s3, '%Y/%m/%d/%H:%M:%S')) + latest_timestamp = max(all_datetimes) + ## END OF NEW CODE + tables = db.run("SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';") + print(tables) + for table in tables: + table_name = table[0] + rows = db.run(f"SELECT * FROM {table_name};") + -
\ No newline at end of file + csv_file_path = f"/tmp/{table_name}.csv" + with open(csv_file_path, "w", newline='') as file: + writer = csv.writer(file) + #column_names = [desc["name"] for desc in db.columns(f"SELECT * FROM {table_name};")] + column_names = [col_name[0] for col_name in db.run(f"SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where table_name = '{table_name}';")] + writer.writerow(column_names) + writer.writerows(rows) + s3_key = datetime.strftime(datetime.today(),f'{table_name}/%Y/%m/%d/{table_name}_%H:%M:%S.csv') + new_csv_content = open(csv_file_path, "r").read() + ## NEW CODE + latest_s3_object_key = datetime.strftime(latest_timestamp,f'{table_name}/%Y/%m/%d/{table_name}_%H:%M:%S.csv') + ## END OF NEW CODE + if existing_files[latest_s3_object_key] != new_csv_content: + try: + client.upload_file(csv_file_path, 'extract_bucket', s3_key) + logger.info(f"Uploaded {s3_key} to S3.") + except ClientError as e: + logger.error(f'Error uploading to S3: {e}') + else: + logger.info(f"No new data.") +
\ No newline at end of file diff --git a/src/secrets_manager.py b/src/secrets_manager.py new file mode 100644 index 0000000..c0fb61e --- /dev/null +++ b/src/secrets_manager.py @@ -0,0 +1,48 @@ +import boto3 +from botocore.exceptions import ClientError +import json + + +def sm_client(): + sm_client = boto3.client('secretsmanager') + yield sm_client + +def create_secret(sm_client, secret_name, cohort_id, user, password, host, database, port): + secret = { + "cohort_id": cohort_id, + "user": user, + "password": password, + "host": host, + "database": database, + "port": port + } + + response = sm_client.create_secret( + Name = secret_name, + SecretString = json.dumps(secret) + ) + + print(response) + return response + +def list_secret(sm_client): + response = sm_client.list_secrets() + secret_dict = response['SecretList'] + secret_names = [] + for items in secret_dict: + secret_names.append(items['Name']) + print(f'{len(secret_names)} secret(s) available') + for name in secret_names: + print(name) + return secret_names + +def retrieve_secrets(sm_client): + response = sm_client.get_secrets( + + ) + + + +#retrieve secret +#so lambda can access totesy db +#so lambda connect to the db and then retrieve the data
\ No newline at end of file diff --git a/terraform/rds.tf b/terraform/rds.tf index 4b25c5f..88783b7 100644 --- a/terraform/rds.tf +++ b/terraform/rds.tf @@ -2,9 +2,9 @@ data "aws_availability_zones" "available" {} module "vpc" { source = "terraform-aws-modules/vpc/aws" - version = "2.77.0" + version = "5.12.1" - name = "${var.project_name}" + name = var.project_name cidr = "10.0.0.0/16" azs = data.aws_availability_zones.available.names public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"] @@ -13,7 +13,7 @@ module "vpc" { } resource "aws_db_subnet_group" "Terrific-Totes-sub-gr" { - name = "TT-db-subnet" + name = "tt-db-subnet" subnet_ids = module.vpc.public_subnets tags = { @@ -45,7 +45,7 @@ resource "aws_security_group" "rds" { } resource "aws_db_parameter_group" "Terrific-Totes-param-gr" { - name = "TT-db-param" + name = "tt-db-param" family = "postgres14" parameter { @@ -54,25 +54,27 @@ resource "aws_db_parameter_group" "Terrific-Totes-param-gr" { } } -resource "aws_db_instance" "Terrific-Totes-rds" { - db_name = "${var.project_name}" - instance_class = "db.t3.micro" - allocated_storage = 5 - engine = "postgres" - engine_version = "14.1" - username = "user credentials for the root user" # we could use .env here - password = "user password for the root user" # we could use .env here +resource "aws_db_instance" "terrific-totes-rds" { + db_name = var.project_name + instance_class = "db.t3.micro" + allocated_storage = 5 + engine = "postgres" + engine_version = "14.10" + username = "totes" + password = "totes123" + # username = "user credentials for the root user" # we could use .env here + # password = "user password for the root user" # we could use .env here ### alternatively to providing username nad password we can specify: -# resource "aws_kms_key" "example_key" { -# description = "Example KMS Key" -# } -# within the resource: -# manage_master_user_password = true -# master_user_secret_kms_key_id = aws_kms_key.example.key_id -# } + # resource "aws_kms_key" "example_key" { + # description = "Example KMS Key" + # } + # within the resource: + # manage_master_user_password = true + # master_user_secret_kms_key_id = aws_kms_key.example.key_id + # } db_subnet_group_name = aws_db_subnet_group.Terrific-Totes-sub-gr.name vpc_security_group_ids = [aws_security_group.rds.id] parameter_group_name = aws_db_parameter_group.Terrific-Totes-param-gr.name publicly_accessible = false skip_final_snapshot = true -}
\ No newline at end of file +} diff --git a/terraform/vars.tf b/terraform/vars.tf index d5cdafb..3c88731 100644 --- a/terraform/vars.tf +++ b/terraform/vars.tf @@ -29,8 +29,8 @@ variable "load_lambda_name" { } variable "project_name" { - type = string - default = "Terrific-Totes" + type = string + default = "tt" } data "aws_caller_identity" "current" {} diff --git a/test/test_secrets_manager.py b/test/test_secrets_manager.py new file mode 100644 index 0000000..86533bc --- /dev/null +++ b/test/test_secrets_manager.py @@ -0,0 +1,34 @@ +from src.secrets_manager import sm_client, create_secret, list_secret +import boto3 +from moto import mock_aws +import json +import pytest +import os + +pytest.fixture(scope='class') +def mock_aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "eu-west-2" + +@pytest.fixture(scope='class') +def mock_sm_client(mock_aws_credentials): + with mock_aws(): + yield boto3.client('secretsmanager') + + +def test_create_secret_stores_secrets(mock_sm_client): + cohort_id = "test_cohort_id" + user = "test_user_id" + password = "test_password" + host = "test_host" + database = "test_database" + port = "test_port" + + secret_name = "test_secret" + response = create_secret(mock_sm_client, secret_name, cohort_id, user, password, host, database, port) + + assert response['Name'] == secret_name
\ No newline at end of file diff --git a/tests/dummy.txt b/tests/dummy.txt new file mode 100644 index 0000000..af27ff4 --- /dev/null +++ b/tests/dummy.txt @@ -0,0 +1 @@ +This is a test file.
\ No newline at end of file diff --git a/tests/dummy_identical.csv b/tests/dummy_identical.csv new file mode 100644 index 0000000..fdd8993 --- /dev/null +++ b/tests/dummy_identical.csv @@ -0,0 +1,4 @@ +Food_type,Flavour,Colour +Vegetable,Sour,Green +Berry,Sweet,Red + diff --git a/tests/test_extract_lambda.py b/tests/test_extract_lambda.py new file mode 100644 index 0000000..e94a8a4 --- /dev/null +++ b/tests/test_extract_lambda.py @@ -0,0 +1,109 @@ +import pytest +import boto3 +from moto import mock_aws +from unittest.mock import patch, MagicMock +from unittest import TestCase +from src.extract_lambda import list_existing_s3_files, connect_to_database, DBConnectionException, process_and_upload_tables +import os +import logging + +@pytest.fixture(scope='class') +def mock_config(): + env_vars = { + "host": "abc", + "port": "5432", + "user": "def", + "password": "password", + "database": "db", + } + with patch("src.extract_lambda.get_config", return_value=env_vars) as mock_config: + yield mock_config + + +@pytest.fixture(scope='class') +def aws_credentials(): + os.environ["AWS_ACCESS_KEY_ID"] = 'testing' + os.environ["AWS_SECRET_ACCESS_KEY"] = 'testing' + os.environ["AWS_SECURIT_TOKEN"] = 'testing' + os.environ["AWS_SESSION_TOKEN"] = 'testing' + os.environ["AWS_DEFAULT_REGION"]= 'eu-west-2' + +@pytest.fixture(scope='class') +def s3_client(aws_credentials): + with mock_aws(): + yield boto3.client('s3') + +class TestListExistingS3Files: + def test_error_if_no_bucket(self, s3_client, caplog): + + logger = logging.getLogger() + logger.info('Testing now.') + caplog.set_level(logging.ERROR) + list_existing_s3_files(client=s3_client) + assert 'Error listing S3 objects' in caplog.text + + def test_error_if_bucket_is_empty(self, s3_client, caplog): + + s3_client.create_bucket(Bucket='extract_bucket', + CreateBucketConfiguration={ + 'LocationConstraint': 'eu-west-2' + }) + list_existing_s3_files(client=s3_client) + assert 'The bucket is empty' in caplog.text + + def test_error_retrieving_object(self, s3_client, caplog): + s3_client.upload_file('tests/dummy.txt', 'extract_bucket', 'dummy.txt') + list_existing_s3_files(bucket_name='test_bucket', client=s3_client) + + assert 'Error retrieving S3 object ' in caplog.text + + def test_retrieves_file_content(self, s3_client, caplog): + result = list_existing_s3_files(client=s3_client) + + assert list(result.values()) == ['This is a test file.'] + +class TestConnectToDatabase: + def test_connect_to_database(mock_conn, mock_config): + with patch("src.extract_lambda.Connection", autospec=True) as mock_conn: + connect_to_database() + mock_conn.assert_called_with( + host="abc", user="def", port="5432", password="password", database="db" + ) + + def test_database_error(self, mock_config): + with pytest.raises(DBConnectionException): + connect_to_database() + + def test_logs_interface_error(self, caplog): + logger = logging.getLogger() + logger.info('Testing now.') + caplog.set_level(logging.ERROR) + with pytest.raises(DBConnectionException): + connect_to_database() + assert 'Interface error' in caplog.text +''' +class TestProcessAndUploadTables: + def test_error_process_and_upload_tables(mock_conn, mock_config, s3_client, caplog): + logger = logging.getLogger() + logger.info('Testing now.') + caplog.set_level(logging.ERROR) + #### + queries = ["SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';", + "SELECT * FROM Fruits;", + "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where table_name = 'Fruits'"] + return_values = [[['Fruits']], + [['Vegetable','Sour','Green'],['Berry','Sweet','Red']], + [['Food_type'],['Flavour'],['Colour']]] + vals = dict(zip(queries,return_values)) + + #### + with patch('src.extract_lambda.connect_to_database') as mock_db: + mock_db().run.side_effects = return_values + s3_key = 'Fruits/2024/08/15/Fruits_16:46:30.csv' + existing_files = {s3_key: 'Food_type,Flavour,Colour\nFruit,Sour,Green\nBerry,Sweet,Red'} + s3_client.create_bucket(Bucket='extract_bucket', + CreateBucketConfiguration={'LocationConstraint': 'eu-west-2'}) + s3_client.upload_file('tests/dummy_identical.csv', 'extract_bucket', s3_key) + process_and_upload_tables(mock_db(), existing_files, client=s3_client) + assert 'No new data.' in caplog.text +'''
\ No newline at end of file |
