diff options
| author | Alex <git@ajschof.me> | 2025-02-18 23:08:06 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-18 23:08:06 +0000 |
| commit | eb0d30d0235dbadd1d5c385a0a49d4cd8aea021e (patch) | |
| tree | 4970d8999b622998800a99e2922b8639049169d9 | |
| parent | 362805c9354dc653442f4e144022cc577ebeb43e (diff) | |
| parent | 20572634aaab2b522128a88449b2a32b6c028fc4 (diff) | |
| download | gdpr-obfuscator-eb0d30d0235dbadd1d5c385a0a49d4cd8aea021e.tar.gz gdpr-obfuscator-eb0d30d0235dbadd1d5c385a0a49d4cd8aea021e.zip | |
Merge pull request #6 from ajschofield/feat/read_s3
implement reading from s3 bucket
| -rw-r--r-- | cli.py | 11 | ||||
| -rw-r--r-- | obfuscator/csv_reader.py | 35 | ||||
| -rw-r--r-- | obfuscator/utils.py | 15 | ||||
| -rw-r--r-- | pyproject.toml | 3 | ||||
| -rw-r--r-- | test/test_csv_reader.py | 143 |
5 files changed, 199 insertions, 8 deletions
@@ -26,21 +26,24 @@ def main(): # Parse the arguments args = parser.parse_args() + # Create the CSVReader object + reader = CSVReader() + # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") - # Create a CSVReader object and read the local CSV file - reader = CSVReader() data = reader.read_local(args.local) # For debug purposes, log the data read from the CSV - logger.debug(data) + logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") + data = reader.read_s3(args.s3) + logger.debug("Contents: " + str(data)) # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) # For debug purposes, log the obfuscated data as JSON for readability - logger.debug(json.dumps(obfuscated_data, indent=4)) + logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) # If the script is run directly (as it should be), call the main function diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index eb93609..8f4ebea 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -1,7 +1,10 @@ import csv import io +import boto3 +import os from typing import List, Dict from obfuscator.logger import get_logger +from obfuscator.utils import get_s3_path # Create the logger logger = get_logger("CSVReader") @@ -47,8 +50,36 @@ class CSVReader: A method to read an S3 object containing CSV data and return the data as a list of dictionaries. """ - # Yet to be implemented. - return [] + bucket, key = get_s3_path(path) + logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + + # If DEBUG=TRUE, use the localstack endpoint for testing + if os.getenv("DEBUG", "FALSE").upper() == "TRUE": + localstack_endpoint = "http://localhost.localstack.cloud:4566" + logger.debug("Using LocalStack endpoint for S3") + client = boto3.client( + "s3", + endpoint_url=localstack_endpoint, + aws_access_key_id="dummy", + aws_secret_access_key="dummy", + ) + logger.debug(f"endpoint_url: {localstack_endpoint}") + else: + client = boto3.client("s3") + + try: + # Attempt to read the S3 object and return the data as a list of dictionaries + response = client.get_object(Bucket=bucket, Key=key) + logger.info("S3 object read successfully") + # Read and decode the content + content = response["Body"].read().decode("utf-8") + # Even though the read_string method was only created for testing, + # it can be reused here to read and return the CSV data + return CSVReader.read_string(content) + # TODO: Add more specific exceptions to catch + except Exception as e: + logger.error(f"Error reading S3 object: {e}") + raise @staticmethod def read_string(content: str) -> List[Dict[str, str]]: diff --git a/obfuscator/utils.py b/obfuscator/utils.py new file mode 100644 index 0000000..2e4211f --- /dev/null +++ b/obfuscator/utils.py @@ -0,0 +1,15 @@ +# Utility functions +from obfuscator.logger import get_logger + +# Create the logger +logger = get_logger("CLI") + + +def get_s3_path(uri): + parts = uri.replace("s3://", "").split("/") + logger.debug(f"Parts: {parts}") + bucket = parts.pop(0) + logger.debug(f"Bucket: {bucket}") + key = "/".join(parts) + logger.debug(f"Key: {key}") + return bucket, key diff --git a/pyproject.toml b/pyproject.toml index cc923b9..46114bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "A Python library designed to detect and remove Personally Identif authors = [{ name = "Alex Schofield", email = "git@ajschof.me" }] readme = "README.md" requires-python = ">=3.13" -dependencies = ["tabulate (>=0.9.0,<0.10.0)"] +dependencies = ["tabulate (>=0.9.0,<0.10.0)", "boto3 (>=1.36.22,<2.0.0)"] [tool.poetry] package-mode = false @@ -13,6 +13,7 @@ package-mode = false [tool.poetry.group.dev.dependencies] pytest = "8.3.4" pytest-cov = "^6.0.0" +moto = "^5.0.28" [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index af13cff..e4c135b 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,10 +1,15 @@ -# csv_reader.py - tests +# csv_reader.py - tests for read_string and read_s3 # Author: Alex Schofield +import boto3 +from moto import mock_aws from obfuscator.csv_reader import CSVReader +import pytest reader = CSVReader() +# CSVREADER: READ_STRING TESTS + # Check if the function can read a CSV string with no content and return # an empty list @@ -61,3 +66,139 @@ def test_csv_with_quoted_fields_should_run_as_expected(): {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, ] assert result == expected + + +# CSVREADER: READ_S3 TESTS + + +def setup_s3(s3_client, bucket: str, key: str, content: str): + s3_client.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + s3_client.put_object(Bucket=bucket, Key=key, Body=content) + + +@pytest.fixture(autouse=True) +def s3_client(): + with mock_aws(): + yield boto3.client("s3", "eu-west-2") + + +def test_read_s3_valid_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/mock.csv" + + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + + assert data == expected + + +def test_read_s3_empty_csv_returns_empty_list(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "empty-bucket" + key = "data/empty.csv" + csv_content = "student_id,name,course\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + assert data == [] + + +def test_read_s3_nonexistent_bucket_raises_exception(): + with mock_aws(): + bucket = "nonexistent-bucket" + key = "data/mock.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + CSVReader.read_s3(path) + + +def test_read_s3_nonexistent_key_raises_exception(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + s3.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + key = "data/nonexistent.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + CSVReader.read_s3(path) + + +def test_read_s3_malformed_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/malformed.csv" + csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] + assert data == expected + + +def test_read_s3_csv_with_extra_empty_lines(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/extra_lines.csv" + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "\n" + "5678,Student 2,Course 2\n" + "\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert data == expected + + +def test_read_s3_csv_with_whitespace_in_fields(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/whitespace.csv" + csv_content = ( + "student_id, name , course \n" + " 1234 , Student 1 , Course 1 \n" + "5678,Student 2,Course 2\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + expected = [ + {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, + {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, + ] + assert data == expected |
