From a104e4a04c914eed6b2a9c52107cdbf7f164d106 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 17:02:38 +0000 Subject: add subheading comments for test sections in test_csv_reader.py --- test/test_csv_reader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index af13cff..271adae 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -5,6 +5,8 @@ from obfuscator.csv_reader import CSVReader reader = CSVReader() +# CSVREADER: READ_STRING TESTS + # Check if the function can read a CSV string with no content and return # an empty list @@ -61,3 +63,5 @@ def test_csv_with_quoted_fields_should_run_as_expected(): {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, ] assert result == expected + +# CSVREADER: READ_S3 TESTS \ No newline at end of file -- cgit v1.2.3 From 4a2b7bbae7c95ade8bcb13ae1ea270469b685f0f Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 17:05:19 +0000 Subject: add & import boto3 and moto to pyproject.toml/test_csv_reader.py --- pyproject.toml | 3 ++- test/test_csv_reader.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cc923b9..46114bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "A Python library designed to detect and remove Personally Identif authors = [{ name = "Alex Schofield", email = "git@ajschof.me" }] readme = "README.md" requires-python = ">=3.13" -dependencies = ["tabulate (>=0.9.0,<0.10.0)"] +dependencies = ["tabulate (>=0.9.0,<0.10.0)", "boto3 (>=1.36.22,<2.0.0)"] [tool.poetry] package-mode = false @@ -13,6 +13,7 @@ package-mode = false [tool.poetry.group.dev.dependencies] pytest = "8.3.4" pytest-cov = "^6.0.0" +moto = "^5.0.28" [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index 271adae..f59a5a7 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,6 +1,8 @@ # csv_reader.py - tests # Author: Alex Schofield +import boto3 +from moto import mock_s3 from obfuscator.csv_reader import CSVReader reader = CSVReader() -- cgit v1.2.3 From 81f7d60e1a20f2d504d810fb44b01c79bd6d55a0 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 19:44:44 +0000 Subject: add first test for read_s3 method in CSVReader" --- test/test_csv_reader.py | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index f59a5a7..27abea0 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -2,8 +2,9 @@ # Author: Alex Schofield import boto3 -from moto import mock_s3 +from moto import mock_aws from obfuscator.csv_reader import CSVReader +import pytest reader = CSVReader() @@ -66,4 +67,44 @@ def test_csv_with_quoted_fields_should_run_as_expected(): ] assert result == expected -# CSVREADER: READ_S3 TESTS \ No newline at end of file + +# CSVREADER: READ_S3 TESTS + + +def setup_s3(s3_client, bucket: str, key: str, content: str): + s3_client.create_bucket( + Bucket="test-bucket", + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + s3_client.put_object(Bucket=bucket, Key=key, Body=content) + + +@pytest.fixture(autouse=True) +def s3_client(): + with mock_aws(): + yield boto3.client("s3", "eu-west-2") + + +def test_read_s3_valid_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/mock.csv" + + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + + assert data == expected -- cgit v1.2.3 From 878d6e0dfd0ce1bfe00ca0bcadce6dd16749e6d3 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 19:53:46 +0000 Subject: create utility function to parse S3 URI --- obfuscator/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 obfuscator/utils.py diff --git a/obfuscator/utils.py b/obfuscator/utils.py new file mode 100644 index 0000000..f0174f8 --- /dev/null +++ b/obfuscator/utils.py @@ -0,0 +1,8 @@ +# Utility functions + + +def get_s3_path(uri): + parts = uri.replace("s3://", "").split("/") + bucket = parts.pop(0) + key = "/".join(parts) + return bucket, key -- cgit v1.2.3 From 2c19a941bb3afe4145761c4c6e54880490160aa2 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 21:19:51 +0000 Subject: call get_s3_path() in read_s3 and debug log path --- obfuscator/csv_reader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index eb93609..578b96b 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -2,6 +2,7 @@ import csv import io from typing import List, Dict from obfuscator.logger import get_logger +from obfuscator.utils import get_s3_path # Create the logger logger = get_logger("CSVReader") @@ -48,7 +49,8 @@ class CSVReader: and return the data as a list of dictionaries. """ # Yet to be implemented. - return [] + bucket, key = get_s3_path(path) + logger.debug(f"Reading S3 CSV from: {bucket}/{key}") @staticmethod def read_string(content: str) -> List[Dict[str, str]]: -- cgit v1.2.3 From 0022dba517a82cf15477281d8d02e0d7da8dbbe1 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 21:21:31 +0000 Subject: create CSVReader object outside of args if statement --- cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cli.py b/cli.py index a4ab4c3..9d003a7 100644 --- a/cli.py +++ b/cli.py @@ -26,11 +26,12 @@ def main(): # Parse the arguments args = parser.parse_args() + # Create the CSVReader object + reader = CSVReader() + # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") - # Create a CSVReader object and read the local CSV file - reader = CSVReader() data = reader.read_local(args.local) # For debug purposes, log the data read from the CSV logger.debug(data) -- cgit v1.2.3 From 125225291481a7e9d383c037dc8c0fc720de48cd Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 22:08:55 +0000 Subject: read data from S3 object path and debug log contents --- cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli.py b/cli.py index 9d003a7..974ac3c 100644 --- a/cli.py +++ b/cli.py @@ -37,6 +37,8 @@ def main(): logger.debug(data) else: logger.debug("User chose to read CSV from S3") + data = reader.read_s3(args.s3) + logger.debug(data) # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) -- cgit v1.2.3 From d0a8826f43a48e3db53f4ec3f62d5b6e5f3fd15d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 22:11:01 +0000 Subject: add logging to utils.py --- obfuscator/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/obfuscator/utils.py b/obfuscator/utils.py index f0174f8..2e4211f 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -1,8 +1,15 @@ # Utility functions +from obfuscator.logger import get_logger + +# Create the logger +logger = get_logger("CLI") def get_s3_path(uri): parts = uri.replace("s3://", "").split("/") + logger.debug(f"Parts: {parts}") bucket = parts.pop(0) + logger.debug(f"Bucket: {bucket}") key = "/".join(parts) + logger.debug(f"Key: {key}") return bucket, key -- cgit v1.2.3 From 5875763f8e384a50004c4dd8ea08598d68f251ed Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 22:13:51 +0000 Subject: make debug log messages clearer in output --- cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli.py b/cli.py index 974ac3c..b1d4000 100644 --- a/cli.py +++ b/cli.py @@ -34,16 +34,16 @@ def main(): logger.debug("User chose to read CSV from local path") data = reader.read_local(args.local) # For debug purposes, log the data read from the CSV - logger.debug(data) + logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") data = reader.read_s3(args.s3) - logger.debug(data) + logger.debug("Contents: " + str(data)) # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) # For debug purposes, log the obfuscated data as JSON for readability - logger.debug(json.dumps(obfuscated_data, indent=4)) + logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) # If the script is run directly (as it should be), call the main function -- cgit v1.2.3 From 6e8c602b7cce9244e66fb0056eeba5e6ab697e6a Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 22:38:48 +0000 Subject: add untested read_s3 logic to CSVReader --- obfuscator/csv_reader.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 578b96b..c777998 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -1,5 +1,6 @@ import csv import io +import boto3 from typing import List, Dict from obfuscator.logger import get_logger from obfuscator.utils import get_s3_path @@ -48,10 +49,25 @@ class CSVReader: A method to read an S3 object containing CSV data and return the data as a list of dictionaries. """ - # Yet to be implemented. bucket, key = get_s3_path(path) logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + client = boto3.client("s3") + + try: + # Attempt to read the S3 object and return the data as a list of dictionaries + response = client.get_object(Bucket=bucket, Key=key) + logger.info("S3 object read successfully") + # Read and decode the content + content = response["Body"].read().decode("utf-8") + # Even though the read_string method was only created for testing, + # it can be reused here to read and return the CSV data + return CSVReader.read_string(content) + # TODO: Add more specific exceptions to catch + except Exception as e: + logger.error(f"Error reading S3 object: {e}") + raise + @staticmethod def read_string(content: str) -> List[Dict[str, str]]: """ -- cgit v1.2.3 From 31d01d5efbccbf923e9131c0b67aa916be873e9e Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 22:50:57 +0000 Subject: change boto3 endpoint if debug mode is enabled should this go here in 'production' code, even though it's a testing tool? this may be changed in the near future. i've just got a gut feeling that this might not be right. --- obfuscator/csv_reader.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index c777998..8f4ebea 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -1,6 +1,7 @@ import csv import io import boto3 +import os from typing import List, Dict from obfuscator.logger import get_logger from obfuscator.utils import get_s3_path @@ -52,7 +53,19 @@ class CSVReader: bucket, key = get_s3_path(path) logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - client = boto3.client("s3") + # If DEBUG=TRUE, use the localstack endpoint for testing + if os.getenv("DEBUG", "FALSE").upper() == "TRUE": + localstack_endpoint = "http://localhost.localstack.cloud:4566" + logger.debug("Using LocalStack endpoint for S3") + client = boto3.client( + "s3", + endpoint_url=localstack_endpoint, + aws_access_key_id="dummy", + aws_secret_access_key="dummy", + ) + logger.debug(f"endpoint_url: {localstack_endpoint}") + else: + client = boto3.client("s3") try: # Attempt to read the S3 object and return the data as a list of dictionaries -- cgit v1.2.3 From ebb884f21106a574d1ab5b61e3d2a34080f0b9a3 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:00:19 +0000 Subject: add tests for read_s3 in CSVReader --- test/test_csv_reader.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index 27abea0..540e172 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -73,7 +73,7 @@ def test_csv_with_quoted_fields_should_run_as_expected(): def setup_s3(s3_client, bucket: str, key: str, content: str): s3_client.create_bucket( - Bucket="test-bucket", + Bucket=bucket, CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, ) s3_client.put_object(Bucket=bucket, Key=key, Body=content) @@ -108,3 +108,97 @@ def test_read_s3_valid_csv_returns_expected(): ] assert data == expected + + +def test_read_s3_empty_csv_returns_empty_list(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "empty-bucket" + key = "data/empty.csv" + csv_content = "student_id,name,course\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + assert data == [] + + +def test_read_s3_nonexistent_bucket_raises_exception(): + with mock_aws(): + bucket = "nonexistent-bucket" + key = "data/mock.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + CSVReader.read_s3(path) + + +def test_read_s3_nonexistent_key_raises_exception(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + s3.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + key = "data/nonexistent.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + CSVReader.read_s3(path) + + +def test_read_s3_malformed_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/malformed.csv" + csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] + assert data == expected + + +def test_read_s3_csv_with_extra_empty_lines(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/extra_lines.csv" + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "\n" + "5678,Student 2,Course 2\n" + "\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert data == expected + + +def test_read_s3_csv_with_whitespace_in_fields(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/whitespace.csv" + csv_content = ( + "student_id, name , course \n" + " 1234 , Student 1 , Course 1 \n" + "5678,Student 2,Course 2\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = CSVReader.read_s3(path) + expected = [ + {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, + {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, + ] + assert data == expected -- cgit v1.2.3 From 20572634aaab2b522128a88449b2a32b6c028fc4 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:05:33 +0000 Subject: update test_csv_reader.py header --- test/test_csv_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index 540e172..e4c135b 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,4 +1,4 @@ -# csv_reader.py - tests +# csv_reader.py - tests for read_string and read_s3 # Author: Alex Schofield import boto3 -- cgit v1.2.3