aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex <git@ajschof.me>2025-02-18 23:08:06 +0000
committerGitHub <noreply@github.com>2025-02-18 23:08:06 +0000
commiteb0d30d0235dbadd1d5c385a0a49d4cd8aea021e (patch)
tree4970d8999b622998800a99e2922b8639049169d9
parent362805c9354dc653442f4e144022cc577ebeb43e (diff)
parent20572634aaab2b522128a88449b2a32b6c028fc4 (diff)
downloadgdpr-obfuscator-eb0d30d0235dbadd1d5c385a0a49d4cd8aea021e.tar.gz
gdpr-obfuscator-eb0d30d0235dbadd1d5c385a0a49d4cd8aea021e.zip
Merge pull request #6 from ajschofield/feat/read_s3
implement reading from s3 bucket
-rw-r--r--cli.py11
-rw-r--r--obfuscator/csv_reader.py35
-rw-r--r--obfuscator/utils.py15
-rw-r--r--pyproject.toml3
-rw-r--r--test/test_csv_reader.py143
5 files changed, 199 insertions, 8 deletions
diff --git a/cli.py b/cli.py
index a4ab4c3..b1d4000 100644
--- a/cli.py
+++ b/cli.py
@@ -26,21 +26,24 @@ def main():
# Parse the arguments
args = parser.parse_args()
+ # Create the CSVReader object
+ reader = CSVReader()
+
# Read the CSV data based on the user's choice of local or S3
if args.local and not args.s3:
logger.debug("User chose to read CSV from local path")
- # Create a CSVReader object and read the local CSV file
- reader = CSVReader()
data = reader.read_local(args.local)
# For debug purposes, log the data read from the CSV
- logger.debug(data)
+ logger.debug("Contents: " + str(data))
else:
logger.debug("User chose to read CSV from S3")
+ data = reader.read_s3(args.s3)
+ logger.debug("Contents: " + str(data))
# Obfuscate the data based on the user's choice of PII fields
obfuscated_data = obfuscate(data, args.pii)
# For debug purposes, log the obfuscated data as JSON for readability
- logger.debug(json.dumps(obfuscated_data, indent=4))
+ logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4))
# If the script is run directly (as it should be), call the main function
diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py
index eb93609..8f4ebea 100644
--- a/obfuscator/csv_reader.py
+++ b/obfuscator/csv_reader.py
@@ -1,7 +1,10 @@
import csv
import io
+import boto3
+import os
from typing import List, Dict
from obfuscator.logger import get_logger
+from obfuscator.utils import get_s3_path
# Create the logger
logger = get_logger("CSVReader")
@@ -47,8 +50,36 @@ class CSVReader:
A method to read an S3 object containing CSV data
and return the data as a list of dictionaries.
"""
- # Yet to be implemented.
- return []
+ bucket, key = get_s3_path(path)
+ logger.debug(f"Reading S3 CSV from: {bucket}/{key}")
+
+ # If DEBUG=TRUE, use the localstack endpoint for testing
+ if os.getenv("DEBUG", "FALSE").upper() == "TRUE":
+ localstack_endpoint = "http://localhost.localstack.cloud:4566"
+ logger.debug("Using LocalStack endpoint for S3")
+ client = boto3.client(
+ "s3",
+ endpoint_url=localstack_endpoint,
+ aws_access_key_id="dummy",
+ aws_secret_access_key="dummy",
+ )
+ logger.debug(f"endpoint_url: {localstack_endpoint}")
+ else:
+ client = boto3.client("s3")
+
+ try:
+ # Attempt to read the S3 object and return the data as a list of dictionaries
+ response = client.get_object(Bucket=bucket, Key=key)
+ logger.info("S3 object read successfully")
+ # Read and decode the content
+ content = response["Body"].read().decode("utf-8")
+ # Even though the read_string method was only created for testing,
+ # it can be reused here to read and return the CSV data
+ return CSVReader.read_string(content)
+ # TODO: Add more specific exceptions to catch
+ except Exception as e:
+ logger.error(f"Error reading S3 object: {e}")
+ raise
@staticmethod
def read_string(content: str) -> List[Dict[str, str]]:
diff --git a/obfuscator/utils.py b/obfuscator/utils.py
new file mode 100644
index 0000000..2e4211f
--- /dev/null
+++ b/obfuscator/utils.py
@@ -0,0 +1,15 @@
+# Utility functions
+from obfuscator.logger import get_logger
+
+# Create the logger
+logger = get_logger("CLI")
+
+
+def get_s3_path(uri):
+ parts = uri.replace("s3://", "").split("/")
+ logger.debug(f"Parts: {parts}")
+ bucket = parts.pop(0)
+ logger.debug(f"Bucket: {bucket}")
+ key = "/".join(parts)
+ logger.debug(f"Key: {key}")
+ return bucket, key
diff --git a/pyproject.toml b/pyproject.toml
index cc923b9..46114bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ description = "A Python library designed to detect and remove Personally Identif
authors = [{ name = "Alex Schofield", email = "git@ajschof.me" }]
readme = "README.md"
requires-python = ">=3.13"
-dependencies = ["tabulate (>=0.9.0,<0.10.0)"]
+dependencies = ["tabulate (>=0.9.0,<0.10.0)", "boto3 (>=1.36.22,<2.0.0)"]
[tool.poetry]
package-mode = false
@@ -13,6 +13,7 @@ package-mode = false
[tool.poetry.group.dev.dependencies]
pytest = "8.3.4"
pytest-cov = "^6.0.0"
+moto = "^5.0.28"
[build-system]
requires = ["poetry-core>=2.0.0,<3.0.0"]
diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py
index af13cff..e4c135b 100644
--- a/test/test_csv_reader.py
+++ b/test/test_csv_reader.py
@@ -1,10 +1,15 @@
-# csv_reader.py - tests
+# csv_reader.py - tests for read_string and read_s3
# Author: Alex Schofield
+import boto3
+from moto import mock_aws
from obfuscator.csv_reader import CSVReader
+import pytest
reader = CSVReader()
+# CSVREADER: READ_STRING TESTS
+
# Check if the function can read a CSV string with no content and return
# an empty list
@@ -61,3 +66,139 @@ def test_csv_with_quoted_fields_should_run_as_expected():
{"student_id": "5678", "name": "Student 2", "course": "Course 2"},
]
assert result == expected
+
+
+# CSVREADER: READ_S3 TESTS
+
+
+def setup_s3(s3_client, bucket: str, key: str, content: str):
+ s3_client.create_bucket(
+ Bucket=bucket,
+ CreateBucketConfiguration={"LocationConstraint": "eu-west-2"},
+ )
+ s3_client.put_object(Bucket=bucket, Key=key, Body=content)
+
+
+@pytest.fixture(autouse=True)
+def s3_client():
+ with mock_aws():
+ yield boto3.client("s3", "eu-west-2")
+
+
+def test_read_s3_valid_csv_returns_expected():
+ with mock_aws():
+ s3 = boto3.client("s3", region_name="eu-west-2")
+ bucket = "test-bucket"
+ key = "data/mock.csv"
+
+ csv_content = (
+ "student_id,name,course\n"
+ "1234,Student 1,Course 1\n"
+ "5678,Student 2,Course 2\n"
+ )
+
+ setup_s3(s3, bucket, key, csv_content)
+ path = f"s3://{bucket}/{key}"
+
+ data = reader.read_s3(path)
+
+ expected = [
+ {"student_id": "1234", "name": "Student 1", "course": "Course 1"},
+ {"student_id": "5678", "name": "Student 2", "course": "Course 2"},
+ ]
+
+ assert data == expected
+
+
+def test_read_s3_empty_csv_returns_empty_list():
+ with mock_aws():
+ s3 = boto3.client("s3", region_name="eu-west-2")
+ bucket = "empty-bucket"
+ key = "data/empty.csv"
+ csv_content = "student_id,name,course\n"
+ setup_s3(s3, bucket, key, csv_content)
+ path = f"s3://{bucket}/{key}"
+
+ data = CSVReader.read_s3(path)
+ assert data == []
+
+
+def test_read_s3_nonexistent_bucket_raises_exception():
+ with mock_aws():
+ bucket = "nonexistent-bucket"
+ key = "data/mock.csv"
+ path = f"s3://{bucket}/{key}"
+ with pytest.raises(Exception):
+ CSVReader.read_s3(path)
+
+
+def test_read_s3_nonexistent_key_raises_exception():
+ with mock_aws():
+ s3 = boto3.client("s3", region_name="eu-west-2")
+ bucket = "test-bucket"
+ s3.create_bucket(
+ Bucket=bucket,
+ CreateBucketConfiguration={"LocationConstraint": "eu-west-2"},
+ )
+ key = "data/nonexistent.csv"
+ path = f"s3://{bucket}/{key}"
+ with pytest.raises(Exception):
+ CSVReader.read_s3(path)
+
+
+def test_read_s3_malformed_csv_returns_expected():
+ with mock_aws():
+ s3 = boto3.client("s3", region_name="eu-west-2")
+ bucket = "test-bucket"
+ key = "data/malformed.csv"
+ csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n"
+ setup_s3(s3, bucket, key, csv_content)
+ path = f"s3://{bucket}/{key}"
+
+ data = CSVReader.read_s3(path)
+ expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}]
+ assert data == expected
+
+
+def test_read_s3_csv_with_extra_empty_lines():
+ with mock_aws():
+ s3 = boto3.client("s3", region_name="eu-west-2")
+ bucket = "test-bucket"
+ key = "data/extra_lines.csv"
+ csv_content = (
+ "student_id,name,course\n"
+ "1234,Student 1,Course 1\n"
+ "\n"
+ "5678,Student 2,Course 2\n"
+ "\n"
+ )
+ setup_s3(s3, bucket, key, csv_content)
+ path = f"s3://{bucket}/{key}"
+
+ data = CSVReader.read_s3(path)
+ expected = [
+ {"student_id": "1234", "name": "Student 1", "course": "Course 1"},
+ {"student_id": "5678", "name": "Student 2", "course": "Course 2"},
+ ]
+ assert data == expected
+
+
+def test_read_s3_csv_with_whitespace_in_fields():
+ with mock_aws():
+ s3 = boto3.client("s3", region_name="eu-west-2")
+ bucket = "test-bucket"
+ key = "data/whitespace.csv"
+ csv_content = (
+ "student_id, name , course \n"
+ " 1234 , Student 1 , Course 1 \n"
+ "5678,Student 2,Course 2\n"
+ )
+ setup_s3(s3, bucket, key, csv_content)
+ path = f"s3://{bucket}/{key}"
+
+ data = CSVReader.read_s3(path)
+ expected = [
+ {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "},
+ {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"},
+ ]
+ assert data == expected
git.ajschof.me — hosted by ajschofield — powered by cgit