aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlex <git@ajschof.me>2025-02-19 15:58:28 +0000
committerGitHub <noreply@github.com>2025-02-19 15:58:28 +0000
commit4066bf747e1e4c938526957c119f3f1485ee251e (patch)
tree9a1e95f4ccbdd04e19d67a6c13641a19c4d4f3e0
parentf24955044c4c05e37aba4efb505ec63b44113912 (diff)
parent5402af2c7198a685a57a05e29a869e1e72a6b877 (diff)
downloadgdpr-obfuscator-4066bf747e1e4c938526957c119f3f1485ee251e.tar.gz
gdpr-obfuscator-4066bf747e1e4c938526957c119f3f1485ee251e.zip
Merge pull request #8 from ajschofield/refining-phase
mostly minor changes (fixing things up)
-rw-r--r--TODO0
-rw-r--r--cli.py61
-rw-r--r--obfuscator/csv_reader.py97
-rw-r--r--obfuscator/csv_writer.py26
-rw-r--r--obfuscator/logger.py40
-rw-r--r--obfuscator/obfuscate.py14
-rw-r--r--obfuscator/read.py89
-rw-r--r--obfuscator/utils.py21
-rw-r--r--obfuscator/write.py28
-rw-r--r--test/test_obfuscator.py15
-rw-r--r--test/test_read.py (renamed from test/test_csv_reader.py)39
-rw-r--r--test/test_write.py (renamed from test/test_csv_writer.py)15
12 files changed, 208 insertions, 237 deletions
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/TODO
diff --git a/cli.py b/cli.py
index f2c8771..bd49707 100644
--- a/cli.py
+++ b/cli.py
@@ -1,53 +1,52 @@
import argparse
-import json
-from obfuscator.csv_reader import CSVReader
+from obfuscator.read import DataReader
+from obfuscator.write import DataWriter
from obfuscator.obfuscate import obfuscate
from obfuscator.logger import get_logger
-from obfuscator.csv_writer import create_byte_stream
-
-# Create the logger
-logger = get_logger("CLI")
def main():
- # Create an argument parser
- parser = argparse.ArgumentParser(description="gdpr-obfuscator")
- # Require user to either choose a local file or an S3 object
- # The user can only choose one of these options or the program will exit
- # If not provided, the program will exit
+ parser = argparse.ArgumentParser(
+ prog="GDPR-Obfuscator",
+ description="Obfuscate sensitive data stored locally or in an AWS environment",
+ )
+
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Enable verbose logging"
+ )
+
loc = parser.add_mutually_exclusive_group(required=True)
- loc.add_argument("--local")
- loc.add_argument("--s3")
+ loc.add_argument("-l", "--local", help="Local path to file")
+ loc.add_argument("-s", "--s3", help="URI path to file stored in S3")
- # Require user to provide a list of PII fields to obfuscate
- # e.g. --pii name email_address
- # If not provided, the program will exit
- parser.add_argument("--pii", nargs="+", required=True)
+ parser.add_argument(
+ "-p",
+ "--pii",
+ nargs="+",
+ required=True,
+ help="List of PII fields to obfuscate, separated by spaces",
+ )
- # Parse the arguments
args = parser.parse_args()
- # Create the CSVReader object
- reader = CSVReader()
+ log_level = "DEBUG" if args.verbose else "INFO"
+ logger = get_logger("CLI", log_level)
+
+ reader = DataReader(log_level)
- # Read the CSV data based on the user's choice of local or S3
if args.local and not args.s3:
- logger.debug("User chose to read CSV from local path")
+ logger.debug("Read data from local path")
data = reader.read_local(args.local)
- # For debug purposes, log the data read from the CSV
- logger.debug("Contents: " + str(data))
else:
- logger.debug("User chose to read CSV from S3")
+ logger.debug("Read data from S3")
data = reader.read_s3(args.s3)
- logger.debug("Contents: " + str(data))
- # Obfuscate the data based on the user's choice of PII fields
obfuscated_data = obfuscate(data, args.pii)
- # For debug purposes, log the obfuscated data as JSON for readability
- logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4))
- return create_byte_stream(obfuscated_data)
+
+ writer = DataWriter()
+
+ return writer.create_byte_stream(obfuscated_data)
-# If the script is run directly (as it should be), call the main function
if __name__ == "__main__":
main()
diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py
deleted file mode 100644
index 8f4ebea..0000000
--- a/obfuscator/csv_reader.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import csv
-import io
-import boto3
-import os
-from typing import List, Dict
-from obfuscator.logger import get_logger
-from obfuscator.utils import get_s3_path
-
-# Create the logger
-logger = get_logger("CSVReader")
-
-# Putting the CSV reading components into a class may seem like overkill
-# for a simple script, but it allows for better organization and scalability.
-# @staticmethod is used to define the method without an instance of the class
-# being required. The methods could be defined just as functions, and this
-# may still be changed.
-
-
-class CSVReader:
- """
- A class to read CSV data from a local file, S3 object, or string. Near
- the project completion, support for JSON/Parquet files will be added.
- """
-
- @staticmethod
- def read_local(path) -> List[Dict[str, str]]:
- """
- A method to read a local CSV file and return the data as a list of
- dictionaries.
- """
- # Log the path of the file being read for debugging
- logger.debug(f"Reading local CSV from: {path}")
-
- # Attempt to read the file and return the data as a list of dictionaries
- # However, if the file isn't found or there is a generic exception, log
- # the error and raise an exception
- try:
- with open(path, mode="r", encoding="utf-8") as f:
- reader = csv.DictReader(f)
- return [dict(row) for row in reader]
- except FileNotFoundError:
- logger.error(f"File not found: {path}")
- raise
- except Exception as e:
- logger.error(f"Error reading file: {e}")
-
- @staticmethod
- def read_s3(path) -> List[Dict[str, str]]:
- """
- A method to read an S3 object containing CSV data
- and return the data as a list of dictionaries.
- """
- bucket, key = get_s3_path(path)
- logger.debug(f"Reading S3 CSV from: {bucket}/{key}")
-
- # If DEBUG=TRUE, use the localstack endpoint for testing
- if os.getenv("DEBUG", "FALSE").upper() == "TRUE":
- localstack_endpoint = "http://localhost.localstack.cloud:4566"
- logger.debug("Using LocalStack endpoint for S3")
- client = boto3.client(
- "s3",
- endpoint_url=localstack_endpoint,
- aws_access_key_id="dummy",
- aws_secret_access_key="dummy",
- )
- logger.debug(f"endpoint_url: {localstack_endpoint}")
- else:
- client = boto3.client("s3")
-
- try:
- # Attempt to read the S3 object and return the data as a list of dictionaries
- response = client.get_object(Bucket=bucket, Key=key)
- logger.info("S3 object read successfully")
- # Read and decode the content
- content = response["Body"].read().decode("utf-8")
- # Even though the read_string method was only created for testing,
- # it can be reused here to read and return the CSV data
- return CSVReader.read_string(content)
- # TODO: Add more specific exceptions to catch
- except Exception as e:
- logger.error(f"Error reading S3 object: {e}")
- raise
-
- @staticmethod
- def read_string(content: str) -> List[Dict[str, str]]:
- """
- A method to read CSV data from a string and return the data as a list
- of dictionaries.
- """
- # If the content is empty, return an empty list
- if not content.strip():
- return []
-
- # Treat the string as a file-like object and return as list of dictionaries
- f = io.StringIO(content)
- reader = csv.DictReader(f)
- return [dict(row) for row in reader]
diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py
deleted file mode 100644
index aa5ac3f..0000000
--- a/obfuscator/csv_writer.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import csv
-import io
-from typing import List, Dict
-from obfuscator.logger import get_logger
-
-# Create the logger
-logger = get_logger("CSVWriter")
-
-
-def create_byte_stream(data: List[Dict[str, str]]) -> bytes:
- if not data:
- logger.info("No valid data was provided to write")
- return b""
-
- output = io.StringIO()
-
- headers = list(data[0].keys())
-
- writer = csv.DictWriter(output, fieldnames=headers)
- writer.writeheader()
- writer.writerows(data)
-
- csv_string = output.getvalue()
- logger.debug(f"CSV data: {csv_string}")
-
- return csv_string.encode("utf-8")
diff --git a/obfuscator/logger.py b/obfuscator/logger.py
index ca41e95..140fa8f 100644
--- a/obfuscator/logger.py
+++ b/obfuscator/logger.py
@@ -1,24 +1,36 @@
import logging
import os
+from enum import Enum
-def get_logger(name: str) -> logging.Logger:
- logger = logging.getLogger(name)
+class LogLevel(Enum):
+ DEBUG = logging.DEBUG
+ INFO = logging.INFO
+ WARNING = logging.WARNING
+ ERROR = logging.ERROR
+ CRITICAL = logging.CRITICAL
+
- if not logger.hasHandlers():
- if os.getenv("DEBUG", "FALSE").upper() == "TRUE":
- log_level = logging.DEBUG
- else:
- log_level = logging.INFO
+def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger:
+ if isinstance(level, str):
+ try:
+ level = LogLevel[level.upper()]
+ except KeyError:
+ raise ValueError(
+ f"Invalid log level '{level}'. Choose from: {', '.join(l.name for l in LogLevel)}"
+ )
- logger.setLevel(log_level)
+ logger = logging.getLogger(name)
- handler = logging.StreamHandler()
- formatting = logging.Formatter(
- "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
- )
- handler.setFormatter(formatting)
+ if logger.hasHandlers():
+ logger.handlers.clear()
- logger.addHandler(handler)
+ handler = logging.StreamHandler()
+ logger.setLevel(level.value)
+ formatting = logging.Formatter(
+ "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s"
+ )
+ handler.setFormatter(formatting)
+ logger.addHandler(handler)
return logger
diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py
index 3f589cb..cd12b6d 100644
--- a/obfuscator/obfuscate.py
+++ b/obfuscator/obfuscate.py
@@ -1,8 +1,7 @@
from typing import List, Dict
from obfuscator.logger import get_logger
-# Create the logger
-logger = get_logger("Obfuscator")
+logger = get_logger("OBFUSCATE")
def obfuscate(
@@ -12,14 +11,15 @@ def obfuscate(
A function to obfuscate PII fields in a list of dictionaries, replacing
sensitive values with a string of asterisks.
"""
- # If no data is provided, log a message and return an empty list
if not data:
- logger.info("No valid data was provided to obfuscate")
+ logger.error(
+ "Invalid or empty data was provided to obfuscate. Returning empty list."
+ )
return []
+ if not pii_fields:
+ logger.error("No PII fields provided to obfuscate. Returning data unchanged.")
+ return data
- # Obfuscate the PII fields in each record using a list/dict comprehension
- # This code is good but makes debugging a bit tricky. I may consider
- # breaking it down into a for loop.
return [
{k: ("***" if k in pii_fields else v) for k, v in record.items()}
for record in data
diff --git a/obfuscator/read.py b/obfuscator/read.py
new file mode 100644
index 0000000..b704643
--- /dev/null
+++ b/obfuscator/read.py
@@ -0,0 +1,89 @@
+import csv
+import io
+import boto3
+import os
+from typing import List, Dict
+from obfuscator.logger import get_logger
+from obfuscator.utils import Utilities
+
+
+class DataReader:
+ """
+ A class to read CSV data from a local file, S3 object, or string. Near
+ the project completion, support for JSON/Parquet files will be added.
+ """
+
+ def __init__(self, log_level=None):
+ self.log_level = log_level
+ self.logger = get_logger("CSVREADER", log_level)
+
+ def read_local(self, path) -> List[Dict[str, str]]:
+ """
+ A method to read a local CSV file and return the data as a list of
+ dictionaries.
+ """
+ self.logger.debug(f"Reading local CSV from: {path}")
+
+ try:
+ with open(path, mode="r", encoding="utf-8") as f:
+ reader = csv.DictReader(f)
+ return [dict(row) for row in reader]
+ except FileNotFoundError:
+ self.logger.error(f"File not found: {path}")
+ raise
+ except Exception as e:
+ self.logger.error(f"Error reading file: {e}")
+
+ def read_s3(self, path) -> List[Dict[str, str]]:
+ """
+ A method to read an S3 object containing CSV data
+ and return the data as a list of dictionaries.
+ """
+ utils = Utilities(self.log_level)
+ bucket, key = utils.get_s3_path(path)
+ self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}")
+
+ if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE":
+ localstack_endpoint = "http://localhost.localstack.cloud:4566"
+ self.logger.debug(
+ "Using LocalStack endpoint for S3 - ensure LocalStack is running"
+ )
+ client = boto3.client(
+ "s3",
+ endpoint_url=localstack_endpoint,
+ aws_access_key_id="dummy",
+ aws_secret_access_key="dummy",
+ )
+ self.logger.debug(f"endpoint_url: {localstack_endpoint}")
+ else:
+ client = boto3.client("s3")
+
+ try:
+ response = client.get_object(Bucket=bucket, Key=key)
+ self.logger.info("S3 object read successfully")
+ content = response["Body"].read().decode("utf-8")
+ return self.read_string(content)
+ except client.exceptions.NoSuchKey:
+ self.logger.error(f"Object not found: {bucket}/{key}")
+ raise
+ except client.exceptions.ClientError as e:
+ self.logger.error(f"Error reading S3 object: {e}")
+ raise
+ except UnicodeDecodeError as e:
+ self.logger.error(f"Error decoding S3 object: {e}")
+ raise
+ except Exception as e:
+ self.logger.error(f"Error reading S3 object: {e}")
+ raise
+
+ def read_string(self, content: str) -> List[Dict[str, str]]:
+ """
+ A method to read CSV data from a string and return the data as a list
+ of dictionaries.
+ """
+ if not content.strip():
+ return []
+
+ f = io.StringIO(content)
+ reader = csv.DictReader(f)
+ return [dict(row) for row in reader]
diff --git a/obfuscator/utils.py b/obfuscator/utils.py
index 2e4211f..77ca1cf 100644
--- a/obfuscator/utils.py
+++ b/obfuscator/utils.py
@@ -1,15 +1,16 @@
# Utility functions
from obfuscator.logger import get_logger
-# Create the logger
-logger = get_logger("CLI")
+class Utilities:
+ def __init__(self, logger=None):
+ self.logger = get_logger("UTILITIES", logger)
-def get_s3_path(uri):
- parts = uri.replace("s3://", "").split("/")
- logger.debug(f"Parts: {parts}")
- bucket = parts.pop(0)
- logger.debug(f"Bucket: {bucket}")
- key = "/".join(parts)
- logger.debug(f"Key: {key}")
- return bucket, key
+ def get_s3_path(self, uri):
+ parts = uri.replace("s3://", "").split("/")
+ self.logger.debug(f"Parts: {parts}")
+ bucket = parts.pop(0)
+ self.logger.debug(f"Bucket: {bucket}")
+ key = "/".join(parts)
+ self.logger.debug(f"Key: {key}")
+ return bucket, key
diff --git a/obfuscator/write.py b/obfuscator/write.py
new file mode 100644
index 0000000..451b073
--- /dev/null
+++ b/obfuscator/write.py
@@ -0,0 +1,28 @@
+import csv
+import io
+from typing import List, Dict
+from obfuscator.logger import get_logger
+
+logger = get_logger("CSVWRITER")
+
+
+class DataWriter:
+ def __init__(self):
+ pass
+
+ def create_byte_stream(self, data: List[Dict[str, str]]) -> bytes:
+ if not data:
+ logger.error("Invalid or empty data was provided to write")
+ return b""
+
+ output = io.StringIO()
+
+ headers = list(data[0].keys())
+
+ writer = csv.DictWriter(output, fieldnames=headers)
+ writer.writeheader()
+ writer.writerows(data)
+
+ csv_string = output.getvalue()
+
+ return csv_string.encode("utf-8")
diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py
index 4f61b16..c77b6b4 100644
--- a/test/test_obfuscator.py
+++ b/test/test_obfuscator.py
@@ -1,8 +1,5 @@
from obfuscator.obfuscate import obfuscate
-# Check if the function does what its supposed to and can obfuscate
-# valid PII fields in a list of dictionaries
-
def test_obfuscate_data_with_valid_pii_fields():
data = [
@@ -39,11 +36,6 @@ def test_obfuscate_data_with_valid_pii_fields():
assert result == expected
-# Check if the function can obfuscate data even when some PII
-# fields are missing from some of the data, returning a list of dictionaries
-# but with the missing PII fields obfuscated and the rest of the data intact
-
-
def test_obfuscate_data_with_missing_pii_field():
data = [
{"student_id": "1234", "name": "John Smith", "course": "Software"},
@@ -69,9 +61,6 @@ def test_obfuscate_data_with_missing_pii_field():
assert result == expected
-# Check if the function can handle an empty list of data, returning an empty list
-
-
def test_obfuscate_data_with_no_data():
data = []
pii_fields = ["name", "email_address"]
@@ -81,10 +70,6 @@ def test_obfuscate_data_with_no_data():
assert result == expected
-# Check if the function can handle an empty list of PII fields, returning the data as is
-# without mutating it
-
-
def test_obfuscate_data_with_empty_pii_fields():
data = [
{
diff --git a/test/test_csv_reader.py b/test/test_read.py
index e4c135b..de425ce 100644
--- a/test/test_csv_reader.py
+++ b/test/test_read.py
@@ -1,17 +1,9 @@
-# csv_reader.py - tests for read_string and read_s3
-# Author: Alex Schofield
-
import boto3
from moto import mock_aws
-from obfuscator.csv_reader import CSVReader
+from obfuscator.read import DataReader
import pytest
-reader = CSVReader()
-
-# CSVREADER: READ_STRING TESTS
-
-# Check if the function can read a CSV string with no content and return
-# an empty list
+reader = DataReader(log_level="DEBUG")
def test_empty_csv_should_return_no_content():
@@ -21,10 +13,6 @@ def test_empty_csv_should_return_no_content():
assert result == expected
-# Check if the function can read a CSV string with only a header and return
-# an empty list
-
-
def test_csv_with_header_only_should_return_no_content():
content = "student_id,name,course\n"
result = reader.read_string(content)
@@ -32,10 +20,6 @@ def test_csv_with_header_only_should_return_no_content():
assert result == expected
-# Check if the function can read a CSV string with valid data and return
-# a list of dictionaries
-
-
def test_csv_with_valid_data():
content = (
"student_id,name,course\n"
@@ -50,10 +34,6 @@ def test_csv_with_valid_data():
assert result == expected
-# Check if the function can read a CSV string with quoted fields and return
-# a list of dictionaries with the quoted fields intact
-
-
def test_csv_with_quoted_fields_should_run_as_expected():
content = (
"student_id,name,course\n"
@@ -68,9 +48,6 @@ def test_csv_with_quoted_fields_should_run_as_expected():
assert result == expected
-# CSVREADER: READ_S3 TESTS
-
-
def setup_s3(s3_client, bucket: str, key: str, content: str):
s3_client.create_bucket(
Bucket=bucket,
@@ -119,7 +96,7 @@ def test_read_s3_empty_csv_returns_empty_list():
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
- data = CSVReader.read_s3(path)
+ data = reader.read_s3(path)
assert data == []
@@ -129,7 +106,7 @@ def test_read_s3_nonexistent_bucket_raises_exception():
key = "data/mock.csv"
path = f"s3://{bucket}/{key}"
with pytest.raises(Exception):
- CSVReader.read_s3(path)
+ reader.read_s3(path)
def test_read_s3_nonexistent_key_raises_exception():
@@ -143,7 +120,7 @@ def test_read_s3_nonexistent_key_raises_exception():
key = "data/nonexistent.csv"
path = f"s3://{bucket}/{key}"
with pytest.raises(Exception):
- CSVReader.read_s3(path)
+ reader.read_s3(path)
def test_read_s3_malformed_csv_returns_expected():
@@ -155,7 +132,7 @@ def test_read_s3_malformed_csv_returns_expected():
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
- data = CSVReader.read_s3(path)
+ data = reader.read_s3(path)
expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}]
assert data == expected
@@ -175,7 +152,7 @@ def test_read_s3_csv_with_extra_empty_lines():
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
- data = CSVReader.read_s3(path)
+ data = reader.read_s3(path)
expected = [
{"student_id": "1234", "name": "Student 1", "course": "Course 1"},
{"student_id": "5678", "name": "Student 2", "course": "Course 2"},
@@ -196,7 +173,7 @@ def test_read_s3_csv_with_whitespace_in_fields():
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
- data = CSVReader.read_s3(path)
+ data = reader.read_s3(path)
expected = [
{"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "},
{"student_id": "5678", " name ": "Student 2", " course ": "Course 2"},
diff --git a/test/test_csv_writer.py b/test/test_write.py
index eceac28..4929b06 100644
--- a/test/test_csv_writer.py
+++ b/test/test_write.py
@@ -1,6 +1,8 @@
import io
import csv
-from obfuscator.csv_writer import create_byte_stream
+from obfuscator.write import DataWriter
+
+writer = DataWriter()
def csv_bytes_to_list(csv_bytes: bytes):
@@ -15,13 +17,14 @@ def test_create_byte_stream_valid_data():
{"student_id": "1234", "name": "Student 1", "course": "Course 1"},
{"student_id": "5678", "name": "Student 2", "course": "Course 2"},
]
- csv_bytes = create_byte_stream(data)
+ csv_bytes = writer.create_byte_stream(data)
result = csv_bytes_to_list(csv_bytes)
assert result == data
def test_create_byte_stream_empty_data():
- csv_bytes = create_byte_stream([])
+ data = []
+ csv_bytes = writer.create_byte_stream(data)
assert csv_bytes == b""
@@ -30,7 +33,7 @@ def test_create_byte_stream_handles_quoted_fields():
{"student_id": "1234", "name": 'Student "One"', "course": "Course, A"},
{"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"},
]
- csv_bytes = create_byte_stream(data)
+ csv_bytes = writer.create_byte_stream(data)
result = csv_bytes_to_list(csv_bytes)
assert result == data
@@ -40,7 +43,7 @@ def test_create_byte_stream_consistent_header_order():
{"student_id": "1234", "name": "Alice", "course": "Math"},
{"student_id": "5678", "name": "Bob", "course": "Science"},
]
- csv_bytes = create_byte_stream(data)
+ csv_bytes = writer.create_byte_stream(data)
csv_string = csv_bytes.decode("utf-8")
header_line = csv_string.splitlines()[0]
expected_header = ",".join(data[0].keys())
@@ -52,6 +55,6 @@ def test_create_byte_stream_special_characters():
{"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"},
{"student_id": "5678", "name": "Student 2", "course": "Value with, comma"},
]
- csv_bytes = create_byte_stream(data)
+ csv_bytes = writer.create_byte_stream(data)
result = csv_bytes_to_list(csv_bytes)
assert result == data
git.ajschof.me — hosted by ajschofield — powered by cgit