diff options
| -rw-r--r-- | TODO | 0 | ||||
| -rw-r--r-- | cli.py | 61 | ||||
| -rw-r--r-- | obfuscator/csv_reader.py | 97 | ||||
| -rw-r--r-- | obfuscator/csv_writer.py | 26 | ||||
| -rw-r--r-- | obfuscator/logger.py | 40 | ||||
| -rw-r--r-- | obfuscator/obfuscate.py | 14 | ||||
| -rw-r--r-- | obfuscator/read.py | 89 | ||||
| -rw-r--r-- | obfuscator/utils.py | 21 | ||||
| -rw-r--r-- | obfuscator/write.py | 28 | ||||
| -rw-r--r-- | test/test_obfuscator.py | 15 | ||||
| -rw-r--r-- | test/test_read.py (renamed from test/test_csv_reader.py) | 39 | ||||
| -rw-r--r-- | test/test_write.py (renamed from test/test_csv_writer.py) | 15 |
12 files changed, 208 insertions, 237 deletions
@@ -1,53 +1,52 @@ import argparse -import json -from obfuscator.csv_reader import CSVReader +from obfuscator.read import DataReader +from obfuscator.write import DataWriter from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger -from obfuscator.csv_writer import create_byte_stream - -# Create the logger -logger = get_logger("CLI") def main(): - # Create an argument parser - parser = argparse.ArgumentParser(description="gdpr-obfuscator") - # Require user to either choose a local file or an S3 object - # The user can only choose one of these options or the program will exit - # If not provided, the program will exit + parser = argparse.ArgumentParser( + prog="GDPR-Obfuscator", + description="Obfuscate sensitive data stored locally or in an AWS environment", + ) + + parser.add_argument( + "-v", "--verbose", action="store_true", help="Enable verbose logging" + ) + loc = parser.add_mutually_exclusive_group(required=True) - loc.add_argument("--local") - loc.add_argument("--s3") + loc.add_argument("-l", "--local", help="Local path to file") + loc.add_argument("-s", "--s3", help="URI path to file stored in S3") - # Require user to provide a list of PII fields to obfuscate - # e.g. --pii name email_address - # If not provided, the program will exit - parser.add_argument("--pii", nargs="+", required=True) + parser.add_argument( + "-p", + "--pii", + nargs="+", + required=True, + help="List of PII fields to obfuscate, separated by spaces", + ) - # Parse the arguments args = parser.parse_args() - # Create the CSVReader object - reader = CSVReader() + log_level = "DEBUG" if args.verbose else "INFO" + logger = get_logger("CLI", log_level) + + reader = DataReader(log_level) - # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: - logger.debug("User chose to read CSV from local path") + logger.debug("Read data from local path") data = reader.read_local(args.local) - # For debug purposes, log the data read from the CSV - logger.debug("Contents: " + str(data)) else: - logger.debug("User chose to read CSV from S3") + logger.debug("Read data from S3") data = reader.read_s3(args.s3) - logger.debug("Contents: " + str(data)) - # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) - # For debug purposes, log the obfuscated data as JSON for readability - logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) - return create_byte_stream(obfuscated_data) + + writer = DataWriter() + + return writer.create_byte_stream(obfuscated_data) -# If the script is run directly (as it should be), call the main function if __name__ == "__main__": main() diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py deleted file mode 100644 index 8f4ebea..0000000 --- a/obfuscator/csv_reader.py +++ /dev/null @@ -1,97 +0,0 @@ -import csv -import io -import boto3 -import os -from typing import List, Dict -from obfuscator.logger import get_logger -from obfuscator.utils import get_s3_path - -# Create the logger -logger = get_logger("CSVReader") - -# Putting the CSV reading components into a class may seem like overkill -# for a simple script, but it allows for better organization and scalability. -# @staticmethod is used to define the method without an instance of the class -# being required. The methods could be defined just as functions, and this -# may still be changed. - - -class CSVReader: - """ - A class to read CSV data from a local file, S3 object, or string. Near - the project completion, support for JSON/Parquet files will be added. - """ - - @staticmethod - def read_local(path) -> List[Dict[str, str]]: - """ - A method to read a local CSV file and return the data as a list of - dictionaries. - """ - # Log the path of the file being read for debugging - logger.debug(f"Reading local CSV from: {path}") - - # Attempt to read the file and return the data as a list of dictionaries - # However, if the file isn't found or there is a generic exception, log - # the error and raise an exception - try: - with open(path, mode="r", encoding="utf-8") as f: - reader = csv.DictReader(f) - return [dict(row) for row in reader] - except FileNotFoundError: - logger.error(f"File not found: {path}") - raise - except Exception as e: - logger.error(f"Error reading file: {e}") - - @staticmethod - def read_s3(path) -> List[Dict[str, str]]: - """ - A method to read an S3 object containing CSV data - and return the data as a list of dictionaries. - """ - bucket, key = get_s3_path(path) - logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - - # If DEBUG=TRUE, use the localstack endpoint for testing - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": - localstack_endpoint = "http://localhost.localstack.cloud:4566" - logger.debug("Using LocalStack endpoint for S3") - client = boto3.client( - "s3", - endpoint_url=localstack_endpoint, - aws_access_key_id="dummy", - aws_secret_access_key="dummy", - ) - logger.debug(f"endpoint_url: {localstack_endpoint}") - else: - client = boto3.client("s3") - - try: - # Attempt to read the S3 object and return the data as a list of dictionaries - response = client.get_object(Bucket=bucket, Key=key) - logger.info("S3 object read successfully") - # Read and decode the content - content = response["Body"].read().decode("utf-8") - # Even though the read_string method was only created for testing, - # it can be reused here to read and return the CSV data - return CSVReader.read_string(content) - # TODO: Add more specific exceptions to catch - except Exception as e: - logger.error(f"Error reading S3 object: {e}") - raise - - @staticmethod - def read_string(content: str) -> List[Dict[str, str]]: - """ - A method to read CSV data from a string and return the data as a list - of dictionaries. - """ - # If the content is empty, return an empty list - if not content.strip(): - return [] - - # Treat the string as a file-like object and return as list of dictionaries - f = io.StringIO(content) - reader = csv.DictReader(f) - return [dict(row) for row in reader] diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py deleted file mode 100644 index aa5ac3f..0000000 --- a/obfuscator/csv_writer.py +++ /dev/null @@ -1,26 +0,0 @@ -import csv -import io -from typing import List, Dict -from obfuscator.logger import get_logger - -# Create the logger -logger = get_logger("CSVWriter") - - -def create_byte_stream(data: List[Dict[str, str]]) -> bytes: - if not data: - logger.info("No valid data was provided to write") - return b"" - - output = io.StringIO() - - headers = list(data[0].keys()) - - writer = csv.DictWriter(output, fieldnames=headers) - writer.writeheader() - writer.writerows(data) - - csv_string = output.getvalue() - logger.debug(f"CSV data: {csv_string}") - - return csv_string.encode("utf-8") diff --git a/obfuscator/logger.py b/obfuscator/logger.py index ca41e95..140fa8f 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -1,24 +1,36 @@ import logging import os +from enum import Enum -def get_logger(name: str) -> logging.Logger: - logger = logging.getLogger(name) +class LogLevel(Enum): + DEBUG = logging.DEBUG + INFO = logging.INFO + WARNING = logging.WARNING + ERROR = logging.ERROR + CRITICAL = logging.CRITICAL + - if not logger.hasHandlers(): - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": - log_level = logging.DEBUG - else: - log_level = logging.INFO +def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: + if isinstance(level, str): + try: + level = LogLevel[level.upper()] + except KeyError: + raise ValueError( + f"Invalid log level '{level}'. Choose from: {', '.join(l.name for l in LogLevel)}" + ) - logger.setLevel(log_level) + logger = logging.getLogger(name) - handler = logging.StreamHandler() - formatting = logging.Formatter( - "%(asctime)s - %(levelname)s - %(name)s - %(message)s" - ) - handler.setFormatter(formatting) + if logger.hasHandlers(): + logger.handlers.clear() - logger.addHandler(handler) + handler = logging.StreamHandler() + logger.setLevel(level.value) + formatting = logging.Formatter( + "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" + ) + handler.setFormatter(formatting) + logger.addHandler(handler) return logger diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 3f589cb..cd12b6d 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -1,8 +1,7 @@ from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger -logger = get_logger("Obfuscator") +logger = get_logger("OBFUSCATE") def obfuscate( @@ -12,14 +11,15 @@ def obfuscate( A function to obfuscate PII fields in a list of dictionaries, replacing sensitive values with a string of asterisks. """ - # If no data is provided, log a message and return an empty list if not data: - logger.info("No valid data was provided to obfuscate") + logger.error( + "Invalid or empty data was provided to obfuscate. Returning empty list." + ) return [] + if not pii_fields: + logger.error("No PII fields provided to obfuscate. Returning data unchanged.") + return data - # Obfuscate the PII fields in each record using a list/dict comprehension - # This code is good but makes debugging a bit tricky. I may consider - # breaking it down into a for loop. return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data diff --git a/obfuscator/read.py b/obfuscator/read.py new file mode 100644 index 0000000..b704643 --- /dev/null +++ b/obfuscator/read.py @@ -0,0 +1,89 @@ +import csv +import io +import boto3 +import os +from typing import List, Dict +from obfuscator.logger import get_logger +from obfuscator.utils import Utilities + + +class DataReader: + """ + A class to read CSV data from a local file, S3 object, or string. Near + the project completion, support for JSON/Parquet files will be added. + """ + + def __init__(self, log_level=None): + self.log_level = log_level + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: + """ + A method to read a local CSV file and return the data as a list of + dictionaries. + """ + self.logger.debug(f"Reading local CSV from: {path}") + + try: + with open(path, mode="r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [dict(row) for row in reader] + except FileNotFoundError: + self.logger.error(f"File not found: {path}") + raise + except Exception as e: + self.logger.error(f"Error reading file: {e}") + + def read_s3(self, path) -> List[Dict[str, str]]: + """ + A method to read an S3 object containing CSV data + and return the data as a list of dictionaries. + """ + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + + if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": + localstack_endpoint = "http://localhost.localstack.cloud:4566" + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) + client = boto3.client( + "s3", + endpoint_url=localstack_endpoint, + aws_access_key_id="dummy", + aws_secret_access_key="dummy", + ) + self.logger.debug(f"endpoint_url: {localstack_endpoint}") + else: + client = boto3.client("s3") + + try: + response = client.get_object(Bucket=bucket, Key=key) + self.logger.info("S3 object read successfully") + content = response["Body"].read().decode("utf-8") + return self.read_string(content) + except client.exceptions.NoSuchKey: + self.logger.error(f"Object not found: {bucket}/{key}") + raise + except client.exceptions.ClientError as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + except UnicodeDecodeError as e: + self.logger.error(f"Error decoding S3 object: {e}") + raise + except Exception as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + + def read_string(self, content: str) -> List[Dict[str, str]]: + """ + A method to read CSV data from a string and return the data as a list + of dictionaries. + """ + if not content.strip(): + return [] + + f = io.StringIO(content) + reader = csv.DictReader(f) + return [dict(row) for row in reader] diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 2e4211f..77ca1cf 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -1,15 +1,16 @@ # Utility functions from obfuscator.logger import get_logger -# Create the logger -logger = get_logger("CLI") +class Utilities: + def __init__(self, logger=None): + self.logger = get_logger("UTILITIES", logger) -def get_s3_path(uri): - parts = uri.replace("s3://", "").split("/") - logger.debug(f"Parts: {parts}") - bucket = parts.pop(0) - logger.debug(f"Bucket: {bucket}") - key = "/".join(parts) - logger.debug(f"Key: {key}") - return bucket, key + def get_s3_path(self, uri): + parts = uri.replace("s3://", "").split("/") + self.logger.debug(f"Parts: {parts}") + bucket = parts.pop(0) + self.logger.debug(f"Bucket: {bucket}") + key = "/".join(parts) + self.logger.debug(f"Key: {key}") + return bucket, key diff --git a/obfuscator/write.py b/obfuscator/write.py new file mode 100644 index 0000000..451b073 --- /dev/null +++ b/obfuscator/write.py @@ -0,0 +1,28 @@ +import csv +import io +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("CSVWRITER") + + +class DataWriter: + def __init__(self): + pass + + def create_byte_stream(self, data: List[Dict[str, str]]) -> bytes: + if not data: + logger.error("Invalid or empty data was provided to write") + return b"" + + output = io.StringIO() + + headers = list(data[0].keys()) + + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) + + csv_string = output.getvalue() + + return csv_string.encode("utf-8") diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index 4f61b16..c77b6b4 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,8 +1,5 @@ from obfuscator.obfuscate import obfuscate -# Check if the function does what its supposed to and can obfuscate -# valid PII fields in a list of dictionaries - def test_obfuscate_data_with_valid_pii_fields(): data = [ @@ -39,11 +36,6 @@ def test_obfuscate_data_with_valid_pii_fields(): assert result == expected -# Check if the function can obfuscate data even when some PII -# fields are missing from some of the data, returning a list of dictionaries -# but with the missing PII fields obfuscated and the rest of the data intact - - def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, @@ -69,9 +61,6 @@ def test_obfuscate_data_with_missing_pii_field(): assert result == expected -# Check if the function can handle an empty list of data, returning an empty list - - def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] @@ -81,10 +70,6 @@ def test_obfuscate_data_with_no_data(): assert result == expected -# Check if the function can handle an empty list of PII fields, returning the data as is -# without mutating it - - def test_obfuscate_data_with_empty_pii_fields(): data = [ { diff --git a/test/test_csv_reader.py b/test/test_read.py index e4c135b..de425ce 100644 --- a/test/test_csv_reader.py +++ b/test/test_read.py @@ -1,17 +1,9 @@ -# csv_reader.py - tests for read_string and read_s3 -# Author: Alex Schofield - import boto3 from moto import mock_aws -from obfuscator.csv_reader import CSVReader +from obfuscator.read import DataReader import pytest -reader = CSVReader() - -# CSVREADER: READ_STRING TESTS - -# Check if the function can read a CSV string with no content and return -# an empty list +reader = DataReader(log_level="DEBUG") def test_empty_csv_should_return_no_content(): @@ -21,10 +13,6 @@ def test_empty_csv_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with only a header and return -# an empty list - - def test_csv_with_header_only_should_return_no_content(): content = "student_id,name,course\n" result = reader.read_string(content) @@ -32,10 +20,6 @@ def test_csv_with_header_only_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with valid data and return -# a list of dictionaries - - def test_csv_with_valid_data(): content = ( "student_id,name,course\n" @@ -50,10 +34,6 @@ def test_csv_with_valid_data(): assert result == expected -# Check if the function can read a CSV string with quoted fields and return -# a list of dictionaries with the quoted fields intact - - def test_csv_with_quoted_fields_should_run_as_expected(): content = ( "student_id,name,course\n" @@ -68,9 +48,6 @@ def test_csv_with_quoted_fields_should_run_as_expected(): assert result == expected -# CSVREADER: READ_S3 TESTS - - def setup_s3(s3_client, bucket: str, key: str, content: str): s3_client.create_bucket( Bucket=bucket, @@ -119,7 +96,7 @@ def test_read_s3_empty_csv_returns_empty_list(): setup_s3(s3, bucket, key, csv_content) path = f"s3://{bucket}/{key}" - data = CSVReader.read_s3(path) + data = reader.read_s3(path) assert data == [] @@ -129,7 +106,7 @@ def test_read_s3_nonexistent_bucket_raises_exception(): key = "data/mock.csv" path = f"s3://{bucket}/{key}" with pytest.raises(Exception): - CSVReader.read_s3(path) + reader.read_s3(path) def test_read_s3_nonexistent_key_raises_exception(): @@ -143,7 +120,7 @@ def test_read_s3_nonexistent_key_raises_exception(): key = "data/nonexistent.csv" path = f"s3://{bucket}/{key}" with pytest.raises(Exception): - CSVReader.read_s3(path) + reader.read_s3(path) def test_read_s3_malformed_csv_returns_expected(): @@ -155,7 +132,7 @@ def test_read_s3_malformed_csv_returns_expected(): setup_s3(s3, bucket, key, csv_content) path = f"s3://{bucket}/{key}" - data = CSVReader.read_s3(path) + data = reader.read_s3(path) expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] assert data == expected @@ -175,7 +152,7 @@ def test_read_s3_csv_with_extra_empty_lines(): setup_s3(s3, bucket, key, csv_content) path = f"s3://{bucket}/{key}" - data = CSVReader.read_s3(path) + data = reader.read_s3(path) expected = [ {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, @@ -196,7 +173,7 @@ def test_read_s3_csv_with_whitespace_in_fields(): setup_s3(s3, bucket, key, csv_content) path = f"s3://{bucket}/{key}" - data = CSVReader.read_s3(path) + data = reader.read_s3(path) expected = [ {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, diff --git a/test/test_csv_writer.py b/test/test_write.py index eceac28..4929b06 100644 --- a/test/test_csv_writer.py +++ b/test/test_write.py @@ -1,6 +1,8 @@ import io import csv -from obfuscator.csv_writer import create_byte_stream +from obfuscator.write import DataWriter + +writer = DataWriter() def csv_bytes_to_list(csv_bytes: bytes): @@ -15,13 +17,14 @@ def test_create_byte_stream_valid_data(): {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, ] - csv_bytes = create_byte_stream(data) + csv_bytes = writer.create_byte_stream(data) result = csv_bytes_to_list(csv_bytes) assert result == data def test_create_byte_stream_empty_data(): - csv_bytes = create_byte_stream([]) + data = [] + csv_bytes = writer.create_byte_stream(data) assert csv_bytes == b"" @@ -30,7 +33,7 @@ def test_create_byte_stream_handles_quoted_fields(): {"student_id": "1234", "name": 'Student "One"', "course": "Course, A"}, {"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"}, ] - csv_bytes = create_byte_stream(data) + csv_bytes = writer.create_byte_stream(data) result = csv_bytes_to_list(csv_bytes) assert result == data @@ -40,7 +43,7 @@ def test_create_byte_stream_consistent_header_order(): {"student_id": "1234", "name": "Alice", "course": "Math"}, {"student_id": "5678", "name": "Bob", "course": "Science"}, ] - csv_bytes = create_byte_stream(data) + csv_bytes = writer.create_byte_stream(data) csv_string = csv_bytes.decode("utf-8") header_line = csv_string.splitlines()[0] expected_header = ",".join(data[0].keys()) @@ -52,6 +55,6 @@ def test_create_byte_stream_special_characters(): {"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"}, {"student_id": "5678", "name": "Student 2", "course": "Value with, comma"}, ] - csv_bytes = create_byte_stream(data) + csv_bytes = writer.create_byte_stream(data) result = csv_bytes_to_list(csv_bytes) assert result == data |
