From 0a4060dc5b8740325c38c1a3fbaf3d0012cae6b4 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:20:11 +0000 Subject: update CLI parser description & prog --- cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index f2c8771..db9821b 100644 --- a/cli.py +++ b/cli.py @@ -11,7 +11,10 @@ logger = get_logger("CLI") def main(): # Create an argument parser - parser = argparse.ArgumentParser(description="gdpr-obfuscator") + parser = argparse.ArgumentParser( + prog="GDPR-Obfuscator", + description="Obfuscate sensitive data stored locally or in an AWS environment", + ) # Require user to either choose a local file or an S3 object # The user can only choose one of these options or the program will exit # If not provided, the program will exit -- cgit v1.2.3 From 7a24e4e5526163f1c90b3bd5be173c545bd283cb Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:21:15 +0000 Subject: add shorthand options for choices in CLI --- cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index db9821b..12fd40a 100644 --- a/cli.py +++ b/cli.py @@ -19,13 +19,13 @@ def main(): # The user can only choose one of these options or the program will exit # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) - loc.add_argument("--local") - loc.add_argument("--s3") + loc.add_argument("-l", "--local") + loc.add_argument("-s", "--s3") # Require user to provide a list of PII fields to obfuscate # e.g. --pii name email_address # If not provided, the program will exit - parser.add_argument("--pii", nargs="+", required=True) + parser.add_argument("-p", "--pii", nargs="+", required=True) # Parse the arguments args = parser.parse_args() -- cgit v1.2.3 From f2ee3e2815d084d92826606dab28fd261ccf1b6f Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:28:06 +0000 Subject: add help information to CLI arguments --- cli.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index 12fd40a..e92148f 100644 --- a/cli.py +++ b/cli.py @@ -15,17 +15,30 @@ def main(): prog="GDPR-Obfuscator", description="Obfuscate sensitive data stored locally or in an AWS environment", ) + + parser.add_argument( + "-v", "--verbose", action="store_true", help="Enable verbose logging" + ) + # Require user to either choose a local file or an S3 object # The user can only choose one of these options or the program will exit # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) - loc.add_argument("-l", "--local") - loc.add_argument("-s", "--s3") + loc.add_argument("-l", "--local", help="Path to local CSV file") + loc.add_argument( + "-s", "--s3", help="S3 object path (example: s3://bucket-name/file)" + ) # Require user to provide a list of PII fields to obfuscate # e.g. --pii name email_address # If not provided, the program will exit - parser.add_argument("-p", "--pii", nargs="+", required=True) + parser.add_argument( + "-p", + "--pii", + nargs="+", + required=True, + help="List of PII fields to obfuscate, separated by spaces", + ) # Parse the arguments args = parser.parse_args() -- cgit v1.2.3 From a4d1862161cf727447900f937c73ef77302a5d51 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:32:09 +0000 Subject: change help message for CLI --s3/--local choice --- cli.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index e92148f..40d8087 100644 --- a/cli.py +++ b/cli.py @@ -24,10 +24,8 @@ def main(): # The user can only choose one of these options or the program will exit # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) - loc.add_argument("-l", "--local", help="Path to local CSV file") - loc.add_argument( - "-s", "--s3", help="S3 object path (example: s3://bucket-name/file)" - ) + loc.add_argument("-l", "--local", help="Local path to file") + loc.add_argument("-s", "--s3", help="URI path to file stored in S3") # Require user to provide a list of PII fields to obfuscate # e.g. --pii name email_address -- cgit v1.2.3 From a41d27a5db76f4e1989067db6b071a88efd5fa8b Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:37:42 +0000 Subject: invoke debug logging using CLI argument --- cli.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'cli.py') diff --git a/cli.py b/cli.py index 40d8087..bf9b53f 100644 --- a/cli.py +++ b/cli.py @@ -41,6 +41,10 @@ def main(): # Parse the arguments args = parser.parse_args() + # If the user chose verbose logging, set the logger to debug + if args.verbose: + logger.setLevel("DEBUG") + # Create the CSVReader object reader = CSVReader() -- cgit v1.2.3 From ef05a027ffbf8bbee89bb031ccd6152de49762c6 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:19:33 +0000 Subject: workaround/fix annoying logging issues --- cli.py | 12 ++++++------ obfuscator/csv_reader.py | 38 +++++++++++++++++++------------------- obfuscator/logger.py | 41 +++++++++++++++++++++++++++-------------- obfuscator/utils.py | 27 +++++++++++++++------------ 4 files changed, 67 insertions(+), 51 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index bf9b53f..d19c18d 100644 --- a/cli.py +++ b/cli.py @@ -5,9 +5,6 @@ from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger from obfuscator.csv_writer import create_byte_stream -# Create the logger -logger = get_logger("CLI") - def main(): # Create an argument parser @@ -42,11 +39,13 @@ def main(): args = parser.parse_args() # If the user chose verbose logging, set the logger to debug - if args.verbose: - logger.setLevel("DEBUG") + log_level = "DEBUG" if args.verbose else "INFO" + + # Create the logger + logger = get_logger("CLI", log_level) # Create the CSVReader object - reader = CSVReader() + reader = CSVReader(log_level) # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: @@ -56,6 +55,7 @@ def main(): logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") + data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 8fdf26f..f8dd7d3 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -4,10 +4,7 @@ import boto3 import os from typing import List, Dict from obfuscator.logger import get_logger -from obfuscator.utils import get_s3_path - -# Create the logger -logger = get_logger("CSVREADER") +from obfuscator.utils import Utilities # Putting the CSV reading components into a class may seem like overkill # for a simple script, but it allows for better organization and scalability. @@ -22,14 +19,18 @@ class CSVReader: the project completion, support for JSON/Parquet files will be added. """ - @staticmethod - def read_local(path) -> List[Dict[str, str]]: + def __init__(self, log_level=None): + self.log_level = log_level + # Create the logger + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: """ A method to read a local CSV file and return the data as a list of dictionaries. """ # Log the path of the file being read for debugging - logger.debug(f"Reading local CSV from: {path}") + self.logger.debug(f"Reading local CSV from: {path}") # Attempt to read the file and return the data as a list of dictionaries # However, if the file isn't found or there is a generic exception, log @@ -39,38 +40,38 @@ class CSVReader: reader = csv.DictReader(f) return [dict(row) for row in reader] except FileNotFoundError: - logger.error(f"File not found: {path}") + self.logger.error(f"File not found: {path}") raise except Exception as e: - logger.error(f"Error reading file: {e}") + self.logger.error(f"Error reading file: {e}") - @staticmethod - def read_s3(path) -> List[Dict[str, str]]: + def read_s3(self, path) -> List[Dict[str, str]]: """ A method to read an S3 object containing CSV data and return the data as a list of dictionaries. """ - bucket, key = get_s3_path(path) - logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" - logger.debug("Using LocalStack endpoint for S3") + self.logger.debug("Using LocalStack endpoint for S3") client = boto3.client( "s3", endpoint_url=localstack_endpoint, aws_access_key_id="dummy", aws_secret_access_key="dummy", ) - logger.debug(f"endpoint_url: {localstack_endpoint}") + self.logger.debug(f"endpoint_url: {localstack_endpoint}") else: client = boto3.client("s3") try: # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) - logger.info("S3 object read successfully") + self.logger.info("S3 object read successfully") # Read and decode the content content = response["Body"].read().decode("utf-8") # Even though the read_string method was only created for testing, @@ -78,11 +79,10 @@ class CSVReader: return CSVReader.read_string(content) # TODO: Add more specific exceptions to catch except Exception as e: - logger.error(f"Error reading S3 object: {e}") + self.logger.error(f"Error reading S3 object: {e}") raise - @staticmethod - def read_string(content: str) -> List[Dict[str, str]]: + def read_string(self, content: str) -> List[Dict[str, str]]: """ A method to read CSV data from a string and return the data as a list of dictionaries. diff --git a/obfuscator/logger.py b/obfuscator/logger.py index 649dad7..2c5b988 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -1,24 +1,37 @@ import logging import os +from enum import Enum -def get_logger(name: str) -> logging.Logger: - logger = logging.getLogger(name) +class LogLevel(Enum): + DEBUG = logging.DEBUG + INFO = logging.INFO + WARNING = logging.WARNING + ERROR = logging.ERROR + CRITICAL = logging.CRITICAL + - if not logger.hasHandlers(): - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": - log_level = logging.DEBUG - else: - log_level = logging.INFO +def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: - logger.setLevel(log_level) + if isinstance(level, str): + try: + level = LogLevel[level.upper()] + except KeyError: + raise ValueError( + f"Invalid log level '{level}'. Choose from: {', '.join(l.name for l in LogLevel)}" + ) + + logger = logging.getLogger(name) - handler = logging.StreamHandler() - formatting = logging.Formatter( - "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" - ) - handler.setFormatter(formatting) + if logger.hasHandlers(): + logger.handlers.clear() - logger.addHandler(handler) + handler = logging.StreamHandler() + logger.setLevel(level.value) + formatting = logging.Formatter( + "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" + ) + handler.setFormatter(formatting) + logger.addHandler(handler) return logger diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 1d1c3fe..81eb04a 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -1,15 +1,18 @@ # Utility functions from obfuscator.logger import get_logger -# Create the logger -logger = get_logger("UTILS") - - -def get_s3_path(uri): - parts = uri.replace("s3://", "").split("/") - logger.debug(f"Parts: {parts}") - bucket = parts.pop(0) - logger.debug(f"Bucket: {bucket}") - key = "/".join(parts) - logger.debug(f"Key: {key}") - return bucket, key + +class Utilities: + + def __init__(self, logger=None): + # Create the logger + self.logger = get_logger("UTILITIES", logger) + + def get_s3_path(self, uri): + parts = uri.replace("s3://", "").split("/") + self.logger.debug(f"Parts: {parts}") + bucket = parts.pop(0) + self.logger.debug(f"Bucket: {bucket}") + key = "/".join(parts) + self.logger.debug(f"Key: {key}") + return bucket, key -- cgit v1.2.3 From 3c4b66e8490c6fdf93fb8fee735d52c76eb2853b Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:22:26 +0000 Subject: remove annoying comments for better readability of code --- cli.py | 16 ---------------- obfuscator/csv_reader.py | 19 ------------------- obfuscator/csv_writer.py | 1 - obfuscator/obfuscate.py | 5 ----- obfuscator/utils.py | 1 - test/test_csv_reader.py | 23 ----------------------- test/test_obfuscator.py | 15 --------------- 7 files changed, 80 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index d19c18d..c03ffd1 100644 --- a/cli.py +++ b/cli.py @@ -7,7 +7,6 @@ from obfuscator.csv_writer import create_byte_stream def main(): - # Create an argument parser parser = argparse.ArgumentParser( prog="GDPR-Obfuscator", description="Obfuscate sensitive data stored locally or in an AWS environment", @@ -17,16 +16,10 @@ def main(): "-v", "--verbose", action="store_true", help="Enable verbose logging" ) - # Require user to either choose a local file or an S3 object - # The user can only choose one of these options or the program will exit - # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) loc.add_argument("-l", "--local", help="Local path to file") loc.add_argument("-s", "--s3", help="URI path to file stored in S3") - # Require user to provide a list of PII fields to obfuscate - # e.g. --pii name email_address - # If not provided, the program will exit parser.add_argument( "-p", "--pii", @@ -35,23 +28,17 @@ def main(): help="List of PII fields to obfuscate, separated by spaces", ) - # Parse the arguments args = parser.parse_args() - # If the user chose verbose logging, set the logger to debug log_level = "DEBUG" if args.verbose else "INFO" - # Create the logger logger = get_logger("CLI", log_level) - # Create the CSVReader object reader = CSVReader(log_level) - # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") data = reader.read_local(args.local) - # For debug purposes, log the data read from the CSV logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") @@ -59,13 +46,10 @@ def main(): data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) - # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) - # For debug purposes, log the obfuscated data as JSON for readability logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) return create_byte_stream(obfuscated_data) -# If the script is run directly (as it should be), call the main function if __name__ == "__main__": main() diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index f8dd7d3..2b099c8 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -6,12 +6,6 @@ from typing import List, Dict from obfuscator.logger import get_logger from obfuscator.utils import Utilities -# Putting the CSV reading components into a class may seem like overkill -# for a simple script, but it allows for better organization and scalability. -# @staticmethod is used to define the method without an instance of the class -# being required. The methods could be defined just as functions, and this -# may still be changed. - class CSVReader: """ @@ -21,7 +15,6 @@ class CSVReader: def __init__(self, log_level=None): self.log_level = log_level - # Create the logger self.logger = get_logger("CSVREADER", log_level) def read_local(self, path) -> List[Dict[str, str]]: @@ -29,12 +22,8 @@ class CSVReader: A method to read a local CSV file and return the data as a list of dictionaries. """ - # Log the path of the file being read for debugging self.logger.debug(f"Reading local CSV from: {path}") - # Attempt to read the file and return the data as a list of dictionaries - # However, if the file isn't found or there is a generic exception, log - # the error and raise an exception try: with open(path, mode="r", encoding="utf-8") as f: reader = csv.DictReader(f) @@ -54,7 +43,6 @@ class CSVReader: bucket, key = utils.get_s3_path(path) self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" self.logger.debug("Using LocalStack endpoint for S3") @@ -69,15 +57,10 @@ class CSVReader: client = boto3.client("s3") try: - # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) self.logger.info("S3 object read successfully") - # Read and decode the content content = response["Body"].read().decode("utf-8") - # Even though the read_string method was only created for testing, - # it can be reused here to read and return the CSV data return CSVReader.read_string(content) - # TODO: Add more specific exceptions to catch except Exception as e: self.logger.error(f"Error reading S3 object: {e}") raise @@ -87,11 +70,9 @@ class CSVReader: A method to read CSV data from a string and return the data as a list of dictionaries. """ - # If the content is empty, return an empty list if not content.strip(): return [] - # Treat the string as a file-like object and return as list of dictionaries f = io.StringIO(content) reader = csv.DictReader(f) return [dict(row) for row in reader] diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 099e910..56b3f1f 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -3,7 +3,6 @@ import io from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("CSVWRITER") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 4f7e6c1..e964433 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -1,7 +1,6 @@ from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("OBFUSCATE") @@ -12,14 +11,10 @@ def obfuscate( A function to obfuscate PII fields in a list of dictionaries, replacing sensitive values with a string of asterisks. """ - # If no data is provided, log a message and return an empty list if not data: logger.info("No valid data was provided to obfuscate") return [] - # Obfuscate the PII fields in each record using a list/dict comprehension - # This code is good but makes debugging a bit tricky. I may consider - # breaking it down into a for loop. return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 81eb04a..f61451b 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -5,7 +5,6 @@ from obfuscator.logger import get_logger class Utilities: def __init__(self, logger=None): - # Create the logger self.logger = get_logger("UTILITIES", logger) def get_s3_path(self, uri): diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index e4c135b..0206542 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,6 +1,3 @@ -# csv_reader.py - tests for read_string and read_s3 -# Author: Alex Schofield - import boto3 from moto import mock_aws from obfuscator.csv_reader import CSVReader @@ -8,11 +5,6 @@ import pytest reader = CSVReader() -# CSVREADER: READ_STRING TESTS - -# Check if the function can read a CSV string with no content and return -# an empty list - def test_empty_csv_should_return_no_content(): content = "" @@ -21,10 +13,6 @@ def test_empty_csv_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with only a header and return -# an empty list - - def test_csv_with_header_only_should_return_no_content(): content = "student_id,name,course\n" result = reader.read_string(content) @@ -32,10 +20,6 @@ def test_csv_with_header_only_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with valid data and return -# a list of dictionaries - - def test_csv_with_valid_data(): content = ( "student_id,name,course\n" @@ -50,10 +34,6 @@ def test_csv_with_valid_data(): assert result == expected -# Check if the function can read a CSV string with quoted fields and return -# a list of dictionaries with the quoted fields intact - - def test_csv_with_quoted_fields_should_run_as_expected(): content = ( "student_id,name,course\n" @@ -68,9 +48,6 @@ def test_csv_with_quoted_fields_should_run_as_expected(): assert result == expected -# CSVREADER: READ_S3 TESTS - - def setup_s3(s3_client, bucket: str, key: str, content: str): s3_client.create_bucket( Bucket=bucket, diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index 4f61b16..c77b6b4 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,8 +1,5 @@ from obfuscator.obfuscate import obfuscate -# Check if the function does what its supposed to and can obfuscate -# valid PII fields in a list of dictionaries - def test_obfuscate_data_with_valid_pii_fields(): data = [ @@ -39,11 +36,6 @@ def test_obfuscate_data_with_valid_pii_fields(): assert result == expected -# Check if the function can obfuscate data even when some PII -# fields are missing from some of the data, returning a list of dictionaries -# but with the missing PII fields obfuscated and the rest of the data intact - - def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, @@ -69,9 +61,6 @@ def test_obfuscate_data_with_missing_pii_field(): assert result == expected -# Check if the function can handle an empty list of data, returning an empty list - - def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] @@ -81,10 +70,6 @@ def test_obfuscate_data_with_no_data(): assert result == expected -# Check if the function can handle an empty list of PII fields, returning the data as is -# without mutating it - - def test_obfuscate_data_with_empty_pii_fields(): data = [ { -- cgit v1.2.3 From ad0328b2f292fe438a8a6a1f7ff2d36856dc578d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:27:54 +0000 Subject: modify logger messages to be more clear --- cli.py | 10 +++------- obfuscator/csv_reader.py | 4 +++- obfuscator/csv_writer.py | 4 +--- obfuscator/obfuscate.py | 2 +- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index c03ffd1..40c777f 100644 --- a/cli.py +++ b/cli.py @@ -31,23 +31,19 @@ def main(): args = parser.parse_args() log_level = "DEBUG" if args.verbose else "INFO" - logger = get_logger("CLI", log_level) reader = CSVReader(log_level) if args.local and not args.s3: - logger.debug("User chose to read CSV from local path") + logger.debug("Read data from local path") data = reader.read_local(args.local) - logger.debug("Contents: " + str(data)) else: - logger.debug("User chose to read CSV from S3") - + logger.debug("Read data from S3") data = reader.read_s3(args.s3) - logger.debug("Contents: " + str(data)) obfuscated_data = obfuscate(data, args.pii) - logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) + return create_byte_stream(obfuscated_data) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 2b099c8..3649681 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -45,7 +45,9 @@ class CSVReader: if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" - self.logger.debug("Using LocalStack endpoint for S3") + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) client = boto3.client( "s3", endpoint_url=localstack_endpoint, diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 56b3f1f..de7cd4b 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -8,8 +8,7 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: - logger.info("No valid data was provided to write") - return b"" + logger.error("Invalid or empty data was provided to write") output = io.StringIO() @@ -20,6 +19,5 @@ def create_byte_stream(data: List[Dict[str, str]]) -> bytes: writer.writerows(data) csv_string = output.getvalue() - logger.debug(f"CSV data: {csv_string}") return csv_string.encode("utf-8") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index e964433..9d43975 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -12,7 +12,7 @@ def obfuscate( sensitive values with a string of asterisks. """ if not data: - logger.info("No valid data was provided to obfuscate") + logger.error("Invalid or empty was provided to obfuscate") return [] return [ -- cgit v1.2.3 From 7e3553e936958a7fc80c7d4ebae4adeb0f634851 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:28:58 +0000 Subject: remove unused json import from cli.py --- cli.py | 1 - 1 file changed, 1 deletion(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index 40c777f..7ffccd8 100644 --- a/cli.py +++ b/cli.py @@ -1,5 +1,4 @@ import argparse -import json from obfuscator.csv_reader import CSVReader from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger -- cgit v1.2.3 From ae89b05dbc8feebc1410f39143c0d829f8704235 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:37:16 +0000 Subject: rename CSVReader to DataReader & update references --- cli.py | 4 +- obfuscator/csv_reader.py | 89 ----------------------- obfuscator/read.py | 89 +++++++++++++++++++++++ test/test_csv_reader.py | 181 ----------------------------------------------- test/test_read.py | 181 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 272 insertions(+), 272 deletions(-) delete mode 100644 obfuscator/csv_reader.py create mode 100644 obfuscator/read.py delete mode 100644 test/test_csv_reader.py create mode 100644 test/test_read.py (limited to 'cli.py') diff --git a/cli.py b/cli.py index 7ffccd8..5100e2b 100644 --- a/cli.py +++ b/cli.py @@ -1,5 +1,5 @@ import argparse -from obfuscator.csv_reader import CSVReader +from obfuscator.read import DataReader from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger from obfuscator.csv_writer import create_byte_stream @@ -32,7 +32,7 @@ def main(): log_level = "DEBUG" if args.verbose else "INFO" logger = get_logger("CLI", log_level) - reader = CSVReader(log_level) + reader = DataReader(log_level) if args.local and not args.s3: logger.debug("Read data from local path") diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py deleted file mode 100644 index 1f503d7..0000000 --- a/obfuscator/csv_reader.py +++ /dev/null @@ -1,89 +0,0 @@ -import csv -import io -import boto3 -import os -from typing import List, Dict -from obfuscator.logger import get_logger -from obfuscator.utils import Utilities - - -class CSVReader: - """ - A class to read CSV data from a local file, S3 object, or string. Near - the project completion, support for JSON/Parquet files will be added. - """ - - def __init__(self, log_level=None): - self.log_level = log_level - self.logger = get_logger("CSVREADER", log_level) - - def read_local(self, path) -> List[Dict[str, str]]: - """ - A method to read a local CSV file and return the data as a list of - dictionaries. - """ - self.logger.debug(f"Reading local CSV from: {path}") - - try: - with open(path, mode="r", encoding="utf-8") as f: - reader = csv.DictReader(f) - return [dict(row) for row in reader] - except FileNotFoundError: - self.logger.error(f"File not found: {path}") - raise - except Exception as e: - self.logger.error(f"Error reading file: {e}") - - def read_s3(self, path) -> List[Dict[str, str]]: - """ - A method to read an S3 object containing CSV data - and return the data as a list of dictionaries. - """ - utils = Utilities(self.log_level) - bucket, key = utils.get_s3_path(path) - self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - - if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": - localstack_endpoint = "http://localhost.localstack.cloud:4566" - self.logger.debug( - "Using LocalStack endpoint for S3 - ensure LocalStack is running" - ) - client = boto3.client( - "s3", - endpoint_url=localstack_endpoint, - aws_access_key_id="dummy", - aws_secret_access_key="dummy", - ) - self.logger.debug(f"endpoint_url: {localstack_endpoint}") - else: - client = boto3.client("s3") - - try: - response = client.get_object(Bucket=bucket, Key=key) - self.logger.info("S3 object read successfully") - content = response["Body"].read().decode("utf-8") - return self.read_string(content) - except client.exceptions.NoSuchKey: - self.logger.error(f"Object not found: {bucket}/{key}") - raise - except client.exceptions.ClientError as e: - self.logger.error(f"Error reading S3 object: {e}") - raise - except UnicodeDecodeError as e: - self.logger.error(f"Error decoding S3 object: {e}") - raise - except Exception as e: - self.logger.error(f"Error reading S3 object: {e}") - raise - - def read_string(self, content: str) -> List[Dict[str, str]]: - """ - A method to read CSV data from a string and return the data as a list - of dictionaries. - """ - if not content.strip(): - return [] - - f = io.StringIO(content) - reader = csv.DictReader(f) - return [dict(row) for row in reader] diff --git a/obfuscator/read.py b/obfuscator/read.py new file mode 100644 index 0000000..b704643 --- /dev/null +++ b/obfuscator/read.py @@ -0,0 +1,89 @@ +import csv +import io +import boto3 +import os +from typing import List, Dict +from obfuscator.logger import get_logger +from obfuscator.utils import Utilities + + +class DataReader: + """ + A class to read CSV data from a local file, S3 object, or string. Near + the project completion, support for JSON/Parquet files will be added. + """ + + def __init__(self, log_level=None): + self.log_level = log_level + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: + """ + A method to read a local CSV file and return the data as a list of + dictionaries. + """ + self.logger.debug(f"Reading local CSV from: {path}") + + try: + with open(path, mode="r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [dict(row) for row in reader] + except FileNotFoundError: + self.logger.error(f"File not found: {path}") + raise + except Exception as e: + self.logger.error(f"Error reading file: {e}") + + def read_s3(self, path) -> List[Dict[str, str]]: + """ + A method to read an S3 object containing CSV data + and return the data as a list of dictionaries. + """ + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + + if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": + localstack_endpoint = "http://localhost.localstack.cloud:4566" + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) + client = boto3.client( + "s3", + endpoint_url=localstack_endpoint, + aws_access_key_id="dummy", + aws_secret_access_key="dummy", + ) + self.logger.debug(f"endpoint_url: {localstack_endpoint}") + else: + client = boto3.client("s3") + + try: + response = client.get_object(Bucket=bucket, Key=key) + self.logger.info("S3 object read successfully") + content = response["Body"].read().decode("utf-8") + return self.read_string(content) + except client.exceptions.NoSuchKey: + self.logger.error(f"Object not found: {bucket}/{key}") + raise + except client.exceptions.ClientError as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + except UnicodeDecodeError as e: + self.logger.error(f"Error decoding S3 object: {e}") + raise + except Exception as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + + def read_string(self, content: str) -> List[Dict[str, str]]: + """ + A method to read CSV data from a string and return the data as a list + of dictionaries. + """ + if not content.strip(): + return [] + + f = io.StringIO(content) + reader = csv.DictReader(f) + return [dict(row) for row in reader] diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py deleted file mode 100644 index d6129e7..0000000 --- a/test/test_csv_reader.py +++ /dev/null @@ -1,181 +0,0 @@ -import boto3 -from moto import mock_aws -from obfuscator.csv_reader import CSVReader -import pytest - -reader = CSVReader(log_level="DEBUG") - - -def test_empty_csv_should_return_no_content(): - content = "" - result = reader.read_string(content) - expected = [] - assert result == expected - - -def test_csv_with_header_only_should_return_no_content(): - content = "student_id,name,course\n" - result = reader.read_string(content) - expected = [] - assert result == expected - - -def test_csv_with_valid_data(): - content = ( - "student_id,name,course\n" - "1234,Student 1,Course 1\n" - "5678,Student 2,Course 2\n" - ) - result = reader.read_string(content) - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - assert result == expected - - -def test_csv_with_quoted_fields_should_run_as_expected(): - content = ( - "student_id,name,course\n" - '1234,"Student 1","Course 1"\n' - '5678,"Student 2","Course 2"\n' - ) - result = reader.read_string(content) - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - assert result == expected - - -def setup_s3(s3_client, bucket: str, key: str, content: str): - s3_client.create_bucket( - Bucket=bucket, - CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, - ) - s3_client.put_object(Bucket=bucket, Key=key, Body=content) - - -@pytest.fixture(autouse=True) -def s3_client(): - with mock_aws(): - yield boto3.client("s3", "eu-west-2") - - -def test_read_s3_valid_csv_returns_expected(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/mock.csv" - - csv_content = ( - "student_id,name,course\n" - "1234,Student 1,Course 1\n" - "5678,Student 2,Course 2\n" - ) - - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - - assert data == expected - - -def test_read_s3_empty_csv_returns_empty_list(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "empty-bucket" - key = "data/empty.csv" - csv_content = "student_id,name,course\n" - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - assert data == [] - - -def test_read_s3_nonexistent_bucket_raises_exception(): - with mock_aws(): - bucket = "nonexistent-bucket" - key = "data/mock.csv" - path = f"s3://{bucket}/{key}" - with pytest.raises(Exception): - reader.read_s3(path) - - -def test_read_s3_nonexistent_key_raises_exception(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - s3.create_bucket( - Bucket=bucket, - CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, - ) - key = "data/nonexistent.csv" - path = f"s3://{bucket}/{key}" - with pytest.raises(Exception): - reader.read_s3(path) - - -def test_read_s3_malformed_csv_returns_expected(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/malformed.csv" - csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n" - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] - assert data == expected - - -def test_read_s3_csv_with_extra_empty_lines(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/extra_lines.csv" - csv_content = ( - "student_id,name,course\n" - "1234,Student 1,Course 1\n" - "\n" - "5678,Student 2,Course 2\n" - "\n" - ) - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - assert data == expected - - -def test_read_s3_csv_with_whitespace_in_fields(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/whitespace.csv" - csv_content = ( - "student_id, name , course \n" - " 1234 , Student 1 , Course 1 \n" - "5678,Student 2,Course 2\n" - ) - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - expected = [ - {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, - {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, - ] - assert data == expected diff --git a/test/test_read.py b/test/test_read.py new file mode 100644 index 0000000..903ab5d --- /dev/null +++ b/test/test_read.py @@ -0,0 +1,181 @@ +import boto3 +from moto import mock_aws +from obfuscator.read import CSVReader +import pytest + +reader = CSVReader(log_level="DEBUG") + + +def test_empty_csv_should_return_no_content(): + content = "" + result = reader.read_string(content) + expected = [] + assert result == expected + + +def test_csv_with_header_only_should_return_no_content(): + content = "student_id,name,course\n" + result = reader.read_string(content) + expected = [] + assert result == expected + + +def test_csv_with_valid_data(): + content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + result = reader.read_string(content) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert result == expected + + +def test_csv_with_quoted_fields_should_run_as_expected(): + content = ( + "student_id,name,course\n" + '1234,"Student 1","Course 1"\n' + '5678,"Student 2","Course 2"\n' + ) + result = reader.read_string(content) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert result == expected + + +def setup_s3(s3_client, bucket: str, key: str, content: str): + s3_client.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + s3_client.put_object(Bucket=bucket, Key=key, Body=content) + + +@pytest.fixture(autouse=True) +def s3_client(): + with mock_aws(): + yield boto3.client("s3", "eu-west-2") + + +def test_read_s3_valid_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/mock.csv" + + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + + assert data == expected + + +def test_read_s3_empty_csv_returns_empty_list(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "empty-bucket" + key = "data/empty.csv" + csv_content = "student_id,name,course\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + assert data == [] + + +def test_read_s3_nonexistent_bucket_raises_exception(): + with mock_aws(): + bucket = "nonexistent-bucket" + key = "data/mock.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + reader.read_s3(path) + + +def test_read_s3_nonexistent_key_raises_exception(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + s3.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + key = "data/nonexistent.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + reader.read_s3(path) + + +def test_read_s3_malformed_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/malformed.csv" + csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] + assert data == expected + + +def test_read_s3_csv_with_extra_empty_lines(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/extra_lines.csv" + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "\n" + "5678,Student 2,Course 2\n" + "\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert data == expected + + +def test_read_s3_csv_with_whitespace_in_fields(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/whitespace.csv" + csv_content = ( + "student_id, name , course \n" + " 1234 , Student 1 , Course 1 \n" + "5678,Student 2,Course 2\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + expected = [ + {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, + {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, + ] + assert data == expected -- cgit v1.2.3 From ef3f16de8d93821d54344d5cdd16d8deee0b016c Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:40:39 +0000 Subject: wrap write functions in class and update references --- cli.py | 6 ++++-- obfuscator/write.py | 24 ++++++++++++++---------- test/test_write.py | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'cli.py') diff --git a/cli.py b/cli.py index 5100e2b..bd49707 100644 --- a/cli.py +++ b/cli.py @@ -1,8 +1,8 @@ import argparse from obfuscator.read import DataReader +from obfuscator.write import DataWriter from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger -from obfuscator.csv_writer import create_byte_stream def main(): @@ -43,7 +43,9 @@ def main(): obfuscated_data = obfuscate(data, args.pii) - return create_byte_stream(obfuscated_data) + writer = DataWriter() + + return writer.create_byte_stream(obfuscated_data) if __name__ == "__main__": diff --git a/obfuscator/write.py b/obfuscator/write.py index de7cd4b..4081f0f 100644 --- a/obfuscator/write.py +++ b/obfuscator/write.py @@ -6,18 +6,22 @@ from obfuscator.logger import get_logger logger = get_logger("CSVWRITER") -def create_byte_stream(data: List[Dict[str, str]]) -> bytes: - if not data: - logger.error("Invalid or empty data was provided to write") +class DataWriter: + def __init__(self): + pass - output = io.StringIO() + def create_byte_stream(self, data: List[Dict[str, str]]) -> bytes: + if not data: + logger.error("Invalid or empty data was provided to write") - headers = list(data[0].keys()) + output = io.StringIO() - writer = csv.DictWriter(output, fieldnames=headers) - writer.writeheader() - writer.writerows(data) + headers = list(data[0].keys()) - csv_string = output.getvalue() + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) - return csv_string.encode("utf-8") + csv_string = output.getvalue() + + return csv_string.encode("utf-8") diff --git a/test/test_write.py b/test/test_write.py index eceac28..f339799 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -1,6 +1,6 @@ import io import csv -from obfuscator.csv_writer import create_byte_stream +from obfuscator.write import create_byte_stream def csv_bytes_to_list(csv_bytes: bytes): -- cgit v1.2.3