From b25f05df2f269258cd685c9b41502f1cc40cc4b5 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:38:30 +0000 Subject: require LOCALSTACK environment variable instead of DEBUG for changing boto3 endpoint --- obfuscator/csv_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 8f4ebea..cd0da0a 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -53,8 +53,8 @@ class CSVReader: bucket, key = get_s3_path(path) logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - # If DEBUG=TRUE, use the localstack endpoint for testing - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": + # If LOCALSTACK=TRUE, use the localstack endpoint for testing + if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" logger.debug("Using LocalStack endpoint for S3") client = boto3.client( -- cgit v1.2.3 From 6887c43a285f5cfeea9eaf552928bdb647a5a139 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:49:04 +0000 Subject: update log format --- obfuscator/logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'obfuscator') diff --git a/obfuscator/logger.py b/obfuscator/logger.py index ca41e95..649dad7 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -15,7 +15,7 @@ def get_logger(name: str) -> logging.Logger: handler = logging.StreamHandler() formatting = logging.Formatter( - "%(asctime)s - %(levelname)s - %(name)s - %(message)s" + "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" ) handler.setFormatter(formatting) -- cgit v1.2.3 From 0ac6d213c1b9cbc24313b9ac5d442e29f027b798 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:51:13 +0000 Subject: update logger names --- obfuscator/csv_reader.py | 2 +- obfuscator/csv_writer.py | 2 +- obfuscator/obfuscate.py | 2 +- obfuscator/utils.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index cd0da0a..8fdf26f 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -7,7 +7,7 @@ from obfuscator.logger import get_logger from obfuscator.utils import get_s3_path # Create the logger -logger = get_logger("CSVReader") +logger = get_logger("CSVREADER") # Putting the CSV reading components into a class may seem like overkill # for a simple script, but it allows for better organization and scalability. diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index aa5ac3f..099e910 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -4,7 +4,7 @@ from typing import List, Dict from obfuscator.logger import get_logger # Create the logger -logger = get_logger("CSVWriter") +logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 3f589cb..4f7e6c1 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -2,7 +2,7 @@ from typing import List, Dict from obfuscator.logger import get_logger # Create the logger -logger = get_logger("Obfuscator") +logger = get_logger("OBFUSCATE") def obfuscate( diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 2e4211f..1d1c3fe 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -2,7 +2,7 @@ from obfuscator.logger import get_logger # Create the logger -logger = get_logger("CLI") +logger = get_logger("UTILS") def get_s3_path(uri): -- cgit v1.2.3 From ef05a027ffbf8bbee89bb031ccd6152de49762c6 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:19:33 +0000 Subject: workaround/fix annoying logging issues --- cli.py | 12 ++++++------ obfuscator/csv_reader.py | 38 +++++++++++++++++++------------------- obfuscator/logger.py | 41 +++++++++++++++++++++++++++-------------- obfuscator/utils.py | 27 +++++++++++++++------------ 4 files changed, 67 insertions(+), 51 deletions(-) (limited to 'obfuscator') diff --git a/cli.py b/cli.py index bf9b53f..d19c18d 100644 --- a/cli.py +++ b/cli.py @@ -5,9 +5,6 @@ from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger from obfuscator.csv_writer import create_byte_stream -# Create the logger -logger = get_logger("CLI") - def main(): # Create an argument parser @@ -42,11 +39,13 @@ def main(): args = parser.parse_args() # If the user chose verbose logging, set the logger to debug - if args.verbose: - logger.setLevel("DEBUG") + log_level = "DEBUG" if args.verbose else "INFO" + + # Create the logger + logger = get_logger("CLI", log_level) # Create the CSVReader object - reader = CSVReader() + reader = CSVReader(log_level) # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: @@ -56,6 +55,7 @@ def main(): logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") + data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 8fdf26f..f8dd7d3 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -4,10 +4,7 @@ import boto3 import os from typing import List, Dict from obfuscator.logger import get_logger -from obfuscator.utils import get_s3_path - -# Create the logger -logger = get_logger("CSVREADER") +from obfuscator.utils import Utilities # Putting the CSV reading components into a class may seem like overkill # for a simple script, but it allows for better organization and scalability. @@ -22,14 +19,18 @@ class CSVReader: the project completion, support for JSON/Parquet files will be added. """ - @staticmethod - def read_local(path) -> List[Dict[str, str]]: + def __init__(self, log_level=None): + self.log_level = log_level + # Create the logger + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: """ A method to read a local CSV file and return the data as a list of dictionaries. """ # Log the path of the file being read for debugging - logger.debug(f"Reading local CSV from: {path}") + self.logger.debug(f"Reading local CSV from: {path}") # Attempt to read the file and return the data as a list of dictionaries # However, if the file isn't found or there is a generic exception, log @@ -39,38 +40,38 @@ class CSVReader: reader = csv.DictReader(f) return [dict(row) for row in reader] except FileNotFoundError: - logger.error(f"File not found: {path}") + self.logger.error(f"File not found: {path}") raise except Exception as e: - logger.error(f"Error reading file: {e}") + self.logger.error(f"Error reading file: {e}") - @staticmethod - def read_s3(path) -> List[Dict[str, str]]: + def read_s3(self, path) -> List[Dict[str, str]]: """ A method to read an S3 object containing CSV data and return the data as a list of dictionaries. """ - bucket, key = get_s3_path(path) - logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" - logger.debug("Using LocalStack endpoint for S3") + self.logger.debug("Using LocalStack endpoint for S3") client = boto3.client( "s3", endpoint_url=localstack_endpoint, aws_access_key_id="dummy", aws_secret_access_key="dummy", ) - logger.debug(f"endpoint_url: {localstack_endpoint}") + self.logger.debug(f"endpoint_url: {localstack_endpoint}") else: client = boto3.client("s3") try: # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) - logger.info("S3 object read successfully") + self.logger.info("S3 object read successfully") # Read and decode the content content = response["Body"].read().decode("utf-8") # Even though the read_string method was only created for testing, @@ -78,11 +79,10 @@ class CSVReader: return CSVReader.read_string(content) # TODO: Add more specific exceptions to catch except Exception as e: - logger.error(f"Error reading S3 object: {e}") + self.logger.error(f"Error reading S3 object: {e}") raise - @staticmethod - def read_string(content: str) -> List[Dict[str, str]]: + def read_string(self, content: str) -> List[Dict[str, str]]: """ A method to read CSV data from a string and return the data as a list of dictionaries. diff --git a/obfuscator/logger.py b/obfuscator/logger.py index 649dad7..2c5b988 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -1,24 +1,37 @@ import logging import os +from enum import Enum -def get_logger(name: str) -> logging.Logger: - logger = logging.getLogger(name) +class LogLevel(Enum): + DEBUG = logging.DEBUG + INFO = logging.INFO + WARNING = logging.WARNING + ERROR = logging.ERROR + CRITICAL = logging.CRITICAL + - if not logger.hasHandlers(): - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": - log_level = logging.DEBUG - else: - log_level = logging.INFO +def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: - logger.setLevel(log_level) + if isinstance(level, str): + try: + level = LogLevel[level.upper()] + except KeyError: + raise ValueError( + f"Invalid log level '{level}'. Choose from: {', '.join(l.name for l in LogLevel)}" + ) + + logger = logging.getLogger(name) - handler = logging.StreamHandler() - formatting = logging.Formatter( - "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" - ) - handler.setFormatter(formatting) + if logger.hasHandlers(): + logger.handlers.clear() - logger.addHandler(handler) + handler = logging.StreamHandler() + logger.setLevel(level.value) + formatting = logging.Formatter( + "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" + ) + handler.setFormatter(formatting) + logger.addHandler(handler) return logger diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 1d1c3fe..81eb04a 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -1,15 +1,18 @@ # Utility functions from obfuscator.logger import get_logger -# Create the logger -logger = get_logger("UTILS") - - -def get_s3_path(uri): - parts = uri.replace("s3://", "").split("/") - logger.debug(f"Parts: {parts}") - bucket = parts.pop(0) - logger.debug(f"Bucket: {bucket}") - key = "/".join(parts) - logger.debug(f"Key: {key}") - return bucket, key + +class Utilities: + + def __init__(self, logger=None): + # Create the logger + self.logger = get_logger("UTILITIES", logger) + + def get_s3_path(self, uri): + parts = uri.replace("s3://", "").split("/") + self.logger.debug(f"Parts: {parts}") + bucket = parts.pop(0) + self.logger.debug(f"Bucket: {bucket}") + key = "/".join(parts) + self.logger.debug(f"Key: {key}") + return bucket, key -- cgit v1.2.3 From 3c4b66e8490c6fdf93fb8fee735d52c76eb2853b Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:22:26 +0000 Subject: remove annoying comments for better readability of code --- cli.py | 16 ---------------- obfuscator/csv_reader.py | 19 ------------------- obfuscator/csv_writer.py | 1 - obfuscator/obfuscate.py | 5 ----- obfuscator/utils.py | 1 - test/test_csv_reader.py | 23 ----------------------- test/test_obfuscator.py | 15 --------------- 7 files changed, 80 deletions(-) (limited to 'obfuscator') diff --git a/cli.py b/cli.py index d19c18d..c03ffd1 100644 --- a/cli.py +++ b/cli.py @@ -7,7 +7,6 @@ from obfuscator.csv_writer import create_byte_stream def main(): - # Create an argument parser parser = argparse.ArgumentParser( prog="GDPR-Obfuscator", description="Obfuscate sensitive data stored locally or in an AWS environment", @@ -17,16 +16,10 @@ def main(): "-v", "--verbose", action="store_true", help="Enable verbose logging" ) - # Require user to either choose a local file or an S3 object - # The user can only choose one of these options or the program will exit - # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) loc.add_argument("-l", "--local", help="Local path to file") loc.add_argument("-s", "--s3", help="URI path to file stored in S3") - # Require user to provide a list of PII fields to obfuscate - # e.g. --pii name email_address - # If not provided, the program will exit parser.add_argument( "-p", "--pii", @@ -35,23 +28,17 @@ def main(): help="List of PII fields to obfuscate, separated by spaces", ) - # Parse the arguments args = parser.parse_args() - # If the user chose verbose logging, set the logger to debug log_level = "DEBUG" if args.verbose else "INFO" - # Create the logger logger = get_logger("CLI", log_level) - # Create the CSVReader object reader = CSVReader(log_level) - # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") data = reader.read_local(args.local) - # For debug purposes, log the data read from the CSV logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") @@ -59,13 +46,10 @@ def main(): data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) - # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) - # For debug purposes, log the obfuscated data as JSON for readability logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) return create_byte_stream(obfuscated_data) -# If the script is run directly (as it should be), call the main function if __name__ == "__main__": main() diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index f8dd7d3..2b099c8 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -6,12 +6,6 @@ from typing import List, Dict from obfuscator.logger import get_logger from obfuscator.utils import Utilities -# Putting the CSV reading components into a class may seem like overkill -# for a simple script, but it allows for better organization and scalability. -# @staticmethod is used to define the method without an instance of the class -# being required. The methods could be defined just as functions, and this -# may still be changed. - class CSVReader: """ @@ -21,7 +15,6 @@ class CSVReader: def __init__(self, log_level=None): self.log_level = log_level - # Create the logger self.logger = get_logger("CSVREADER", log_level) def read_local(self, path) -> List[Dict[str, str]]: @@ -29,12 +22,8 @@ class CSVReader: A method to read a local CSV file and return the data as a list of dictionaries. """ - # Log the path of the file being read for debugging self.logger.debug(f"Reading local CSV from: {path}") - # Attempt to read the file and return the data as a list of dictionaries - # However, if the file isn't found or there is a generic exception, log - # the error and raise an exception try: with open(path, mode="r", encoding="utf-8") as f: reader = csv.DictReader(f) @@ -54,7 +43,6 @@ class CSVReader: bucket, key = utils.get_s3_path(path) self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" self.logger.debug("Using LocalStack endpoint for S3") @@ -69,15 +57,10 @@ class CSVReader: client = boto3.client("s3") try: - # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) self.logger.info("S3 object read successfully") - # Read and decode the content content = response["Body"].read().decode("utf-8") - # Even though the read_string method was only created for testing, - # it can be reused here to read and return the CSV data return CSVReader.read_string(content) - # TODO: Add more specific exceptions to catch except Exception as e: self.logger.error(f"Error reading S3 object: {e}") raise @@ -87,11 +70,9 @@ class CSVReader: A method to read CSV data from a string and return the data as a list of dictionaries. """ - # If the content is empty, return an empty list if not content.strip(): return [] - # Treat the string as a file-like object and return as list of dictionaries f = io.StringIO(content) reader = csv.DictReader(f) return [dict(row) for row in reader] diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 099e910..56b3f1f 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -3,7 +3,6 @@ import io from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("CSVWRITER") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 4f7e6c1..e964433 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -1,7 +1,6 @@ from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("OBFUSCATE") @@ -12,14 +11,10 @@ def obfuscate( A function to obfuscate PII fields in a list of dictionaries, replacing sensitive values with a string of asterisks. """ - # If no data is provided, log a message and return an empty list if not data: logger.info("No valid data was provided to obfuscate") return [] - # Obfuscate the PII fields in each record using a list/dict comprehension - # This code is good but makes debugging a bit tricky. I may consider - # breaking it down into a for loop. return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 81eb04a..f61451b 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -5,7 +5,6 @@ from obfuscator.logger import get_logger class Utilities: def __init__(self, logger=None): - # Create the logger self.logger = get_logger("UTILITIES", logger) def get_s3_path(self, uri): diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index e4c135b..0206542 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,6 +1,3 @@ -# csv_reader.py - tests for read_string and read_s3 -# Author: Alex Schofield - import boto3 from moto import mock_aws from obfuscator.csv_reader import CSVReader @@ -8,11 +5,6 @@ import pytest reader = CSVReader() -# CSVREADER: READ_STRING TESTS - -# Check if the function can read a CSV string with no content and return -# an empty list - def test_empty_csv_should_return_no_content(): content = "" @@ -21,10 +13,6 @@ def test_empty_csv_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with only a header and return -# an empty list - - def test_csv_with_header_only_should_return_no_content(): content = "student_id,name,course\n" result = reader.read_string(content) @@ -32,10 +20,6 @@ def test_csv_with_header_only_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with valid data and return -# a list of dictionaries - - def test_csv_with_valid_data(): content = ( "student_id,name,course\n" @@ -50,10 +34,6 @@ def test_csv_with_valid_data(): assert result == expected -# Check if the function can read a CSV string with quoted fields and return -# a list of dictionaries with the quoted fields intact - - def test_csv_with_quoted_fields_should_run_as_expected(): content = ( "student_id,name,course\n" @@ -68,9 +48,6 @@ def test_csv_with_quoted_fields_should_run_as_expected(): assert result == expected -# CSVREADER: READ_S3 TESTS - - def setup_s3(s3_client, bucket: str, key: str, content: str): s3_client.create_bucket( Bucket=bucket, diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index 4f61b16..c77b6b4 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,8 +1,5 @@ from obfuscator.obfuscate import obfuscate -# Check if the function does what its supposed to and can obfuscate -# valid PII fields in a list of dictionaries - def test_obfuscate_data_with_valid_pii_fields(): data = [ @@ -39,11 +36,6 @@ def test_obfuscate_data_with_valid_pii_fields(): assert result == expected -# Check if the function can obfuscate data even when some PII -# fields are missing from some of the data, returning a list of dictionaries -# but with the missing PII fields obfuscated and the rest of the data intact - - def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, @@ -69,9 +61,6 @@ def test_obfuscate_data_with_missing_pii_field(): assert result == expected -# Check if the function can handle an empty list of data, returning an empty list - - def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] @@ -81,10 +70,6 @@ def test_obfuscate_data_with_no_data(): assert result == expected -# Check if the function can handle an empty list of PII fields, returning the data as is -# without mutating it - - def test_obfuscate_data_with_empty_pii_fields(): data = [ { -- cgit v1.2.3 From ad0328b2f292fe438a8a6a1f7ff2d36856dc578d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:27:54 +0000 Subject: modify logger messages to be more clear --- cli.py | 10 +++------- obfuscator/csv_reader.py | 4 +++- obfuscator/csv_writer.py | 4 +--- obfuscator/obfuscate.py | 2 +- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'obfuscator') diff --git a/cli.py b/cli.py index c03ffd1..40c777f 100644 --- a/cli.py +++ b/cli.py @@ -31,23 +31,19 @@ def main(): args = parser.parse_args() log_level = "DEBUG" if args.verbose else "INFO" - logger = get_logger("CLI", log_level) reader = CSVReader(log_level) if args.local and not args.s3: - logger.debug("User chose to read CSV from local path") + logger.debug("Read data from local path") data = reader.read_local(args.local) - logger.debug("Contents: " + str(data)) else: - logger.debug("User chose to read CSV from S3") - + logger.debug("Read data from S3") data = reader.read_s3(args.s3) - logger.debug("Contents: " + str(data)) obfuscated_data = obfuscate(data, args.pii) - logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) + return create_byte_stream(obfuscated_data) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 2b099c8..3649681 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -45,7 +45,9 @@ class CSVReader: if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" - self.logger.debug("Using LocalStack endpoint for S3") + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) client = boto3.client( "s3", endpoint_url=localstack_endpoint, diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 56b3f1f..de7cd4b 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -8,8 +8,7 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: - logger.info("No valid data was provided to write") - return b"" + logger.error("Invalid or empty data was provided to write") output = io.StringIO() @@ -20,6 +19,5 @@ def create_byte_stream(data: List[Dict[str, str]]) -> bytes: writer.writerows(data) csv_string = output.getvalue() - logger.debug(f"CSV data: {csv_string}") return csv_string.encode("utf-8") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index e964433..9d43975 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -12,7 +12,7 @@ def obfuscate( sensitive values with a string of asterisks. """ if not data: - logger.info("No valid data was provided to obfuscate") + logger.error("Invalid or empty was provided to obfuscate") return [] return [ -- cgit v1.2.3 From d53acac501d43fa501f23efc961a90e5ef31dbbb Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:28:49 +0000 Subject: improve error handling in obfuscate() --- obfuscator/obfuscate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'obfuscator') diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 9d43975..73448ce 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -13,7 +13,10 @@ def obfuscate( """ if not data: logger.error("Invalid or empty was provided to obfuscate") - return [] + raise + if not pii_fields: + logger.error("No PII fields provided to obfuscate") + raise return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} -- cgit v1.2.3 From f1edb55e4f3e2692cb6259cd658c70db6f0cadd4 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:30:21 +0000 Subject: improve error handling in create_byte_stream() --- obfuscator/csv_writer.py | 1 + 1 file changed, 1 insertion(+) (limited to 'obfuscator') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index de7cd4b..2bff6e0 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -9,6 +9,7 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: logger.error("Invalid or empty data was provided to write") + raise output = io.StringIO() -- cgit v1.2.3 From bc3e976ff9b51ca47fa7ec6465b40e2e41ed6d0d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:32:58 +0000 Subject: improve error handling in read_s3() --- obfuscator/csv_reader.py | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'obfuscator') diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 3649681..97f67b7 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -63,6 +63,15 @@ class CSVReader: self.logger.info("S3 object read successfully") content = response["Body"].read().decode("utf-8") return CSVReader.read_string(content) + except client.exceptions.NoSuchKey: + self.logger.error(f"Object not found: {bucket}/{key}") + raise + except client.exceptions.ClientError as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + except UnicodeDecodeError as e: + self.logger.error(f"Error decoding S3 object: {e}") + raise except Exception as e: self.logger.error(f"Error reading S3 object: {e}") raise -- cgit v1.2.3 From 6b0fc64f71b65625ee59dffc8b8bd0459cde64fb Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:37:14 +0000 Subject: call read_string via self in read_s3 --- obfuscator/csv_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'obfuscator') diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 97f67b7..1f503d7 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -62,7 +62,7 @@ class CSVReader: response = client.get_object(Bucket=bucket, Key=key) self.logger.info("S3 object read successfully") content = response["Body"].read().decode("utf-8") - return CSVReader.read_string(content) + return self.read_string(content) except client.exceptions.NoSuchKey: self.logger.error(f"Object not found: {bucket}/{key}") raise -- cgit v1.2.3 From 422acef7a0762089298e9eae9944877e788fd94d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:50:09 +0000 Subject: fix some tests (some are still broken) --- obfuscator/csv_writer.py | 1 - obfuscator/obfuscate.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 2bff6e0..de7cd4b 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -9,7 +9,6 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: logger.error("Invalid or empty data was provided to write") - raise output = io.StringIO() diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 73448ce..9cd3a03 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -13,10 +13,10 @@ def obfuscate( """ if not data: logger.error("Invalid or empty was provided to obfuscate") - raise + raise ValueError("Invalid data provided to obfuscate") if not pii_fields: logger.error("No PII fields provided to obfuscate") - raise + raise ValueError("No PII fields provided to obfuscate") return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} -- cgit v1.2.3 From ae89b05dbc8feebc1410f39143c0d829f8704235 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:37:16 +0000 Subject: rename CSVReader to DataReader & update references --- cli.py | 4 +- obfuscator/csv_reader.py | 89 ----------------------- obfuscator/read.py | 89 +++++++++++++++++++++++ test/test_csv_reader.py | 181 ----------------------------------------------- test/test_read.py | 181 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 272 insertions(+), 272 deletions(-) delete mode 100644 obfuscator/csv_reader.py create mode 100644 obfuscator/read.py delete mode 100644 test/test_csv_reader.py create mode 100644 test/test_read.py (limited to 'obfuscator') diff --git a/cli.py b/cli.py index 7ffccd8..5100e2b 100644 --- a/cli.py +++ b/cli.py @@ -1,5 +1,5 @@ import argparse -from obfuscator.csv_reader import CSVReader +from obfuscator.read import DataReader from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger from obfuscator.csv_writer import create_byte_stream @@ -32,7 +32,7 @@ def main(): log_level = "DEBUG" if args.verbose else "INFO" logger = get_logger("CLI", log_level) - reader = CSVReader(log_level) + reader = DataReader(log_level) if args.local and not args.s3: logger.debug("Read data from local path") diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py deleted file mode 100644 index 1f503d7..0000000 --- a/obfuscator/csv_reader.py +++ /dev/null @@ -1,89 +0,0 @@ -import csv -import io -import boto3 -import os -from typing import List, Dict -from obfuscator.logger import get_logger -from obfuscator.utils import Utilities - - -class CSVReader: - """ - A class to read CSV data from a local file, S3 object, or string. Near - the project completion, support for JSON/Parquet files will be added. - """ - - def __init__(self, log_level=None): - self.log_level = log_level - self.logger = get_logger("CSVREADER", log_level) - - def read_local(self, path) -> List[Dict[str, str]]: - """ - A method to read a local CSV file and return the data as a list of - dictionaries. - """ - self.logger.debug(f"Reading local CSV from: {path}") - - try: - with open(path, mode="r", encoding="utf-8") as f: - reader = csv.DictReader(f) - return [dict(row) for row in reader] - except FileNotFoundError: - self.logger.error(f"File not found: {path}") - raise - except Exception as e: - self.logger.error(f"Error reading file: {e}") - - def read_s3(self, path) -> List[Dict[str, str]]: - """ - A method to read an S3 object containing CSV data - and return the data as a list of dictionaries. - """ - utils = Utilities(self.log_level) - bucket, key = utils.get_s3_path(path) - self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - - if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": - localstack_endpoint = "http://localhost.localstack.cloud:4566" - self.logger.debug( - "Using LocalStack endpoint for S3 - ensure LocalStack is running" - ) - client = boto3.client( - "s3", - endpoint_url=localstack_endpoint, - aws_access_key_id="dummy", - aws_secret_access_key="dummy", - ) - self.logger.debug(f"endpoint_url: {localstack_endpoint}") - else: - client = boto3.client("s3") - - try: - response = client.get_object(Bucket=bucket, Key=key) - self.logger.info("S3 object read successfully") - content = response["Body"].read().decode("utf-8") - return self.read_string(content) - except client.exceptions.NoSuchKey: - self.logger.error(f"Object not found: {bucket}/{key}") - raise - except client.exceptions.ClientError as e: - self.logger.error(f"Error reading S3 object: {e}") - raise - except UnicodeDecodeError as e: - self.logger.error(f"Error decoding S3 object: {e}") - raise - except Exception as e: - self.logger.error(f"Error reading S3 object: {e}") - raise - - def read_string(self, content: str) -> List[Dict[str, str]]: - """ - A method to read CSV data from a string and return the data as a list - of dictionaries. - """ - if not content.strip(): - return [] - - f = io.StringIO(content) - reader = csv.DictReader(f) - return [dict(row) for row in reader] diff --git a/obfuscator/read.py b/obfuscator/read.py new file mode 100644 index 0000000..b704643 --- /dev/null +++ b/obfuscator/read.py @@ -0,0 +1,89 @@ +import csv +import io +import boto3 +import os +from typing import List, Dict +from obfuscator.logger import get_logger +from obfuscator.utils import Utilities + + +class DataReader: + """ + A class to read CSV data from a local file, S3 object, or string. Near + the project completion, support for JSON/Parquet files will be added. + """ + + def __init__(self, log_level=None): + self.log_level = log_level + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: + """ + A method to read a local CSV file and return the data as a list of + dictionaries. + """ + self.logger.debug(f"Reading local CSV from: {path}") + + try: + with open(path, mode="r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [dict(row) for row in reader] + except FileNotFoundError: + self.logger.error(f"File not found: {path}") + raise + except Exception as e: + self.logger.error(f"Error reading file: {e}") + + def read_s3(self, path) -> List[Dict[str, str]]: + """ + A method to read an S3 object containing CSV data + and return the data as a list of dictionaries. + """ + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + + if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": + localstack_endpoint = "http://localhost.localstack.cloud:4566" + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) + client = boto3.client( + "s3", + endpoint_url=localstack_endpoint, + aws_access_key_id="dummy", + aws_secret_access_key="dummy", + ) + self.logger.debug(f"endpoint_url: {localstack_endpoint}") + else: + client = boto3.client("s3") + + try: + response = client.get_object(Bucket=bucket, Key=key) + self.logger.info("S3 object read successfully") + content = response["Body"].read().decode("utf-8") + return self.read_string(content) + except client.exceptions.NoSuchKey: + self.logger.error(f"Object not found: {bucket}/{key}") + raise + except client.exceptions.ClientError as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + except UnicodeDecodeError as e: + self.logger.error(f"Error decoding S3 object: {e}") + raise + except Exception as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + + def read_string(self, content: str) -> List[Dict[str, str]]: + """ + A method to read CSV data from a string and return the data as a list + of dictionaries. + """ + if not content.strip(): + return [] + + f = io.StringIO(content) + reader = csv.DictReader(f) + return [dict(row) for row in reader] diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py deleted file mode 100644 index d6129e7..0000000 --- a/test/test_csv_reader.py +++ /dev/null @@ -1,181 +0,0 @@ -import boto3 -from moto import mock_aws -from obfuscator.csv_reader import CSVReader -import pytest - -reader = CSVReader(log_level="DEBUG") - - -def test_empty_csv_should_return_no_content(): - content = "" - result = reader.read_string(content) - expected = [] - assert result == expected - - -def test_csv_with_header_only_should_return_no_content(): - content = "student_id,name,course\n" - result = reader.read_string(content) - expected = [] - assert result == expected - - -def test_csv_with_valid_data(): - content = ( - "student_id,name,course\n" - "1234,Student 1,Course 1\n" - "5678,Student 2,Course 2\n" - ) - result = reader.read_string(content) - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - assert result == expected - - -def test_csv_with_quoted_fields_should_run_as_expected(): - content = ( - "student_id,name,course\n" - '1234,"Student 1","Course 1"\n' - '5678,"Student 2","Course 2"\n' - ) - result = reader.read_string(content) - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - assert result == expected - - -def setup_s3(s3_client, bucket: str, key: str, content: str): - s3_client.create_bucket( - Bucket=bucket, - CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, - ) - s3_client.put_object(Bucket=bucket, Key=key, Body=content) - - -@pytest.fixture(autouse=True) -def s3_client(): - with mock_aws(): - yield boto3.client("s3", "eu-west-2") - - -def test_read_s3_valid_csv_returns_expected(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/mock.csv" - - csv_content = ( - "student_id,name,course\n" - "1234,Student 1,Course 1\n" - "5678,Student 2,Course 2\n" - ) - - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - - assert data == expected - - -def test_read_s3_empty_csv_returns_empty_list(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "empty-bucket" - key = "data/empty.csv" - csv_content = "student_id,name,course\n" - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - assert data == [] - - -def test_read_s3_nonexistent_bucket_raises_exception(): - with mock_aws(): - bucket = "nonexistent-bucket" - key = "data/mock.csv" - path = f"s3://{bucket}/{key}" - with pytest.raises(Exception): - reader.read_s3(path) - - -def test_read_s3_nonexistent_key_raises_exception(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - s3.create_bucket( - Bucket=bucket, - CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, - ) - key = "data/nonexistent.csv" - path = f"s3://{bucket}/{key}" - with pytest.raises(Exception): - reader.read_s3(path) - - -def test_read_s3_malformed_csv_returns_expected(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/malformed.csv" - csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n" - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] - assert data == expected - - -def test_read_s3_csv_with_extra_empty_lines(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/extra_lines.csv" - csv_content = ( - "student_id,name,course\n" - "1234,Student 1,Course 1\n" - "\n" - "5678,Student 2,Course 2\n" - "\n" - ) - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - expected = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - assert data == expected - - -def test_read_s3_csv_with_whitespace_in_fields(): - with mock_aws(): - s3 = boto3.client("s3", region_name="eu-west-2") - bucket = "test-bucket" - key = "data/whitespace.csv" - csv_content = ( - "student_id, name , course \n" - " 1234 , Student 1 , Course 1 \n" - "5678,Student 2,Course 2\n" - ) - setup_s3(s3, bucket, key, csv_content) - path = f"s3://{bucket}/{key}" - - data = reader.read_s3(path) - expected = [ - {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, - {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, - ] - assert data == expected diff --git a/test/test_read.py b/test/test_read.py new file mode 100644 index 0000000..903ab5d --- /dev/null +++ b/test/test_read.py @@ -0,0 +1,181 @@ +import boto3 +from moto import mock_aws +from obfuscator.read import CSVReader +import pytest + +reader = CSVReader(log_level="DEBUG") + + +def test_empty_csv_should_return_no_content(): + content = "" + result = reader.read_string(content) + expected = [] + assert result == expected + + +def test_csv_with_header_only_should_return_no_content(): + content = "student_id,name,course\n" + result = reader.read_string(content) + expected = [] + assert result == expected + + +def test_csv_with_valid_data(): + content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + result = reader.read_string(content) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert result == expected + + +def test_csv_with_quoted_fields_should_run_as_expected(): + content = ( + "student_id,name,course\n" + '1234,"Student 1","Course 1"\n' + '5678,"Student 2","Course 2"\n' + ) + result = reader.read_string(content) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert result == expected + + +def setup_s3(s3_client, bucket: str, key: str, content: str): + s3_client.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + s3_client.put_object(Bucket=bucket, Key=key, Body=content) + + +@pytest.fixture(autouse=True) +def s3_client(): + with mock_aws(): + yield boto3.client("s3", "eu-west-2") + + +def test_read_s3_valid_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/mock.csv" + + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + + assert data == expected + + +def test_read_s3_empty_csv_returns_empty_list(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "empty-bucket" + key = "data/empty.csv" + csv_content = "student_id,name,course\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + assert data == [] + + +def test_read_s3_nonexistent_bucket_raises_exception(): + with mock_aws(): + bucket = "nonexistent-bucket" + key = "data/mock.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + reader.read_s3(path) + + +def test_read_s3_nonexistent_key_raises_exception(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + s3.create_bucket( + Bucket=bucket, + CreateBucketConfiguration={"LocationConstraint": "eu-west-2"}, + ) + key = "data/nonexistent.csv" + path = f"s3://{bucket}/{key}" + with pytest.raises(Exception): + reader.read_s3(path) + + +def test_read_s3_malformed_csv_returns_expected(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/malformed.csv" + csv_content = "1234,Student 1,Course 1\n" "5678,Student 2,Course 2\n" + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + expected = [{"1234": "5678", "Student 1": "Student 2", "Course 1": "Course 2"}] + assert data == expected + + +def test_read_s3_csv_with_extra_empty_lines(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/extra_lines.csv" + csv_content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "\n" + "5678,Student 2,Course 2\n" + "\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert data == expected + + +def test_read_s3_csv_with_whitespace_in_fields(): + with mock_aws(): + s3 = boto3.client("s3", region_name="eu-west-2") + bucket = "test-bucket" + key = "data/whitespace.csv" + csv_content = ( + "student_id, name , course \n" + " 1234 , Student 1 , Course 1 \n" + "5678,Student 2,Course 2\n" + ) + setup_s3(s3, bucket, key, csv_content) + path = f"s3://{bucket}/{key}" + + data = reader.read_s3(path) + expected = [ + {"student_id": " 1234 ", " name ": " Student 1 ", " course ": " Course 1 "}, + {"student_id": "5678", " name ": "Student 2", " course ": "Course 2"}, + ] + assert data == expected -- cgit v1.2.3 From 1608d01bb68c1f6292b04c70caa609d34943b371 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:37:57 +0000 Subject: rename write function & update references --- obfuscator/csv_writer.py | 23 ------------------- obfuscator/write.py | 23 +++++++++++++++++++ test/test_csv_writer.py | 57 ------------------------------------------------ test/test_write.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 80 deletions(-) delete mode 100644 obfuscator/csv_writer.py create mode 100644 obfuscator/write.py delete mode 100644 test/test_csv_writer.py create mode 100644 test/test_write.py (limited to 'obfuscator') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py deleted file mode 100644 index de7cd4b..0000000 --- a/obfuscator/csv_writer.py +++ /dev/null @@ -1,23 +0,0 @@ -import csv -import io -from typing import List, Dict -from obfuscator.logger import get_logger - -logger = get_logger("CSVWRITER") - - -def create_byte_stream(data: List[Dict[str, str]]) -> bytes: - if not data: - logger.error("Invalid or empty data was provided to write") - - output = io.StringIO() - - headers = list(data[0].keys()) - - writer = csv.DictWriter(output, fieldnames=headers) - writer.writeheader() - writer.writerows(data) - - csv_string = output.getvalue() - - return csv_string.encode("utf-8") diff --git a/obfuscator/write.py b/obfuscator/write.py new file mode 100644 index 0000000..de7cd4b --- /dev/null +++ b/obfuscator/write.py @@ -0,0 +1,23 @@ +import csv +import io +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("CSVWRITER") + + +def create_byte_stream(data: List[Dict[str, str]]) -> bytes: + if not data: + logger.error("Invalid or empty data was provided to write") + + output = io.StringIO() + + headers = list(data[0].keys()) + + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) + + csv_string = output.getvalue() + + return csv_string.encode("utf-8") diff --git a/test/test_csv_writer.py b/test/test_csv_writer.py deleted file mode 100644 index eceac28..0000000 --- a/test/test_csv_writer.py +++ /dev/null @@ -1,57 +0,0 @@ -import io -import csv -from obfuscator.csv_writer import create_byte_stream - - -def csv_bytes_to_list(csv_bytes: bytes): - csv_string = csv_bytes.decode("utf-8") - f = io.StringIO(csv_string) - reader = csv.DictReader(f) - return [dict(row) for row in reader] - - -def test_create_byte_stream_valid_data(): - data = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - csv_bytes = create_byte_stream(data) - result = csv_bytes_to_list(csv_bytes) - assert result == data - - -def test_create_byte_stream_empty_data(): - csv_bytes = create_byte_stream([]) - assert csv_bytes == b"" - - -def test_create_byte_stream_handles_quoted_fields(): - data = [ - {"student_id": "1234", "name": 'Student "One"', "course": "Course, A"}, - {"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"}, - ] - csv_bytes = create_byte_stream(data) - result = csv_bytes_to_list(csv_bytes) - assert result == data - - -def test_create_byte_stream_consistent_header_order(): - data = [ - {"student_id": "1234", "name": "Alice", "course": "Math"}, - {"student_id": "5678", "name": "Bob", "course": "Science"}, - ] - csv_bytes = create_byte_stream(data) - csv_string = csv_bytes.decode("utf-8") - header_line = csv_string.splitlines()[0] - expected_header = ",".join(data[0].keys()) - assert header_line == expected_header - - -def test_create_byte_stream_special_characters(): - data = [ - {"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"}, - {"student_id": "5678", "name": "Student 2", "course": "Value with, comma"}, - ] - csv_bytes = create_byte_stream(data) - result = csv_bytes_to_list(csv_bytes) - assert result == data diff --git a/test/test_write.py b/test/test_write.py new file mode 100644 index 0000000..eceac28 --- /dev/null +++ b/test/test_write.py @@ -0,0 +1,57 @@ +import io +import csv +from obfuscator.csv_writer import create_byte_stream + + +def csv_bytes_to_list(csv_bytes: bytes): + csv_string = csv_bytes.decode("utf-8") + f = io.StringIO(csv_string) + reader = csv.DictReader(f) + return [dict(row) for row in reader] + + +def test_create_byte_stream_valid_data(): + data = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data + + +def test_create_byte_stream_empty_data(): + csv_bytes = create_byte_stream([]) + assert csv_bytes == b"" + + +def test_create_byte_stream_handles_quoted_fields(): + data = [ + {"student_id": "1234", "name": 'Student "One"', "course": "Course, A"}, + {"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data + + +def test_create_byte_stream_consistent_header_order(): + data = [ + {"student_id": "1234", "name": "Alice", "course": "Math"}, + {"student_id": "5678", "name": "Bob", "course": "Science"}, + ] + csv_bytes = create_byte_stream(data) + csv_string = csv_bytes.decode("utf-8") + header_line = csv_string.splitlines()[0] + expected_header = ",".join(data[0].keys()) + assert header_line == expected_header + + +def test_create_byte_stream_special_characters(): + data = [ + {"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"}, + {"student_id": "5678", "name": "Student 2", "course": "Value with, comma"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data -- cgit v1.2.3 From ef3f16de8d93821d54344d5cdd16d8deee0b016c Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:40:39 +0000 Subject: wrap write functions in class and update references --- cli.py | 6 ++++-- obfuscator/write.py | 24 ++++++++++++++---------- test/test_write.py | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'obfuscator') diff --git a/cli.py b/cli.py index 5100e2b..bd49707 100644 --- a/cli.py +++ b/cli.py @@ -1,8 +1,8 @@ import argparse from obfuscator.read import DataReader +from obfuscator.write import DataWriter from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger -from obfuscator.csv_writer import create_byte_stream def main(): @@ -43,7 +43,9 @@ def main(): obfuscated_data = obfuscate(data, args.pii) - return create_byte_stream(obfuscated_data) + writer = DataWriter() + + return writer.create_byte_stream(obfuscated_data) if __name__ == "__main__": diff --git a/obfuscator/write.py b/obfuscator/write.py index de7cd4b..4081f0f 100644 --- a/obfuscator/write.py +++ b/obfuscator/write.py @@ -6,18 +6,22 @@ from obfuscator.logger import get_logger logger = get_logger("CSVWRITER") -def create_byte_stream(data: List[Dict[str, str]]) -> bytes: - if not data: - logger.error("Invalid or empty data was provided to write") +class DataWriter: + def __init__(self): + pass - output = io.StringIO() + def create_byte_stream(self, data: List[Dict[str, str]]) -> bytes: + if not data: + logger.error("Invalid or empty data was provided to write") - headers = list(data[0].keys()) + output = io.StringIO() - writer = csv.DictWriter(output, fieldnames=headers) - writer.writeheader() - writer.writerows(data) + headers = list(data[0].keys()) - csv_string = output.getvalue() + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) - return csv_string.encode("utf-8") + csv_string = output.getvalue() + + return csv_string.encode("utf-8") diff --git a/test/test_write.py b/test/test_write.py index eceac28..f339799 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -1,6 +1,6 @@ import io import csv -from obfuscator.csv_writer import create_byte_stream +from obfuscator.write import create_byte_stream def csv_bytes_to_list(csv_bytes: bytes): -- cgit v1.2.3 From 2c7fb06247d3684d951579e75e3a99136f06415f Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:50:15 +0000 Subject: return empty list instead of raising exceptions --- obfuscator/obfuscate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 9cd3a03..0a33158 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -13,10 +13,10 @@ def obfuscate( """ if not data: logger.error("Invalid or empty was provided to obfuscate") - raise ValueError("Invalid data provided to obfuscate") + return [] if not pii_fields: logger.error("No PII fields provided to obfuscate") - raise ValueError("No PII fields provided to obfuscate") + return [] return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} -- cgit v1.2.3 From c851a52500a630d2ebcedc562ba6a83531dc9e0d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:51:14 +0000 Subject: update log messages for invalid data to reflect new behaviour --- obfuscator/obfuscate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 0a33158..40f8493 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -12,10 +12,12 @@ def obfuscate( sensitive values with a string of asterisks. """ if not data: - logger.error("Invalid or empty was provided to obfuscate") + logger.error( + "Invalid or empty data was provided to obfuscate. Returning empty list." + ) return [] if not pii_fields: - logger.error("No PII fields provided to obfuscate") + logger.error("No PII fields provided to obfuscate. Returning empty list.") return [] return [ -- cgit v1.2.3 From 4be03e1fb3ec2d50ae08921948ecaf1bb12b28a3 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:52:06 +0000 Subject: if no PII fields provided, return original data unchanged --- obfuscator/obfuscate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 40f8493..cd12b6d 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -17,8 +17,8 @@ def obfuscate( ) return [] if not pii_fields: - logger.error("No PII fields provided to obfuscate. Returning empty list.") - return [] + logger.error("No PII fields provided to obfuscate. Returning data unchanged.") + return data return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} -- cgit v1.2.3 From b402ee55d9f9e37c772c47b703ae4b66b66adda6 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:54:18 +0000 Subject: re-add original behaviour of returning empty byte stream when data empty in create_byte_stream() --- obfuscator/write.py | 1 + 1 file changed, 1 insertion(+) (limited to 'obfuscator') diff --git a/obfuscator/write.py b/obfuscator/write.py index 4081f0f..451b073 100644 --- a/obfuscator/write.py +++ b/obfuscator/write.py @@ -13,6 +13,7 @@ class DataWriter: def create_byte_stream(self, data: List[Dict[str, str]]) -> bytes: if not data: logger.error("Invalid or empty data was provided to write") + return b"" output = io.StringIO() -- cgit v1.2.3 From 5402af2c7198a685a57a05e29a869e1e72a6b877 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:55:50 +0000 Subject: style: format code with Autopep8, Black and Ruff Formatter This commit fixes the style issues introduced in b402ee5 according to the output from Autopep8, Black and Ruff Formatter. Details: https://github.com/ajschofield/gdpr-obfuscator/pull/8 --- obfuscator/logger.py | 1 - obfuscator/utils.py | 1 - 2 files changed, 2 deletions(-) (limited to 'obfuscator') diff --git a/obfuscator/logger.py b/obfuscator/logger.py index 2c5b988..140fa8f 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -12,7 +12,6 @@ class LogLevel(Enum): def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: - if isinstance(level, str): try: level = LogLevel[level.upper()] diff --git a/obfuscator/utils.py b/obfuscator/utils.py index f61451b..77ca1cf 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -3,7 +3,6 @@ from obfuscator.logger import get_logger class Utilities: - def __init__(self, logger=None): self.logger = get_logger("UTILITIES", logger) -- cgit v1.2.3