From 0ac6d213c1b9cbc24313b9ac5d442e29f027b798 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:51:13 +0000 Subject: update logger names --- obfuscator/csv_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'obfuscator/csv_writer.py') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index aa5ac3f..099e910 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -4,7 +4,7 @@ from typing import List, Dict from obfuscator.logger import get_logger # Create the logger -logger = get_logger("CSVWriter") +logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: -- cgit v1.2.3 From 3c4b66e8490c6fdf93fb8fee735d52c76eb2853b Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:22:26 +0000 Subject: remove annoying comments for better readability of code --- cli.py | 16 ---------------- obfuscator/csv_reader.py | 19 ------------------- obfuscator/csv_writer.py | 1 - obfuscator/obfuscate.py | 5 ----- obfuscator/utils.py | 1 - test/test_csv_reader.py | 23 ----------------------- test/test_obfuscator.py | 15 --------------- 7 files changed, 80 deletions(-) (limited to 'obfuscator/csv_writer.py') diff --git a/cli.py b/cli.py index d19c18d..c03ffd1 100644 --- a/cli.py +++ b/cli.py @@ -7,7 +7,6 @@ from obfuscator.csv_writer import create_byte_stream def main(): - # Create an argument parser parser = argparse.ArgumentParser( prog="GDPR-Obfuscator", description="Obfuscate sensitive data stored locally or in an AWS environment", @@ -17,16 +16,10 @@ def main(): "-v", "--verbose", action="store_true", help="Enable verbose logging" ) - # Require user to either choose a local file or an S3 object - # The user can only choose one of these options or the program will exit - # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) loc.add_argument("-l", "--local", help="Local path to file") loc.add_argument("-s", "--s3", help="URI path to file stored in S3") - # Require user to provide a list of PII fields to obfuscate - # e.g. --pii name email_address - # If not provided, the program will exit parser.add_argument( "-p", "--pii", @@ -35,23 +28,17 @@ def main(): help="List of PII fields to obfuscate, separated by spaces", ) - # Parse the arguments args = parser.parse_args() - # If the user chose verbose logging, set the logger to debug log_level = "DEBUG" if args.verbose else "INFO" - # Create the logger logger = get_logger("CLI", log_level) - # Create the CSVReader object reader = CSVReader(log_level) - # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") data = reader.read_local(args.local) - # For debug purposes, log the data read from the CSV logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") @@ -59,13 +46,10 @@ def main(): data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) - # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) - # For debug purposes, log the obfuscated data as JSON for readability logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) return create_byte_stream(obfuscated_data) -# If the script is run directly (as it should be), call the main function if __name__ == "__main__": main() diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index f8dd7d3..2b099c8 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -6,12 +6,6 @@ from typing import List, Dict from obfuscator.logger import get_logger from obfuscator.utils import Utilities -# Putting the CSV reading components into a class may seem like overkill -# for a simple script, but it allows for better organization and scalability. -# @staticmethod is used to define the method without an instance of the class -# being required. The methods could be defined just as functions, and this -# may still be changed. - class CSVReader: """ @@ -21,7 +15,6 @@ class CSVReader: def __init__(self, log_level=None): self.log_level = log_level - # Create the logger self.logger = get_logger("CSVREADER", log_level) def read_local(self, path) -> List[Dict[str, str]]: @@ -29,12 +22,8 @@ class CSVReader: A method to read a local CSV file and return the data as a list of dictionaries. """ - # Log the path of the file being read for debugging self.logger.debug(f"Reading local CSV from: {path}") - # Attempt to read the file and return the data as a list of dictionaries - # However, if the file isn't found or there is a generic exception, log - # the error and raise an exception try: with open(path, mode="r", encoding="utf-8") as f: reader = csv.DictReader(f) @@ -54,7 +43,6 @@ class CSVReader: bucket, key = utils.get_s3_path(path) self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" self.logger.debug("Using LocalStack endpoint for S3") @@ -69,15 +57,10 @@ class CSVReader: client = boto3.client("s3") try: - # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) self.logger.info("S3 object read successfully") - # Read and decode the content content = response["Body"].read().decode("utf-8") - # Even though the read_string method was only created for testing, - # it can be reused here to read and return the CSV data return CSVReader.read_string(content) - # TODO: Add more specific exceptions to catch except Exception as e: self.logger.error(f"Error reading S3 object: {e}") raise @@ -87,11 +70,9 @@ class CSVReader: A method to read CSV data from a string and return the data as a list of dictionaries. """ - # If the content is empty, return an empty list if not content.strip(): return [] - # Treat the string as a file-like object and return as list of dictionaries f = io.StringIO(content) reader = csv.DictReader(f) return [dict(row) for row in reader] diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 099e910..56b3f1f 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -3,7 +3,6 @@ import io from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("CSVWRITER") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 4f7e6c1..e964433 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -1,7 +1,6 @@ from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("OBFUSCATE") @@ -12,14 +11,10 @@ def obfuscate( A function to obfuscate PII fields in a list of dictionaries, replacing sensitive values with a string of asterisks. """ - # If no data is provided, log a message and return an empty list if not data: logger.info("No valid data was provided to obfuscate") return [] - # Obfuscate the PII fields in each record using a list/dict comprehension - # This code is good but makes debugging a bit tricky. I may consider - # breaking it down into a for loop. return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 81eb04a..f61451b 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -5,7 +5,6 @@ from obfuscator.logger import get_logger class Utilities: def __init__(self, logger=None): - # Create the logger self.logger = get_logger("UTILITIES", logger) def get_s3_path(self, uri): diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index e4c135b..0206542 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,6 +1,3 @@ -# csv_reader.py - tests for read_string and read_s3 -# Author: Alex Schofield - import boto3 from moto import mock_aws from obfuscator.csv_reader import CSVReader @@ -8,11 +5,6 @@ import pytest reader = CSVReader() -# CSVREADER: READ_STRING TESTS - -# Check if the function can read a CSV string with no content and return -# an empty list - def test_empty_csv_should_return_no_content(): content = "" @@ -21,10 +13,6 @@ def test_empty_csv_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with only a header and return -# an empty list - - def test_csv_with_header_only_should_return_no_content(): content = "student_id,name,course\n" result = reader.read_string(content) @@ -32,10 +20,6 @@ def test_csv_with_header_only_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with valid data and return -# a list of dictionaries - - def test_csv_with_valid_data(): content = ( "student_id,name,course\n" @@ -50,10 +34,6 @@ def test_csv_with_valid_data(): assert result == expected -# Check if the function can read a CSV string with quoted fields and return -# a list of dictionaries with the quoted fields intact - - def test_csv_with_quoted_fields_should_run_as_expected(): content = ( "student_id,name,course\n" @@ -68,9 +48,6 @@ def test_csv_with_quoted_fields_should_run_as_expected(): assert result == expected -# CSVREADER: READ_S3 TESTS - - def setup_s3(s3_client, bucket: str, key: str, content: str): s3_client.create_bucket( Bucket=bucket, diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index 4f61b16..c77b6b4 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,8 +1,5 @@ from obfuscator.obfuscate import obfuscate -# Check if the function does what its supposed to and can obfuscate -# valid PII fields in a list of dictionaries - def test_obfuscate_data_with_valid_pii_fields(): data = [ @@ -39,11 +36,6 @@ def test_obfuscate_data_with_valid_pii_fields(): assert result == expected -# Check if the function can obfuscate data even when some PII -# fields are missing from some of the data, returning a list of dictionaries -# but with the missing PII fields obfuscated and the rest of the data intact - - def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, @@ -69,9 +61,6 @@ def test_obfuscate_data_with_missing_pii_field(): assert result == expected -# Check if the function can handle an empty list of data, returning an empty list - - def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] @@ -81,10 +70,6 @@ def test_obfuscate_data_with_no_data(): assert result == expected -# Check if the function can handle an empty list of PII fields, returning the data as is -# without mutating it - - def test_obfuscate_data_with_empty_pii_fields(): data = [ { -- cgit v1.2.3 From ad0328b2f292fe438a8a6a1f7ff2d36856dc578d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:27:54 +0000 Subject: modify logger messages to be more clear --- cli.py | 10 +++------- obfuscator/csv_reader.py | 4 +++- obfuscator/csv_writer.py | 4 +--- obfuscator/obfuscate.py | 2 +- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'obfuscator/csv_writer.py') diff --git a/cli.py b/cli.py index c03ffd1..40c777f 100644 --- a/cli.py +++ b/cli.py @@ -31,23 +31,19 @@ def main(): args = parser.parse_args() log_level = "DEBUG" if args.verbose else "INFO" - logger = get_logger("CLI", log_level) reader = CSVReader(log_level) if args.local and not args.s3: - logger.debug("User chose to read CSV from local path") + logger.debug("Read data from local path") data = reader.read_local(args.local) - logger.debug("Contents: " + str(data)) else: - logger.debug("User chose to read CSV from S3") - + logger.debug("Read data from S3") data = reader.read_s3(args.s3) - logger.debug("Contents: " + str(data)) obfuscated_data = obfuscate(data, args.pii) - logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) + return create_byte_stream(obfuscated_data) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 2b099c8..3649681 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -45,7 +45,9 @@ class CSVReader: if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" - self.logger.debug("Using LocalStack endpoint for S3") + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) client = boto3.client( "s3", endpoint_url=localstack_endpoint, diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 56b3f1f..de7cd4b 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -8,8 +8,7 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: - logger.info("No valid data was provided to write") - return b"" + logger.error("Invalid or empty data was provided to write") output = io.StringIO() @@ -20,6 +19,5 @@ def create_byte_stream(data: List[Dict[str, str]]) -> bytes: writer.writerows(data) csv_string = output.getvalue() - logger.debug(f"CSV data: {csv_string}") return csv_string.encode("utf-8") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index e964433..9d43975 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -12,7 +12,7 @@ def obfuscate( sensitive values with a string of asterisks. """ if not data: - logger.info("No valid data was provided to obfuscate") + logger.error("Invalid or empty was provided to obfuscate") return [] return [ -- cgit v1.2.3 From f1edb55e4f3e2692cb6259cd658c70db6f0cadd4 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:30:21 +0000 Subject: improve error handling in create_byte_stream() --- obfuscator/csv_writer.py | 1 + 1 file changed, 1 insertion(+) (limited to 'obfuscator/csv_writer.py') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index de7cd4b..2bff6e0 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -9,6 +9,7 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: logger.error("Invalid or empty data was provided to write") + raise output = io.StringIO() -- cgit v1.2.3 From 422acef7a0762089298e9eae9944877e788fd94d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:50:09 +0000 Subject: fix some tests (some are still broken) --- obfuscator/csv_writer.py | 1 - obfuscator/obfuscate.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'obfuscator/csv_writer.py') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 2bff6e0..de7cd4b 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -9,7 +9,6 @@ logger = get_logger("CSVWRITER") def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: logger.error("Invalid or empty data was provided to write") - raise output = io.StringIO() diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 73448ce..9cd3a03 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -13,10 +13,10 @@ def obfuscate( """ if not data: logger.error("Invalid or empty was provided to obfuscate") - raise + raise ValueError("Invalid data provided to obfuscate") if not pii_fields: logger.error("No PII fields provided to obfuscate") - raise + raise ValueError("No PII fields provided to obfuscate") return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} -- cgit v1.2.3 From 1608d01bb68c1f6292b04c70caa609d34943b371 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 15:37:57 +0000 Subject: rename write function & update references --- obfuscator/csv_writer.py | 23 ------------------- obfuscator/write.py | 23 +++++++++++++++++++ test/test_csv_writer.py | 57 ------------------------------------------------ test/test_write.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 80 deletions(-) delete mode 100644 obfuscator/csv_writer.py create mode 100644 obfuscator/write.py delete mode 100644 test/test_csv_writer.py create mode 100644 test/test_write.py (limited to 'obfuscator/csv_writer.py') diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py deleted file mode 100644 index de7cd4b..0000000 --- a/obfuscator/csv_writer.py +++ /dev/null @@ -1,23 +0,0 @@ -import csv -import io -from typing import List, Dict -from obfuscator.logger import get_logger - -logger = get_logger("CSVWRITER") - - -def create_byte_stream(data: List[Dict[str, str]]) -> bytes: - if not data: - logger.error("Invalid or empty data was provided to write") - - output = io.StringIO() - - headers = list(data[0].keys()) - - writer = csv.DictWriter(output, fieldnames=headers) - writer.writeheader() - writer.writerows(data) - - csv_string = output.getvalue() - - return csv_string.encode("utf-8") diff --git a/obfuscator/write.py b/obfuscator/write.py new file mode 100644 index 0000000..de7cd4b --- /dev/null +++ b/obfuscator/write.py @@ -0,0 +1,23 @@ +import csv +import io +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("CSVWRITER") + + +def create_byte_stream(data: List[Dict[str, str]]) -> bytes: + if not data: + logger.error("Invalid or empty data was provided to write") + + output = io.StringIO() + + headers = list(data[0].keys()) + + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) + + csv_string = output.getvalue() + + return csv_string.encode("utf-8") diff --git a/test/test_csv_writer.py b/test/test_csv_writer.py deleted file mode 100644 index eceac28..0000000 --- a/test/test_csv_writer.py +++ /dev/null @@ -1,57 +0,0 @@ -import io -import csv -from obfuscator.csv_writer import create_byte_stream - - -def csv_bytes_to_list(csv_bytes: bytes): - csv_string = csv_bytes.decode("utf-8") - f = io.StringIO(csv_string) - reader = csv.DictReader(f) - return [dict(row) for row in reader] - - -def test_create_byte_stream_valid_data(): - data = [ - {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, - {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, - ] - csv_bytes = create_byte_stream(data) - result = csv_bytes_to_list(csv_bytes) - assert result == data - - -def test_create_byte_stream_empty_data(): - csv_bytes = create_byte_stream([]) - assert csv_bytes == b"" - - -def test_create_byte_stream_handles_quoted_fields(): - data = [ - {"student_id": "1234", "name": 'Student "One"', "course": "Course, A"}, - {"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"}, - ] - csv_bytes = create_byte_stream(data) - result = csv_bytes_to_list(csv_bytes) - assert result == data - - -def test_create_byte_stream_consistent_header_order(): - data = [ - {"student_id": "1234", "name": "Alice", "course": "Math"}, - {"student_id": "5678", "name": "Bob", "course": "Science"}, - ] - csv_bytes = create_byte_stream(data) - csv_string = csv_bytes.decode("utf-8") - header_line = csv_string.splitlines()[0] - expected_header = ",".join(data[0].keys()) - assert header_line == expected_header - - -def test_create_byte_stream_special_characters(): - data = [ - {"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"}, - {"student_id": "5678", "name": "Student 2", "course": "Value with, comma"}, - ] - csv_bytes = create_byte_stream(data) - result = csv_bytes_to_list(csv_bytes) - assert result == data diff --git a/test/test_write.py b/test/test_write.py new file mode 100644 index 0000000..eceac28 --- /dev/null +++ b/test/test_write.py @@ -0,0 +1,57 @@ +import io +import csv +from obfuscator.csv_writer import create_byte_stream + + +def csv_bytes_to_list(csv_bytes: bytes): + csv_string = csv_bytes.decode("utf-8") + f = io.StringIO(csv_string) + reader = csv.DictReader(f) + return [dict(row) for row in reader] + + +def test_create_byte_stream_valid_data(): + data = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data + + +def test_create_byte_stream_empty_data(): + csv_bytes = create_byte_stream([]) + assert csv_bytes == b"" + + +def test_create_byte_stream_handles_quoted_fields(): + data = [ + {"student_id": "1234", "name": 'Student "One"', "course": "Course, A"}, + {"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data + + +def test_create_byte_stream_consistent_header_order(): + data = [ + {"student_id": "1234", "name": "Alice", "course": "Math"}, + {"student_id": "5678", "name": "Bob", "course": "Science"}, + ] + csv_bytes = create_byte_stream(data) + csv_string = csv_bytes.decode("utf-8") + header_line = csv_string.splitlines()[0] + expected_header = ",".join(data[0].keys()) + assert header_line == expected_header + + +def test_create_byte_stream_special_characters(): + data = [ + {"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"}, + {"student_id": "5678", "name": "Student 2", "course": "Value with, comma"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data -- cgit v1.2.3