From fb2488ebb6c0b48ff1d24bcb14d4301dc7f006ea Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:27:30 +0000 Subject: add function to return byte-stream of obfuscated csv file --- obfuscator/csv_writer.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 obfuscator/csv_writer.py diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py new file mode 100644 index 0000000..f8cd27d --- /dev/null +++ b/obfuscator/csv_writer.py @@ -0,0 +1,26 @@ +import csv +import io +from typing import List, Dict +from obfuscator.logger import get_logger + +# Create the logger +logger = get_logger("CSVWriter") + + +def data_to_csv_bytes(data: List[Dict[str, str]]) -> bytes: + if not data: + logger.info("No valid data was provided to write") + return b"" + + output = io.StringIO() + + headers = list(data[0].keys()) + + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) + + csv_string = output.getvalue() + logger.debug(f"CSV data: {csv_string}") + + return csv_string.encode("utf-8") -- cgit v1.2.3 From b40cf894d21f23bf38b2d122365fc3b9a4f8d4e1 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:32:04 +0000 Subject: rename data_to_csv_bytes function to create_byte_stream in csv_writer.py --- obfuscator/csv_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index f8cd27d..aa5ac3f 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -7,7 +7,7 @@ from obfuscator.logger import get_logger logger = get_logger("CSVWriter") -def data_to_csv_bytes(data: List[Dict[str, str]]) -> bytes: +def create_byte_stream(data: List[Dict[str, str]]) -> bytes: if not data: logger.info("No valid data was provided to write") return b"" -- cgit v1.2.3 From a655b62cbf2899e683a152039a8c6bd38b9d636d Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:32:19 +0000 Subject: add placeholder tests for create_byte_stream (until I get around to finishing my testing stack using LocalStack/Docker) --- test/test_csv_writer.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 test/test_csv_writer.py diff --git a/test/test_csv_writer.py b/test/test_csv_writer.py new file mode 100644 index 0000000..eceac28 --- /dev/null +++ b/test/test_csv_writer.py @@ -0,0 +1,57 @@ +import io +import csv +from obfuscator.csv_writer import create_byte_stream + + +def csv_bytes_to_list(csv_bytes: bytes): + csv_string = csv_bytes.decode("utf-8") + f = io.StringIO(csv_string) + reader = csv.DictReader(f) + return [dict(row) for row in reader] + + +def test_create_byte_stream_valid_data(): + data = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data + + +def test_create_byte_stream_empty_data(): + csv_bytes = create_byte_stream([]) + assert csv_bytes == b"" + + +def test_create_byte_stream_handles_quoted_fields(): + data = [ + {"student_id": "1234", "name": 'Student "One"', "course": "Course, A"}, + {"student_id": "5678", "name": 'Student "Two"', "course": "Course, B"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data + + +def test_create_byte_stream_consistent_header_order(): + data = [ + {"student_id": "1234", "name": "Alice", "course": "Math"}, + {"student_id": "5678", "name": "Bob", "course": "Science"}, + ] + csv_bytes = create_byte_stream(data) + csv_string = csv_bytes.decode("utf-8") + header_line = csv_string.splitlines()[0] + expected_header = ",".join(data[0].keys()) + assert header_line == expected_header + + +def test_create_byte_stream_special_characters(): + data = [ + {"student_id": "1234", "name": "Student 1", "course": "Line1\nLine2"}, + {"student_id": "5678", "name": "Student 2", "course": "Value with, comma"}, + ] + csv_bytes = create_byte_stream(data) + result = csv_bytes_to_list(csv_bytes) + assert result == data -- cgit v1.2.3 From 90542cfe838376988982fb5c9062fc8dee0b7c87 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Tue, 18 Feb 2025 23:34:26 +0000 Subject: update cli.py to use create_byte_stream --- cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli.py b/cli.py index b1d4000..f2c8771 100644 --- a/cli.py +++ b/cli.py @@ -3,6 +3,7 @@ import json from obfuscator.csv_reader import CSVReader from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger +from obfuscator.csv_writer import create_byte_stream # Create the logger logger = get_logger("CLI") @@ -44,6 +45,7 @@ def main(): obfuscated_data = obfuscate(data, args.pii) # For debug purposes, log the obfuscated data as JSON for readability logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) + return create_byte_stream(obfuscated_data) # If the script is run directly (as it should be), call the main function -- cgit v1.2.3