From 792b3589535e747ad25151ed98f3729d1e356aca Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Mon, 17 Feb 2025 12:37:35 +0000 Subject: create initial function for obfuscation --- obfuscator/obfuscate.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 obfuscator/obfuscate.py diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py new file mode 100644 index 0000000..6c0414e --- /dev/null +++ b/obfuscator/obfuscate.py @@ -0,0 +1,7 @@ +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("Obfuscator") + +def obfuscate(data: List[Dict[str, str]], pii_fields: List[str]) -> List[Dict[str,str]]: + pass \ No newline at end of file -- cgit v1.2.3 From 4555ccc1bb60cd3624c69bde536b905bafa3a19f Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Mon, 17 Feb 2025 12:45:27 +0000 Subject: add tests for obfuscate() using mock data --- test/test_obfuscator.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 test/test_obfuscator.py diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py new file mode 100644 index 0000000..cfc88e5 --- /dev/null +++ b/test/test_obfuscator.py @@ -0,0 +1,47 @@ +from obfuscator.obfuscate import obfuscate + +def test_obfuscate_data_with_valid_pii_fields(): + data = [ + {"student_id": "1234", "name": "John Smith", "course": "Software", "email_address": "j.smith@email.com"}, + {"student_id": "5678", "name": "Jane Doe", "course": "Data Science", "email_address": "j.doe@email.com"} + ] + pii_fields = ["name", "email_address"] + expected = [ + {"student_id": "1234", "name": "***", "course": "Software", "email_address": "***"}, + {"student_id": "5678", "name": "***", "course": "Data Science", "email_address": "***"} + ] + + result = obfuscate(data, pii_fields) + assert result == expected + +def test_obfuscate_data_with_missing_pii_field(): + data = [ + {"student_id": "1234", "name": "John Smith", "course": "Software"}, + {"student_id": "5678", "name": "Jane Doe", "course": "Data Science", "email_address": "j.doe@email.com"} + ] + pii_fields = ["name", "email_address"] + expected = [ + {"student_id": "1234", "name": "***", "course": "Software"}, + {"student_id": "5678", "name": "***", "course": "Data Science", "email_address": "***"} + ] + + result = obfuscate(data, pii_fields) + assert result == expected + +def test_obfuscate_data_with_no_data(): + data = [] + pii_fields = ["name", "email_address"] + expected = [] + + result = obfuscate(data, pii_fields) + assert result == expected + +def test_obfuscate_data_with_empty_pii_fields(): + data = [ + {"student_id": "1234", "name": "John Smith", "course": "Software", "email_address": "j.smith@email.com"} + ] + pii_fields = [] + expected = data.copy() + + result = obfuscate(data, pii_fields) + assert result == expected \ No newline at end of file -- cgit v1.2.3 From 97b345545b950615ebd1df77fd0c1a8a4300495a Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Mon, 17 Feb 2025 12:50:36 +0000 Subject: use list/dictionary comprehension to obfuscate pii data --- obfuscator/obfuscate.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 6c0414e..c9c116c 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -4,4 +4,12 @@ from obfuscator.logger import get_logger logger = get_logger("Obfuscator") def obfuscate(data: List[Dict[str, str]], pii_fields: List[str]) -> List[Dict[str,str]]: - pass \ No newline at end of file + if not data: + logger.info("No valid data was provided to obfuscate") + return [] + + return [ + {k: ("***" if k in pii_fields else v) for k, v in record.items()} + for record in data + ] + \ No newline at end of file -- cgit v1.2.3 From 82ea5c29c444bc9d050d80c0ad24ed7975c49595 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Mon, 17 Feb 2025 12:57:35 +0000 Subject: add cli argument for pii fields to obfuscate --- cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cli.py b/cli.py index 19e64d8..7fb4393 100644 --- a/cli.py +++ b/cli.py @@ -1,5 +1,6 @@ import argparse from obfuscator.csv_reader import CSVReader +from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger logger = get_logger("CLI") @@ -11,6 +12,9 @@ def main(): loc = parser.add_mutually_exclusive_group(required=True) loc.add_argument("--local") loc.add_argument("--s3") + + parser.add_argument("--pii", nargs="+", required=True) + args = parser.parse_args() if args.local and not args.s3: -- cgit v1.2.3 From 08ab3d77f1a36c72340858a8ff8c3e57246dc8f4 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Mon, 17 Feb 2025 12:58:44 +0000 Subject: debug log csv data instead of printing --- cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cli.py b/cli.py index 7fb4393..f5590e6 100644 --- a/cli.py +++ b/cli.py @@ -14,14 +14,14 @@ def main(): loc.add_argument("--s3") parser.add_argument("--pii", nargs="+", required=True) - + args = parser.parse_args() if args.local and not args.s3: logger.debug("User chose to read CSV from local path") reader = CSVReader() data = reader.read_local(args.local) - print(data) + logger.debug(data) else: logger.debug("User chose to read CSV from S3") -- cgit v1.2.3 From 74843c48b7aa6f862b2965d590a711aa4cfc5f42 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Mon, 17 Feb 2025 13:00:39 +0000 Subject: obfuscate the data in cli.py and debug log as json --- cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cli.py b/cli.py index f5590e6..c6442c7 100644 --- a/cli.py +++ b/cli.py @@ -1,4 +1,5 @@ import argparse +import json from obfuscator.csv_reader import CSVReader from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger @@ -25,6 +26,9 @@ def main(): else: logger.debug("User chose to read CSV from S3") + obfuscated_data = obfuscate(data, args.pii) + logger.debug(json.dumps(obfuscated_data, indent=4)) + if __name__ == "__main__": main() -- cgit v1.2.3 From e796c7bb6cc6de6368c2d195e233d0b11cf7e699 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 13:04:43 +0000 Subject: style: format code with Autopep8, Black and Ruff Formatter This commit fixes the style issues introduced in 74843c4 according to the output from Autopep8, Black and Ruff Formatter. Details: https://github.com/ajschofield/gdpr-obfuscator/pull/2 --- obfuscator/obfuscate.py | 8 ++++--- test/test_obfuscator.py | 63 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index c9c116c..ac0bd21 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -3,13 +3,15 @@ from obfuscator.logger import get_logger logger = get_logger("Obfuscator") -def obfuscate(data: List[Dict[str, str]], pii_fields: List[str]) -> List[Dict[str,str]]: + +def obfuscate( + data: List[Dict[str, str]], pii_fields: List[str] +) -> List[Dict[str, str]]: if not data: logger.info("No valid data was provided to obfuscate") return [] - + return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data ] - \ No newline at end of file diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index cfc88e5..c77b6b4 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,47 +1,86 @@ from obfuscator.obfuscate import obfuscate + def test_obfuscate_data_with_valid_pii_fields(): data = [ - {"student_id": "1234", "name": "John Smith", "course": "Software", "email_address": "j.smith@email.com"}, - {"student_id": "5678", "name": "Jane Doe", "course": "Data Science", "email_address": "j.doe@email.com"} + { + "student_id": "1234", + "name": "John Smith", + "course": "Software", + "email_address": "j.smith@email.com", + }, + { + "student_id": "5678", + "name": "Jane Doe", + "course": "Data Science", + "email_address": "j.doe@email.com", + }, ] pii_fields = ["name", "email_address"] expected = [ - {"student_id": "1234", "name": "***", "course": "Software", "email_address": "***"}, - {"student_id": "5678", "name": "***", "course": "Data Science", "email_address": "***"} + { + "student_id": "1234", + "name": "***", + "course": "Software", + "email_address": "***", + }, + { + "student_id": "5678", + "name": "***", + "course": "Data Science", + "email_address": "***", + }, ] - + result = obfuscate(data, pii_fields) assert result == expected + def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, - {"student_id": "5678", "name": "Jane Doe", "course": "Data Science", "email_address": "j.doe@email.com"} + { + "student_id": "5678", + "name": "Jane Doe", + "course": "Data Science", + "email_address": "j.doe@email.com", + }, ] pii_fields = ["name", "email_address"] expected = [ {"student_id": "1234", "name": "***", "course": "Software"}, - {"student_id": "5678", "name": "***", "course": "Data Science", "email_address": "***"} + { + "student_id": "5678", + "name": "***", + "course": "Data Science", + "email_address": "***", + }, ] - + result = obfuscate(data, pii_fields) assert result == expected + def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] expected = [] - + result = obfuscate(data, pii_fields) assert result == expected + def test_obfuscate_data_with_empty_pii_fields(): data = [ - {"student_id": "1234", "name": "John Smith", "course": "Software", "email_address": "j.smith@email.com"} + { + "student_id": "1234", + "name": "John Smith", + "course": "Software", + "email_address": "j.smith@email.com", + } ] pii_fields = [] expected = data.copy() - + result = obfuscate(data, pii_fields) - assert result == expected \ No newline at end of file + assert result == expected -- cgit v1.2.3