diff options
| author | Alex <git@ajschof.me> | 2025-02-17 16:47:47 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-17 16:47:47 +0000 |
| commit | 00917b8ecf67de9e955479be555d74fcc8257020 (patch) | |
| tree | 17dd9b2e85866f85bdbb3702185463b13c911a28 | |
| parent | bf323b8c2ebd47bb446ba773027f389a0887e325 (diff) | |
| parent | e2b0f2553b8dfcbe39f6e6fdc86ca68cc63f5705 (diff) | |
| download | gdpr-obfuscator-00917b8ecf67de9e955479be555d74fcc8257020.tar.gz gdpr-obfuscator-00917b8ecf67de9e955479be555d74fcc8257020.zip | |
Merge pull request #3 from ajschofield/add-docs
update README & add comments in src code
| -rw-r--r-- | README.md | 54 | ||||
| -rw-r--r-- | cli.py | 16 | ||||
| -rw-r--r-- | obfuscator/csv_reader.py | 33 | ||||
| -rw-r--r-- | obfuscator/obfuscate.py | 10 | ||||
| -rw-r--r-- | test/test_csv_reader.py | 20 | ||||
| -rw-r--r-- | test/test_obfuscator.py | 12 |
6 files changed, 119 insertions, 26 deletions
@@ -1,14 +1,58 @@ -- [gdpr-obfuscator](#gdpr-obfuscator) - * [Minimum Viable Product (MVP)](#minimum-viable-product--mvp-) - * [Setup](#setup) - * [Usage](#usage) +# GDPR Obfuscator - Launchpad Project + +1. [Overview](#overview) +2. [Minimum Viable Product (MVP)](#minimum-viable-product-mvp) + 1. [Additional Features](#additional-features) +4. [Setup](#setup) + 1. [Prerequisites](#prerequisites) + 2. [Installation](#installation) +5. [Usage](#usage) ## Overview -A Python library designed to detect and remove Personally Identifiable Information (PII) from CSV files stored in an AWS S3 bucket. +A Python library designed to detect and remove Personally Identifiable Information (PII) from data formats such as CSV, JSON and Parquet formats. ## Minimum Viable Product (MVP) +The MVP covers: +1. Reading a JSON string containing the S3 location of the CSV file and the names of the fields that are required to be obfuscated +2. Ingesting the CSV file containing data records (with a primary key) from an AWS S3 bucket +3. Obfuscating chosen PII fields (e.g. `name`, `email_address`) by replacing their values with an obfuscated string (`***`) +4. Producing an output CSV file (or a byte-stream) that maintains the original structure but with sensitive fields changed + +This meets the requirements under the General Data Protection Regulation [(GDPR)](https://ico.org.uk/media/for-organisations/guide-to-data-protection/guide-to-the-general-data-protection-regulation-gdpr-1-1.pdf) to ensure that all data containing information that can be used to identify an individual should be anonymised. + +### Additional Features + +*(Ranked in order of priority from high to low)* + +- [ ] **Support for JSON and Parquet formats**: Extend the library to support reading and writing data in JSON and Parquet formats +- [ ] **Command-line interface**: Create a command-line interface to allow users to run the obfuscation process from the terminal +- [ ] **Support for multiple sources**: Extend the library to support reading data from multiple sources (e.g. local file system) + ## Setup +### Prerequisites + +- Python >= 3.13 +- Poetry >= 2.0.1 + +### Installation + +1. Clone the repository: + +```bash +git clone --recurse-submodules https://github.com/ajschofield/gdpr-obfuscator.git +cd gdpr-obfuscator +``` + +2. Install dependencies using poetry + +```bash +# Production +poetry install +# Developer (optional) +poetry install --dev +``` + ## Usage @@ -4,31 +4,43 @@ from obfuscator.csv_reader import CSVReader from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger +# Create the logger logger = get_logger("CLI") - def main(): + # Create an argument parser parser = argparse.ArgumentParser(description="gdpr-obfuscator") # Require user to either choose a local file or an S3 object + # The user can only choose one of these options or the program will exit + # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) loc.add_argument("--local") loc.add_argument("--s3") + # Require user to provide a list of PII fields to obfuscate + # e.g. --pii name email_address + # If not provided, the program will exit parser.add_argument("--pii", nargs="+", required=True) + # Parse the arguments args = parser.parse_args() + # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") + # Create a CSVReader object and read the local CSV file reader = CSVReader() data = reader.read_local(args.local) + # For debug purposes, log the data read from the CSV logger.debug(data) else: logger.debug("User chose to read CSV from S3") + # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) + # For debug purposes, log the obfuscated data as JSON for readability logger.debug(json.dumps(obfuscated_data, indent=4)) - +# If the script is run directly (as it should be), call the main function if __name__ == "__main__": main() diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index b9dccdb..23a34fc 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -3,32 +3,61 @@ import io from typing import List, Dict from obfuscator.logger import get_logger +# Create the logger logger = get_logger("CSVReader") - +# Putting the CSV reading components into a class may seem like overkill +# for a simple script, but it allows for better organization and scalability. +# @staticmethod is used to define the method without an instance of the class +# being required. The methods could be defined just as functions, and this +# may still be changed. class CSVReader: + """ + A class to read CSV data from a local file, S3 object, or string. Near + the project completion, support for JSON/Parquet files will be added. + """ @staticmethod def read_local(path) -> List[Dict[str, str]]: + """ + A method to read a local CSV file and return the data as a list of + dictionaries. + """ + # Log the path of the file being read for debugging logger.debug(f"Reading local CSV from: {path}") - + + # Attempt to read the file and return the data as a list of dictionaries + # However, if the file isn't found or there is a generic exception, log + # the error and raise an exception try: with open(path, mode="r", encoding="utf-8") as f: reader = csv.DictReader(f) return [dict(row) for row in reader] except FileNotFoundError: logger.error(f"File not found: {path}") + raise except Exception as e: logger.error(f"Error reading file: {e}") @staticmethod def read_s3(path) -> List[Dict[str, str]]: + """ + A method to read an S3 object containing CSV data + and return the data as a list of dictionaries. + """ + # Yet to be implemented. return [] @staticmethod def read_string(content: str) -> List[Dict[str, str]]: + """ + A method to read CSV data from a string and return the data as a list + of dictionaries. + """ + # If the content is empty, return an empty list if not content.strip(): return [] + # Treat the string as a file-like object and return as list of dictionaries f = io.StringIO(content) reader = csv.DictReader(f) return [dict(row) for row in reader] diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index ac0bd21..3da9155 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -1,16 +1,24 @@ from typing import List, Dict from obfuscator.logger import get_logger +# Create the logger logger = get_logger("Obfuscator") - def obfuscate( data: List[Dict[str, str]], pii_fields: List[str] ) -> List[Dict[str, str]]: + """ + A function to obfuscate PII fields in a list of dictionaries, replacing + sensitive values with a string of asterisks. + """ + # If no data is provided, log a message and return an empty list if not data: logger.info("No valid data was provided to obfuscate") return [] + # Obfuscate the PII fields in each record using a list/dict comprehension + # This code is good but makes debugging a bit tricky. I may consider + # breaking it down into a for loop. return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index e62c093..1b3d071 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -2,25 +2,27 @@ # Author: Alex Schofield from obfuscator.csv_reader import CSVReader -import pytest reader = CSVReader() - +# Check if the function can read a CSV string with no content and return +# an empty list def test_empty_csv_should_return_no_content(): content = "" result = reader.read_string(content) expected = [] assert result == expected - +# Check if the function can read a CSV string with only a header and return +# an empty list def test_csv_with_header_only_should_return_no_content(): content = "student_id,name,course\n" result = reader.read_string(content) expected = [] assert result == expected - +# Check if the function can read a CSV string with valid data and return +# a list of dictionaries def test_csv_with_valid_data(): content = ( "student_id,name,course\n" @@ -34,7 +36,8 @@ def test_csv_with_valid_data(): ] assert result == expected - +# Check if the function can read a CSV string with quoted fields and return +# a list of dictionaries with the quoted fields intact def test_csv_with_quoted_fields_should_run_as_expected(): content = ( "student_id,name,course\n" @@ -47,10 +50,3 @@ def test_csv_with_quoted_fields_should_run_as_expected(): {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, ] assert result == expected - - -def test_non_csv_file_should_return_no_content(): - content = "" - result = reader.read_string(content) - expected = [] - assert result == expected diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index c77b6b4..cc7d2c1 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,6 +1,7 @@ from obfuscator.obfuscate import obfuscate - +# Check if the function does what its supposed to and can obfuscate +# valid PII fields in a list of dictionaries def test_obfuscate_data_with_valid_pii_fields(): data = [ { @@ -35,7 +36,9 @@ def test_obfuscate_data_with_valid_pii_fields(): result = obfuscate(data, pii_fields) assert result == expected - +# Check if the function can obfuscate data even when some PII +# fields are missing from some of the data, returning a list of dictionaries +# but with the missing PII fields obfuscated and the rest of the data intact def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, @@ -60,7 +63,7 @@ def test_obfuscate_data_with_missing_pii_field(): result = obfuscate(data, pii_fields) assert result == expected - +# Check if the function can handle an empty list of data, returning an empty list def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] @@ -69,7 +72,8 @@ def test_obfuscate_data_with_no_data(): result = obfuscate(data, pii_fields) assert result == expected - +# Check if the function can handle an empty list of PII fields, returning the data as is +# without mutating it def test_obfuscate_data_with_empty_pii_fields(): data = [ { |
