aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.md54
-rw-r--r--cli.py16
-rw-r--r--obfuscator/csv_reader.py33
-rw-r--r--obfuscator/obfuscate.py10
-rw-r--r--test/test_csv_reader.py20
-rw-r--r--test/test_obfuscator.py12
6 files changed, 119 insertions, 26 deletions
diff --git a/README.md b/README.md
index 430fcdc..5cd4bfb 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,58 @@
-- [gdpr-obfuscator](#gdpr-obfuscator)
- * [Minimum Viable Product (MVP)](#minimum-viable-product--mvp-)
- * [Setup](#setup)
- * [Usage](#usage)
+# GDPR Obfuscator - Launchpad Project
+
+1. [Overview](#overview)
+2. [Minimum Viable Product (MVP)](#minimum-viable-product-mvp)
+ 1. [Additional Features](#additional-features)
+4. [Setup](#setup)
+ 1. [Prerequisites](#prerequisites)
+ 2. [Installation](#installation)
+5. [Usage](#usage)
## Overview
-A Python library designed to detect and remove Personally Identifiable Information (PII) from CSV files stored in an AWS S3 bucket.
+A Python library designed to detect and remove Personally Identifiable Information (PII) from data formats such as CSV, JSON and Parquet formats.
## Minimum Viable Product (MVP)
+The MVP covers:
+1. Reading a JSON string containing the S3 location of the CSV file and the names of the fields that are required to be obfuscated
+2. Ingesting the CSV file containing data records (with a primary key) from an AWS S3 bucket
+3. Obfuscating chosen PII fields (e.g. `name`, `email_address`) by replacing their values with an obfuscated string (`***`)
+4. Producing an output CSV file (or a byte-stream) that maintains the original structure but with sensitive fields changed
+
+This meets the requirements under the General Data Protection Regulation [(GDPR)](https://ico.org.uk/media/for-organisations/guide-to-data-protection/guide-to-the-general-data-protection-regulation-gdpr-1-1.pdf) to ensure that all data containing information that can be used to identify an individual should be anonymised.
+
+### Additional Features
+
+*(Ranked in order of priority from high to low)*
+
+- [ ] **Support for JSON and Parquet formats**: Extend the library to support reading and writing data in JSON and Parquet formats
+- [ ] **Command-line interface**: Create a command-line interface to allow users to run the obfuscation process from the terminal
+- [ ] **Support for multiple sources**: Extend the library to support reading data from multiple sources (e.g. local file system)
+
## Setup
+### Prerequisites
+
+- Python >= 3.13
+- Poetry >= 2.0.1
+
+### Installation
+
+1. Clone the repository:
+
+```bash
+git clone --recurse-submodules https://github.com/ajschofield/gdpr-obfuscator.git
+cd gdpr-obfuscator
+```
+
+2. Install dependencies using poetry
+
+```bash
+# Production
+poetry install
+# Developer (optional)
+poetry install --dev
+```
+
## Usage
diff --git a/cli.py b/cli.py
index c6442c7..bd4f79d 100644
--- a/cli.py
+++ b/cli.py
@@ -4,31 +4,43 @@ from obfuscator.csv_reader import CSVReader
from obfuscator.obfuscate import obfuscate
from obfuscator.logger import get_logger
+# Create the logger
logger = get_logger("CLI")
-
def main():
+ # Create an argument parser
parser = argparse.ArgumentParser(description="gdpr-obfuscator")
# Require user to either choose a local file or an S3 object
+ # The user can only choose one of these options or the program will exit
+ # If not provided, the program will exit
loc = parser.add_mutually_exclusive_group(required=True)
loc.add_argument("--local")
loc.add_argument("--s3")
+ # Require user to provide a list of PII fields to obfuscate
+ # e.g. --pii name email_address
+ # If not provided, the program will exit
parser.add_argument("--pii", nargs="+", required=True)
+ # Parse the arguments
args = parser.parse_args()
+ # Read the CSV data based on the user's choice of local or S3
if args.local and not args.s3:
logger.debug("User chose to read CSV from local path")
+ # Create a CSVReader object and read the local CSV file
reader = CSVReader()
data = reader.read_local(args.local)
+ # For debug purposes, log the data read from the CSV
logger.debug(data)
else:
logger.debug("User chose to read CSV from S3")
+ # Obfuscate the data based on the user's choice of PII fields
obfuscated_data = obfuscate(data, args.pii)
+ # For debug purposes, log the obfuscated data as JSON for readability
logger.debug(json.dumps(obfuscated_data, indent=4))
-
+# If the script is run directly (as it should be), call the main function
if __name__ == "__main__":
main()
diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py
index b9dccdb..23a34fc 100644
--- a/obfuscator/csv_reader.py
+++ b/obfuscator/csv_reader.py
@@ -3,32 +3,61 @@ import io
from typing import List, Dict
from obfuscator.logger import get_logger
+# Create the logger
logger = get_logger("CSVReader")
-
+# Putting the CSV reading components into a class may seem like overkill
+# for a simple script, but it allows for better organization and scalability.
+# @staticmethod is used to define the method without an instance of the class
+# being required. The methods could be defined just as functions, and this
+# may still be changed.
class CSVReader:
+ """
+ A class to read CSV data from a local file, S3 object, or string. Near
+ the project completion, support for JSON/Parquet files will be added.
+ """
@staticmethod
def read_local(path) -> List[Dict[str, str]]:
+ """
+ A method to read a local CSV file and return the data as a list of
+ dictionaries.
+ """
+ # Log the path of the file being read for debugging
logger.debug(f"Reading local CSV from: {path}")
-
+
+ # Attempt to read the file and return the data as a list of dictionaries
+ # However, if the file isn't found or there is a generic exception, log
+ # the error and raise an exception
try:
with open(path, mode="r", encoding="utf-8") as f:
reader = csv.DictReader(f)
return [dict(row) for row in reader]
except FileNotFoundError:
logger.error(f"File not found: {path}")
+ raise
except Exception as e:
logger.error(f"Error reading file: {e}")
@staticmethod
def read_s3(path) -> List[Dict[str, str]]:
+ """
+ A method to read an S3 object containing CSV data
+ and return the data as a list of dictionaries.
+ """
+ # Yet to be implemented.
return []
@staticmethod
def read_string(content: str) -> List[Dict[str, str]]:
+ """
+ A method to read CSV data from a string and return the data as a list
+ of dictionaries.
+ """
+ # If the content is empty, return an empty list
if not content.strip():
return []
+ # Treat the string as a file-like object and return as list of dictionaries
f = io.StringIO(content)
reader = csv.DictReader(f)
return [dict(row) for row in reader]
diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py
index ac0bd21..3da9155 100644
--- a/obfuscator/obfuscate.py
+++ b/obfuscator/obfuscate.py
@@ -1,16 +1,24 @@
from typing import List, Dict
from obfuscator.logger import get_logger
+# Create the logger
logger = get_logger("Obfuscator")
-
def obfuscate(
data: List[Dict[str, str]], pii_fields: List[str]
) -> List[Dict[str, str]]:
+ """
+ A function to obfuscate PII fields in a list of dictionaries, replacing
+ sensitive values with a string of asterisks.
+ """
+ # If no data is provided, log a message and return an empty list
if not data:
logger.info("No valid data was provided to obfuscate")
return []
+ # Obfuscate the PII fields in each record using a list/dict comprehension
+ # This code is good but makes debugging a bit tricky. I may consider
+ # breaking it down into a for loop.
return [
{k: ("***" if k in pii_fields else v) for k, v in record.items()}
for record in data
diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py
index e62c093..1b3d071 100644
--- a/test/test_csv_reader.py
+++ b/test/test_csv_reader.py
@@ -2,25 +2,27 @@
# Author: Alex Schofield
from obfuscator.csv_reader import CSVReader
-import pytest
reader = CSVReader()
-
+# Check if the function can read a CSV string with no content and return
+# an empty list
def test_empty_csv_should_return_no_content():
content = ""
result = reader.read_string(content)
expected = []
assert result == expected
-
+# Check if the function can read a CSV string with only a header and return
+# an empty list
def test_csv_with_header_only_should_return_no_content():
content = "student_id,name,course\n"
result = reader.read_string(content)
expected = []
assert result == expected
-
+# Check if the function can read a CSV string with valid data and return
+# a list of dictionaries
def test_csv_with_valid_data():
content = (
"student_id,name,course\n"
@@ -34,7 +36,8 @@ def test_csv_with_valid_data():
]
assert result == expected
-
+# Check if the function can read a CSV string with quoted fields and return
+# a list of dictionaries with the quoted fields intact
def test_csv_with_quoted_fields_should_run_as_expected():
content = (
"student_id,name,course\n"
@@ -47,10 +50,3 @@ def test_csv_with_quoted_fields_should_run_as_expected():
{"student_id": "5678", "name": "Student 2", "course": "Course 2"},
]
assert result == expected
-
-
-def test_non_csv_file_should_return_no_content():
- content = ""
- result = reader.read_string(content)
- expected = []
- assert result == expected
diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py
index c77b6b4..cc7d2c1 100644
--- a/test/test_obfuscator.py
+++ b/test/test_obfuscator.py
@@ -1,6 +1,7 @@
from obfuscator.obfuscate import obfuscate
-
+# Check if the function does what its supposed to and can obfuscate
+# valid PII fields in a list of dictionaries
def test_obfuscate_data_with_valid_pii_fields():
data = [
{
@@ -35,7 +36,9 @@ def test_obfuscate_data_with_valid_pii_fields():
result = obfuscate(data, pii_fields)
assert result == expected
-
+# Check if the function can obfuscate data even when some PII
+# fields are missing from some of the data, returning a list of dictionaries
+# but with the missing PII fields obfuscated and the rest of the data intact
def test_obfuscate_data_with_missing_pii_field():
data = [
{"student_id": "1234", "name": "John Smith", "course": "Software"},
@@ -60,7 +63,7 @@ def test_obfuscate_data_with_missing_pii_field():
result = obfuscate(data, pii_fields)
assert result == expected
-
+# Check if the function can handle an empty list of data, returning an empty list
def test_obfuscate_data_with_no_data():
data = []
pii_fields = ["name", "email_address"]
@@ -69,7 +72,8 @@ def test_obfuscate_data_with_no_data():
result = obfuscate(data, pii_fields)
assert result == expected
-
+# Check if the function can handle an empty list of PII fields, returning the data as is
+# without mutating it
def test_obfuscate_data_with_empty_pii_fields():
data = [
{
git.ajschof.me — hosted by ajschofield — powered by cgit