aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore5
-rw-r--r--cli.py26
-rw-r--r--obfuscator/__init__.py0
-rw-r--r--obfuscator/csv_reader.py34
-rw-r--r--obfuscator/logger.py24
-rw-r--r--pyproject.toml25
-rw-r--r--pytest.ini2
-rw-r--r--test/test_csv_reader.py56
8 files changed, 171 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index ecda235..42385eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,7 +105,7 @@ Pipfile.lock
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
+poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
@@ -172,3 +172,6 @@ cython_debug/
# Private notes
*.private
+
+# CSV Files
+*.csv
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..19e64d8
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,26 @@
+import argparse
+from obfuscator.csv_reader import CSVReader
+from obfuscator.logger import get_logger
+
+logger = get_logger("CLI")
+
+
+def main():
+ parser = argparse.ArgumentParser(description="gdpr-obfuscator")
+ # Require user to either choose a local file or an S3 object
+ loc = parser.add_mutually_exclusive_group(required=True)
+ loc.add_argument("--local")
+ loc.add_argument("--s3")
+ args = parser.parse_args()
+
+ if args.local and not args.s3:
+ logger.debug("User chose to read CSV from local path")
+ reader = CSVReader()
+ data = reader.read_local(args.local)
+ print(data)
+ else:
+ logger.debug("User chose to read CSV from S3")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/obfuscator/__init__.py b/obfuscator/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/obfuscator/__init__.py
diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py
new file mode 100644
index 0000000..b9dccdb
--- /dev/null
+++ b/obfuscator/csv_reader.py
@@ -0,0 +1,34 @@
+import csv
+import io
+from typing import List, Dict
+from obfuscator.logger import get_logger
+
+logger = get_logger("CSVReader")
+
+
+class CSVReader:
+ @staticmethod
+ def read_local(path) -> List[Dict[str, str]]:
+ logger.debug(f"Reading local CSV from: {path}")
+
+ try:
+ with open(path, mode="r", encoding="utf-8") as f:
+ reader = csv.DictReader(f)
+ return [dict(row) for row in reader]
+ except FileNotFoundError:
+ logger.error(f"File not found: {path}")
+ except Exception as e:
+ logger.error(f"Error reading file: {e}")
+
+ @staticmethod
+ def read_s3(path) -> List[Dict[str, str]]:
+ return []
+
+ @staticmethod
+ def read_string(content: str) -> List[Dict[str, str]]:
+ if not content.strip():
+ return []
+
+ f = io.StringIO(content)
+ reader = csv.DictReader(f)
+ return [dict(row) for row in reader]
diff --git a/obfuscator/logger.py b/obfuscator/logger.py
new file mode 100644
index 0000000..ca41e95
--- /dev/null
+++ b/obfuscator/logger.py
@@ -0,0 +1,24 @@
+import logging
+import os
+
+
+def get_logger(name: str) -> logging.Logger:
+ logger = logging.getLogger(name)
+
+ if not logger.hasHandlers():
+ if os.getenv("DEBUG", "FALSE").upper() == "TRUE":
+ log_level = logging.DEBUG
+ else:
+ log_level = logging.INFO
+
+ logger.setLevel(log_level)
+
+ handler = logging.StreamHandler()
+ formatting = logging.Formatter(
+ "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+ )
+ handler.setFormatter(formatting)
+
+ logger.addHandler(handler)
+
+ return logger
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..d5db843
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "gdpr-obfuscator"
+version = "0.1.0"
+description = "A Python library designed to detect and remove Personally Identifiable Information (PII) from CSV files stored in an AWS S3 bucket."
+authors = [
+ {name = "Alex Schofield",email = "git@ajschof.me"}
+]
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+ "tabulate (>=0.9.0,<0.10.0)"
+]
+
+[tool.poetry]
+packages = [
+ { include = "gdpr-obfuscator", from = "obfuscator" }
+]
+
+[tool.poetry.group.dev.dependencies]
+pytest = "8.3.4"
+pytest-cov = "^6.0.0"
+
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..ad5c7cc
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = . src
diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py
new file mode 100644
index 0000000..e62c093
--- /dev/null
+++ b/test/test_csv_reader.py
@@ -0,0 +1,56 @@
+# csv_reader.py - tests
+# Author: Alex Schofield
+
+from obfuscator.csv_reader import CSVReader
+import pytest
+
+reader = CSVReader()
+
+
+def test_empty_csv_should_return_no_content():
+ content = ""
+ result = reader.read_string(content)
+ expected = []
+ assert result == expected
+
+
+def test_csv_with_header_only_should_return_no_content():
+ content = "student_id,name,course\n"
+ result = reader.read_string(content)
+ expected = []
+ assert result == expected
+
+
+def test_csv_with_valid_data():
+ content = (
+ "student_id,name,course\n"
+ "1234,Student 1,Course 1\n"
+ "5678,Student 2,Course 2\n"
+ )
+ result = reader.read_string(content)
+ expected = [
+ {"student_id": "1234", "name": "Student 1", "course": "Course 1"},
+ {"student_id": "5678", "name": "Student 2", "course": "Course 2"},
+ ]
+ assert result == expected
+
+
+def test_csv_with_quoted_fields_should_run_as_expected():
+ content = (
+ "student_id,name,course\n"
+ '1234,"Student 1","Course 1"\n'
+ '5678,"Student 2","Course 2"\n'
+ )
+ result = reader.read_string(content)
+ expected = [
+ {"student_id": "1234", "name": "Student 1", "course": "Course 1"},
+ {"student_id": "5678", "name": "Student 2", "course": "Course 2"},
+ ]
+ assert result == expected
+
+
+def test_non_csv_file_should_return_no_content():
+ content = ""
+ result = reader.read_string(content)
+ expected = []
+ assert result == expected
git.ajschof.me — hosted by ajschofield — powered by cgit