diff options
| author | Alex <git@ajschof.me> | 2025-02-17 01:19:11 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-17 01:19:11 +0000 |
| commit | 73462d62d48cd3cf061697f9f6a390437ee29f2d (patch) | |
| tree | 4331016762f96c0861b46ac67343a0769f038fa6 | |
| parent | e6bedc5c9d391b761fec6529de8537a991125c26 (diff) | |
| parent | cf1376862fb2f58c2e837338ed9c765439ffa1b9 (diff) | |
| download | gdpr-obfuscator-73462d62d48cd3cf061697f9f6a390437ee29f2d.tar.gz gdpr-obfuscator-73462d62d48cd3cf061697f9f6a390437ee29f2d.zip | |
Merge pull request #1 from ajschofield/feat/csv-reader
merge csvreader into stable
| -rw-r--r-- | .gitignore | 5 | ||||
| -rw-r--r-- | cli.py | 26 | ||||
| -rw-r--r-- | obfuscator/__init__.py | 0 | ||||
| -rw-r--r-- | obfuscator/csv_reader.py | 34 | ||||
| -rw-r--r-- | obfuscator/logger.py | 24 | ||||
| -rw-r--r-- | pyproject.toml | 25 | ||||
| -rw-r--r-- | pytest.ini | 2 | ||||
| -rw-r--r-- | test/test_csv_reader.py | 56 |
8 files changed, 171 insertions, 1 deletions
@@ -105,7 +105,7 @@ Pipfile.lock # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock +poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. @@ -172,3 +172,6 @@ cython_debug/ # Private notes *.private + +# CSV Files +*.csv @@ -0,0 +1,26 @@ +import argparse +from obfuscator.csv_reader import CSVReader +from obfuscator.logger import get_logger + +logger = get_logger("CLI") + + +def main(): + parser = argparse.ArgumentParser(description="gdpr-obfuscator") + # Require user to either choose a local file or an S3 object + loc = parser.add_mutually_exclusive_group(required=True) + loc.add_argument("--local") + loc.add_argument("--s3") + args = parser.parse_args() + + if args.local and not args.s3: + logger.debug("User chose to read CSV from local path") + reader = CSVReader() + data = reader.read_local(args.local) + print(data) + else: + logger.debug("User chose to read CSV from S3") + + +if __name__ == "__main__": + main() diff --git a/obfuscator/__init__.py b/obfuscator/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/obfuscator/__init__.py diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py new file mode 100644 index 0000000..b9dccdb --- /dev/null +++ b/obfuscator/csv_reader.py @@ -0,0 +1,34 @@ +import csv +import io +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("CSVReader") + + +class CSVReader: + @staticmethod + def read_local(path) -> List[Dict[str, str]]: + logger.debug(f"Reading local CSV from: {path}") + + try: + with open(path, mode="r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [dict(row) for row in reader] + except FileNotFoundError: + logger.error(f"File not found: {path}") + except Exception as e: + logger.error(f"Error reading file: {e}") + + @staticmethod + def read_s3(path) -> List[Dict[str, str]]: + return [] + + @staticmethod + def read_string(content: str) -> List[Dict[str, str]]: + if not content.strip(): + return [] + + f = io.StringIO(content) + reader = csv.DictReader(f) + return [dict(row) for row in reader] diff --git a/obfuscator/logger.py b/obfuscator/logger.py new file mode 100644 index 0000000..ca41e95 --- /dev/null +++ b/obfuscator/logger.py @@ -0,0 +1,24 @@ +import logging +import os + + +def get_logger(name: str) -> logging.Logger: + logger = logging.getLogger(name) + + if not logger.hasHandlers(): + if os.getenv("DEBUG", "FALSE").upper() == "TRUE": + log_level = logging.DEBUG + else: + log_level = logging.INFO + + logger.setLevel(log_level) + + handler = logging.StreamHandler() + formatting = logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" + ) + handler.setFormatter(formatting) + + logger.addHandler(handler) + + return logger diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d5db843 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "gdpr-obfuscator" +version = "0.1.0" +description = "A Python library designed to detect and remove Personally Identifiable Information (PII) from CSV files stored in an AWS S3 bucket." +authors = [ + {name = "Alex Schofield",email = "git@ajschof.me"} +] +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + "tabulate (>=0.9.0,<0.10.0)" +] + +[tool.poetry] +packages = [ + { include = "gdpr-obfuscator", from = "obfuscator" } +] + +[tool.poetry.group.dev.dependencies] +pytest = "8.3.4" +pytest-cov = "^6.0.0" + +[build-system] +requires = ["poetry-core>=2.0.0,<3.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..ad5c7cc --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = . src diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py new file mode 100644 index 0000000..e62c093 --- /dev/null +++ b/test/test_csv_reader.py @@ -0,0 +1,56 @@ +# csv_reader.py - tests +# Author: Alex Schofield + +from obfuscator.csv_reader import CSVReader +import pytest + +reader = CSVReader() + + +def test_empty_csv_should_return_no_content(): + content = "" + result = reader.read_string(content) + expected = [] + assert result == expected + + +def test_csv_with_header_only_should_return_no_content(): + content = "student_id,name,course\n" + result = reader.read_string(content) + expected = [] + assert result == expected + + +def test_csv_with_valid_data(): + content = ( + "student_id,name,course\n" + "1234,Student 1,Course 1\n" + "5678,Student 2,Course 2\n" + ) + result = reader.read_string(content) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert result == expected + + +def test_csv_with_quoted_fields_should_run_as_expected(): + content = ( + "student_id,name,course\n" + '1234,"Student 1","Course 1"\n' + '5678,"Student 2","Course 2"\n' + ) + result = reader.read_string(content) + expected = [ + {"student_id": "1234", "name": "Student 1", "course": "Course 1"}, + {"student_id": "5678", "name": "Student 2", "course": "Course 2"}, + ] + assert result == expected + + +def test_non_csv_file_should_return_no_content(): + content = "" + result = reader.read_string(content) + expected = [] + assert result == expected |
