From 70a16769761450b9c2aa63cda86a30a73bc0217c Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Thu, 20 Feb 2025 18:27:12 +0000 Subject: update pyproject.toml & references with new src folder name --- gdpr_obfuscator/__init__.py | 12 ++++++ gdpr_obfuscator/logger.py | 36 ++++++++++++++++++ gdpr_obfuscator/obfuscate.py | 26 +++++++++++++ gdpr_obfuscator/read.py | 89 ++++++++++++++++++++++++++++++++++++++++++++ gdpr_obfuscator/utils.py | 16 ++++++++ gdpr_obfuscator/write.py | 28 ++++++++++++++ 6 files changed, 207 insertions(+) create mode 100644 gdpr_obfuscator/__init__.py create mode 100644 gdpr_obfuscator/logger.py create mode 100644 gdpr_obfuscator/obfuscate.py create mode 100644 gdpr_obfuscator/read.py create mode 100644 gdpr_obfuscator/utils.py create mode 100644 gdpr_obfuscator/write.py (limited to 'gdpr_obfuscator') diff --git a/gdpr_obfuscator/__init__.py b/gdpr_obfuscator/__init__.py new file mode 100644 index 0000000..2ee3b96 --- /dev/null +++ b/gdpr_obfuscator/__init__.py @@ -0,0 +1,12 @@ +from obfuscator.read import DataReader +from obfuscator.write import DataWriter +from obfuscator.obfuscate import obfuscate +from typing import List + + +def main(s3_source: str, pii_fields: List[str], log_level: str = "INFO") -> bytes: + reader = DataReader(log_level) + writer = DataWriter() + data = reader.read_s3(s3_source) + obfuscated_data = obfuscate(data, pii_fields) + return writer.create_byte_stream(obfuscated_data) diff --git a/gdpr_obfuscator/logger.py b/gdpr_obfuscator/logger.py new file mode 100644 index 0000000..140fa8f --- /dev/null +++ b/gdpr_obfuscator/logger.py @@ -0,0 +1,36 @@ +import logging +import os +from enum import Enum + + +class LogLevel(Enum): + DEBUG = logging.DEBUG + INFO = logging.INFO + WARNING = logging.WARNING + ERROR = logging.ERROR + CRITICAL = logging.CRITICAL + + +def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: + if isinstance(level, str): + try: + level = LogLevel[level.upper()] + except KeyError: + raise ValueError( + f"Invalid log level '{level}'. Choose from: {', '.join(l.name for l in LogLevel)}" + ) + + logger = logging.getLogger(name) + + if logger.hasHandlers(): + logger.handlers.clear() + + handler = logging.StreamHandler() + logger.setLevel(level.value) + formatting = logging.Formatter( + "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" + ) + handler.setFormatter(formatting) + logger.addHandler(handler) + + return logger diff --git a/gdpr_obfuscator/obfuscate.py b/gdpr_obfuscator/obfuscate.py new file mode 100644 index 0000000..cd12b6d --- /dev/null +++ b/gdpr_obfuscator/obfuscate.py @@ -0,0 +1,26 @@ +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("OBFUSCATE") + + +def obfuscate( + data: List[Dict[str, str]], pii_fields: List[str] +) -> List[Dict[str, str]]: + """ + A function to obfuscate PII fields in a list of dictionaries, replacing + sensitive values with a string of asterisks. + """ + if not data: + logger.error( + "Invalid or empty data was provided to obfuscate. Returning empty list." + ) + return [] + if not pii_fields: + logger.error("No PII fields provided to obfuscate. Returning data unchanged.") + return data + + return [ + {k: ("***" if k in pii_fields else v) for k, v in record.items()} + for record in data + ] diff --git a/gdpr_obfuscator/read.py b/gdpr_obfuscator/read.py new file mode 100644 index 0000000..b704643 --- /dev/null +++ b/gdpr_obfuscator/read.py @@ -0,0 +1,89 @@ +import csv +import io +import boto3 +import os +from typing import List, Dict +from obfuscator.logger import get_logger +from obfuscator.utils import Utilities + + +class DataReader: + """ + A class to read CSV data from a local file, S3 object, or string. Near + the project completion, support for JSON/Parquet files will be added. + """ + + def __init__(self, log_level=None): + self.log_level = log_level + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: + """ + A method to read a local CSV file and return the data as a list of + dictionaries. + """ + self.logger.debug(f"Reading local CSV from: {path}") + + try: + with open(path, mode="r", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [dict(row) for row in reader] + except FileNotFoundError: + self.logger.error(f"File not found: {path}") + raise + except Exception as e: + self.logger.error(f"Error reading file: {e}") + + def read_s3(self, path) -> List[Dict[str, str]]: + """ + A method to read an S3 object containing CSV data + and return the data as a list of dictionaries. + """ + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + + if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": + localstack_endpoint = "http://localhost.localstack.cloud:4566" + self.logger.debug( + "Using LocalStack endpoint for S3 - ensure LocalStack is running" + ) + client = boto3.client( + "s3", + endpoint_url=localstack_endpoint, + aws_access_key_id="dummy", + aws_secret_access_key="dummy", + ) + self.logger.debug(f"endpoint_url: {localstack_endpoint}") + else: + client = boto3.client("s3") + + try: + response = client.get_object(Bucket=bucket, Key=key) + self.logger.info("S3 object read successfully") + content = response["Body"].read().decode("utf-8") + return self.read_string(content) + except client.exceptions.NoSuchKey: + self.logger.error(f"Object not found: {bucket}/{key}") + raise + except client.exceptions.ClientError as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + except UnicodeDecodeError as e: + self.logger.error(f"Error decoding S3 object: {e}") + raise + except Exception as e: + self.logger.error(f"Error reading S3 object: {e}") + raise + + def read_string(self, content: str) -> List[Dict[str, str]]: + """ + A method to read CSV data from a string and return the data as a list + of dictionaries. + """ + if not content.strip(): + return [] + + f = io.StringIO(content) + reader = csv.DictReader(f) + return [dict(row) for row in reader] diff --git a/gdpr_obfuscator/utils.py b/gdpr_obfuscator/utils.py new file mode 100644 index 0000000..77ca1cf --- /dev/null +++ b/gdpr_obfuscator/utils.py @@ -0,0 +1,16 @@ +# Utility functions +from obfuscator.logger import get_logger + + +class Utilities: + def __init__(self, logger=None): + self.logger = get_logger("UTILITIES", logger) + + def get_s3_path(self, uri): + parts = uri.replace("s3://", "").split("/") + self.logger.debug(f"Parts: {parts}") + bucket = parts.pop(0) + self.logger.debug(f"Bucket: {bucket}") + key = "/".join(parts) + self.logger.debug(f"Key: {key}") + return bucket, key diff --git a/gdpr_obfuscator/write.py b/gdpr_obfuscator/write.py new file mode 100644 index 0000000..451b073 --- /dev/null +++ b/gdpr_obfuscator/write.py @@ -0,0 +1,28 @@ +import csv +import io +from typing import List, Dict +from obfuscator.logger import get_logger + +logger = get_logger("CSVWRITER") + + +class DataWriter: + def __init__(self): + pass + + def create_byte_stream(self, data: List[Dict[str, str]]) -> bytes: + if not data: + logger.error("Invalid or empty data was provided to write") + return b"" + + output = io.StringIO() + + headers = list(data[0].keys()) + + writer = csv.DictWriter(output, fieldnames=headers) + writer.writeheader() + writer.writerows(data) + + csv_string = output.getvalue() + + return csv_string.encode("utf-8") -- cgit v1.2.3