From 0ac6d213c1b9cbc24313b9ac5d442e29f027b798 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 02:51:13 +0000 Subject: update logger names --- obfuscator/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'obfuscator/utils.py') diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 2e4211f..1d1c3fe 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -2,7 +2,7 @@ from obfuscator.logger import get_logger # Create the logger -logger = get_logger("CLI") +logger = get_logger("UTILS") def get_s3_path(uri): -- cgit v1.2.3 From ef05a027ffbf8bbee89bb031ccd6152de49762c6 Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:19:33 +0000 Subject: workaround/fix annoying logging issues --- cli.py | 12 ++++++------ obfuscator/csv_reader.py | 38 +++++++++++++++++++------------------- obfuscator/logger.py | 41 +++++++++++++++++++++++++++-------------- obfuscator/utils.py | 27 +++++++++++++++------------ 4 files changed, 67 insertions(+), 51 deletions(-) (limited to 'obfuscator/utils.py') diff --git a/cli.py b/cli.py index bf9b53f..d19c18d 100644 --- a/cli.py +++ b/cli.py @@ -5,9 +5,6 @@ from obfuscator.obfuscate import obfuscate from obfuscator.logger import get_logger from obfuscator.csv_writer import create_byte_stream -# Create the logger -logger = get_logger("CLI") - def main(): # Create an argument parser @@ -42,11 +39,13 @@ def main(): args = parser.parse_args() # If the user chose verbose logging, set the logger to debug - if args.verbose: - logger.setLevel("DEBUG") + log_level = "DEBUG" if args.verbose else "INFO" + + # Create the logger + logger = get_logger("CLI", log_level) # Create the CSVReader object - reader = CSVReader() + reader = CSVReader(log_level) # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: @@ -56,6 +55,7 @@ def main(): logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") + data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index 8fdf26f..f8dd7d3 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -4,10 +4,7 @@ import boto3 import os from typing import List, Dict from obfuscator.logger import get_logger -from obfuscator.utils import get_s3_path - -# Create the logger -logger = get_logger("CSVREADER") +from obfuscator.utils import Utilities # Putting the CSV reading components into a class may seem like overkill # for a simple script, but it allows for better organization and scalability. @@ -22,14 +19,18 @@ class CSVReader: the project completion, support for JSON/Parquet files will be added. """ - @staticmethod - def read_local(path) -> List[Dict[str, str]]: + def __init__(self, log_level=None): + self.log_level = log_level + # Create the logger + self.logger = get_logger("CSVREADER", log_level) + + def read_local(self, path) -> List[Dict[str, str]]: """ A method to read a local CSV file and return the data as a list of dictionaries. """ # Log the path of the file being read for debugging - logger.debug(f"Reading local CSV from: {path}") + self.logger.debug(f"Reading local CSV from: {path}") # Attempt to read the file and return the data as a list of dictionaries # However, if the file isn't found or there is a generic exception, log @@ -39,38 +40,38 @@ class CSVReader: reader = csv.DictReader(f) return [dict(row) for row in reader] except FileNotFoundError: - logger.error(f"File not found: {path}") + self.logger.error(f"File not found: {path}") raise except Exception as e: - logger.error(f"Error reading file: {e}") + self.logger.error(f"Error reading file: {e}") - @staticmethod - def read_s3(path) -> List[Dict[str, str]]: + def read_s3(self, path) -> List[Dict[str, str]]: """ A method to read an S3 object containing CSV data and return the data as a list of dictionaries. """ - bucket, key = get_s3_path(path) - logger.debug(f"Reading S3 CSV from: {bucket}/{key}") + utils = Utilities(self.log_level) + bucket, key = utils.get_s3_path(path) + self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" - logger.debug("Using LocalStack endpoint for S3") + self.logger.debug("Using LocalStack endpoint for S3") client = boto3.client( "s3", endpoint_url=localstack_endpoint, aws_access_key_id="dummy", aws_secret_access_key="dummy", ) - logger.debug(f"endpoint_url: {localstack_endpoint}") + self.logger.debug(f"endpoint_url: {localstack_endpoint}") else: client = boto3.client("s3") try: # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) - logger.info("S3 object read successfully") + self.logger.info("S3 object read successfully") # Read and decode the content content = response["Body"].read().decode("utf-8") # Even though the read_string method was only created for testing, @@ -78,11 +79,10 @@ class CSVReader: return CSVReader.read_string(content) # TODO: Add more specific exceptions to catch except Exception as e: - logger.error(f"Error reading S3 object: {e}") + self.logger.error(f"Error reading S3 object: {e}") raise - @staticmethod - def read_string(content: str) -> List[Dict[str, str]]: + def read_string(self, content: str) -> List[Dict[str, str]]: """ A method to read CSV data from a string and return the data as a list of dictionaries. diff --git a/obfuscator/logger.py b/obfuscator/logger.py index 649dad7..2c5b988 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -1,24 +1,37 @@ import logging import os +from enum import Enum -def get_logger(name: str) -> logging.Logger: - logger = logging.getLogger(name) +class LogLevel(Enum): + DEBUG = logging.DEBUG + INFO = logging.INFO + WARNING = logging.WARNING + ERROR = logging.ERROR + CRITICAL = logging.CRITICAL + - if not logger.hasHandlers(): - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": - log_level = logging.DEBUG - else: - log_level = logging.INFO +def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: - logger.setLevel(log_level) + if isinstance(level, str): + try: + level = LogLevel[level.upper()] + except KeyError: + raise ValueError( + f"Invalid log level '{level}'. Choose from: {', '.join(l.name for l in LogLevel)}" + ) + + logger = logging.getLogger(name) - handler = logging.StreamHandler() - formatting = logging.Formatter( - "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" - ) - handler.setFormatter(formatting) + if logger.hasHandlers(): + logger.handlers.clear() - logger.addHandler(handler) + handler = logging.StreamHandler() + logger.setLevel(level.value) + formatting = logging.Formatter( + "[%(asctime)s] - %(levelname)s::%(name)s - %(message)s" + ) + handler.setFormatter(formatting) + logger.addHandler(handler) return logger diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 1d1c3fe..81eb04a 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -1,15 +1,18 @@ # Utility functions from obfuscator.logger import get_logger -# Create the logger -logger = get_logger("UTILS") - - -def get_s3_path(uri): - parts = uri.replace("s3://", "").split("/") - logger.debug(f"Parts: {parts}") - bucket = parts.pop(0) - logger.debug(f"Bucket: {bucket}") - key = "/".join(parts) - logger.debug(f"Key: {key}") - return bucket, key + +class Utilities: + + def __init__(self, logger=None): + # Create the logger + self.logger = get_logger("UTILITIES", logger) + + def get_s3_path(self, uri): + parts = uri.replace("s3://", "").split("/") + self.logger.debug(f"Parts: {parts}") + bucket = parts.pop(0) + self.logger.debug(f"Bucket: {bucket}") + key = "/".join(parts) + self.logger.debug(f"Key: {key}") + return bucket, key -- cgit v1.2.3 From 3c4b66e8490c6fdf93fb8fee735d52c76eb2853b Mon Sep 17 00:00:00 2001 From: Alex Schofield Date: Wed, 19 Feb 2025 03:22:26 +0000 Subject: remove annoying comments for better readability of code --- cli.py | 16 ---------------- obfuscator/csv_reader.py | 19 ------------------- obfuscator/csv_writer.py | 1 - obfuscator/obfuscate.py | 5 ----- obfuscator/utils.py | 1 - test/test_csv_reader.py | 23 ----------------------- test/test_obfuscator.py | 15 --------------- 7 files changed, 80 deletions(-) (limited to 'obfuscator/utils.py') diff --git a/cli.py b/cli.py index d19c18d..c03ffd1 100644 --- a/cli.py +++ b/cli.py @@ -7,7 +7,6 @@ from obfuscator.csv_writer import create_byte_stream def main(): - # Create an argument parser parser = argparse.ArgumentParser( prog="GDPR-Obfuscator", description="Obfuscate sensitive data stored locally or in an AWS environment", @@ -17,16 +16,10 @@ def main(): "-v", "--verbose", action="store_true", help="Enable verbose logging" ) - # Require user to either choose a local file or an S3 object - # The user can only choose one of these options or the program will exit - # If not provided, the program will exit loc = parser.add_mutually_exclusive_group(required=True) loc.add_argument("-l", "--local", help="Local path to file") loc.add_argument("-s", "--s3", help="URI path to file stored in S3") - # Require user to provide a list of PII fields to obfuscate - # e.g. --pii name email_address - # If not provided, the program will exit parser.add_argument( "-p", "--pii", @@ -35,23 +28,17 @@ def main(): help="List of PII fields to obfuscate, separated by spaces", ) - # Parse the arguments args = parser.parse_args() - # If the user chose verbose logging, set the logger to debug log_level = "DEBUG" if args.verbose else "INFO" - # Create the logger logger = get_logger("CLI", log_level) - # Create the CSVReader object reader = CSVReader(log_level) - # Read the CSV data based on the user's choice of local or S3 if args.local and not args.s3: logger.debug("User chose to read CSV from local path") data = reader.read_local(args.local) - # For debug purposes, log the data read from the CSV logger.debug("Contents: " + str(data)) else: logger.debug("User chose to read CSV from S3") @@ -59,13 +46,10 @@ def main(): data = reader.read_s3(args.s3) logger.debug("Contents: " + str(data)) - # Obfuscate the data based on the user's choice of PII fields obfuscated_data = obfuscate(data, args.pii) - # For debug purposes, log the obfuscated data as JSON for readability logger.debug("Obfuscated data (JSON): " + json.dumps(obfuscated_data, indent=4)) return create_byte_stream(obfuscated_data) -# If the script is run directly (as it should be), call the main function if __name__ == "__main__": main() diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py index f8dd7d3..2b099c8 100644 --- a/obfuscator/csv_reader.py +++ b/obfuscator/csv_reader.py @@ -6,12 +6,6 @@ from typing import List, Dict from obfuscator.logger import get_logger from obfuscator.utils import Utilities -# Putting the CSV reading components into a class may seem like overkill -# for a simple script, but it allows for better organization and scalability. -# @staticmethod is used to define the method without an instance of the class -# being required. The methods could be defined just as functions, and this -# may still be changed. - class CSVReader: """ @@ -21,7 +15,6 @@ class CSVReader: def __init__(self, log_level=None): self.log_level = log_level - # Create the logger self.logger = get_logger("CSVREADER", log_level) def read_local(self, path) -> List[Dict[str, str]]: @@ -29,12 +22,8 @@ class CSVReader: A method to read a local CSV file and return the data as a list of dictionaries. """ - # Log the path of the file being read for debugging self.logger.debug(f"Reading local CSV from: {path}") - # Attempt to read the file and return the data as a list of dictionaries - # However, if the file isn't found or there is a generic exception, log - # the error and raise an exception try: with open(path, mode="r", encoding="utf-8") as f: reader = csv.DictReader(f) @@ -54,7 +43,6 @@ class CSVReader: bucket, key = utils.get_s3_path(path) self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - # If LOCALSTACK=TRUE, use the localstack endpoint for testing if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE": localstack_endpoint = "http://localhost.localstack.cloud:4566" self.logger.debug("Using LocalStack endpoint for S3") @@ -69,15 +57,10 @@ class CSVReader: client = boto3.client("s3") try: - # Attempt to read the S3 object and return the data as a list of dictionaries response = client.get_object(Bucket=bucket, Key=key) self.logger.info("S3 object read successfully") - # Read and decode the content content = response["Body"].read().decode("utf-8") - # Even though the read_string method was only created for testing, - # it can be reused here to read and return the CSV data return CSVReader.read_string(content) - # TODO: Add more specific exceptions to catch except Exception as e: self.logger.error(f"Error reading S3 object: {e}") raise @@ -87,11 +70,9 @@ class CSVReader: A method to read CSV data from a string and return the data as a list of dictionaries. """ - # If the content is empty, return an empty list if not content.strip(): return [] - # Treat the string as a file-like object and return as list of dictionaries f = io.StringIO(content) reader = csv.DictReader(f) return [dict(row) for row in reader] diff --git a/obfuscator/csv_writer.py b/obfuscator/csv_writer.py index 099e910..56b3f1f 100644 --- a/obfuscator/csv_writer.py +++ b/obfuscator/csv_writer.py @@ -3,7 +3,6 @@ import io from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("CSVWRITER") diff --git a/obfuscator/obfuscate.py b/obfuscator/obfuscate.py index 4f7e6c1..e964433 100644 --- a/obfuscator/obfuscate.py +++ b/obfuscator/obfuscate.py @@ -1,7 +1,6 @@ from typing import List, Dict from obfuscator.logger import get_logger -# Create the logger logger = get_logger("OBFUSCATE") @@ -12,14 +11,10 @@ def obfuscate( A function to obfuscate PII fields in a list of dictionaries, replacing sensitive values with a string of asterisks. """ - # If no data is provided, log a message and return an empty list if not data: logger.info("No valid data was provided to obfuscate") return [] - # Obfuscate the PII fields in each record using a list/dict comprehension - # This code is good but makes debugging a bit tricky. I may consider - # breaking it down into a for loop. return [ {k: ("***" if k in pii_fields else v) for k, v in record.items()} for record in data diff --git a/obfuscator/utils.py b/obfuscator/utils.py index 81eb04a..f61451b 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -5,7 +5,6 @@ from obfuscator.logger import get_logger class Utilities: def __init__(self, logger=None): - # Create the logger self.logger = get_logger("UTILITIES", logger) def get_s3_path(self, uri): diff --git a/test/test_csv_reader.py b/test/test_csv_reader.py index e4c135b..0206542 100644 --- a/test/test_csv_reader.py +++ b/test/test_csv_reader.py @@ -1,6 +1,3 @@ -# csv_reader.py - tests for read_string and read_s3 -# Author: Alex Schofield - import boto3 from moto import mock_aws from obfuscator.csv_reader import CSVReader @@ -8,11 +5,6 @@ import pytest reader = CSVReader() -# CSVREADER: READ_STRING TESTS - -# Check if the function can read a CSV string with no content and return -# an empty list - def test_empty_csv_should_return_no_content(): content = "" @@ -21,10 +13,6 @@ def test_empty_csv_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with only a header and return -# an empty list - - def test_csv_with_header_only_should_return_no_content(): content = "student_id,name,course\n" result = reader.read_string(content) @@ -32,10 +20,6 @@ def test_csv_with_header_only_should_return_no_content(): assert result == expected -# Check if the function can read a CSV string with valid data and return -# a list of dictionaries - - def test_csv_with_valid_data(): content = ( "student_id,name,course\n" @@ -50,10 +34,6 @@ def test_csv_with_valid_data(): assert result == expected -# Check if the function can read a CSV string with quoted fields and return -# a list of dictionaries with the quoted fields intact - - def test_csv_with_quoted_fields_should_run_as_expected(): content = ( "student_id,name,course\n" @@ -68,9 +48,6 @@ def test_csv_with_quoted_fields_should_run_as_expected(): assert result == expected -# CSVREADER: READ_S3 TESTS - - def setup_s3(s3_client, bucket: str, key: str, content: str): s3_client.create_bucket( Bucket=bucket, diff --git a/test/test_obfuscator.py b/test/test_obfuscator.py index 4f61b16..c77b6b4 100644 --- a/test/test_obfuscator.py +++ b/test/test_obfuscator.py @@ -1,8 +1,5 @@ from obfuscator.obfuscate import obfuscate -# Check if the function does what its supposed to and can obfuscate -# valid PII fields in a list of dictionaries - def test_obfuscate_data_with_valid_pii_fields(): data = [ @@ -39,11 +36,6 @@ def test_obfuscate_data_with_valid_pii_fields(): assert result == expected -# Check if the function can obfuscate data even when some PII -# fields are missing from some of the data, returning a list of dictionaries -# but with the missing PII fields obfuscated and the rest of the data intact - - def test_obfuscate_data_with_missing_pii_field(): data = [ {"student_id": "1234", "name": "John Smith", "course": "Software"}, @@ -69,9 +61,6 @@ def test_obfuscate_data_with_missing_pii_field(): assert result == expected -# Check if the function can handle an empty list of data, returning an empty list - - def test_obfuscate_data_with_no_data(): data = [] pii_fields = ["name", "email_address"] @@ -81,10 +70,6 @@ def test_obfuscate_data_with_no_data(): assert result == expected -# Check if the function can handle an empty list of PII fields, returning the data as is -# without mutating it - - def test_obfuscate_data_with_empty_pii_fields(): data = [ { -- cgit v1.2.3 From 5402af2c7198a685a57a05e29a869e1e72a6b877 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:55:50 +0000 Subject: style: format code with Autopep8, Black and Ruff Formatter This commit fixes the style issues introduced in b402ee5 according to the output from Autopep8, Black and Ruff Formatter. Details: https://github.com/ajschofield/gdpr-obfuscator/pull/8 --- obfuscator/logger.py | 1 - obfuscator/utils.py | 1 - 2 files changed, 2 deletions(-) (limited to 'obfuscator/utils.py') diff --git a/obfuscator/logger.py b/obfuscator/logger.py index 2c5b988..140fa8f 100644 --- a/obfuscator/logger.py +++ b/obfuscator/logger.py @@ -12,7 +12,6 @@ class LogLevel(Enum): def get_logger(name: str, level: LogLevel = LogLevel.INFO) -> logging.Logger: - if isinstance(level, str): try: level = LogLevel[level.upper()] diff --git a/obfuscator/utils.py b/obfuscator/utils.py index f61451b..77ca1cf 100644 --- a/obfuscator/utils.py +++ b/obfuscator/utils.py @@ -3,7 +3,6 @@ from obfuscator.logger import get_logger class Utilities: - def __init__(self, logger=None): self.logger = get_logger("UTILITIES", logger) -- cgit v1.2.3