diff options
| author | Alex <git@ajschof.me> | 2025-02-19 15:58:28 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-19 15:58:28 +0000 |
| commit | 4066bf747e1e4c938526957c119f3f1485ee251e (patch) | |
| tree | 9a1e95f4ccbdd04e19d67a6c13641a19c4d4f3e0 /obfuscator/csv_reader.py | |
| parent | f24955044c4c05e37aba4efb505ec63b44113912 (diff) | |
| parent | 5402af2c7198a685a57a05e29a869e1e72a6b877 (diff) | |
| download | gdpr-obfuscator-4066bf747e1e4c938526957c119f3f1485ee251e.tar.gz gdpr-obfuscator-4066bf747e1e4c938526957c119f3f1485ee251e.zip | |
Merge pull request #8 from ajschofield/refining-phase
mostly minor changes (fixing things up)
Diffstat (limited to 'obfuscator/csv_reader.py')
| -rw-r--r-- | obfuscator/csv_reader.py | 97 |
1 files changed, 0 insertions, 97 deletions
diff --git a/obfuscator/csv_reader.py b/obfuscator/csv_reader.py deleted file mode 100644 index 8f4ebea..0000000 --- a/obfuscator/csv_reader.py +++ /dev/null @@ -1,97 +0,0 @@ -import csv -import io -import boto3 -import os -from typing import List, Dict -from obfuscator.logger import get_logger -from obfuscator.utils import get_s3_path - -# Create the logger -logger = get_logger("CSVReader") - -# Putting the CSV reading components into a class may seem like overkill -# for a simple script, but it allows for better organization and scalability. -# @staticmethod is used to define the method without an instance of the class -# being required. The methods could be defined just as functions, and this -# may still be changed. - - -class CSVReader: - """ - A class to read CSV data from a local file, S3 object, or string. Near - the project completion, support for JSON/Parquet files will be added. - """ - - @staticmethod - def read_local(path) -> List[Dict[str, str]]: - """ - A method to read a local CSV file and return the data as a list of - dictionaries. - """ - # Log the path of the file being read for debugging - logger.debug(f"Reading local CSV from: {path}") - - # Attempt to read the file and return the data as a list of dictionaries - # However, if the file isn't found or there is a generic exception, log - # the error and raise an exception - try: - with open(path, mode="r", encoding="utf-8") as f: - reader = csv.DictReader(f) - return [dict(row) for row in reader] - except FileNotFoundError: - logger.error(f"File not found: {path}") - raise - except Exception as e: - logger.error(f"Error reading file: {e}") - - @staticmethod - def read_s3(path) -> List[Dict[str, str]]: - """ - A method to read an S3 object containing CSV data - and return the data as a list of dictionaries. - """ - bucket, key = get_s3_path(path) - logger.debug(f"Reading S3 CSV from: {bucket}/{key}") - - # If DEBUG=TRUE, use the localstack endpoint for testing - if os.getenv("DEBUG", "FALSE").upper() == "TRUE": - localstack_endpoint = "http://localhost.localstack.cloud:4566" - logger.debug("Using LocalStack endpoint for S3") - client = boto3.client( - "s3", - endpoint_url=localstack_endpoint, - aws_access_key_id="dummy", - aws_secret_access_key="dummy", - ) - logger.debug(f"endpoint_url: {localstack_endpoint}") - else: - client = boto3.client("s3") - - try: - # Attempt to read the S3 object and return the data as a list of dictionaries - response = client.get_object(Bucket=bucket, Key=key) - logger.info("S3 object read successfully") - # Read and decode the content - content = response["Body"].read().decode("utf-8") - # Even though the read_string method was only created for testing, - # it can be reused here to read and return the CSV data - return CSVReader.read_string(content) - # TODO: Add more specific exceptions to catch - except Exception as e: - logger.error(f"Error reading S3 object: {e}") - raise - - @staticmethod - def read_string(content: str) -> List[Dict[str, str]]: - """ - A method to read CSV data from a string and return the data as a list - of dictionaries. - """ - # If the content is empty, return an empty list - if not content.strip(): - return [] - - # Treat the string as a file-like object and return as list of dictionaries - f = io.StringIO(content) - reader = csv.DictReader(f) - return [dict(row) for row in reader] |
