aboutsummaryrefslogtreecommitdiffstats
path: root/obfuscator/read.py
diff options
context:
space:
mode:
authorAlex <git@ajschof.me>2025-02-19 15:58:28 +0000
committerGitHub <noreply@github.com>2025-02-19 15:58:28 +0000
commit4066bf747e1e4c938526957c119f3f1485ee251e (patch)
tree9a1e95f4ccbdd04e19d67a6c13641a19c4d4f3e0 /obfuscator/read.py
parentf24955044c4c05e37aba4efb505ec63b44113912 (diff)
parent5402af2c7198a685a57a05e29a869e1e72a6b877 (diff)
downloadgdpr-obfuscator-4066bf747e1e4c938526957c119f3f1485ee251e.tar.gz
gdpr-obfuscator-4066bf747e1e4c938526957c119f3f1485ee251e.zip
Merge pull request #8 from ajschofield/refining-phase
mostly minor changes (fixing things up)
Diffstat (limited to 'obfuscator/read.py')
-rw-r--r--obfuscator/read.py89
1 files changed, 89 insertions, 0 deletions
diff --git a/obfuscator/read.py b/obfuscator/read.py
new file mode 100644
index 0000000..b704643
--- /dev/null
+++ b/obfuscator/read.py
@@ -0,0 +1,89 @@
+import csv
+import io
+import boto3
+import os
+from typing import List, Dict
+from obfuscator.logger import get_logger
+from obfuscator.utils import Utilities
+
+
+class DataReader:
+ """
+ A class to read CSV data from a local file, S3 object, or string. Near
+ the project completion, support for JSON/Parquet files will be added.
+ """
+
+ def __init__(self, log_level=None):
+ self.log_level = log_level
+ self.logger = get_logger("CSVREADER", log_level)
+
+ def read_local(self, path) -> List[Dict[str, str]]:
+ """
+ A method to read a local CSV file and return the data as a list of
+ dictionaries.
+ """
+ self.logger.debug(f"Reading local CSV from: {path}")
+
+ try:
+ with open(path, mode="r", encoding="utf-8") as f:
+ reader = csv.DictReader(f)
+ return [dict(row) for row in reader]
+ except FileNotFoundError:
+ self.logger.error(f"File not found: {path}")
+ raise
+ except Exception as e:
+ self.logger.error(f"Error reading file: {e}")
+
+ def read_s3(self, path) -> List[Dict[str, str]]:
+ """
+ A method to read an S3 object containing CSV data
+ and return the data as a list of dictionaries.
+ """
+ utils = Utilities(self.log_level)
+ bucket, key = utils.get_s3_path(path)
+ self.logger.debug(f"Reading S3 CSV from: {bucket}/{key}")
+
+ if os.getenv("LOCALSTACK", "FALSE").upper() == "TRUE":
+ localstack_endpoint = "http://localhost.localstack.cloud:4566"
+ self.logger.debug(
+ "Using LocalStack endpoint for S3 - ensure LocalStack is running"
+ )
+ client = boto3.client(
+ "s3",
+ endpoint_url=localstack_endpoint,
+ aws_access_key_id="dummy",
+ aws_secret_access_key="dummy",
+ )
+ self.logger.debug(f"endpoint_url: {localstack_endpoint}")
+ else:
+ client = boto3.client("s3")
+
+ try:
+ response = client.get_object(Bucket=bucket, Key=key)
+ self.logger.info("S3 object read successfully")
+ content = response["Body"].read().decode("utf-8")
+ return self.read_string(content)
+ except client.exceptions.NoSuchKey:
+ self.logger.error(f"Object not found: {bucket}/{key}")
+ raise
+ except client.exceptions.ClientError as e:
+ self.logger.error(f"Error reading S3 object: {e}")
+ raise
+ except UnicodeDecodeError as e:
+ self.logger.error(f"Error decoding S3 object: {e}")
+ raise
+ except Exception as e:
+ self.logger.error(f"Error reading S3 object: {e}")
+ raise
+
+ def read_string(self, content: str) -> List[Dict[str, str]]:
+ """
+ A method to read CSV data from a string and return the data as a list
+ of dictionaries.
+ """
+ if not content.strip():
+ return []
+
+ f = io.StringIO(content)
+ reader = csv.DictReader(f)
+ return [dict(row) for row in reader]
git.ajschof.me — hosted by ajschofield — powered by cgit