1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
import csv
import io
import boto3
from typing import List, Dict
from .utils import Utilities
class FileHandler:
"""
A class to read CSV data from a local file, S3 object, or string. Currently,
CSV files are supported but support for JSON and Parquet files may be
added in the future.
"""
def __init__(self):
"""
Initialise the FileHandler with a Utilities instance.
"""
self.utils = Utilities()
def read_local(self, file_path) -> List[Dict[str, str]]:
"""
Read a local CSV file and return the data as a list of dictionaries.
The file path should be a local path to the CSV file. There is no logic
to convert file paths between operating systems since `read_s3` is the
main method to be used. Therefore, this method will only work reliably
on MacOS and Linux systems.
This method uses the built-in `open` function to read the CSV file and
then reads the CSV data using `read_string` to be returned.
Args:
file_path (_type_): The local file path to the CSV file
Returns:
List[Dict[str, str]]: A list of dictionaries representing the CSV data rows
"""
with open(file_path, mode="r", encoding="utf-8") as f:
return self.read_string(f.read())
def read_s3(self, file_path) -> List[Dict[str, str]]:
"""
Read a CSV file within an S3 bucket and return the data as a list of dictionaries.
The S3 URI should be in the format "s3://bucket/key". This method uses
get_object present in the boto3 library to interact with S3 and retrieve
the CSV file. Once retrieved, the CSV data is read using `read_string`
and is returned.
Args:
file_path (_type_): The local file path to the CSV file
Returns:
List[Dict[str, str]]: A list of dictionaries representing the CSV data rows
"""
bucket, key = self.utils.get_s3_path(file_path)
client = boto3.client("s3")
try:
response = client.get_object(Bucket=bucket, Key=key)
except client.exceptions.NoSuchKey:
raise ValueError(f"File not found in S3 bucket: {bucket}/{key}")
except client.exceptions.NoSuchBucket:
raise ValueError(f"Bucket not found in S3: {bucket}")
except client.exceptions.ClientError as e:
raise ValueError(f"Error accessing S3: {e}")
try:
content = response["Body"].read().decode("utf-8")
except UnicodeDecodeError:
raise ValueError("File is not UTF-8 encoded or malformed")
except Exception as e:
raise ValueError(f"Error reading file from S3: {e}")
return self.read_string(content)
@staticmethod
def read_string(content: str) -> List[Dict[str, str]]:
"""
Parse raw data provided by read helpers and return the data as a list of dictionaries.
If the provided string is empty, an empty list is returned.
Args:
content (str): The raw CSV data as a string
Returns:
List[Dict[str, str]]: A list of dictionaries representing the CSV data rows
"""
if not content.strip():
return []
f = io.StringIO(content)
reader = csv.DictReader(f)
return [dict(row) for row in reader]
|