aboutsummaryrefslogtreecommitdiffstats
path: root/gdpr_obfuscator/__init__.py
blob: f5440a66e66cc0cfec6a8573fd6879add3ff308c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from .read import FileHandler
from .obfuscate import obfuscate_data
from .utils import Utilities


class Obfuscator:
    """
    Provides the functionality to obfuscate the sensitive PII data in CSV files.

    It integrates the file handler to load the CSV data and the obfuscation logic to
    replace the PII fields with obfuscated values. The obfuscated data is then returned
    as a byte stream.

    The input is expected to be a JSON string containing the file path and the PII fields,
    and the user should utilise either the `process_s3` or `process_local` methods to
    obfuscate the data, depending on the file location.
    """

    def __init__(self):
        """
        Initialise the Obfuscator with a FileHandler and Utilities instance.
        """
        self.reader = FileHandler()
        self.utils = Utilities()

    def process_s3(self, input: str) -> bytes:
        """
        Process a CSV file stored in an S3 bucket and obfuscate the PII fields.

        The method expects a JSON string input that contains:
        - "file_path": an S3 URI (e.g., "s3://bucket/key")
        - "pii_fields": a list of column names that contain PII

        Args:
            input (str): A JSON string containing the S3 URI and PII fields, respectively

        Returns:
            bytes: The obfuscated CSV data as a byte stream
        """

        path, pii_fields = self.utils.process_json_input(input)
        obfuscated_data = obfuscate_data(self.reader.read_s3(path), pii_fields)
        return self.utils.create_byte_stream(obfuscated_data)

    def process_local(self, input: str) -> bytes:
        """
        Process a CSV file stored locally and obfuscate the PII fields.

        The method expects a JSON string input that contains:
        - "file_path": a local file path
        - "pii_fields": a list of column names that contain PII

        NOTE: Since the scope of the project is to obfuscate data stored in S3, this method
        has only been tested to work on MacOS and Linux systems.

        Args:
            input (str): A JSON string containing the local file path and PII fields, respectively

        Returns:
            bytes: The obfuscated CSV data as a byte stream
        """

        path, pii_fields = self.utils.process_json_input(input)
        obfuscated_data = obfuscate_data(self.reader.read_local(path), pii_fields)
        return self.utils.create_byte_stream(obfuscated_data)
git.ajschof.me — hosted by ajschofield — powered by cgit