1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
from gdpr_obfuscator import Obfuscator
import pytest
from moto import mock_aws
import boto3
import csv
import random
import json
import time
obfuscator = Obfuscator()
def setup_s3(s3_client, bucket: str, key: str, content: str):
s3_client.create_bucket(
Bucket=bucket,
CreateBucketConfiguration={"LocationConstraint": "eu-west-2"},
)
s3_client.put_object(Bucket=bucket, Key=key, Body=content)
@pytest.fixture(autouse=True)
def s3_client():
with mock_aws():
yield boto3.client("s3", "eu-west-2")
def test_imported_module_runs_successfully_with_local_data():
with mock_aws():
s3 = boto3.client("s3", region_name="eu-west-2")
bucket = "test-bucket"
key = "data/mock.csv"
with open("test/data/mock_data.csv", "r") as f:
csv_content = f.read()
with open("test/data/mock_data.csv", "r") as f:
reader = list(csv.DictReader(f))
rand_row = random.randint(0, len(reader) - 1)
rand_name = reader[rand_row]["name"]
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
json_input = json.dumps({"file_path": path, "pii_fields": ["name"]})
result = obfuscator.process_s3(json_input)
result_str = result.decode("utf-8")
assert rand_name not in result_str
def test_imported_module_completes_in_under_one_minute():
with mock_aws():
s3 = boto3.client("s3", region_name="eu-west-2")
bucket = "test-bucket"
key = "data/large_dataset.csv"
with open("test/data/large_dataset.csv", "r") as f:
csv_content = f.read()
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
json_input = json.dumps(
{"file_path": path, "pii_fields": ["full_name", "email_address"]}
)
start = time.time()
obfuscator.process_s3(json_input)
end = time.time()
assert end - start < 60
def test_output_compatible_with_s3_put_object():
with mock_aws():
s3 = boto3.client("s3", region_name="eu-west-2")
bucket = "test-bucket"
key = "data/mock.csv"
output_key = "data/obfuscated.csv"
with open("test/data/mock_data.csv", "r") as f:
csv_content = f.read()
setup_s3(s3, bucket, key, csv_content)
path = f"s3://{bucket}/{key}"
json_input = json.dumps({"file_path": path, "pii_fields": ["name"]})
result_bytes = obfuscator.process_s3(json_input)
try:
response = s3.put_object(Bucket=bucket, Key=output_key, Body=result_bytes)
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
get_response = s3.get_object(Bucket=bucket, Key=output_key)
retrieved_content = get_response["Body"].read()
assert retrieved_content == result_bytes
# Shouldn't reach this point but catch and fail anyway
except Exception as e:
pytest.fail(f"put_object did not like the output from process_s3: {e}")
|