Roman Isecke 7ad8e88a95
feat: leverage logger to hide sensitive data in ingest logs (#2175)
### Description
Modify the logger being used by ingest to leverage a new class
implemented inheriting `logging.Formatter` which adds in some middleware
to update the message being logged to omit any sensitive content. It
does this by dynamically pulled out any valid json from the string being
logged and runs that through a `hide_sensitive_fields` method which
updated any values that are considered sensitive. Replaces the original
json strings with the `json.dumps` version of the new dictionary.
2023-11-29 18:16:23 +00:00

79 lines
2.3 KiB
Python

import json
import pytest
from unstructured.ingest.logger import (
default_is_data_sensitive,
hide_sensitive_fields,
redact_jsons,
)
@pytest.mark.parametrize(
("key", "value", "is_sensitive"),
[
("username", "john_smith", False),
("password", "13?H%", True),
("token", "123", True),
("AWS_CREDENTIAL", "aws_credential", True),
("AWS_KEY", None, False),
],
)
def test_default_is_sensitive(key, value, is_sensitive):
assert default_is_data_sensitive(key, value) == is_sensitive
def test_hide_sensitive_fields():
d = {
"username": "john_smith",
"password": "13?H%",
"inner": {
"token": "123",
"AWS_KEY": None,
"inner_j_string": json.dumps(
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
),
},
}
redacted_d = hide_sensitive_fields(d)
expected_d = {
"password": "*******",
"username": "john_smith",
"inner": {
"token": "*******",
"AWS_KEY": None,
"inner_j_string": json.dumps(
{"account_name": "*******", "client_id": "*******", "timestamp": 123}
),
},
}
assert redacted_d == expected_d
def test_redact_jsons():
d1 = {
"username": "john_smith",
"password": "13?H%",
"inner": {
"token": "123",
"AWS_KEY": None,
"inner_j_string": json.dumps(
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
),
},
}
d2 = {"username": "tim67", "update_time": 456}
d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
expected_string = (
'Some topic secret info ({"username": "john_smith", "password": "*******", '
'"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
'"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
'\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
'and {"account_name": "*******", "host": "http://localhost:8888"})'
)
redacted_string = redact_jsons(sensitive_string)
assert redacted_string == expected_string