mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 15:11:30 +00:00

### Description Modify the logger being used by ingest to leverage a new class implemented inheriting `logging.Formatter` which adds in some middleware to update the message being logged to omit any sensitive content. It does this by dynamically pulled out any valid json from the string being logged and runs that through a `hide_sensitive_fields` method which updated any values that are considered sensitive. Replaces the original json strings with the `json.dumps` version of the new dictionary.
79 lines
2.3 KiB
Python
79 lines
2.3 KiB
Python
import json
|
|
|
|
import pytest
|
|
|
|
from unstructured.ingest.logger import (
|
|
default_is_data_sensitive,
|
|
hide_sensitive_fields,
|
|
redact_jsons,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("key", "value", "is_sensitive"),
|
|
[
|
|
("username", "john_smith", False),
|
|
("password", "13?H%", True),
|
|
("token", "123", True),
|
|
("AWS_CREDENTIAL", "aws_credential", True),
|
|
("AWS_KEY", None, False),
|
|
],
|
|
)
|
|
def test_default_is_sensitive(key, value, is_sensitive):
|
|
assert default_is_data_sensitive(key, value) == is_sensitive
|
|
|
|
|
|
def test_hide_sensitive_fields():
|
|
d = {
|
|
"username": "john_smith",
|
|
"password": "13?H%",
|
|
"inner": {
|
|
"token": "123",
|
|
"AWS_KEY": None,
|
|
"inner_j_string": json.dumps(
|
|
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
),
|
|
},
|
|
}
|
|
redacted_d = hide_sensitive_fields(d)
|
|
expected_d = {
|
|
"password": "*******",
|
|
"username": "john_smith",
|
|
"inner": {
|
|
"token": "*******",
|
|
"AWS_KEY": None,
|
|
"inner_j_string": json.dumps(
|
|
{"account_name": "*******", "client_id": "*******", "timestamp": 123}
|
|
),
|
|
},
|
|
}
|
|
assert redacted_d == expected_d
|
|
|
|
|
|
def test_redact_jsons():
|
|
d1 = {
|
|
"username": "john_smith",
|
|
"password": "13?H%",
|
|
"inner": {
|
|
"token": "123",
|
|
"AWS_KEY": None,
|
|
"inner_j_string": json.dumps(
|
|
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
),
|
|
},
|
|
}
|
|
|
|
d2 = {"username": "tim67", "update_time": 456}
|
|
d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
|
|
|
|
sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
|
|
expected_string = (
|
|
'Some topic secret info ({"username": "john_smith", "password": "*******", '
|
|
'"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
|
|
'"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
|
|
'\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
|
|
'and {"account_name": "*******", "host": "http://localhost:8888"})'
|
|
)
|
|
redacted_string = redact_jsons(sensitive_string)
|
|
assert redacted_string == expected_string
|