unstructured/test_unstructured_ingest/unit/test_logger.py

import json

import pytest

from unstructured.ingest.logger import (
    default_is_data_sensitive,
    hide_sensitive_fields,
    redact_jsons,
)


@pytest.mark.parametrize(
    ("key", "value", "is_sensitive"),
    [
        ("username", "john_smith", False),
        ("password", "13?H%", True),
        ("token", "123", True),
        ("AWS_CREDENTIAL", "aws_credential", True),
        ("AWS_KEY", None, False),
    ],
)
def test_default_is_sensitive(key, value, is_sensitive):
    assert default_is_data_sensitive(key, value) == is_sensitive


def test_hide_sensitive_fields():
    d = {
        "username": "john_smith",
        "password": "13?H%",
        "inner": {
            "token": "123",
            "AWS_KEY": None,
            "inner_j_string": json.dumps(
                {"account_name": "secret name", "client_id": 123, "timestamp": 123}
            ),
        },
    }
    redacted_d = hide_sensitive_fields(d)
    expected_d = {
        "password": "*******",
        "username": "john_smith",
        "inner": {
            "token": "*******",
            "AWS_KEY": None,
            "inner_j_string": json.dumps(
                {"account_name": "*******", "client_id": "*******", "timestamp": 123}
            ),
        },
    }
    assert redacted_d == expected_d


def test_redact_jsons():
    d1 = {
        "username": "john_smith",
        "password": "13?H%",
        "inner": {
            "token": "123",
            "AWS_KEY": None,
            "inner_j_string": json.dumps(
                {"account_name": "secret name", "client_id": 123, "timestamp": 123}
            ),
        },
    }

    d2 = {"username": "tim67", "update_time": 456}
    d3 = {"account_name": "top secret", "host": "http://localhost:8888"}

    sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
    expected_string = (
        'Some topic secret info ({"username": "john_smith", "password": "*******", '
        '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
        '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
        '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
        'and {"account_name": "*******", "host": "http://localhost:8888"})'
    )
    redacted_string = redact_jsons(sensitive_string)
    assert redacted_string == expected_string