mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 7ad8e88a95
			
		
	
	
		7ad8e88a95
		
			
		
	
	
	
	
		
			
			### Description Modify the logger being used by ingest to leverage a new class implemented inheriting `logging.Formatter` which adds in some middleware to update the message being logged to omit any sensitive content. It does this by dynamically pulled out any valid json from the string being logged and runs that through a `hide_sensitive_fields` method which updated any values that are considered sensitive. Replaces the original json strings with the `json.dumps` version of the new dictionary.
		
			
				
	
	
		
			79 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			79 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| from unstructured.ingest.logger import (
 | |
|     default_is_data_sensitive,
 | |
|     hide_sensitive_fields,
 | |
|     redact_jsons,
 | |
| )
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("key", "value", "is_sensitive"),
 | |
|     [
 | |
|         ("username", "john_smith", False),
 | |
|         ("password", "13?H%", True),
 | |
|         ("token", "123", True),
 | |
|         ("AWS_CREDENTIAL", "aws_credential", True),
 | |
|         ("AWS_KEY", None, False),
 | |
|     ],
 | |
| )
 | |
| def test_default_is_sensitive(key, value, is_sensitive):
 | |
|     assert default_is_data_sensitive(key, value) == is_sensitive
 | |
| 
 | |
| 
 | |
| def test_hide_sensitive_fields():
 | |
|     d = {
 | |
|         "username": "john_smith",
 | |
|         "password": "13?H%",
 | |
|         "inner": {
 | |
|             "token": "123",
 | |
|             "AWS_KEY": None,
 | |
|             "inner_j_string": json.dumps(
 | |
|                 {"account_name": "secret name", "client_id": 123, "timestamp": 123}
 | |
|             ),
 | |
|         },
 | |
|     }
 | |
|     redacted_d = hide_sensitive_fields(d)
 | |
|     expected_d = {
 | |
|         "password": "*******",
 | |
|         "username": "john_smith",
 | |
|         "inner": {
 | |
|             "token": "*******",
 | |
|             "AWS_KEY": None,
 | |
|             "inner_j_string": json.dumps(
 | |
|                 {"account_name": "*******", "client_id": "*******", "timestamp": 123}
 | |
|             ),
 | |
|         },
 | |
|     }
 | |
|     assert redacted_d == expected_d
 | |
| 
 | |
| 
 | |
| def test_redact_jsons():
 | |
|     d1 = {
 | |
|         "username": "john_smith",
 | |
|         "password": "13?H%",
 | |
|         "inner": {
 | |
|             "token": "123",
 | |
|             "AWS_KEY": None,
 | |
|             "inner_j_string": json.dumps(
 | |
|                 {"account_name": "secret name", "client_id": 123, "timestamp": 123}
 | |
|             ),
 | |
|         },
 | |
|     }
 | |
| 
 | |
|     d2 = {"username": "tim67", "update_time": 456}
 | |
|     d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
 | |
| 
 | |
|     sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
 | |
|     expected_string = (
 | |
|         'Some topic secret info ({"username": "john_smith", "password": "*******", '
 | |
|         '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
 | |
|         '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
 | |
|         '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
 | |
|         'and {"account_name": "*******", "host": "http://localhost:8888"})'
 | |
|     )
 | |
|     redacted_string = redact_jsons(sensitive_string)
 | |
|     assert redacted_string == expected_string
 |