Nick Franck b8c14a7a4f
fix: replace UnicodeDecodeError to prevent large payload logging (#4071)
Replace UnicodeDecodeError with UnprocessableEntityError in encoding
detection to avoid logging entire file contents.
UnicodeDecodeError.object automatically stores complete input data,
causing memory issues with large files in logging and error reporting
systems.
2025-07-25 21:01:37 +00:00

72 lines
2.8 KiB
Python

"""Test encoding detection error handling (PR #4071)."""
import os
import pickle
import sys
import tempfile
from unittest.mock import patch
import pytest
from unstructured.errors import UnprocessableEntityError
from unstructured.file_utils.encoding import detect_file_encoding
def test_charset_detection_failure():
"""Test encoding detection failure with memory safety checks."""
large_data = b"\x80\x81\x82\x83" * 250_000 # 1MB of invalid UTF-8
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
f.write(large_data)
temp_file_path = f.name
try:
detect_result = {"encoding": None, "confidence": None}
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]): # Will fail
with pytest.raises(UnprocessableEntityError) as exc_info:
detect_file_encoding(filename=temp_file_path)
exception = exc_info.value
assert "Unable to determine file encoding" in str(exception)
# Ensure no .object attribute that would store file content (prevents memory bloat)
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
assert not hasattr(exception, "object")
# Exception should be lightweight regardless of file size
exception_memory = sys.getsizeof(exception)
serialized_size = len(pickle.dumps(exception))
assert exception_memory < 10_000 # Small in-memory footprint
assert serialized_size < 10_000 # Small serialization footprint
finally:
os.unlink(temp_file_path)
def test_decode_failure():
"""Test decode failure with memory safety checks."""
# Invalid UTF-16: BOM followed by odd number of bytes
invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00"
detect_result = {"encoding": "utf-16", "confidence": 0.95}
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
with pytest.raises(UnprocessableEntityError) as exc_info:
detect_file_encoding(file=invalid_utf16)
exception = exc_info.value
assert "detected 'utf-16' but decode failed" in str(exception)
# Ensure no .object attribute that would store file content (prevents memory bloat)
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
assert not hasattr(exception, "object")
# Exception should be lightweight
exception_memory = sys.getsizeof(exception)
serialized_size = len(pickle.dumps(exception))
assert exception_memory < 10_000 # Small in-memory footprint
assert serialized_size < 10_000 # Small serialization footprint