72 lines
2.8 KiB
Python
Raw Normal View History

"""Test encoding detection error handling (PR #4071)."""
import os
import pickle
import sys
import tempfile
from unittest.mock import patch
import pytest
from unstructured.errors import UnprocessableEntityError
from unstructured.file_utils.encoding import detect_file_encoding
def test_charset_detection_failure():
"""Test encoding detection failure with memory safety checks."""
large_data = b"\x80\x81\x82\x83" * 250_000 # 1MB of invalid UTF-8
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
f.write(large_data)
temp_file_path = f.name
try:
detect_result = {"encoding": None, "confidence": None}
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]): # Will fail
with pytest.raises(UnprocessableEntityError) as exc_info:
detect_file_encoding(filename=temp_file_path)
exception = exc_info.value
assert "Unable to determine file encoding" in str(exception)
# Ensure no .object attribute that would store file content (prevents memory bloat)
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
assert not hasattr(exception, "object")
# Exception should be lightweight regardless of file size
exception_memory = sys.getsizeof(exception)
serialized_size = len(pickle.dumps(exception))
assert exception_memory < 10_000 # Small in-memory footprint
assert serialized_size < 10_000 # Small serialization footprint
finally:
os.unlink(temp_file_path)
def test_decode_failure():
"""Test decode failure with memory safety checks."""
# Invalid UTF-16: BOM followed by odd number of bytes
invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00"
detect_result = {"encoding": "utf-16", "confidence": 0.95}
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
with pytest.raises(UnprocessableEntityError) as exc_info:
detect_file_encoding(file=invalid_utf16)
exception = exc_info.value
assert "detected 'utf-16' but decode failed" in str(exception)
# Ensure no .object attribute that would store file content (prevents memory bloat)
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
assert not hasattr(exception, "object")
# Exception should be lightweight
exception_memory = sys.getsizeof(exception)
serialized_size = len(pickle.dumps(exception))
assert exception_memory < 10_000 # Small in-memory footprint
assert serialized_size < 10_000 # Small serialization footprint