mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-12 02:39:37 +00:00
72 lines
2.8 KiB
Python
72 lines
2.8 KiB
Python
![]() |
"""Test encoding detection error handling (PR #4071)."""
|
||
|
|
||
|
import os
|
||
|
import pickle
|
||
|
import sys
|
||
|
import tempfile
|
||
|
from unittest.mock import patch
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from unstructured.errors import UnprocessableEntityError
|
||
|
from unstructured.file_utils.encoding import detect_file_encoding
|
||
|
|
||
|
|
||
|
def test_charset_detection_failure():
|
||
|
"""Test encoding detection failure with memory safety checks."""
|
||
|
large_data = b"\x80\x81\x82\x83" * 250_000 # 1MB of invalid UTF-8
|
||
|
|
||
|
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
|
||
|
f.write(large_data)
|
||
|
temp_file_path = f.name
|
||
|
|
||
|
try:
|
||
|
detect_result = {"encoding": None, "confidence": None}
|
||
|
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
|
||
|
with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]): # Will fail
|
||
|
with pytest.raises(UnprocessableEntityError) as exc_info:
|
||
|
detect_file_encoding(filename=temp_file_path)
|
||
|
|
||
|
exception = exc_info.value
|
||
|
|
||
|
assert "Unable to determine file encoding" in str(exception)
|
||
|
|
||
|
# Ensure no .object attribute that would store file content (prevents memory bloat)
|
||
|
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
|
||
|
assert not hasattr(exception, "object")
|
||
|
|
||
|
# Exception should be lightweight regardless of file size
|
||
|
exception_memory = sys.getsizeof(exception)
|
||
|
serialized_size = len(pickle.dumps(exception))
|
||
|
|
||
|
assert exception_memory < 10_000 # Small in-memory footprint
|
||
|
assert serialized_size < 10_000 # Small serialization footprint
|
||
|
finally:
|
||
|
os.unlink(temp_file_path)
|
||
|
|
||
|
|
||
|
def test_decode_failure():
|
||
|
"""Test decode failure with memory safety checks."""
|
||
|
# Invalid UTF-16: BOM followed by odd number of bytes
|
||
|
invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00"
|
||
|
|
||
|
detect_result = {"encoding": "utf-16", "confidence": 0.95}
|
||
|
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
|
||
|
with pytest.raises(UnprocessableEntityError) as exc_info:
|
||
|
detect_file_encoding(file=invalid_utf16)
|
||
|
|
||
|
exception = exc_info.value
|
||
|
|
||
|
assert "detected 'utf-16' but decode failed" in str(exception)
|
||
|
|
||
|
# Ensure no .object attribute that would store file content (prevents memory bloat)
|
||
|
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
|
||
|
assert not hasattr(exception, "object")
|
||
|
|
||
|
# Exception should be lightweight
|
||
|
exception_memory = sys.getsizeof(exception)
|
||
|
serialized_size = len(pickle.dumps(exception))
|
||
|
|
||
|
assert exception_memory < 10_000 # Small in-memory footprint
|
||
|
assert serialized_size < 10_000 # Small serialization footprint
|