unstructured/test_unstructured/file_utils/test_encoding.py

"""Test encoding detection error handling (PR #4071)."""

import os
import pickle
import sys
import tempfile
from unittest.mock import patch

import pytest

from unstructured.errors import UnprocessableEntityError
from unstructured.file_utils.encoding import detect_file_encoding


def test_charset_detection_failure():
    """Test encoding detection failure with memory safety checks."""
    large_data = b"\x80\x81\x82\x83" * 250_000  # 1MB of invalid UTF-8

    with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
        f.write(large_data)
        temp_file_path = f.name

    try:
        detect_result = {"encoding": None, "confidence": None}
        with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
            with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]):  # Will fail
                with pytest.raises(UnprocessableEntityError) as exc_info:
                    detect_file_encoding(filename=temp_file_path)

                exception = exc_info.value

                assert "Unable to determine file encoding" in str(exception)

                # Ensure no .object attribute that would store file content (prevents memory bloat)
                # See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
                assert not hasattr(exception, "object")

                # Exception should be lightweight regardless of file size
                exception_memory = sys.getsizeof(exception)
                serialized_size = len(pickle.dumps(exception))

                assert exception_memory < 10_000  # Small in-memory footprint
                assert serialized_size < 10_000  # Small serialization footprint
    finally:
        os.unlink(temp_file_path)


def test_decode_failure():
    """Test decode failure with memory safety checks."""
    # Invalid UTF-16: BOM followed by odd number of bytes
    invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00"

    detect_result = {"encoding": "utf-16", "confidence": 0.95}
    with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
        with pytest.raises(UnprocessableEntityError) as exc_info:
            detect_file_encoding(file=invalid_utf16)

        exception = exc_info.value

        assert "detected 'utf-16' but decode failed" in str(exception)

        # Ensure no .object attribute that would store file content (prevents memory bloat)
        # See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
        assert not hasattr(exception, "object")

        # Exception should be lightweight
        exception_memory = sys.getsizeof(exception)
        serialized_size = len(pickle.dumps(exception))

        assert exception_memory < 10_000  # Small in-memory footprint
        assert serialized_size < 10_000  # Small serialization footprint