mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 14:14:30 +00:00
fix: replace UnicodeDecodeError to prevent large payload logging (#4071)
Replace UnicodeDecodeError with UnprocessableEntityError in encoding detection to avoid logging entire file contents. UnicodeDecodeError.object automatically stores complete input data, causing memory issues with large files in logging and error reporting systems.
This commit is contained in:
parent
591729c0b8
commit
b8c14a7a4f
@ -1,3 +1,12 @@
|
||||
## 0.18.12
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **Prevent large file content in encoding exceptions** Replace UnicodeDecodeError with UnprocessableEntityError in encoding detection to avoid storing entire file content in exception objects, which can cause issues in logging and error reporting systems when processing large files.
|
||||
|
||||
## 0.18.11
|
||||
|
||||
### Enhancements
|
||||
|
||||
71
test_unstructured/file_utils/test_encoding.py
Normal file
71
test_unstructured/file_utils/test_encoding.py
Normal file
@ -0,0 +1,71 @@
|
||||
"""Test encoding detection error handling (PR #4071)."""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.errors import UnprocessableEntityError
|
||||
from unstructured.file_utils.encoding import detect_file_encoding
|
||||
|
||||
|
||||
def test_charset_detection_failure():
|
||||
"""Test encoding detection failure with memory safety checks."""
|
||||
large_data = b"\x80\x81\x82\x83" * 250_000 # 1MB of invalid UTF-8
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
|
||||
f.write(large_data)
|
||||
temp_file_path = f.name
|
||||
|
||||
try:
|
||||
detect_result = {"encoding": None, "confidence": None}
|
||||
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
|
||||
with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]): # Will fail
|
||||
with pytest.raises(UnprocessableEntityError) as exc_info:
|
||||
detect_file_encoding(filename=temp_file_path)
|
||||
|
||||
exception = exc_info.value
|
||||
|
||||
assert "Unable to determine file encoding" in str(exception)
|
||||
|
||||
# Ensure no .object attribute that would store file content (prevents memory bloat)
|
||||
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
|
||||
assert not hasattr(exception, "object")
|
||||
|
||||
# Exception should be lightweight regardless of file size
|
||||
exception_memory = sys.getsizeof(exception)
|
||||
serialized_size = len(pickle.dumps(exception))
|
||||
|
||||
assert exception_memory < 10_000 # Small in-memory footprint
|
||||
assert serialized_size < 10_000 # Small serialization footprint
|
||||
finally:
|
||||
os.unlink(temp_file_path)
|
||||
|
||||
|
||||
def test_decode_failure():
|
||||
"""Test decode failure with memory safety checks."""
|
||||
# Invalid UTF-16: BOM followed by odd number of bytes
|
||||
invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00"
|
||||
|
||||
detect_result = {"encoding": "utf-16", "confidence": 0.95}
|
||||
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
|
||||
with pytest.raises(UnprocessableEntityError) as exc_info:
|
||||
detect_file_encoding(file=invalid_utf16)
|
||||
|
||||
exception = exc_info.value
|
||||
|
||||
assert "detected 'utf-16' but decode failed" in str(exception)
|
||||
|
||||
# Ensure no .object attribute that would store file content (prevents memory bloat)
|
||||
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
|
||||
assert not hasattr(exception, "object")
|
||||
|
||||
# Exception should be lightweight
|
||||
exception_memory = sys.getsizeof(exception)
|
||||
serialized_size = len(pickle.dumps(exception))
|
||||
|
||||
assert exception_memory < 10_000 # Small in-memory footprint
|
||||
assert serialized_size < 10_000 # Small serialization footprint
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.18.11" # pragma: no cover
|
||||
__version__ = "0.18.12" # pragma: no cover
|
||||
|
||||
@ -2,6 +2,7 @@ from typing import IO, Optional, Tuple, Union
|
||||
|
||||
from charset_normalizer import detect
|
||||
|
||||
from unstructured.errors import UnprocessableEntityError
|
||||
from unstructured.partition.common.common import convert_to_bytes
|
||||
|
||||
ENCODE_REC_THRESHOLD = 0.8
|
||||
@ -88,17 +89,26 @@ def detect_file_encoding(
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
else:
|
||||
raise UnicodeDecodeError(
|
||||
"Unable to determine the encoding of the file or match it with any "
|
||||
"of the specified encodings.",
|
||||
byte_data,
|
||||
0,
|
||||
len(byte_data),
|
||||
"Invalid encoding",
|
||||
)
|
||||
# NOTE: Use UnprocessableEntityError instead of UnicodeDecodeError to avoid
|
||||
# logging the entire file content. UnicodeDecodeError automatically stores
|
||||
# the complete input data, which can be problematic for large files.
|
||||
raise UnprocessableEntityError(
|
||||
"Unable to determine file encoding after trying all common encodings. "
|
||||
"File may be corrupted or in an unsupported format."
|
||||
) from None
|
||||
|
||||
else:
|
||||
file_text = byte_data.decode(encoding)
|
||||
# NOTE: Catch UnicodeDecodeError to avoid logging the entire file content.
|
||||
# UnicodeDecodeError automatically stores the complete input data in its
|
||||
# 'object' attribute, which can cause issues with large files in logging
|
||||
# and error reporting systems.
|
||||
try:
|
||||
file_text = byte_data.decode(encoding)
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
raise UnprocessableEntityError(
|
||||
f"File encoding detection failed: detected '{encoding}' but decode failed. "
|
||||
f"File may be corrupted or in an unsupported format."
|
||||
) from None
|
||||
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user