diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c0938be2..828a738e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.6.6-dev1 + +### Enhancements + +### Features + +### Fixes + +* fix: fileutils/file_type check json and eml decode ignore error + ## 0.6.6-dev0 ### Enhancements diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index fee59fd33..9985280db 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -10,6 +10,7 @@ from unstructured.file_utils.filetype import ( DOCX_MIME_TYPES, XLSX_MIME_TYPES, FileType, + _is_text_file_a_json, detect_filetype, ) @@ -285,3 +286,16 @@ def test_detect_filetype_raises_with_none_specified(): def test_filetype_order(): assert FileType.HTML < FileType.XML + + +@pytest.mark.parametrize( + ("content", "expected"), + [ + (b"d\xe2\x80", False), + ], +) +def test_is_text_file_a_json(content, expected): + from io import BytesIO + + with BytesIO(content) as f: + assert _is_text_file_a_json(file=f) == expected diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d7d6a9bd3..490af064a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.6-dev0" # pragma: no cover +__version__ = "0.6.6-dev1" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 05e6e1ce7..6a18177f4 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -352,7 +352,7 @@ def _is_text_file_a_json( if isinstance(file_content, str): file_text = file_content else: - file_text = file_content.decode() + file_text = file_content.decode(errors="ignore") file.seek(0) elif filename is not None: with open(filename) as f: @@ -367,7 +367,7 @@ def _check_eml_from_buffer(file: IO) -> bool: file.seek(0) file_content = file.read(4096) if isinstance(file_content, bytes): - file_head = file_content.decode("utf-8") + file_head = file_content.decode("utf-8", errors="ignore") else: file_head = file_content