fix: check json and eml decode ignore error (#574)

This commit is contained in:
Yida Liu 2023-05-11 13:00:11 +08:00 committed by GitHub
parent 328863375e
commit f46eb06e2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 3 deletions

View File

@ -1,3 +1,13 @@
## 0.6.6-dev1
### Enhancements
### Features
### Fixes
* fix: fileutils/file_type check json and eml decode ignore error
## 0.6.6-dev0
### Enhancements

View File

@ -10,6 +10,7 @@ from unstructured.file_utils.filetype import (
DOCX_MIME_TYPES,
XLSX_MIME_TYPES,
FileType,
_is_text_file_a_json,
detect_filetype,
)
@ -285,3 +286,16 @@ def test_detect_filetype_raises_with_none_specified():
def test_filetype_order():
assert FileType.HTML < FileType.XML
@pytest.mark.parametrize(
("content", "expected"),
[
(b"d\xe2\x80", False),
],
)
def test_is_text_file_a_json(content, expected):
from io import BytesIO
with BytesIO(content) as f:
assert _is_text_file_a_json(file=f) == expected

View File

@ -1 +1 @@
__version__ = "0.6.6-dev0" # pragma: no cover
__version__ = "0.6.6-dev1" # pragma: no cover

View File

@ -352,7 +352,7 @@ def _is_text_file_a_json(
if isinstance(file_content, str):
file_text = file_content
else:
file_text = file_content.decode()
file_text = file_content.decode(errors="ignore")
file.seek(0)
elif filename is not None:
with open(filename) as f:
@ -367,7 +367,7 @@ def _check_eml_from_buffer(file: IO) -> bool:
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, bytes):
file_head = file_content.decode("utf-8")
file_head = file_content.decode("utf-8", errors="ignore")
else:
file_head = file_content