mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 15:42:19 +00:00
fix: check json and eml decode ignore error (#574)
This commit is contained in:
parent
328863375e
commit
f46eb06e2d
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.6.6-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* fix: fileutils/file_type check json and eml decode ignore error
|
||||
|
||||
## 0.6.6-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@ from unstructured.file_utils.filetype import (
|
||||
DOCX_MIME_TYPES,
|
||||
XLSX_MIME_TYPES,
|
||||
FileType,
|
||||
_is_text_file_a_json,
|
||||
detect_filetype,
|
||||
)
|
||||
|
||||
@ -285,3 +286,16 @@ def test_detect_filetype_raises_with_none_specified():
|
||||
|
||||
def test_filetype_order():
|
||||
assert FileType.HTML < FileType.XML
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("content", "expected"),
|
||||
[
|
||||
(b"d\xe2\x80", False),
|
||||
],
|
||||
)
|
||||
def test_is_text_file_a_json(content, expected):
|
||||
from io import BytesIO
|
||||
|
||||
with BytesIO(content) as f:
|
||||
assert _is_text_file_a_json(file=f) == expected
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.6-dev0" # pragma: no cover
|
||||
__version__ = "0.6.6-dev1" # pragma: no cover
|
||||
|
||||
@ -352,7 +352,7 @@ def _is_text_file_a_json(
|
||||
if isinstance(file_content, str):
|
||||
file_text = file_content
|
||||
else:
|
||||
file_text = file_content.decode()
|
||||
file_text = file_content.decode(errors="ignore")
|
||||
file.seek(0)
|
||||
elif filename is not None:
|
||||
with open(filename) as f:
|
||||
@ -367,7 +367,7 @@ def _check_eml_from_buffer(file: IO) -> bool:
|
||||
file.seek(0)
|
||||
file_content = file.read(4096)
|
||||
if isinstance(file_content, bytes):
|
||||
file_head = file_content.decode("utf-8")
|
||||
file_head = file_content.decode("utf-8", errors="ignore")
|
||||
else:
|
||||
file_head = file_content
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user