mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 23:51:47 +00:00
fix: check json and eml decode ignore error (#574)
This commit is contained in:
parent
328863375e
commit
f46eb06e2d
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.6.6-dev1
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
* fix: fileutils/file_type check json and eml decode ignore error
|
||||||
|
|
||||||
## 0.6.6-dev0
|
## 0.6.6-dev0
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from unstructured.file_utils.filetype import (
|
|||||||
DOCX_MIME_TYPES,
|
DOCX_MIME_TYPES,
|
||||||
XLSX_MIME_TYPES,
|
XLSX_MIME_TYPES,
|
||||||
FileType,
|
FileType,
|
||||||
|
_is_text_file_a_json,
|
||||||
detect_filetype,
|
detect_filetype,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -285,3 +286,16 @@ def test_detect_filetype_raises_with_none_specified():
|
|||||||
|
|
||||||
def test_filetype_order():
|
def test_filetype_order():
|
||||||
assert FileType.HTML < FileType.XML
|
assert FileType.HTML < FileType.XML
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("content", "expected"),
|
||||||
|
[
|
||||||
|
(b"d\xe2\x80", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_text_file_a_json(content, expected):
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
with BytesIO(content) as f:
|
||||||
|
assert _is_text_file_a_json(file=f) == expected
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.6.6-dev0" # pragma: no cover
|
__version__ = "0.6.6-dev1" # pragma: no cover
|
||||||
|
|||||||
@ -352,7 +352,7 @@ def _is_text_file_a_json(
|
|||||||
if isinstance(file_content, str):
|
if isinstance(file_content, str):
|
||||||
file_text = file_content
|
file_text = file_content
|
||||||
else:
|
else:
|
||||||
file_text = file_content.decode()
|
file_text = file_content.decode(errors="ignore")
|
||||||
file.seek(0)
|
file.seek(0)
|
||||||
elif filename is not None:
|
elif filename is not None:
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
@ -367,7 +367,7 @@ def _check_eml_from_buffer(file: IO) -> bool:
|
|||||||
file.seek(0)
|
file.seek(0)
|
||||||
file_content = file.read(4096)
|
file_content = file.read(4096)
|
||||||
if isinstance(file_content, bytes):
|
if isinstance(file_content, bytes):
|
||||||
file_head = file_content.decode("utf-8")
|
file_head = file_content.decode("utf-8", errors="ignore")
|
||||||
else:
|
else:
|
||||||
file_head = file_content
|
file_head = file_content
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user