mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-09 23:17:21 +00:00
refactor: simplifies JSON detection and add tests (#975)
* refactor json detection * version and changelog * fix mock in test
This commit is contained in:
parent
f282a10715
commit
d694cd53bf
@ -1,11 +1,12 @@
|
|||||||
## 0.8.2-dev6
|
## 0.8.2-dev7
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* Additional tests and refactor of JSON detection.
|
||||||
* Update functionality to retrieve image metadata from a page for `document_to_element_list`
|
* Update functionality to retrieve image metadata from a page for `document_to_element_list`
|
||||||
* Links are now tracked in `partition_html` output.
|
* Links are now tracked in `partition_html` output.
|
||||||
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||||
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
|
* Add `min_partition` kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
|
||||||
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||||
* Add slide notes to pptx
|
* Add slide notes to pptx
|
||||||
* Add `--encoding` directive to ingest
|
* Add `--encoding` directive to ingest
|
||||||
|
|||||||
@ -77,7 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout):
|
|||||||
("README.rst", FileType.RST),
|
("README.rst", FileType.RST),
|
||||||
("README.md", FileType.MD),
|
("README.md", FileType.MD),
|
||||||
("fake.odt", FileType.ODT),
|
("fake.odt", FileType.ODT),
|
||||||
("fake-incomplete-json.txt", FileType.JSON),
|
("fake-incomplete-json.txt", FileType.TXT),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_detect_filetype_from_filename(file, expected):
|
def test_detect_filetype_from_filename(file, expected):
|
||||||
@ -141,7 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
|||||||
("stanley-cups.tsv", FileType.TSV),
|
("stanley-cups.tsv", FileType.TSV),
|
||||||
("fake-power-point.pptx", FileType.PPTX),
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
("winter-sports.epub", FileType.EPUB),
|
("winter-sports.epub", FileType.EPUB),
|
||||||
("fake-incomplete-json.txt", FileType.JSON),
|
("fake-incomplete-json.txt", FileType.TXT),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_detect_filetype_from_file(file, expected):
|
def test_detect_filetype_from_file(file, expected):
|
||||||
|
|||||||
@ -220,6 +220,19 @@ def test_auto_partition_json_from_filename():
|
|||||||
assert json_data == json_elems
|
assert json_data == json_elems
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
|
||||||
|
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
|
||||||
|
# per the Unstructured ISD format
|
||||||
|
text = '{"hi": "there"}'
|
||||||
|
|
||||||
|
filename = os.path.join(tmpdir, "unprocessable.json")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition(filename=filename)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
|
reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
|
||||||
)
|
)
|
||||||
@ -525,7 +538,7 @@ def test_auto_partition_odt_from_file():
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("content_type", "routing_func", "expected"),
|
("content_type", "routing_func", "expected"),
|
||||||
[
|
[
|
||||||
("application/json", "json", "application/json"),
|
("text/csv", "csv", "text/csv"),
|
||||||
("text/html", "html", "text/html"),
|
("text/html", "html", "text/html"),
|
||||||
("jdsfjdfsjkds", "pdf", None),
|
("jdsfjdfsjkds", "pdf", None),
|
||||||
],
|
],
|
||||||
|
|||||||
@ -204,3 +204,17 @@ def test_partition_json_from_text_exclude_metadata(filename: str):
|
|||||||
|
|
||||||
for i in range(len(test_elements)):
|
for i in range(len(test_elements)):
|
||||||
assert any(test_elements[i].metadata.to_dict()) is False
|
assert any(test_elements[i].metadata.to_dict()) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_json_raises_with_unprocessable_json():
|
||||||
|
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
|
||||||
|
# per the Unstructured ISD format
|
||||||
|
text = '{"hi": "there"}'
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_json(text=text)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_json_raises_with_invalid_json():
|
||||||
|
text = '[{"hi": "there"}]]'
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_json(text=text)
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.8.2-dev6" # pragma: no cover
|
__version__ = "0.8.2-dev7" # pragma: no cover
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
@ -11,7 +12,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
|
|||||||
from unstructured.documents.coordinates import PixelSpace
|
from unstructured.documents.coordinates import PixelSpace
|
||||||
from unstructured.documents.elements import Element, PageBreak
|
from unstructured.documents.elements import Element, PageBreak
|
||||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||||
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
|
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
_add_element_metadata,
|
_add_element_metadata,
|
||||||
_remove_element_metadata,
|
_remove_element_metadata,
|
||||||
@ -417,15 +418,23 @@ def _is_text_file_a_json(
|
|||||||
):
|
):
|
||||||
"""Detects if a file that has a text/plain MIME type is a JSON file."""
|
"""Detects if a file that has a text/plain MIME type is a JSON file."""
|
||||||
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
|
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
|
||||||
text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
|
try:
|
||||||
|
json.loads(file_text)
|
||||||
if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
|
return True
|
||||||
|
except json.JSONDecodeError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not re.match(JSON_PATTERN, file_text):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
def is_json_processable(
|
||||||
|
filename: Optional[str] = None,
|
||||||
|
file: Optional[IO[bytes]] = None,
|
||||||
|
file_text: Optional[str] = None,
|
||||||
|
encoding: Optional[str] = "utf-8",
|
||||||
|
) -> bool:
|
||||||
|
exactly_one(filename=filename, file=file, file_text=file_text)
|
||||||
|
if file_text is None:
|
||||||
|
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
|
||||||
|
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||||
|
|
||||||
|
|
||||||
def _count_commas(text: str):
|
def _count_commas(text: str):
|
||||||
|
|||||||
@ -9,6 +9,7 @@ from unstructured.file_utils.filetype import (
|
|||||||
STR_TO_FILETYPE,
|
STR_TO_FILETYPE,
|
||||||
FileType,
|
FileType,
|
||||||
detect_filetype,
|
detect_filetype,
|
||||||
|
is_json_processable,
|
||||||
)
|
)
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common import exactly_one
|
||||||
@ -227,6 +228,11 @@ def partition(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.JSON:
|
elif filetype == FileType.JSON:
|
||||||
|
if not is_json_processable(filename=filename, file=file):
|
||||||
|
raise ValueError(
|
||||||
|
"Detected a JSON file that does not conform to the Unstructured schema. "
|
||||||
|
"partition_json currently only processes serialized Unstructured output.",
|
||||||
|
)
|
||||||
elements = partition_json(filename=filename, file=file, **kwargs)
|
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||||
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
||||||
elements = partition_xlsx(filename=filename, file=file, **kwargs)
|
elements = partition_xlsx(filename=filename, file=file, **kwargs)
|
||||||
|
|||||||
@ -1,10 +1,12 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
|
||||||
from typing import IO, List, Optional
|
from typing import IO, List, Optional
|
||||||
|
|
||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
from unstructured.file_utils.filetype import (
|
||||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
FileType,
|
||||||
|
add_metadata_with_filetype,
|
||||||
|
is_json_processable,
|
||||||
|
)
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common import exactly_one
|
||||||
from unstructured.staging.base import dict_to_elements
|
from unstructured.staging.base import dict_to_elements
|
||||||
|
|
||||||
@ -48,9 +50,10 @@ def partition_json(
|
|||||||
elif text is not None:
|
elif text is not None:
|
||||||
file_text = str(text)
|
file_text = str(text)
|
||||||
|
|
||||||
# NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
|
if not is_json_processable(file_text=file_text):
|
||||||
if not re.match(LIST_OF_DICTS_PATTERN, file_text):
|
raise ValueError(
|
||||||
raise ValueError("Json schema does not match the Unstructured schema")
|
"JSON cannot be partitioned. Schema does not match the Unstructured schema.",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dict = json.loads(file_text)
|
dict = json.loads(file_text)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user