diff --git a/CHANGELOG.md b/CHANGELOG.md index 3eb3a1c45..8da12fc17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,12 @@ -## 0.8.2-dev6 +## 0.8.2-dev7 ### Enhancements +* Additional tests and refactor of JSON detection. * Update functionality to retrieve image metadata from a page for `document_to_element_list` * Links are now tracked in `partition_html` output. * Set the file's current position to the beginning after reading the file in `convert_to_bytes` -* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split. +* Add `min_partition` kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split. * set the file's current position to the beginning after reading the file in `convert_to_bytes` * Add slide notes to pptx * Add `--encoding` directive to ingest diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 4a0ac9424..6aee31a73 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -77,7 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout): ("README.rst", FileType.RST), ("README.md", FileType.MD), ("fake.odt", FileType.ODT), - ("fake-incomplete-json.txt", FileType.JSON), + ("fake-incomplete-json.txt", FileType.TXT), ], ) def test_detect_filetype_from_filename(file, expected): @@ -141,7 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte ("stanley-cups.tsv", FileType.TSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), - ("fake-incomplete-json.txt", FileType.JSON), + ("fake-incomplete-json.txt", FileType.TXT), ], ) def test_detect_filetype_from_file(file, expected): diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index dccd2bbae..0d36a002e 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -220,6 +220,19 @@ def test_auto_partition_json_from_filename(): assert json_data == json_elems +def test_auto_partition_json_raises_with_unprocessable_json(tmpdir): + # NOTE(robinson) - This is unprocessable because it is not a list of dicts, + # per the Unstructured ISD format + text = '{"hi": "there"}' + + filename = os.path.join(tmpdir, "unprocessable.json") + with open(filename, "w") as f: + f.write(text) + + with pytest.raises(ValueError): + partition(filename=filename) + + @pytest.mark.xfail( reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492", ) @@ -525,7 +538,7 @@ def test_auto_partition_odt_from_file(): @pytest.mark.parametrize( ("content_type", "routing_func", "expected"), [ - ("application/json", "json", "application/json"), + ("text/csv", "csv", "text/csv"), ("text/html", "html", "text/html"), ("jdsfjdfsjkds", "pdf", None), ], diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 735c539cb..ceee14c28 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -204,3 +204,17 @@ def test_partition_json_from_text_exclude_metadata(filename: str): for i in range(len(test_elements)): assert any(test_elements[i].metadata.to_dict()) is False + + +def test_partition_json_raises_with_unprocessable_json(): + # NOTE(robinson) - This is unprocessable because it is not a list of dicts, + # per the Unstructured ISD format + text = '{"hi": "there"}' + with pytest.raises(ValueError): + partition_json(text=text) + + +def test_partition_json_raises_with_invalid_json(): + text = '[{"hi": "there"}]]' + with pytest.raises(ValueError): + partition_json(text=text) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1041e8072..079a22828 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.2-dev6" # pragma: no cover +__version__ = "0.8.2-dev7" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 78372af83..9f272ebaf 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -1,6 +1,7 @@ from __future__ import annotations import inspect +import json import os import re import zipfile @@ -11,7 +12,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import Element, PageBreak from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str -from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS +from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( _add_element_metadata, _remove_element_metadata, @@ -417,15 +418,23 @@ def _is_text_file_a_json( ): """Detects if a file that has a text/plain MIME type is a JSON file.""" file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding) - text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text) - - if not re.match(VALID_JSON_CHARACTERS, text_without_strings): + try: + json.loads(file_text) + return True + except json.JSONDecodeError: return False - if not re.match(JSON_PATTERN, file_text): - return False - return True +def is_json_processable( + filename: Optional[str] = None, + file: Optional[IO[bytes]] = None, + file_text: Optional[str] = None, + encoding: Optional[str] = "utf-8", +) -> bool: + exactly_one(filename=filename, file=file, file_text=file_text) + if file_text is None: + file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding) + return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None def _count_commas(text: str): diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 3dbc1a37c..e4ddcc8ff 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -9,6 +9,7 @@ from unstructured.file_utils.filetype import ( STR_TO_FILETYPE, FileType, detect_filetype, + is_json_processable, ) from unstructured.logger import logger from unstructured.partition.common import exactly_one @@ -227,6 +228,11 @@ def partition( **kwargs, ) elif filetype == FileType.JSON: + if not is_json_processable(filename=filename, file=file): + raise ValueError( + "Detected a JSON file that does not conform to the Unstructured schema. " + "partition_json currently only processes serialized Unstructured output.", + ) elements = partition_json(filename=filename, file=file, **kwargs) elif (filetype == FileType.XLSX) or (filetype == FileType.XLS): elements = partition_xlsx(filename=filename, file=file, **kwargs) diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index a293e6e31..1391a0fd0 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -1,10 +1,12 @@ import json -import re from typing import IO, List, Optional from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN +from unstructured.file_utils.filetype import ( + FileType, + add_metadata_with_filetype, + is_json_processable, +) from unstructured.partition.common import exactly_one from unstructured.staging.base import dict_to_elements @@ -48,9 +50,10 @@ def partition_json( elif text is not None: file_text = str(text) - # NOTE(Nathan): we expect file_text to be a list of dicts (optimization) - if not re.match(LIST_OF_DICTS_PATTERN, file_text): - raise ValueError("Json schema does not match the Unstructured schema") + if not is_json_processable(file_text=file_text): + raise ValueError( + "JSON cannot be partitioned. Schema does not match the Unstructured schema.", + ) try: dict = json.loads(file_text)