mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 12:03:15 +00:00 
			
		
		
		
	refactor: simplifies JSON detection and add tests (#975)
* refactor json detection * version and changelog * fix mock in test
This commit is contained in:
		
							parent
							
								
									f282a10715
								
							
						
					
					
						commit
						d694cd53bf
					
				@ -1,11 +1,12 @@
 | 
			
		||||
## 0.8.2-dev6
 | 
			
		||||
## 0.8.2-dev7
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
* Additional tests and refactor of JSON detection.
 | 
			
		||||
* Update functionality to retrieve image metadata from a page for `document_to_element_list`
 | 
			
		||||
* Links are now tracked in `partition_html` output.
 | 
			
		||||
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
 | 
			
		||||
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 | 
			
		||||
* Add `min_partition` kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 | 
			
		||||
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
 | 
			
		||||
* Add slide notes to pptx
 | 
			
		||||
* Add `--encoding` directive to ingest
 | 
			
		||||
 | 
			
		||||
@ -77,7 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout):
 | 
			
		||||
        ("README.rst", FileType.RST),
 | 
			
		||||
        ("README.md", FileType.MD),
 | 
			
		||||
        ("fake.odt", FileType.ODT),
 | 
			
		||||
        ("fake-incomplete-json.txt", FileType.JSON),
 | 
			
		||||
        ("fake-incomplete-json.txt", FileType.TXT),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_detect_filetype_from_filename(file, expected):
 | 
			
		||||
@ -141,7 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
 | 
			
		||||
        ("stanley-cups.tsv", FileType.TSV),
 | 
			
		||||
        ("fake-power-point.pptx", FileType.PPTX),
 | 
			
		||||
        ("winter-sports.epub", FileType.EPUB),
 | 
			
		||||
        ("fake-incomplete-json.txt", FileType.JSON),
 | 
			
		||||
        ("fake-incomplete-json.txt", FileType.TXT),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_detect_filetype_from_file(file, expected):
 | 
			
		||||
 | 
			
		||||
@ -220,6 +220,19 @@ def test_auto_partition_json_from_filename():
 | 
			
		||||
    assert json_data == json_elems
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
 | 
			
		||||
    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
 | 
			
		||||
    # per the Unstructured ISD format
 | 
			
		||||
    text = '{"hi": "there"}'
 | 
			
		||||
 | 
			
		||||
    filename = os.path.join(tmpdir, "unprocessable.json")
 | 
			
		||||
    with open(filename, "w") as f:
 | 
			
		||||
        f.write(text)
 | 
			
		||||
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        partition(filename=filename)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.xfail(
 | 
			
		||||
    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
 | 
			
		||||
)
 | 
			
		||||
@ -525,7 +538,7 @@ def test_auto_partition_odt_from_file():
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    ("content_type", "routing_func", "expected"),
 | 
			
		||||
    [
 | 
			
		||||
        ("application/json", "json", "application/json"),
 | 
			
		||||
        ("text/csv", "csv", "text/csv"),
 | 
			
		||||
        ("text/html", "html", "text/html"),
 | 
			
		||||
        ("jdsfjdfsjkds", "pdf", None),
 | 
			
		||||
    ],
 | 
			
		||||
 | 
			
		||||
@ -204,3 +204,17 @@ def test_partition_json_from_text_exclude_metadata(filename: str):
 | 
			
		||||
 | 
			
		||||
    for i in range(len(test_elements)):
 | 
			
		||||
        assert any(test_elements[i].metadata.to_dict()) is False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_json_raises_with_unprocessable_json():
 | 
			
		||||
    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
 | 
			
		||||
    # per the Unstructured ISD format
 | 
			
		||||
    text = '{"hi": "there"}'
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        partition_json(text=text)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_json_raises_with_invalid_json():
 | 
			
		||||
    text = '[{"hi": "there"}]]'
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        partition_json(text=text)
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.8.2-dev6"  # pragma: no cover
 | 
			
		||||
__version__ = "0.8.2-dev7"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,7 @@
 | 
			
		||||
from __future__ import annotations
 | 
			
		||||
 | 
			
		||||
import inspect
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import zipfile
 | 
			
		||||
@ -11,7 +12,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
 | 
			
		||||
from unstructured.documents.coordinates import PixelSpace
 | 
			
		||||
from unstructured.documents.elements import Element, PageBreak
 | 
			
		||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
 | 
			
		||||
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
 | 
			
		||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 | 
			
		||||
from unstructured.partition.common import (
 | 
			
		||||
    _add_element_metadata,
 | 
			
		||||
    _remove_element_metadata,
 | 
			
		||||
@ -417,15 +418,23 @@ def _is_text_file_a_json(
 | 
			
		||||
):
 | 
			
		||||
    """Detects if a file that has a text/plain MIME type is a JSON file."""
 | 
			
		||||
    file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
 | 
			
		||||
    text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
 | 
			
		||||
 | 
			
		||||
    if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
 | 
			
		||||
    try:
 | 
			
		||||
        json.loads(file_text)
 | 
			
		||||
        return True
 | 
			
		||||
    except json.JSONDecodeError:
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    if not re.match(JSON_PATTERN, file_text):
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    return True
 | 
			
		||||
def is_json_processable(
 | 
			
		||||
    filename: Optional[str] = None,
 | 
			
		||||
    file: Optional[IO[bytes]] = None,
 | 
			
		||||
    file_text: Optional[str] = None,
 | 
			
		||||
    encoding: Optional[str] = "utf-8",
 | 
			
		||||
) -> bool:
 | 
			
		||||
    exactly_one(filename=filename, file=file, file_text=file_text)
 | 
			
		||||
    if file_text is None:
 | 
			
		||||
        file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
 | 
			
		||||
    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _count_commas(text: str):
 | 
			
		||||
 | 
			
		||||
@ -9,6 +9,7 @@ from unstructured.file_utils.filetype import (
 | 
			
		||||
    STR_TO_FILETYPE,
 | 
			
		||||
    FileType,
 | 
			
		||||
    detect_filetype,
 | 
			
		||||
    is_json_processable,
 | 
			
		||||
)
 | 
			
		||||
from unstructured.logger import logger
 | 
			
		||||
from unstructured.partition.common import exactly_one
 | 
			
		||||
@ -227,6 +228,11 @@ def partition(
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
    elif filetype == FileType.JSON:
 | 
			
		||||
        if not is_json_processable(filename=filename, file=file):
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "Detected a JSON file that does not conform to the Unstructured schema. "
 | 
			
		||||
                "partition_json currently only processes serialized Unstructured output.",
 | 
			
		||||
            )
 | 
			
		||||
        elements = partition_json(filename=filename, file=file, **kwargs)
 | 
			
		||||
    elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
 | 
			
		||||
        elements = partition_xlsx(filename=filename, file=file, **kwargs)
 | 
			
		||||
 | 
			
		||||
@ -1,10 +1,12 @@
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
from typing import IO, List, Optional
 | 
			
		||||
 | 
			
		||||
from unstructured.documents.elements import Element, process_metadata
 | 
			
		||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 | 
			
		||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 | 
			
		||||
from unstructured.file_utils.filetype import (
 | 
			
		||||
    FileType,
 | 
			
		||||
    add_metadata_with_filetype,
 | 
			
		||||
    is_json_processable,
 | 
			
		||||
)
 | 
			
		||||
from unstructured.partition.common import exactly_one
 | 
			
		||||
from unstructured.staging.base import dict_to_elements
 | 
			
		||||
 | 
			
		||||
@ -48,9 +50,10 @@ def partition_json(
 | 
			
		||||
    elif text is not None:
 | 
			
		||||
        file_text = str(text)
 | 
			
		||||
 | 
			
		||||
    # NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
 | 
			
		||||
    if not re.match(LIST_OF_DICTS_PATTERN, file_text):
 | 
			
		||||
        raise ValueError("Json schema does not match the Unstructured schema")
 | 
			
		||||
    if not is_json_processable(file_text=file_text):
 | 
			
		||||
        raise ValueError(
 | 
			
		||||
            "JSON cannot be partitioned. Schema does not match the Unstructured schema.",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        dict = json.loads(file_text)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user