feat: update auto.partition() function to recognize Unstructured json (#337)

2025-12-26 22:55:07 +00:00 · 2023-03-09 03:36:01 +09:00 · 2023-03-09 03:36:01 +09:00 · 6be07a5260
commit 6be07a5260
parent 1580c1bf8e
5 changed files with 162 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,7 @@

 ### Enhancements

+* `auto.partition()` can now load Unstructured ISD json documents.
 * Simplify partitioning functions.
 * Improve logging for ingest CLI.

--- a/test_unstructured/partition/test_json.py
+++ b/test_unstructured/partition/test_json.py
@ -0,0 +1,107 @@
+import os
+import pathlib
+import tempfile
+
+import pytest
+
+from unstructured.partition.auto import partition
+from unstructured.partition.json import partition_json
+from unstructured.staging.base import elements_to_json
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
+test_files = [
+    "fake-text.txt",
+    "layout-parser-paper-fast.pdf",
+    "fake-html.html",
+    "fake.doc",
+    "fake-email.eml",
+    "fake-power-point.ppt",
+    "fake.docx",
+    "fake-power-point.pptx",
+]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_partition_json_from_filename(filename: str):
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, filename + ".json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        test_elements = partition_json(filename=test_path)
+
+    assert len(elements) > 0
+    assert len(str(elements[0])) > 0
+
+    assert len(elements) == len(test_elements)
+    for i in range(len(elements)):
+        assert elements[i] == test_elements[i]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_partition_json_from_file(filename: str):
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, filename + ".json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        with open(test_path) as f:
+            test_elements = partition_json(file=f)
+
+    assert len(elements) > 0
+    assert len(str(elements[0])) > 0
+
+    assert len(elements) == len(test_elements)
+    for i in range(len(elements)):
+        assert elements[i] == test_elements[i]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_partition_json_from_text(filename: str):
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, filename + ".json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        with open(test_path) as f:
+            text = f.read()
+        test_elements = partition_json(text=text)
+
+    assert len(elements) > 0
+    assert len(str(elements[0])) > 0
+
+    assert len(elements) == len(test_elements)
+    for i in range(len(elements)):
+        assert elements[i] == test_elements[i]
+
+
+def test_partition_json_raises_with_none_specified():
+    with pytest.raises(ValueError):
+        partition_json()
+
+
+def test_partition_json_raises_with_too_many_specified():
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, "fake-text.txt.json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        with open(test_path) as f:
+            text = f.read()
+
+    with pytest.raises(ValueError):
+        partition_json(filename=test_path, file=f)
+
+    with pytest.raises(ValueError):
+        partition_json(filename=test_path, text=text)
+
+    with pytest.raises(ValueError):
+        partition_json(file=f, text=text)
+
+    with pytest.raises(ValueError):
+        partition_json(filename=test_path, file=f, text=text)
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -83,6 +83,7 @@ class FileType(Enum):
    EML = 40
    RTF = 41
    TXT = 42
+    JSON = 43

    # Markup Types
    HTML = 50
@ -116,6 +117,7 @@ EXT_TO_FILETYPE = {
    ".xls": FileType.XLS,
    ".ppt": FileType.PPT,
    ".rtf": FileType.RTF,
+    ".json": FileType.JSON,
 }


@ -154,6 +156,9 @@ def detect_filetype(
    if mime_type == "application/pdf":
        return FileType.PDF

+    elif mime_type == "application/json":
+        return FileType.JSON
+
    elif mime_type in DOCX_MIME_TYPES:
        return FileType.DOCX

--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.image import partition_image
+from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
@ -67,6 +68,8 @@ def partition(
        return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
    elif filetype == FileType.PPTX:
        return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
+    elif filetype == FileType.JSON:
+        return partition_json(filename=filename, file=file)
    else:
        msg = "Invalid file" if not filename else f"Invalid file {filename}"
        raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
--- a/unstructured/partition/json.py
+++ b/unstructured/partition/json.py
@ -0,0 +1,46 @@
+import json
+import re
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.staging.base import dict_to_elements
+
+LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{"
+
+
+def partition_json(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    text: Optional[str] = None,
+) -> List[Element]:
+    """Partitions an .json document into its constituent elements."""
+    if not any([filename, file, text]):
+        raise ValueError("One of filename, file, or text must be specified.")
+
+    if filename is not None and not file and not text:
+        with open(filename, encoding="utf8") as f:
+            file_text = f.read()
+
+    elif file is not None and not filename and not text:
+        file_text = file.read()
+
+    elif text is not None and not filename and not file:
+        file_text = str(text)
+
+    else:
+        raise ValueError("Only one of filename, file, or text can be specified.")
+
+    # NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
+    if not re.match(LIST_OF_DICTS_PATTERN, file_text):
+        raise ValueError("Json schema does not match the Unstructured schema")
+
+    try:
+        dict = json.loads(file_text)
+        elements = dict_to_elements(dict)
+    except json.JSONDecodeError:
+        raise ValueError("Not a valid json")
+
+    # NOTE(Nathan): in future PR, try extracting items that look like text
+    #               if file_text is a valid json but not an unstructured json
+
+    return elements