From 6be07a5260917c6e2dc7d134f789d9d046afdb1b Mon Sep 17 00:00:00 2001
From: natygyoon <59327875+natygyoon@users.noreply.github.com>
Date: Thu, 9 Mar 2023 03:36:01 +0900
Subject: [PATCH] feat: update auto.partition() function to recognize
 Unstructured json (#337)

---
 CHANGELOG.md                             |   1 +
 test_unstructured/partition/test_json.py | 107 +++++++++++++++++++++++
 unstructured/file_utils/filetype.py      |   5 ++
 unstructured/partition/auto.py           |   3 +
 unstructured/partition/json.py           |  46 ++++++++++
 5 files changed, 162 insertions(+)
 create mode 100644 test_unstructured/partition/test_json.py
 create mode 100644 unstructured/partition/json.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 23f3a2833..d10118640 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ### Enhancements
 
+* `auto.partition()` can now load Unstructured ISD json documents.
 * Simplify partitioning functions.
 * Improve logging for ingest CLI.
 
diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py
new file mode 100644
index 000000000..a41ebcef0
--- /dev/null
+++ b/test_unstructured/partition/test_json.py
@@ -0,0 +1,107 @@
+import os
+import pathlib
+import tempfile
+
+import pytest
+
+from unstructured.partition.auto import partition
+from unstructured.partition.json import partition_json
+from unstructured.staging.base import elements_to_json
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
+test_files = [
+    "fake-text.txt",
+    "layout-parser-paper-fast.pdf",
+    "fake-html.html",
+    "fake.doc",
+    "fake-email.eml",
+    "fake-power-point.ppt",
+    "fake.docx",
+    "fake-power-point.pptx",
+]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_partition_json_from_filename(filename: str):
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, filename + ".json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        test_elements = partition_json(filename=test_path)
+
+    assert len(elements) > 0
+    assert len(str(elements[0])) > 0
+
+    assert len(elements) == len(test_elements)
+    for i in range(len(elements)):
+        assert elements[i] == test_elements[i]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_partition_json_from_file(filename: str):
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, filename + ".json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        with open(test_path) as f:
+            test_elements = partition_json(file=f)
+
+    assert len(elements) > 0
+    assert len(str(elements[0])) > 0
+
+    assert len(elements) == len(test_elements)
+    for i in range(len(elements)):
+        assert elements[i] == test_elements[i]
+
+
+@pytest.mark.parametrize("filename", test_files)
+def test_partition_json_from_text(filename: str):
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, filename + ".json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        with open(test_path) as f:
+            text = f.read()
+        test_elements = partition_json(text=text)
+
+    assert len(elements) > 0
+    assert len(str(elements[0])) > 0
+
+    assert len(elements) == len(test_elements)
+    for i in range(len(elements)):
+        assert elements[i] == test_elements[i]
+
+
+def test_partition_json_raises_with_none_specified():
+    with pytest.raises(ValueError):
+        partition_json()
+
+
+def test_partition_json_raises_with_too_many_specified():
+    path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    elements = partition(filename=path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = os.path.join(tmpdir, "fake-text.txt.json")
+        elements_to_json(elements, filename=test_path, indent=2)
+        with open(test_path) as f:
+            text = f.read()
+
+    with pytest.raises(ValueError):
+        partition_json(filename=test_path, file=f)
+
+    with pytest.raises(ValueError):
+        partition_json(filename=test_path, text=text)
+
+    with pytest.raises(ValueError):
+        partition_json(file=f, text=text)
+
+    with pytest.raises(ValueError):
+        partition_json(filename=test_path, file=f, text=text)
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index 4f069ee79..5685f7e9d 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -83,6 +83,7 @@ class FileType(Enum):
     EML = 40
     RTF = 41
     TXT = 42
+    JSON = 43
 
     # Markup Types
     HTML = 50
@@ -116,6 +117,7 @@ EXT_TO_FILETYPE = {
     ".xls": FileType.XLS,
     ".ppt": FileType.PPT,
     ".rtf": FileType.RTF,
+    ".json": FileType.JSON,
 }
 
 
@@ -154,6 +156,9 @@ def detect_filetype(
     if mime_type == "application/pdf":
         return FileType.PDF
 
+    elif mime_type == "application/json":
+        return FileType.JSON
+
     elif mime_type in DOCX_MIME_TYPES:
         return FileType.DOCX
 
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index f28c66d9d..b286ed541 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.image import partition_image
+from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
@@ -67,6 +68,8 @@ def partition(
         return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
     elif filetype == FileType.PPTX:
         return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
+    elif filetype == FileType.JSON:
+        return partition_json(filename=filename, file=file)
     else:
         msg = "Invalid file" if not filename else f"Invalid file {filename}"
         raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py
new file mode 100644
index 000000000..5ec8fa6e5
--- /dev/null
+++ b/unstructured/partition/json.py
@@ -0,0 +1,46 @@
+import json
+import re
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.staging.base import dict_to_elements
+
+LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{"
+
+
+def partition_json(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    text: Optional[str] = None,
+) -> List[Element]:
+    """Partitions an .json document into its constituent elements."""
+    if not any([filename, file, text]):
+        raise ValueError("One of filename, file, or text must be specified.")
+
+    if filename is not None and not file and not text:
+        with open(filename, encoding="utf8") as f:
+            file_text = f.read()
+
+    elif file is not None and not filename and not text:
+        file_text = file.read()
+
+    elif text is not None and not filename and not file:
+        file_text = str(text)
+
+    else:
+        raise ValueError("Only one of filename, file, or text can be specified.")
+
+    # NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
+    if not re.match(LIST_OF_DICTS_PATTERN, file_text):
+        raise ValueError("Json schema does not match the Unstructured schema")
+
+    try:
+        dict = json.loads(file_text)
+        elements = dict_to_elements(dict)
+    except json.JSONDecodeError:
+        raise ValueError("Not a valid json")
+
+    # NOTE(Nathan): in future PR, try extracting items that look like text
+    #               if file_text is a valid json but not an unstructured json
+
+    return elements