From 6be07a5260917c6e2dc7d134f789d9d046afdb1b Mon Sep 17 00:00:00 2001 From: natygyoon <59327875+natygyoon@users.noreply.github.com> Date: Thu, 9 Mar 2023 03:36:01 +0900 Subject: [PATCH] feat: update auto.partition() function to recognize Unstructured json (#337) --- CHANGELOG.md | 1 + test_unstructured/partition/test_json.py | 107 +++++++++++++++++++++++ unstructured/file_utils/filetype.py | 5 ++ unstructured/partition/auto.py | 3 + unstructured/partition/json.py | 46 ++++++++++ 5 files changed, 162 insertions(+) create mode 100644 test_unstructured/partition/test_json.py create mode 100644 unstructured/partition/json.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 23f3a2833..d10118640 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Enhancements +* `auto.partition()` can now load Unstructured ISD json documents. * Simplify partitioning functions. * Improve logging for ingest CLI. diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py new file mode 100644 index 000000000..a41ebcef0 --- /dev/null +++ b/test_unstructured/partition/test_json.py @@ -0,0 +1,107 @@ +import os +import pathlib +import tempfile + +import pytest + +from unstructured.partition.auto import partition +from unstructured.partition.json import partition_json +from unstructured.staging.base import elements_to_json + +DIRECTORY = pathlib.Path(__file__).parent.resolve() + +test_files = [ + "fake-text.txt", + "layout-parser-paper-fast.pdf", + "fake-html.html", + "fake.doc", + "fake-email.eml", + "fake-power-point.ppt", + "fake.docx", + "fake-power-point.pptx", +] + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_json_from_filename(filename: str): + path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + elements = partition(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + test_path = os.path.join(tmpdir, filename + ".json") + elements_to_json(elements, filename=test_path, indent=2) + test_elements = partition_json(filename=test_path) + + assert len(elements) > 0 + assert len(str(elements[0])) > 0 + + assert len(elements) == len(test_elements) + for i in range(len(elements)): + assert elements[i] == test_elements[i] + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_json_from_file(filename: str): + path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + elements = partition(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + test_path = os.path.join(tmpdir, filename + ".json") + elements_to_json(elements, filename=test_path, indent=2) + with open(test_path) as f: + test_elements = partition_json(file=f) + + assert len(elements) > 0 + assert len(str(elements[0])) > 0 + + assert len(elements) == len(test_elements) + for i in range(len(elements)): + assert elements[i] == test_elements[i] + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_json_from_text(filename: str): + path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + elements = partition(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + test_path = os.path.join(tmpdir, filename + ".json") + elements_to_json(elements, filename=test_path, indent=2) + with open(test_path) as f: + text = f.read() + test_elements = partition_json(text=text) + + assert len(elements) > 0 + assert len(str(elements[0])) > 0 + + assert len(elements) == len(test_elements) + for i in range(len(elements)): + assert elements[i] == test_elements[i] + + +def test_partition_json_raises_with_none_specified(): + with pytest.raises(ValueError): + partition_json() + + +def test_partition_json_raises_with_too_many_specified(): + path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") + elements = partition(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + test_path = os.path.join(tmpdir, "fake-text.txt.json") + elements_to_json(elements, filename=test_path, indent=2) + with open(test_path) as f: + text = f.read() + + with pytest.raises(ValueError): + partition_json(filename=test_path, file=f) + + with pytest.raises(ValueError): + partition_json(filename=test_path, text=text) + + with pytest.raises(ValueError): + partition_json(file=f, text=text) + + with pytest.raises(ValueError): + partition_json(filename=test_path, file=f, text=text) diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 4f069ee79..5685f7e9d 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -83,6 +83,7 @@ class FileType(Enum): EML = 40 RTF = 41 TXT = 42 + JSON = 43 # Markup Types HTML = 50 @@ -116,6 +117,7 @@ EXT_TO_FILETYPE = { ".xls": FileType.XLS, ".ppt": FileType.PPT, ".rtf": FileType.RTF, + ".json": FileType.JSON, } @@ -154,6 +156,9 @@ def detect_filetype( if mime_type == "application/pdf": return FileType.PDF + elif mime_type == "application/json": + return FileType.JSON + elif mime_type in DOCX_MIME_TYPES: return FileType.DOCX diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index f28c66d9d..b286ed541 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx from unstructured.partition.email import partition_email from unstructured.partition.html import partition_html from unstructured.partition.image import partition_image +from unstructured.partition.json import partition_json from unstructured.partition.md import partition_md from unstructured.partition.pdf import partition_pdf from unstructured.partition.ppt import partition_ppt @@ -67,6 +68,8 @@ def partition( return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.PPTX: return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks) + elif filetype == FileType.JSON: + return partition_json(filename=filename, file=file) else: msg = "Invalid file" if not filename else f"Invalid file {filename}" raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.") diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py new file mode 100644 index 000000000..5ec8fa6e5 --- /dev/null +++ b/unstructured/partition/json.py @@ -0,0 +1,46 @@ +import json +import re +from typing import IO, List, Optional + +from unstructured.documents.elements import Element +from unstructured.staging.base import dict_to_elements + +LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{" + + +def partition_json( + filename: Optional[str] = None, + file: Optional[IO] = None, + text: Optional[str] = None, +) -> List[Element]: + """Partitions an .json document into its constituent elements.""" + if not any([filename, file, text]): + raise ValueError("One of filename, file, or text must be specified.") + + if filename is not None and not file and not text: + with open(filename, encoding="utf8") as f: + file_text = f.read() + + elif file is not None and not filename and not text: + file_text = file.read() + + elif text is not None and not filename and not file: + file_text = str(text) + + else: + raise ValueError("Only one of filename, file, or text can be specified.") + + # NOTE(Nathan): we expect file_text to be a list of dicts (optimization) + if not re.match(LIST_OF_DICTS_PATTERN, file_text): + raise ValueError("Json schema does not match the Unstructured schema") + + try: + dict = json.loads(file_text) + elements = dict_to_elements(dict) + except json.JSONDecodeError: + raise ValueError("Not a valid json") + + # NOTE(Nathan): in future PR, try extracting items that look like text + # if file_text is a valid json but not an unstructured json + + return elements