feat: update auto.partition() function to recognize Unstructured json (#337)

This commit is contained in:
natygyoon 2023-03-09 03:36:01 +09:00 committed by GitHub
parent 1580c1bf8e
commit 6be07a5260
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 162 additions and 0 deletions

View File

@ -2,6 +2,7 @@
### Enhancements
* `auto.partition()` can now load Unstructured ISD json documents.
* Simplify partitioning functions.
* Improve logging for ingest CLI.

View File

@ -0,0 +1,107 @@
import os
import pathlib
import tempfile
import pytest
from unstructured.partition.auto import partition
from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_to_json
DIRECTORY = pathlib.Path(__file__).parent.resolve()
test_files = [
"fake-text.txt",
"layout-parser-paper-fast.pdf",
"fake-html.html",
"fake.doc",
"fake-email.eml",
"fake-power-point.ppt",
"fake.docx",
"fake-power-point.pptx",
]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
test_elements = partition_json(filename=test_path)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
test_elements = partition_json(file=f)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_text(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, filename + ".json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
text = f.read()
test_elements = partition_json(text=text)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
def test_partition_json_raises_with_none_specified():
with pytest.raises(ValueError):
partition_json()
def test_partition_json_raises_with_too_many_specified():
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, "fake-text.txt.json")
elements_to_json(elements, filename=test_path, indent=2)
with open(test_path) as f:
text = f.read()
with pytest.raises(ValueError):
partition_json(filename=test_path, file=f)
with pytest.raises(ValueError):
partition_json(filename=test_path, text=text)
with pytest.raises(ValueError):
partition_json(file=f, text=text)
with pytest.raises(ValueError):
partition_json(filename=test_path, file=f, text=text)

View File

@ -83,6 +83,7 @@ class FileType(Enum):
EML = 40
RTF = 41
TXT = 42
JSON = 43
# Markup Types
HTML = 50
@ -116,6 +117,7 @@ EXT_TO_FILETYPE = {
".xls": FileType.XLS,
".ppt": FileType.PPT,
".rtf": FileType.RTF,
".json": FileType.JSON,
}
@ -154,6 +156,9 @@ def detect_filetype(
if mime_type == "application/pdf":
return FileType.PDF
elif mime_type == "application/json":
return FileType.JSON
elif mime_type in DOCX_MIME_TYPES:
return FileType.DOCX

View File

@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
from unstructured.partition.image import partition_image
from unstructured.partition.json import partition_json
from unstructured.partition.md import partition_md
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.ppt import partition_ppt
@ -67,6 +68,8 @@ def partition(
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
elif filetype == FileType.PPTX:
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
elif filetype == FileType.JSON:
return partition_json(filename=filename, file=file)
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")

View File

@ -0,0 +1,46 @@
import json
import re
from typing import IO, List, Optional
from unstructured.documents.elements import Element
from unstructured.staging.base import dict_to_elements
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{"
def partition_json(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
) -> List[Element]:
"""Partitions an .json document into its constituent elements."""
if not any([filename, file, text]):
raise ValueError("One of filename, file, or text must be specified.")
if filename is not None and not file and not text:
with open(filename, encoding="utf8") as f:
file_text = f.read()
elif file is not None and not filename and not text:
file_text = file.read()
elif text is not None and not filename and not file:
file_text = str(text)
else:
raise ValueError("Only one of filename, file, or text can be specified.")
# NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
if not re.match(LIST_OF_DICTS_PATTERN, file_text):
raise ValueError("Json schema does not match the Unstructured schema")
try:
dict = json.loads(file_text)
elements = dict_to_elements(dict)
except json.JSONDecodeError:
raise ValueError("Not a valid json")
# NOTE(Nathan): in future PR, try extracting items that look like text
# if file_text is a valid json but not an unstructured json
return elements