mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 22:23:24 +00:00
feat: update auto.partition() function to recognize Unstructured json (#337)
This commit is contained in:
parent
1580c1bf8e
commit
6be07a5260
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* `auto.partition()` can now load Unstructured ISD json documents.
|
||||
* Simplify partitioning functions.
|
||||
* Improve logging for ingest CLI.
|
||||
|
||||
|
||||
107
test_unstructured/partition/test_json.py
Normal file
107
test_unstructured/partition/test_json.py
Normal file
@ -0,0 +1,107 @@
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.staging.base import elements_to_json
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
test_files = [
|
||||
"fake-text.txt",
|
||||
"layout-parser-paper-fast.pdf",
|
||||
"fake-html.html",
|
||||
"fake.doc",
|
||||
"fake-email.eml",
|
||||
"fake-power-point.ppt",
|
||||
"fake.docx",
|
||||
"fake-power-point.pptx",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_json_from_filename(filename: str):
|
||||
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
elements = partition(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_path = os.path.join(tmpdir, filename + ".json")
|
||||
elements_to_json(elements, filename=test_path, indent=2)
|
||||
test_elements = partition_json(filename=test_path)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert len(str(elements[0])) > 0
|
||||
|
||||
assert len(elements) == len(test_elements)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i] == test_elements[i]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_json_from_file(filename: str):
|
||||
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
elements = partition(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_path = os.path.join(tmpdir, filename + ".json")
|
||||
elements_to_json(elements, filename=test_path, indent=2)
|
||||
with open(test_path) as f:
|
||||
test_elements = partition_json(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert len(str(elements[0])) > 0
|
||||
|
||||
assert len(elements) == len(test_elements)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i] == test_elements[i]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_json_from_text(filename: str):
|
||||
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
elements = partition(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_path = os.path.join(tmpdir, filename + ".json")
|
||||
elements_to_json(elements, filename=test_path, indent=2)
|
||||
with open(test_path) as f:
|
||||
text = f.read()
|
||||
test_elements = partition_json(text=text)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert len(str(elements[0])) > 0
|
||||
|
||||
assert len(elements) == len(test_elements)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i] == test_elements[i]
|
||||
|
||||
|
||||
def test_partition_json_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_json()
|
||||
|
||||
|
||||
def test_partition_json_raises_with_too_many_specified():
|
||||
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
elements = partition(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_path = os.path.join(tmpdir, "fake-text.txt.json")
|
||||
elements_to_json(elements, filename=test_path, indent=2)
|
||||
with open(test_path) as f:
|
||||
text = f.read()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_json(filename=test_path, file=f)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_json(filename=test_path, text=text)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_json(file=f, text=text)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_json(filename=test_path, file=f, text=text)
|
||||
@ -83,6 +83,7 @@ class FileType(Enum):
|
||||
EML = 40
|
||||
RTF = 41
|
||||
TXT = 42
|
||||
JSON = 43
|
||||
|
||||
# Markup Types
|
||||
HTML = 50
|
||||
@ -116,6 +117,7 @@ EXT_TO_FILETYPE = {
|
||||
".xls": FileType.XLS,
|
||||
".ppt": FileType.PPT,
|
||||
".rtf": FileType.RTF,
|
||||
".json": FileType.JSON,
|
||||
}
|
||||
|
||||
|
||||
@ -154,6 +156,9 @@ def detect_filetype(
|
||||
if mime_type == "application/pdf":
|
||||
return FileType.PDF
|
||||
|
||||
elif mime_type == "application/json":
|
||||
return FileType.JSON
|
||||
|
||||
elif mime_type in DOCX_MIME_TYPES:
|
||||
return FileType.DOCX
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.image import partition_image
|
||||
from unstructured.partition.json import partition_json
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
@ -67,6 +68,8 @@ def partition(
|
||||
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elif filetype == FileType.PPTX:
|
||||
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elif filetype == FileType.JSON:
|
||||
return partition_json(filename=filename, file=file)
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||
|
||||
46
unstructured/partition/json.py
Normal file
46
unstructured/partition/json.py
Normal file
@ -0,0 +1,46 @@
|
||||
import json
|
||||
import re
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.staging.base import dict_to_elements
|
||||
|
||||
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{"
|
||||
|
||||
|
||||
def partition_json(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .json document into its constituent elements."""
|
||||
if not any([filename, file, text]):
|
||||
raise ValueError("One of filename, file, or text must be specified.")
|
||||
|
||||
if filename is not None and not file and not text:
|
||||
with open(filename, encoding="utf8") as f:
|
||||
file_text = f.read()
|
||||
|
||||
elif file is not None and not filename and not text:
|
||||
file_text = file.read()
|
||||
|
||||
elif text is not None and not filename and not file:
|
||||
file_text = str(text)
|
||||
|
||||
else:
|
||||
raise ValueError("Only one of filename, file, or text can be specified.")
|
||||
|
||||
# NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
|
||||
if not re.match(LIST_OF_DICTS_PATTERN, file_text):
|
||||
raise ValueError("Json schema does not match the Unstructured schema")
|
||||
|
||||
try:
|
||||
dict = json.loads(file_text)
|
||||
elements = dict_to_elements(dict)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Not a valid json")
|
||||
|
||||
# NOTE(Nathan): in future PR, try extracting items that look like text
|
||||
# if file_text is a valid json but not an unstructured json
|
||||
|
||||
return elements
|
||||
Loading…
x
Reference in New Issue
Block a user