mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-17 10:14:36 +00:00
rfctr(part): extract partition.common submodules (#3649)
**Summary** In preparation for consolidating post-partitioning metadata decorators, extract `partition.common` module into a sub-package (directory) and extract `partition.common.metadata` module to house metadata-specific object shared by partitioners. **Additional Context** - This new module will be the home of the new consolidated metadata decorator. - The consolidated decorator is a step toward removing post-processing decorators from _delegating_ partitioners. A delegating partitioner is one that convert its file to a different format and "delegates" actual partitioning to the partitioner for that target format. 10 of the 20 partitioners are delegating partitioners. - Removing decorators from delegating partitioners will allow us to avoid "double-decorating", i.e. running those decorators twice, once on the principal partitioner and again on the proxy partitioner. - This will allow us to send `**kwargs` to either partitioner, removing the knowledge of which arguments to send for each file-type from auto-partition. - And this will allow pluggable auto-partitioners which all have a `partition_x(filename, *, file, **kwargs) -> list[Element]` interface.
This commit is contained in:
parent
7d66a236f1
commit
03c2bf8f1f
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,5 +1,17 @@
|
|||||||
|
## 0.15.14-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.15.13
|
## 0.15.13
|
||||||
|
|
||||||
|
### BREAKING CHANGES
|
||||||
|
|
||||||
|
* **Remove dead experimental code.** Unused code in `file_utils.experimantal` and `file_utils.metadata` was removed. These functions were never published in the documentation, but if a client dug these out and used them this removal could break client code.
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents.
|
* **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents.
|
||||||
|
|||||||
@ -3,13 +3,4 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
# $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
|
# $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
|
||||||
echo "Current venv is:"
|
|
||||||
echo "$VIRTUAL_ENV"
|
|
||||||
echo "Current path is:"
|
|
||||||
echo "$PATH"
|
|
||||||
ls -l "$VIRTUAL_ENV/bin/chroma"
|
|
||||||
echo "================"
|
|
||||||
cat "$VIRTUAL_ENV/bin/chroma"
|
|
||||||
echo "================"
|
|
||||||
# chroma run --path "$1" &
|
|
||||||
python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &
|
python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &
|
||||||
|
|||||||
0
test_unstructured/partition/common/__init__.py
Normal file
0
test_unstructured/partition/common/__init__.py
Normal file
@ -1,6 +1,3 @@
|
|||||||
import datetime as dt
|
|
||||||
import io
|
|
||||||
import os
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
@ -20,7 +17,6 @@ from unstructured.documents.elements import (
|
|||||||
TYPE_TO_TEXT_ELEMENT_MAP,
|
TYPE_TO_TEXT_ELEMENT_MAP,
|
||||||
CheckBox,
|
CheckBox,
|
||||||
CoordinatesMetadata,
|
CoordinatesMetadata,
|
||||||
ElementMetadata,
|
|
||||||
ElementType,
|
ElementType,
|
||||||
FigureCaption,
|
FigureCaption,
|
||||||
Header,
|
Header,
|
||||||
@ -32,7 +28,7 @@ from unstructured.documents.elements import (
|
|||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Image as ImageElement,
|
Image as ImageElement,
|
||||||
)
|
)
|
||||||
from unstructured.partition import common
|
from unstructured.partition.common import common
|
||||||
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
|
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
|
||||||
|
|
||||||
|
|
||||||
@ -347,7 +343,7 @@ class MockRunOutput:
|
|||||||
|
|
||||||
|
|
||||||
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
||||||
from unstructured.partition.common import subprocess
|
from unstructured.partition.common.common import subprocess
|
||||||
|
|
||||||
def mock_run(*args, **kwargs):
|
def mock_run(*args, **kwargs):
|
||||||
return MockRunOutput(1, "an error occurred".encode(), "error details".encode())
|
return MockRunOutput(1, "an error occurred".encode(), "error details".encode())
|
||||||
@ -429,75 +425,6 @@ def test_get_page_image_metadata_and_coordinate_system():
|
|||||||
assert isinstance(metadata, dict)
|
assert isinstance(metadata, dict)
|
||||||
|
|
||||||
|
|
||||||
def test_set_element_hierarchy():
|
|
||||||
elements_to_set = [
|
|
||||||
Title(text="Title"), # 0
|
|
||||||
NarrativeText(text="NarrativeText"), # 1
|
|
||||||
FigureCaption(text="FigureCaption"), # 2
|
|
||||||
ListItem(text="ListItem"), # 3
|
|
||||||
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4
|
|
||||||
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5
|
|
||||||
ListItem(text="ListItem"), # 6
|
|
||||||
CheckBox(element_id="some-id-1", checked=True), # 7
|
|
||||||
Title(text="Title 2"), # 8
|
|
||||||
ListItem(text="ListItem"), # 9
|
|
||||||
ListItem(text="ListItem"), # 10
|
|
||||||
Text(text="Text"), # 11
|
|
||||||
]
|
|
||||||
elements = common.set_element_hierarchy(elements_to_set)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
elements[1].metadata.parent_id == elements[0].id
|
|
||||||
), "NarrativeText should be child of Title"
|
|
||||||
assert (
|
|
||||||
elements[2].metadata.parent_id == elements[0].id
|
|
||||||
), "FigureCaption should be child of Title"
|
|
||||||
assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
|
||||||
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
|
||||||
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
|
||||||
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
|
||||||
# NOTE(Hubert): moving the category field to Element, caused this to fail.
|
|
||||||
# Checkboxes will soon be deprecated, then we can remove the test.
|
|
||||||
# assert (
|
|
||||||
# elements[7].metadata.parent_id is None
|
|
||||||
# ), "CheckBox should be None, as it's not a Text based element"
|
|
||||||
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
|
|
||||||
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
|
||||||
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
|
||||||
assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
|
|
||||||
|
|
||||||
|
|
||||||
def test_set_element_hierarchy_custom_rule_set():
|
|
||||||
elements_to_set = [
|
|
||||||
Header(text="Header"), # 0
|
|
||||||
Title(text="Title"), # 1
|
|
||||||
NarrativeText(text="NarrativeText"), # 2
|
|
||||||
Text(text="Text"), # 3
|
|
||||||
Title(text="Title 2"), # 4
|
|
||||||
FigureCaption(text="FigureCaption"), # 5
|
|
||||||
]
|
|
||||||
|
|
||||||
custom_rule_set = {
|
|
||||||
"Header": ["Title", "Text"],
|
|
||||||
"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
|
|
||||||
}
|
|
||||||
|
|
||||||
elements = common.set_element_hierarchy(
|
|
||||||
elements=elements_to_set,
|
|
||||||
ruleset=custom_rule_set,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
|
|
||||||
assert (
|
|
||||||
elements[2].metadata.parent_id == elements[1].id
|
|
||||||
), "NarrativeText should be child of Title"
|
|
||||||
assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
|
|
||||||
assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
|
|
||||||
assert (
|
|
||||||
elements[5].metadata.parent_id == elements[4].id
|
|
||||||
), "FigureCaption should be child of Title 2"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MockImage:
|
class MockImage:
|
||||||
width = 640
|
width = 640
|
||||||
@ -607,99 +534,3 @@ def test_ocr_data_to_elements(
|
|||||||
points=layout_el.bbox.coordinates,
|
points=layout_el.bbox.coordinates,
|
||||||
system=coordinate_system,
|
system=coordinate_system,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Describe_get_last_modified:
|
|
||||||
"""Isolated unit-tests for `unstructured.partition.common.get_last_modified()."""
|
|
||||||
|
|
||||||
def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
|
|
||||||
self, file_and_last_modified: tuple[str, str]
|
|
||||||
):
|
|
||||||
file_path, last_modified = file_and_last_modified
|
|
||||||
last_modified_date = common.get_last_modified(str(file_path), None, False)
|
|
||||||
assert last_modified_date == last_modified
|
|
||||||
|
|
||||||
def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
|
|
||||||
self, file_and_last_modified: tuple[str, str]
|
|
||||||
):
|
|
||||||
file_path, last_modified = file_and_last_modified
|
|
||||||
with open(file_path, "rb") as f:
|
|
||||||
last_modified_date = common.get_last_modified(None, f, True)
|
|
||||||
assert last_modified_date == last_modified
|
|
||||||
|
|
||||||
def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
|
|
||||||
file_path, _ = file_and_last_modified
|
|
||||||
with open(file_path, "rb") as f:
|
|
||||||
last_modified_date = common.get_last_modified(None, f, False)
|
|
||||||
assert last_modified_date is None
|
|
||||||
|
|
||||||
# -- fixtures --------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
|
|
||||||
modified_timestamp = dt.datetime(
|
|
||||||
year=2024, month=6, day=14, hour=15, minute=39, second=25
|
|
||||||
).timestamp()
|
|
||||||
file_path = tmp_path / "some_file.txt"
|
|
||||||
file_path.write_text("abcdefg")
|
|
||||||
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
|
||||||
return str(file_path), "2024-06-14T15:39:25"
|
|
||||||
|
|
||||||
|
|
||||||
class Describe_get_last_modified_date:
|
|
||||||
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
|
|
||||||
modified_timestamp = dt.datetime(
|
|
||||||
year=2024, month=3, day=5, hour=17, minute=43, second=40
|
|
||||||
).timestamp()
|
|
||||||
file_path = tmp_path / "some_file.txt"
|
|
||||||
file_path.write_text("abcdefg")
|
|
||||||
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
|
||||||
|
|
||||||
last_modified_date = common.get_last_modified_date(str(file_path))
|
|
||||||
|
|
||||||
assert last_modified_date == "2024-03-05T17:43:40"
|
|
||||||
|
|
||||||
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
|
|
||||||
file_path = tmp_path / "some_file_that_does_not_exist.txt"
|
|
||||||
|
|
||||||
last_modified_date = common.get_last_modified_date(str(file_path))
|
|
||||||
|
|
||||||
assert last_modified_date is None
|
|
||||||
|
|
||||||
|
|
||||||
class Describe_get_last_modified_date_from_file:
|
|
||||||
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
|
|
||||||
self, tmp_path: pathlib.Path
|
|
||||||
):
|
|
||||||
modified_timestamp = dt.datetime(
|
|
||||||
year=2024, month=3, day=5, hour=20, minute=48, second=26
|
|
||||||
).timestamp()
|
|
||||||
file_path = tmp_path / "some_file_2.txt"
|
|
||||||
file_path.write_text("abcdefg")
|
|
||||||
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
|
||||||
|
|
||||||
with open(file_path, "rb") as f:
|
|
||||||
last_modified_date = common.get_last_modified_date_from_file(f)
|
|
||||||
|
|
||||||
assert last_modified_date == "2024-03-05T20:48:26"
|
|
||||||
|
|
||||||
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
|
|
||||||
assert common.get_last_modified_date_from_file(b"abcdefg") is None
|
|
||||||
|
|
||||||
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
|
|
||||||
file = io.BytesIO(b"abcdefg")
|
|
||||||
assert hasattr(file, "name") is False
|
|
||||||
|
|
||||||
last_modified_date = common.get_last_modified_date_from_file(file)
|
|
||||||
|
|
||||||
assert last_modified_date is None
|
|
||||||
|
|
||||||
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
|
|
||||||
self, tmp_path: pathlib.Path
|
|
||||||
):
|
|
||||||
file = io.BytesIO(b"abcdefg")
|
|
||||||
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
|
|
||||||
|
|
||||||
last_modified_date = common.get_last_modified_date_from_file(file)
|
|
||||||
|
|
||||||
assert last_modified_date is None
|
|
||||||
201
test_unstructured/partition/common/test_metadata.py
Normal file
201
test_unstructured/partition/common/test_metadata.py
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
"""Test-suite for `unstructured.partition.common.metadata` module."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime as dt
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.documents.elements import (
|
||||||
|
CheckBox,
|
||||||
|
ElementMetadata,
|
||||||
|
FigureCaption,
|
||||||
|
Header,
|
||||||
|
ListItem,
|
||||||
|
NarrativeText,
|
||||||
|
Text,
|
||||||
|
Title,
|
||||||
|
)
|
||||||
|
from unstructured.partition.common.metadata import (
|
||||||
|
get_last_modified,
|
||||||
|
get_last_modified_date,
|
||||||
|
get_last_modified_date_from_file,
|
||||||
|
set_element_hierarchy,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# LAST-MODIFIED
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_get_last_modified:
|
||||||
|
"""Isolated unit-tests for `unstructured.partition.common.metadata.get_last_modified()."""
|
||||||
|
|
||||||
|
def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
|
||||||
|
self, file_and_last_modified: tuple[str, str]
|
||||||
|
):
|
||||||
|
file_path, last_modified = file_and_last_modified
|
||||||
|
last_modified_date = get_last_modified(str(file_path), None, False)
|
||||||
|
assert last_modified_date == last_modified
|
||||||
|
|
||||||
|
def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
|
||||||
|
self, file_and_last_modified: tuple[str, str]
|
||||||
|
):
|
||||||
|
file_path, last_modified = file_and_last_modified
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
last_modified_date = get_last_modified(None, f, True)
|
||||||
|
assert last_modified_date == last_modified
|
||||||
|
|
||||||
|
def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
|
||||||
|
file_path, _ = file_and_last_modified
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
last_modified_date = get_last_modified(None, f, False)
|
||||||
|
assert last_modified_date is None
|
||||||
|
|
||||||
|
# -- fixtures --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
|
||||||
|
modified_timestamp = dt.datetime(
|
||||||
|
year=2024, month=6, day=14, hour=15, minute=39, second=25
|
||||||
|
).timestamp()
|
||||||
|
file_path = tmp_path / "some_file.txt"
|
||||||
|
file_path.write_text("abcdefg")
|
||||||
|
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||||||
|
return str(file_path), "2024-06-14T15:39:25"
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_get_last_modified_date:
|
||||||
|
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
|
||||||
|
modified_timestamp = dt.datetime(
|
||||||
|
year=2024, month=3, day=5, hour=17, minute=43, second=40
|
||||||
|
).timestamp()
|
||||||
|
file_path = tmp_path / "some_file.txt"
|
||||||
|
file_path.write_text("abcdefg")
|
||||||
|
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||||||
|
|
||||||
|
last_modified_date = get_last_modified_date(str(file_path))
|
||||||
|
|
||||||
|
assert last_modified_date == "2024-03-05T17:43:40"
|
||||||
|
|
||||||
|
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
|
||||||
|
file_path = tmp_path / "some_file_that_does_not_exist.txt"
|
||||||
|
|
||||||
|
last_modified_date = get_last_modified_date(str(file_path))
|
||||||
|
|
||||||
|
assert last_modified_date is None
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_get_last_modified_date_from_file:
|
||||||
|
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
|
||||||
|
self, tmp_path: pathlib.Path
|
||||||
|
):
|
||||||
|
modified_timestamp = dt.datetime(
|
||||||
|
year=2024, month=3, day=5, hour=20, minute=48, second=26
|
||||||
|
).timestamp()
|
||||||
|
file_path = tmp_path / "some_file_2.txt"
|
||||||
|
file_path.write_text("abcdefg")
|
||||||
|
os.utime(file_path, (modified_timestamp, modified_timestamp))
|
||||||
|
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
last_modified_date = get_last_modified_date_from_file(f)
|
||||||
|
|
||||||
|
assert last_modified_date == "2024-03-05T20:48:26"
|
||||||
|
|
||||||
|
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
|
||||||
|
assert get_last_modified_date_from_file(b"abcdefg") is None
|
||||||
|
|
||||||
|
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
|
||||||
|
file = io.BytesIO(b"abcdefg")
|
||||||
|
assert hasattr(file, "name") is False
|
||||||
|
|
||||||
|
last_modified_date = get_last_modified_date_from_file(file)
|
||||||
|
|
||||||
|
assert last_modified_date is None
|
||||||
|
|
||||||
|
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
|
||||||
|
self, tmp_path: pathlib.Path
|
||||||
|
):
|
||||||
|
file = io.BytesIO(b"abcdefg")
|
||||||
|
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
|
||||||
|
|
||||||
|
last_modified_date = get_last_modified_date_from_file(file)
|
||||||
|
|
||||||
|
assert last_modified_date is None
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# ELEMENT HIERARCHY
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_set_element_hierarchy():
|
||||||
|
elements_to_set = [
|
||||||
|
Title(text="Title"), # 0
|
||||||
|
NarrativeText(text="NarrativeText"), # 1
|
||||||
|
FigureCaption(text="FigureCaption"), # 2
|
||||||
|
ListItem(text="ListItem"), # 3
|
||||||
|
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4
|
||||||
|
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5
|
||||||
|
ListItem(text="ListItem"), # 6
|
||||||
|
CheckBox(element_id="some-id-1", checked=True), # 7
|
||||||
|
Title(text="Title 2"), # 8
|
||||||
|
ListItem(text="ListItem"), # 9
|
||||||
|
ListItem(text="ListItem"), # 10
|
||||||
|
Text(text="Text"), # 11
|
||||||
|
]
|
||||||
|
elements = set_element_hierarchy(elements_to_set)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
elements[1].metadata.parent_id == elements[0].id
|
||||||
|
), "NarrativeText should be child of Title"
|
||||||
|
assert (
|
||||||
|
elements[2].metadata.parent_id == elements[0].id
|
||||||
|
), "FigureCaption should be child of Title"
|
||||||
|
assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
||||||
|
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
||||||
|
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
||||||
|
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
||||||
|
# NOTE(Hubert): moving the category field to Element, caused this to fail.
|
||||||
|
# Checkboxes will soon be deprecated, then we can remove the test.
|
||||||
|
# assert (
|
||||||
|
# elements[7].metadata.parent_id is None
|
||||||
|
# ), "CheckBox should be None, as it's not a Text based element"
|
||||||
|
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
|
||||||
|
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
||||||
|
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
||||||
|
assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
|
||||||
|
|
||||||
|
|
||||||
|
def test_set_element_hierarchy_custom_rule_set():
|
||||||
|
elements_to_set = [
|
||||||
|
Header(text="Header"), # 0
|
||||||
|
Title(text="Title"), # 1
|
||||||
|
NarrativeText(text="NarrativeText"), # 2
|
||||||
|
Text(text="Text"), # 3
|
||||||
|
Title(text="Title 2"), # 4
|
||||||
|
FigureCaption(text="FigureCaption"), # 5
|
||||||
|
]
|
||||||
|
|
||||||
|
custom_rule_set = {
|
||||||
|
"Header": ["Title", "Text"],
|
||||||
|
"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
|
||||||
|
}
|
||||||
|
|
||||||
|
elements = set_element_hierarchy(
|
||||||
|
elements=elements_to_set,
|
||||||
|
ruleset=custom_rule_set,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
|
||||||
|
assert (
|
||||||
|
elements[2].metadata.parent_id == elements[1].id
|
||||||
|
), "NarrativeText should be child of Title"
|
||||||
|
assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
|
||||||
|
assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
|
||||||
|
assert (
|
||||||
|
elements[5].metadata.parent_id == elements[4].id
|
||||||
|
), "FigureCaption should be child of Title 2"
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.15.13" # pragma: no cover
|
__version__ = "0.15.14-dev0" # pragma: no cover
|
||||||
|
|||||||
@ -2,7 +2,7 @@ from typing import IO, Optional, Tuple, Union
|
|||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
from unstructured.partition.common import convert_to_bytes
|
from unstructured.partition.common.common import convert_to_bytes
|
||||||
|
|
||||||
ENCODE_REC_THRESHOLD = 0.8
|
ENCODE_REC_THRESHOLD = 0.8
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common.common import exactly_one
|
||||||
from unstructured.utils import requires_dependencies
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -47,12 +47,12 @@ from unstructured.file_utils.encoding import detect_file_encoding, format_encodi
|
|||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
|
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import (
|
||||||
add_element_metadata,
|
add_element_metadata,
|
||||||
exactly_one,
|
exactly_one,
|
||||||
remove_element_metadata,
|
remove_element_metadata,
|
||||||
set_element_hierarchy,
|
|
||||||
)
|
)
|
||||||
|
from unstructured.partition.common.metadata import set_element_hierarchy
|
||||||
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
|
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
|
||||||
|
|
||||||
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
|
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
|
||||||
@ -500,8 +500,8 @@ class _OleFileDifferentiator:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
|
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
|
||||||
with ctx.open() as f:
|
with ctx.open() as f:
|
||||||
ole = OleFileIO(f)
|
ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType]
|
||||||
root_storage = Storage.from_ole(ole)
|
root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType]
|
||||||
|
|
||||||
for stream in root_storage.streams:
|
for stream in root_storage.streams:
|
||||||
if stream.name == "WordDocument":
|
if stream.name == "WordDocument":
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from unstructured_client.models import shared
|
|||||||
|
|
||||||
from unstructured.documents.elements import Element
|
from unstructured.documents.elements import Element
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common.common import exactly_one
|
||||||
from unstructured.staging.base import elements_from_dicts, elements_from_json
|
from unstructured.staging.base import elements_from_dicts, elements_from_json
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from unstructured.documents.elements import DataSourceMetadata, Element
|
|||||||
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common.common import exactly_one
|
||||||
from unstructured.partition.lang import check_language_args
|
from unstructured.partition.lang import check_language_args
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
from unstructured.utils import dependency_exists
|
from unstructured.utils import dependency_exists
|
||||||
|
|||||||
0
unstructured/partition/common/__init__.py
Normal file
0
unstructured/partition/common/__init__.py
Normal file
@ -1,9 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import numbers
|
import numbers
|
||||||
import os
|
|
||||||
import subprocess
|
import subprocess
|
||||||
from datetime import datetime
|
|
||||||
from io import BufferedReader, BytesIO, TextIOWrapper
|
from io import BufferedReader, BytesIO, TextIOWrapper
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from time import sleep
|
from time import sleep
|
||||||
@ -41,76 +39,6 @@ if TYPE_CHECKING:
|
|||||||
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
|
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
|
||||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
from unstructured_inference.inference.layoutelement import LayoutElement
|
||||||
|
|
||||||
HIERARCHY_RULE_SET = {
|
|
||||||
"Title": [
|
|
||||||
"Text",
|
|
||||||
"UncategorizedText",
|
|
||||||
"NarrativeText",
|
|
||||||
"ListItem",
|
|
||||||
"BulletedText",
|
|
||||||
"Table",
|
|
||||||
"FigureCaption",
|
|
||||||
"CheckBox",
|
|
||||||
"Table",
|
|
||||||
],
|
|
||||||
"Header": [
|
|
||||||
"Title",
|
|
||||||
"Text",
|
|
||||||
"UncategorizedText",
|
|
||||||
"NarrativeText",
|
|
||||||
"ListItem",
|
|
||||||
"BulletedText",
|
|
||||||
"Table",
|
|
||||||
"FigureCaption",
|
|
||||||
"CheckBox",
|
|
||||||
"Table",
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_last_modified(
|
|
||||||
filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
|
|
||||||
) -> str | None:
|
|
||||||
"""Determine best available last-modified date from file or filename."""
|
|
||||||
if filename is not None:
|
|
||||||
return get_last_modified_date(filename)
|
|
||||||
|
|
||||||
if file is not None:
|
|
||||||
return get_last_modified_date_from_file(file) if date_from_file_object else None
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_last_modified_date(filename: str) -> Optional[str]:
|
|
||||||
"""Modification time of file at path `filename`, if it exists.
|
|
||||||
|
|
||||||
Returns `None` when `filename` is not a path to a file on the local filesystem.
|
|
||||||
|
|
||||||
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
|
|
||||||
"2024-03-05T17:02:53".
|
|
||||||
"""
|
|
||||||
if not os.path.isfile(filename):
|
|
||||||
return None
|
|
||||||
|
|
||||||
modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
|
|
||||||
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
|
|
||||||
|
|
||||||
|
|
||||||
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
|
|
||||||
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
|
|
||||||
# -- a file-like object will have a name attribute if created by `open()` or if a name is
|
|
||||||
# -- assigned to it for metadata purposes. Use "" as default because the empty string is never
|
|
||||||
# -- a path to an actual file.
|
|
||||||
filename = str(getattr(file, "name", ""))
|
|
||||||
|
|
||||||
# -- there's no guarantee the path corresponds to an actual file on the filesystem. In
|
|
||||||
# -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
|
|
||||||
# -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
|
|
||||||
if not os.path.isfile(filename):
|
|
||||||
return None
|
|
||||||
|
|
||||||
return get_last_modified_date(filename)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_layout_element(
|
def normalize_layout_element(
|
||||||
layout_element: LayoutElement | Element | dict[str, Any],
|
layout_element: LayoutElement | Element | dict[str, Any],
|
||||||
@ -230,54 +158,6 @@ def layout_list_to_list_items(
|
|||||||
return list_items
|
return list_items
|
||||||
|
|
||||||
|
|
||||||
def set_element_hierarchy(
|
|
||||||
elements: list[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
|
|
||||||
) -> list[Element]:
|
|
||||||
"""Sets the parent_id for each element in the list of elements
|
|
||||||
based on the element's category, depth and a ruleset
|
|
||||||
|
|
||||||
"""
|
|
||||||
stack: list[Element] = []
|
|
||||||
for element in elements:
|
|
||||||
if element.metadata.parent_id is not None:
|
|
||||||
continue
|
|
||||||
parent_id = None
|
|
||||||
element_category = getattr(element, "category", None)
|
|
||||||
element_category_depth = getattr(element.metadata, "category_depth", 0) or 0
|
|
||||||
|
|
||||||
if not element_category:
|
|
||||||
continue
|
|
||||||
|
|
||||||
while stack:
|
|
||||||
top_element: Element = stack[-1]
|
|
||||||
top_element_category = getattr(top_element, "category")
|
|
||||||
top_element_category_depth = (
|
|
||||||
getattr(
|
|
||||||
top_element.metadata,
|
|
||||||
"category_depth",
|
|
||||||
0,
|
|
||||||
)
|
|
||||||
or 0
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
top_element_category == element_category
|
|
||||||
and top_element_category_depth < element_category_depth
|
|
||||||
) or (
|
|
||||||
top_element_category != element_category
|
|
||||||
and element_category in ruleset.get(top_element_category, [])
|
|
||||||
):
|
|
||||||
parent_id = top_element.id
|
|
||||||
break
|
|
||||||
|
|
||||||
stack.pop()
|
|
||||||
|
|
||||||
element.metadata.parent_id = parent_id
|
|
||||||
stack.append(element)
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
def add_element_metadata(
|
def add_element_metadata(
|
||||||
element: Element,
|
element: Element,
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
@ -580,7 +460,7 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
|
|||||||
# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
|
# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
|
||||||
# unstructured.documents.html, which imports this module so we can't import the class for type
|
# unstructured.documents.html, which imports this module so we can't import the class for type
|
||||||
# hints. Moreover, those two types of documents have different lists of attributes
|
# hints. Moreover, those two types of documents have different lists of attributes
|
||||||
# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by
|
# UPDATE(scanny): HTMLDocument no longer exists, so this function can be optimized for use by
|
||||||
# DocumentLayout only.
|
# DocumentLayout only.
|
||||||
def document_to_element_list(
|
def document_to_element_list(
|
||||||
document: DocumentLayout,
|
document: DocumentLayout,
|
||||||
128
unstructured/partition/common/metadata.py
Normal file
128
unstructured/partition/common/metadata.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
"""Helpers used across multiple partitioners to compute metadata."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime as dt
|
||||||
|
import os
|
||||||
|
from typing import IO, Optional, Sequence
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_modified(
|
||||||
|
filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
|
||||||
|
) -> str | None:
|
||||||
|
"""Determine best available last-modified date from file or filename."""
|
||||||
|
if filename is not None:
|
||||||
|
return get_last_modified_date(filename)
|
||||||
|
|
||||||
|
if file is not None:
|
||||||
|
return get_last_modified_date_from_file(file) if date_from_file_object else None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_modified_date(filename: str) -> Optional[str]:
|
||||||
|
"""Modification time of file at path `filename`, if it exists.
|
||||||
|
|
||||||
|
Returns `None` when `filename` is not a path to a file on the local filesystem.
|
||||||
|
|
||||||
|
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
|
||||||
|
"2024-03-05T17:02:53".
|
||||||
|
"""
|
||||||
|
if not os.path.isfile(filename):
|
||||||
|
return None
|
||||||
|
|
||||||
|
modify_date = dt.datetime.fromtimestamp(os.path.getmtime(filename))
|
||||||
|
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
|
||||||
|
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
|
||||||
|
# -- a file-like object will have a name attribute if created by `open()` or if a name is
|
||||||
|
# -- assigned to it for metadata purposes. Use "" as default because the empty string is never
|
||||||
|
# -- a path to an actual file.
|
||||||
|
filename = str(getattr(file, "name", ""))
|
||||||
|
|
||||||
|
# -- there's no guarantee the path corresponds to an actual file on the filesystem. In
|
||||||
|
# -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
|
||||||
|
# -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
|
||||||
|
if not os.path.isfile(filename):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return get_last_modified_date(filename)
|
||||||
|
|
||||||
|
|
||||||
|
HIERARCHY_RULE_SET = {
|
||||||
|
"Title": [
|
||||||
|
"Text",
|
||||||
|
"UncategorizedText",
|
||||||
|
"NarrativeText",
|
||||||
|
"ListItem",
|
||||||
|
"BulletedText",
|
||||||
|
"Table",
|
||||||
|
"FigureCaption",
|
||||||
|
"CheckBox",
|
||||||
|
"Table",
|
||||||
|
],
|
||||||
|
"Header": [
|
||||||
|
"Title",
|
||||||
|
"Text",
|
||||||
|
"UncategorizedText",
|
||||||
|
"NarrativeText",
|
||||||
|
"ListItem",
|
||||||
|
"BulletedText",
|
||||||
|
"Table",
|
||||||
|
"FigureCaption",
|
||||||
|
"CheckBox",
|
||||||
|
"Table",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def set_element_hierarchy(
|
||||||
|
elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
|
||||||
|
) -> list[Element]:
|
||||||
|
"""Sets the parent_id for each element in the list of elements
|
||||||
|
based on the element's category, depth and a ruleset
|
||||||
|
|
||||||
|
"""
|
||||||
|
stack: list[Element] = []
|
||||||
|
for element in elements:
|
||||||
|
if element.metadata.parent_id is not None:
|
||||||
|
continue
|
||||||
|
parent_id = None
|
||||||
|
element_category = getattr(element, "category", None)
|
||||||
|
element_category_depth = getattr(element.metadata, "category_depth", 0) or 0
|
||||||
|
|
||||||
|
if not element_category:
|
||||||
|
continue
|
||||||
|
|
||||||
|
while stack:
|
||||||
|
top_element: Element = stack[-1]
|
||||||
|
top_element_category = getattr(top_element, "category")
|
||||||
|
top_element_category_depth = (
|
||||||
|
getattr(
|
||||||
|
top_element.metadata,
|
||||||
|
"category_depth",
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
or 0
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
top_element_category == element_category
|
||||||
|
and top_element_category_depth < element_category_depth
|
||||||
|
) or (
|
||||||
|
top_element_category != element_category
|
||||||
|
and element_category in ruleset.get(top_element_category, [])
|
||||||
|
):
|
||||||
|
parent_id = top_element.id
|
||||||
|
break
|
||||||
|
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
|
element.metadata.parent_id = parent_id
|
||||||
|
stack.append(element)
|
||||||
|
|
||||||
|
return list(elements)
|
||||||
@ -16,7 +16,10 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
from unstructured.partition.common.metadata import (
|
||||||
|
get_last_modified_date,
|
||||||
|
get_last_modified_date_from_file,
|
||||||
|
)
|
||||||
from unstructured.partition.lang import apply_lang_metadata
|
from unstructured.partition.lang import apply_lang_metadata
|
||||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||||
|
|
||||||
|
|||||||
@ -8,11 +8,8 @@ from unstructured.chunking import add_chunking_strategy
|
|||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import convert_office_doc, exactly_one
|
||||||
convert_office_doc,
|
from unstructured.partition.common.metadata import get_last_modified
|
||||||
exactly_one,
|
|
||||||
get_last_modified,
|
|
||||||
)
|
|
||||||
from unstructured.partition.docx import partition_docx
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -46,7 +46,7 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -46,9 +46,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
|
|||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import convert_to_bytes, exactly_one
|
||||||
convert_to_bytes,
|
from unstructured.partition.common.metadata import (
|
||||||
exactly_one,
|
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
|
|||||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import exactly_one, get_last_modified
|
from unstructured.partition.common.common import exactly_one
|
||||||
|
from unstructured.partition.common.metadata import get_last_modified
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
DETECTION_ORIGIN: str = "epub"
|
DETECTION_ORIGIN: str = "epub"
|
||||||
|
|||||||
@ -14,7 +14,10 @@ from unstructured.documents.elements import Element, process_metadata
|
|||||||
from unstructured.file_utils.encoding import read_txt_file
|
from unstructured.file_utils.encoding import read_txt_file
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
from unstructured.partition.common.metadata import (
|
||||||
|
get_last_modified_date,
|
||||||
|
get_last_modified_date_from_file,
|
||||||
|
)
|
||||||
from unstructured.partition.html.parser import Flow, html_parser
|
from unstructured.partition.html.parser import Flow, html_parser
|
||||||
from unstructured.partition.lang import apply_lang_metadata
|
from unstructured.partition.lang import apply_lang_metadata
|
||||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||||
|
|||||||
@ -5,7 +5,7 @@ from typing import IO, Any, Optional
|
|||||||
from unstructured.chunking import add_chunking_strategy
|
from unstructured.chunking import add_chunking_strategy
|
||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import add_metadata
|
from unstructured.file_utils.filetype import add_metadata
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common.common import exactly_one
|
||||||
from unstructured.partition.lang import check_language_args
|
from unstructured.partition.lang import check_language_args
|
||||||
from unstructured.partition.pdf import partition_pdf_or_image
|
from unstructured.partition.pdf import partition_pdf_or_image
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
|
|||||||
@ -19,8 +19,8 @@ from unstructured.file_utils.filetype import (
|
|||||||
add_metadata_with_filetype,
|
add_metadata_with_filetype,
|
||||||
is_json_processable,
|
is_json_processable,
|
||||||
)
|
)
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import exactly_one
|
||||||
exactly_one,
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -9,8 +9,8 @@ from unstructured.chunking import add_chunking_strategy
|
|||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import exactly_one
|
||||||
exactly_one,
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me
|
|||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -8,7 +8,8 @@ from unstructured.chunking import add_chunking_strategy
|
|||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import exactly_one, get_last_modified
|
from unstructured.partition.common.common import exactly_one
|
||||||
|
from unstructured.partition.common.metadata import get_last_modified
|
||||||
from unstructured.partition.docx import partition_docx
|
from unstructured.partition.docx import partition_docx
|
||||||
from unstructured.utils import requires_dependencies
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
|||||||
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element
|
|||||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import exactly_one, get_last_modified
|
from unstructured.partition.common.common import exactly_one
|
||||||
|
from unstructured.partition.common.metadata import get_last_modified
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
DETECTION_ORIGIN: str = "org"
|
DETECTION_ORIGIN: str = "org"
|
||||||
|
|||||||
@ -42,7 +42,7 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
|
|||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.logger import logger, trace_logger
|
from unstructured.logger import logger, trace_logger
|
||||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import (
|
||||||
document_to_element_list,
|
document_to_element_list,
|
||||||
exactly_one,
|
exactly_one,
|
||||||
ocr_data_to_elements,
|
ocr_data_to_elements,
|
||||||
|
|||||||
@ -17,16 +17,16 @@ from PIL import Image
|
|||||||
|
|
||||||
from unstructured.documents.elements import ElementType
|
from unstructured.documents.elements import ElementType
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import convert_to_bytes, exactly_one
|
||||||
convert_to_bytes,
|
from unstructured.partition.common.metadata import (
|
||||||
exactly_one,
|
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
from unstructured.partition.utils.config import env_config
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
|
from unstructured_inference.inference.elements import TextRegion
|
||||||
|
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
|
||||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
from unstructured_inference.inference.layoutelement import LayoutElement
|
||||||
|
|
||||||
from unstructured.documents.elements import Element
|
from unstructured.documents.elements import Element
|
||||||
|
|||||||
@ -8,9 +8,8 @@ from unstructured.chunking import add_chunking_strategy
|
|||||||
from unstructured.documents.elements import Element, process_metadata
|
from unstructured.documents.elements import Element, process_metadata
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import convert_office_doc, exactly_one
|
||||||
convert_office_doc,
|
from unstructured.partition.common.metadata import (
|
||||||
exactly_one,
|
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -36,8 +36,8 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import convert_ms_office_table_to_text
|
||||||
convert_ms_office_table_to_text,
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
|
|||||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import exactly_one, get_last_modified
|
from unstructured.partition.common.common import exactly_one
|
||||||
|
from unstructured.partition.common.metadata import get_last_modified
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
DETECTION_ORIGIN: str = "rst"
|
DETECTION_ORIGIN: str = "rst"
|
||||||
|
|||||||
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
|
|||||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import exactly_one, get_last_modified
|
from unstructured.partition.common.common import exactly_one
|
||||||
|
from unstructured.partition.common.metadata import get_last_modified
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
DETECTION_ORIGIN: str = "rtf"
|
DETECTION_ORIGIN: str = "rtf"
|
||||||
|
|||||||
@ -29,8 +29,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
|
|||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
|
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
|
||||||
from unstructured.nlp.tokenize import sent_tokenize
|
from unstructured.nlp.tokenize import sent_tokenize
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import exactly_one
|
||||||
exactly_one,
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -14,11 +14,13 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import (
|
||||||
exactly_one,
|
exactly_one,
|
||||||
|
spooled_to_bytes_io_if_needed,
|
||||||
|
)
|
||||||
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
spooled_to_bytes_io_if_needed,
|
|
||||||
)
|
)
|
||||||
from unstructured.partition.lang import apply_lang_metadata
|
from unstructured.partition.lang import apply_lang_metadata
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,10 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
|
from unstructured.partition.common.metadata import (
|
||||||
|
get_last_modified_date,
|
||||||
|
get_last_modified_date_from_file,
|
||||||
|
)
|
||||||
from unstructured.partition.lang import apply_lang_metadata
|
from unstructured.partition.lang import apply_lang_metadata
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
is_bulleted_text,
|
is_bulleted_text,
|
||||||
|
|||||||
@ -16,11 +16,13 @@ from unstructured.documents.elements import (
|
|||||||
from unstructured.file_utils.encoding import read_txt_file
|
from unstructured.file_utils.encoding import read_txt_file
|
||||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common.common import (
|
||||||
exactly_one,
|
exactly_one,
|
||||||
|
spooled_to_bytes_io_if_needed,
|
||||||
|
)
|
||||||
|
from unstructured.partition.common.metadata import (
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
spooled_to_bytes_io_if_needed,
|
|
||||||
)
|
)
|
||||||
from unstructured.partition.lang import apply_lang_metadata
|
from unstructured.partition.lang import apply_lang_metadata
|
||||||
from unstructured.partition.text import element_from_text
|
from unstructured.partition.text import element_from_text
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from unstructured.documents.elements import (
|
|||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
)
|
)
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common.common import exactly_one
|
||||||
from unstructured.utils import Point, dependency_exists, requires_dependencies
|
from unstructured.utils import Point, dependency_exists, requires_dependencies
|
||||||
|
|
||||||
if dependency_exists("pandas"):
|
if dependency_exists("pandas"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user