rfctr(part): extract partition.common submodules (#3649)

**Summary**
In preparation for consolidating post-partitioning metadata decorators,
extract `partition.common` module into a sub-package (directory) and
extract `partition.common.metadata` module to house metadata-specific
object shared by partitioners.

**Additional Context**
- This new module will be the home of the new consolidated metadata
decorator.
- The consolidated decorator is a step toward removing post-processing
decorators from _delegating_ partitioners. A delegating partitioner is
one that convert its file to a different format and "delegates" actual
partitioning to the partitioner for that target format. 10 of the 20
partitioners are delegating partitioners.
- Removing decorators from delegating partitioners will allow us to
avoid "double-decorating", i.e. running those decorators twice, once on
the principal partitioner and again on the proxy partitioner.
- This will allow us to send `**kwargs` to either partitioner, removing
the knowledge of which arguments to send for each file-type from
auto-partition.
- And this will allow pluggable auto-partitioners which all have a
`partition_x(filename, *, file, **kwargs) -> list[Element]` interface.
This commit is contained in:
Steve Canny 2024-09-20 13:35:28 -07:00 committed by GitHub
parent 7d66a236f1
commit 03c2bf8f1f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 406 additions and 350 deletions

View File

@ -1,5 +1,17 @@
## 0.15.14-dev0
### Enhancements
### Features
### Fixes
## 0.15.13
### BREAKING CHANGES
* **Remove dead experimental code.** Unused code in `file_utils.experimantal` and `file_utils.metadata` was removed. These functions were never published in the documentation, but if a client dug these out and used them this removal could break client code.
### Enhancements
* **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents.

View File

@ -3,13 +3,4 @@
set -e
# $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
echo "Current venv is:"
echo "$VIRTUAL_ENV"
echo "Current path is:"
echo "$PATH"
ls -l "$VIRTUAL_ENV/bin/chroma"
echo "================"
cat "$VIRTUAL_ENV/bin/chroma"
echo "================"
# chroma run --path "$1" &
python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &

View File

@ -1,6 +1,3 @@
import datetime as dt
import io
import os
import pathlib
from dataclasses import dataclass
from multiprocessing import Pool
@ -20,7 +17,6 @@ from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
CheckBox,
CoordinatesMetadata,
ElementMetadata,
ElementType,
FigureCaption,
Header,
@ -32,7 +28,7 @@ from unstructured.documents.elements import (
from unstructured.documents.elements import (
Image as ImageElement,
)
from unstructured.partition import common
from unstructured.partition.common import common
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
@ -347,7 +343,7 @@ class MockRunOutput:
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
from unstructured.partition.common import subprocess
from unstructured.partition.common.common import subprocess
def mock_run(*args, **kwargs):
return MockRunOutput(1, "an error occurred".encode(), "error details".encode())
@ -429,75 +425,6 @@ def test_get_page_image_metadata_and_coordinate_system():
assert isinstance(metadata, dict)
def test_set_element_hierarchy():
elements_to_set = [
Title(text="Title"), # 0
NarrativeText(text="NarrativeText"), # 1
FigureCaption(text="FigureCaption"), # 2
ListItem(text="ListItem"), # 3
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5
ListItem(text="ListItem"), # 6
CheckBox(element_id="some-id-1", checked=True), # 7
Title(text="Title 2"), # 8
ListItem(text="ListItem"), # 9
ListItem(text="ListItem"), # 10
Text(text="Text"), # 11
]
elements = common.set_element_hierarchy(elements_to_set)
assert (
elements[1].metadata.parent_id == elements[0].id
), "NarrativeText should be child of Title"
assert (
elements[2].metadata.parent_id == elements[0].id
), "FigureCaption should be child of Title"
assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
# NOTE(Hubert): moving the category field to Element, caused this to fail.
# Checkboxes will soon be deprecated, then we can remove the test.
# assert (
# elements[7].metadata.parent_id is None
# ), "CheckBox should be None, as it's not a Text based element"
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
def test_set_element_hierarchy_custom_rule_set():
elements_to_set = [
Header(text="Header"), # 0
Title(text="Title"), # 1
NarrativeText(text="NarrativeText"), # 2
Text(text="Text"), # 3
Title(text="Title 2"), # 4
FigureCaption(text="FigureCaption"), # 5
]
custom_rule_set = {
"Header": ["Title", "Text"],
"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
}
elements = common.set_element_hierarchy(
elements=elements_to_set,
ruleset=custom_rule_set,
)
assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
assert (
elements[2].metadata.parent_id == elements[1].id
), "NarrativeText should be child of Title"
assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
assert (
elements[5].metadata.parent_id == elements[4].id
), "FigureCaption should be child of Title 2"
@dataclass
class MockImage:
width = 640
@ -607,99 +534,3 @@ def test_ocr_data_to_elements(
points=layout_el.bbox.coordinates,
system=coordinate_system,
)
class Describe_get_last_modified:
"""Isolated unit-tests for `unstructured.partition.common.get_last_modified()."""
def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
self, file_and_last_modified: tuple[str, str]
):
file_path, last_modified = file_and_last_modified
last_modified_date = common.get_last_modified(str(file_path), None, False)
assert last_modified_date == last_modified
def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
self, file_and_last_modified: tuple[str, str]
):
file_path, last_modified = file_and_last_modified
with open(file_path, "rb") as f:
last_modified_date = common.get_last_modified(None, f, True)
assert last_modified_date == last_modified
def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
file_path, _ = file_and_last_modified
with open(file_path, "rb") as f:
last_modified_date = common.get_last_modified(None, f, False)
assert last_modified_date is None
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
modified_timestamp = dt.datetime(
year=2024, month=6, day=14, hour=15, minute=39, second=25
).timestamp()
file_path = tmp_path / "some_file.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
return str(file_path), "2024-06-14T15:39:25"
class Describe_get_last_modified_date:
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=17, minute=43, second=40
).timestamp()
file_path = tmp_path / "some_file.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
last_modified_date = common.get_last_modified_date(str(file_path))
assert last_modified_date == "2024-03-05T17:43:40"
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
file_path = tmp_path / "some_file_that_does_not_exist.txt"
last_modified_date = common.get_last_modified_date(str(file_path))
assert last_modified_date is None
class Describe_get_last_modified_date_from_file:
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
self, tmp_path: pathlib.Path
):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=20, minute=48, second=26
).timestamp()
file_path = tmp_path / "some_file_2.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
with open(file_path, "rb") as f:
last_modified_date = common.get_last_modified_date_from_file(f)
assert last_modified_date == "2024-03-05T20:48:26"
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
assert common.get_last_modified_date_from_file(b"abcdefg") is None
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
file = io.BytesIO(b"abcdefg")
assert hasattr(file, "name") is False
last_modified_date = common.get_last_modified_date_from_file(file)
assert last_modified_date is None
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
self, tmp_path: pathlib.Path
):
file = io.BytesIO(b"abcdefg")
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
last_modified_date = common.get_last_modified_date_from_file(file)
assert last_modified_date is None

View File

@ -0,0 +1,201 @@
"""Test-suite for `unstructured.partition.common.metadata` module."""
from __future__ import annotations
import datetime as dt
import io
import os
import pathlib
import pytest
from unstructured.documents.elements import (
CheckBox,
ElementMetadata,
FigureCaption,
Header,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.partition.common.metadata import (
get_last_modified,
get_last_modified_date,
get_last_modified_date_from_file,
set_element_hierarchy,
)
# ================================================================================================
# LAST-MODIFIED
# ================================================================================================
class Describe_get_last_modified:
"""Isolated unit-tests for `unstructured.partition.common.metadata.get_last_modified()."""
def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
self, file_and_last_modified: tuple[str, str]
):
file_path, last_modified = file_and_last_modified
last_modified_date = get_last_modified(str(file_path), None, False)
assert last_modified_date == last_modified
def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
self, file_and_last_modified: tuple[str, str]
):
file_path, last_modified = file_and_last_modified
with open(file_path, "rb") as f:
last_modified_date = get_last_modified(None, f, True)
assert last_modified_date == last_modified
def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
file_path, _ = file_and_last_modified
with open(file_path, "rb") as f:
last_modified_date = get_last_modified(None, f, False)
assert last_modified_date is None
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
modified_timestamp = dt.datetime(
year=2024, month=6, day=14, hour=15, minute=39, second=25
).timestamp()
file_path = tmp_path / "some_file.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
return str(file_path), "2024-06-14T15:39:25"
class Describe_get_last_modified_date:
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=17, minute=43, second=40
).timestamp()
file_path = tmp_path / "some_file.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
last_modified_date = get_last_modified_date(str(file_path))
assert last_modified_date == "2024-03-05T17:43:40"
def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
file_path = tmp_path / "some_file_that_does_not_exist.txt"
last_modified_date = get_last_modified_date(str(file_path))
assert last_modified_date is None
class Describe_get_last_modified_date_from_file:
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
self, tmp_path: pathlib.Path
):
modified_timestamp = dt.datetime(
year=2024, month=3, day=5, hour=20, minute=48, second=26
).timestamp()
file_path = tmp_path / "some_file_2.txt"
file_path.write_text("abcdefg")
os.utime(file_path, (modified_timestamp, modified_timestamp))
with open(file_path, "rb") as f:
last_modified_date = get_last_modified_date_from_file(f)
assert last_modified_date == "2024-03-05T20:48:26"
def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
assert get_last_modified_date_from_file(b"abcdefg") is None
def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
file = io.BytesIO(b"abcdefg")
assert hasattr(file, "name") is False
last_modified_date = get_last_modified_date_from_file(file)
assert last_modified_date is None
def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
self, tmp_path: pathlib.Path
):
file = io.BytesIO(b"abcdefg")
file.name = str(tmp_path / "a_file_that_isn't_here.txt")
last_modified_date = get_last_modified_date_from_file(file)
assert last_modified_date is None
# ================================================================================================
# ELEMENT HIERARCHY
# ================================================================================================
def test_set_element_hierarchy():
elements_to_set = [
Title(text="Title"), # 0
NarrativeText(text="NarrativeText"), # 1
FigureCaption(text="FigureCaption"), # 2
ListItem(text="ListItem"), # 3
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5
ListItem(text="ListItem"), # 6
CheckBox(element_id="some-id-1", checked=True), # 7
Title(text="Title 2"), # 8
ListItem(text="ListItem"), # 9
ListItem(text="ListItem"), # 10
Text(text="Text"), # 11
]
elements = set_element_hierarchy(elements_to_set)
assert (
elements[1].metadata.parent_id == elements[0].id
), "NarrativeText should be child of Title"
assert (
elements[2].metadata.parent_id == elements[0].id
), "FigureCaption should be child of Title"
assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
# NOTE(Hubert): moving the category field to Element, caused this to fail.
# Checkboxes will soon be deprecated, then we can remove the test.
# assert (
# elements[7].metadata.parent_id is None
# ), "CheckBox should be None, as it's not a Text based element"
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
def test_set_element_hierarchy_custom_rule_set():
elements_to_set = [
Header(text="Header"), # 0
Title(text="Title"), # 1
NarrativeText(text="NarrativeText"), # 2
Text(text="Text"), # 3
Title(text="Title 2"), # 4
FigureCaption(text="FigureCaption"), # 5
]
custom_rule_set = {
"Header": ["Title", "Text"],
"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
}
elements = set_element_hierarchy(
elements=elements_to_set,
ruleset=custom_rule_set,
)
assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
assert (
elements[2].metadata.parent_id == elements[1].id
), "NarrativeText should be child of Title"
assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
assert (
elements[5].metadata.parent_id == elements[4].id
), "FigureCaption should be child of Title 2"

View File

@ -1 +1 @@
__version__ = "0.15.13" # pragma: no cover
__version__ = "0.15.14-dev0" # pragma: no cover

View File

@ -2,7 +2,7 @@ from typing import IO, Optional, Tuple, Union
import chardet
from unstructured.partition.common import convert_to_bytes
from unstructured.partition.common.common import convert_to_bytes
ENCODE_REC_THRESHOLD = 0.8

View File

@ -4,7 +4,7 @@ import os
import tempfile
from typing import IO
from unstructured.partition.common import exactly_one
from unstructured.partition.common.common import exactly_one
from unstructured.utils import requires_dependencies

View File

@ -47,12 +47,12 @@ from unstructured.file_utils.encoding import detect_file_encoding, format_encodi
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
from unstructured.partition.common import (
from unstructured.partition.common.common import (
add_element_metadata,
exactly_one,
remove_element_metadata,
set_element_hierarchy,
)
from unstructured.partition.common.metadata import set_element_hierarchy
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
@ -500,8 +500,8 @@ class _OleFileDifferentiator:
@staticmethod
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
with ctx.open() as f:
ole = OleFileIO(f)
root_storage = Storage.from_ole(ole)
ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType]
root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType]
for stream in root_storage.streams:
if stream.name == "WordDocument":

View File

@ -9,7 +9,7 @@ from unstructured_client.models import shared
from unstructured.documents.elements import Element
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.common.common import exactly_one
from unstructured.staging.base import elements_from_dicts, elements_from_json

View File

@ -13,7 +13,7 @@ from unstructured.documents.elements import DataSourceMetadata, Element
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.common.common import exactly_one
from unstructured.partition.lang import check_language_args
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import dependency_exists

View File

@ -1,9 +1,7 @@
from __future__ import annotations
import numbers
import os
import subprocess
from datetime import datetime
from io import BufferedReader, BytesIO, TextIOWrapper
from tempfile import SpooledTemporaryFile
from time import sleep
@ -41,76 +39,6 @@ if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement
HIERARCHY_RULE_SET = {
"Title": [
"Text",
"UncategorizedText",
"NarrativeText",
"ListItem",
"BulletedText",
"Table",
"FigureCaption",
"CheckBox",
"Table",
],
"Header": [
"Title",
"Text",
"UncategorizedText",
"NarrativeText",
"ListItem",
"BulletedText",
"Table",
"FigureCaption",
"CheckBox",
"Table",
],
}
def get_last_modified(
filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
) -> str | None:
"""Determine best available last-modified date from file or filename."""
if filename is not None:
return get_last_modified_date(filename)
if file is not None:
return get_last_modified_date_from_file(file) if date_from_file_object else None
return None
def get_last_modified_date(filename: str) -> Optional[str]:
"""Modification time of file at path `filename`, if it exists.
Returns `None` when `filename` is not a path to a file on the local filesystem.
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
"2024-03-05T17:02:53".
"""
if not os.path.isfile(filename):
return None
modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
# -- a file-like object will have a name attribute if created by `open()` or if a name is
# -- assigned to it for metadata purposes. Use "" as default because the empty string is never
# -- a path to an actual file.
filename = str(getattr(file, "name", ""))
# -- there's no guarantee the path corresponds to an actual file on the filesystem. In
# -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
# -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
if not os.path.isfile(filename):
return None
return get_last_modified_date(filename)
def normalize_layout_element(
layout_element: LayoutElement | Element | dict[str, Any],
@ -230,54 +158,6 @@ def layout_list_to_list_items(
return list_items
def set_element_hierarchy(
elements: list[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
) -> list[Element]:
"""Sets the parent_id for each element in the list of elements
based on the element's category, depth and a ruleset
"""
stack: list[Element] = []
for element in elements:
if element.metadata.parent_id is not None:
continue
parent_id = None
element_category = getattr(element, "category", None)
element_category_depth = getattr(element.metadata, "category_depth", 0) or 0
if not element_category:
continue
while stack:
top_element: Element = stack[-1]
top_element_category = getattr(top_element, "category")
top_element_category_depth = (
getattr(
top_element.metadata,
"category_depth",
0,
)
or 0
)
if (
top_element_category == element_category
and top_element_category_depth < element_category_depth
) or (
top_element_category != element_category
and element_category in ruleset.get(top_element_category, [])
):
parent_id = top_element.id
break
stack.pop()
element.metadata.parent_id = parent_id
stack.append(element)
return elements
def add_element_metadata(
element: Element,
filename: Optional[str] = None,
@ -580,7 +460,7 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
# unstructured.documents.html, which imports this module so we can't import the class for type
# hints. Moreover, those two types of documents have different lists of attributes
# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by
# UPDATE(scanny): HTMLDocument no longer exists, so this function can be optimized for use by
# DocumentLayout only.
def document_to_element_list(
document: DocumentLayout,

View File

@ -0,0 +1,128 @@
"""Helpers used across multiple partitioners to compute metadata."""
from __future__ import annotations
import datetime as dt
import os
from typing import IO, Optional, Sequence
from unstructured.documents.elements import Element
def get_last_modified(
filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
) -> str | None:
"""Determine best available last-modified date from file or filename."""
if filename is not None:
return get_last_modified_date(filename)
if file is not None:
return get_last_modified_date_from_file(file) if date_from_file_object else None
return None
def get_last_modified_date(filename: str) -> Optional[str]:
"""Modification time of file at path `filename`, if it exists.
Returns `None` when `filename` is not a path to a file on the local filesystem.
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
"2024-03-05T17:02:53".
"""
if not os.path.isfile(filename):
return None
modify_date = dt.datetime.fromtimestamp(os.path.getmtime(filename))
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
# -- a file-like object will have a name attribute if created by `open()` or if a name is
# -- assigned to it for metadata purposes. Use "" as default because the empty string is never
# -- a path to an actual file.
filename = str(getattr(file, "name", ""))
# -- there's no guarantee the path corresponds to an actual file on the filesystem. In
# -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
# -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
if not os.path.isfile(filename):
return None
return get_last_modified_date(filename)
HIERARCHY_RULE_SET = {
"Title": [
"Text",
"UncategorizedText",
"NarrativeText",
"ListItem",
"BulletedText",
"Table",
"FigureCaption",
"CheckBox",
"Table",
],
"Header": [
"Title",
"Text",
"UncategorizedText",
"NarrativeText",
"ListItem",
"BulletedText",
"Table",
"FigureCaption",
"CheckBox",
"Table",
],
}
def set_element_hierarchy(
elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
) -> list[Element]:
"""Sets the parent_id for each element in the list of elements
based on the element's category, depth and a ruleset
"""
stack: list[Element] = []
for element in elements:
if element.metadata.parent_id is not None:
continue
parent_id = None
element_category = getattr(element, "category", None)
element_category_depth = getattr(element.metadata, "category_depth", 0) or 0
if not element_category:
continue
while stack:
top_element: Element = stack[-1]
top_element_category = getattr(top_element, "category")
top_element_category_depth = (
getattr(
top_element.metadata,
"category_depth",
0,
)
or 0
)
if (
top_element_category == element_category
and top_element_category_depth < element_category_depth
) or (
top_element_category != element_category
and element_category in ruleset.get(top_element_category, [])
):
parent_id = top_element.id
break
stack.pop()
element.metadata.parent_id = parent_id
stack.append(element)
return list(elements)

View File

@ -16,7 +16,10 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.utils import is_temp_file_path, lazyproperty

View File

@ -8,11 +8,8 @@ from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
convert_office_doc,
exactly_one,
get_last_modified,
)
from unstructured.partition.common.common import convert_office_doc, exactly_one
from unstructured.partition.common.metadata import get_last_modified
from unstructured.partition.docx import partition_docx

View File

@ -46,7 +46,7 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -46,9 +46,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.common import (
convert_to_bytes,
exactly_one,
from unstructured.partition.common.common import convert_to_bytes, exactly_one
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified
from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "epub"

View File

@ -14,7 +14,10 @@ from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.html.parser import Flow, html_parser
from unstructured.partition.lang import apply_lang_metadata
from unstructured.utils import is_temp_file_path, lazyproperty

View File

@ -5,7 +5,7 @@ from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata
from unstructured.partition.common import exactly_one
from unstructured.partition.common.common import exactly_one
from unstructured.partition.lang import check_language_args
from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.utils.constants import PartitionStrategy

View File

@ -19,8 +19,8 @@ from unstructured.file_utils.filetype import (
add_metadata_with_filetype,
is_json_processable,
)
from unstructured.partition.common import (
exactly_one,
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -9,8 +9,8 @@ from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
exactly_one,
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -14,7 +14,7 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.partition.common import (
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -8,7 +8,8 @@ from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified
from unstructured.partition.docx import partition_docx
from unstructured.utils import requires_dependencies

View File

@ -7,7 +7,8 @@ from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified
from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "org"

View File

@ -42,7 +42,7 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger, trace_logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition.common import (
from unstructured.partition.common.common import (
document_to_element_list,
exactly_one,
ocr_data_to_elements,

View File

@ -17,16 +17,16 @@ from PIL import Image
from unstructured.documents.elements import ElementType
from unstructured.logger import logger
from unstructured.partition.common import (
convert_to_bytes,
exactly_one,
from unstructured.partition.common.common import convert_to_bytes, exactly_one
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.utils.config import env_config
if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.elements import Element

View File

@ -8,9 +8,8 @@ from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
convert_office_doc,
exactly_one,
from unstructured.partition.common.common import convert_office_doc, exactly_one
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -36,8 +36,8 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
convert_ms_office_table_to_text,
from unstructured.partition.common.common import convert_ms_office_table_to_text
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified
from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "rst"

View File

@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import exactly_one, get_last_modified
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified
from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "rtf"

View File

@ -29,8 +29,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import sent_tokenize
from unstructured.partition.common import (
exactly_one,
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)

View File

@ -14,11 +14,13 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
from unstructured.partition.common.common import (
exactly_one,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import apply_lang_metadata

View File

@ -26,7 +26,10 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import (
is_bulleted_text,

View File

@ -16,11 +16,13 @@ from unstructured.documents.elements import (
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common import (
from unstructured.partition.common.common import (
exactly_one,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.common.metadata import (
get_last_modified_date,
get_last_modified_date_from_file,
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import element_from_text

View File

@ -16,7 +16,7 @@ from unstructured.documents.elements import (
Element,
ElementMetadata,
)
from unstructured.partition.common import exactly_one
from unstructured.partition.common.common import exactly_one
from unstructured.utils import Point, dependency_exists, requires_dependencies
if dependency_exists("pandas"):