rfctr(part): extract partition.common submodules (#3649)

**Summary** In preparation for consolidating post-partitioning metadata decorators, extract `partition.common` module into a sub-package (directory) and extract `partition.common.metadata` module to house metadata-specific object shared by partitioners. **Additional Context** - This new module will be the home of the new consolidated metadata decorator. - The consolidated decorator is a step toward removing post-processing decorators from _delegating_ partitioners. A delegating partitioner is one that convert its file to a different format and "delegates" actual partitioning to the partitioner for that target format. 10 of the 20 partitioners are delegating partitioners. - Removing decorators from delegating partitioners will allow us to avoid "double-decorating", i.e. running those decorators twice, once on the principal partitioner and again on the proxy partitioner. - This will allow us to send `**kwargs` to either partitioner, removing the knowledge of which arguments to send for each file-type from auto-partition. - And this will allow pluggable auto-partitioners which all have a `partition_x(filename, *, file, **kwargs) -> list[Element]` interface.
2025-12-11 07:01:24 +00:00 · 2024-09-20 13:35:28 -07:00 · 2024-09-20 13:35:28 -07:00 · 03c2bf8f1f
commit 03c2bf8f1f
parent 7d66a236f1
37 changed files with 406 additions and 350 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,17 @@
+## 0.15.14-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.15.13

+### BREAKING CHANGES
+
+* **Remove dead experimental code.** Unused code in `file_utils.experimantal` and `file_utils.metadata` was removed. These functions were never published in the documentation, but if a client dug these out and used them this removal could break client code.
+
 ### Enhancements

 * **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents.
--- a/scripts/chroma-test-helpers/create-and-check-chroma.sh
+++ b/scripts/chroma-test-helpers/create-and-check-chroma.sh
@ -3,13 +3,4 @@
 set -e

 # $1 is the path for chroma to write the contents to. The symbol "&" runs process in background
-echo "Current venv is:"
-echo "$VIRTUAL_ENV"
-echo "Current path is:"
-echo "$PATH"
-ls -l "$VIRTUAL_ENV/bin/chroma"
-echo "================"
-cat "$VIRTUAL_ENV/bin/chroma"
-echo "================"
-# chroma run --path "$1" &
 python "$VIRTUAL_ENV/bin/chroma" run --path "$1" &
--- a/test_unstructured/partition/common/init.py
+++ b/test_unstructured/partition/common/init.py
--- a/test_unstructured/partition/common/test_common.py
+++ b/test_unstructured/partition/common/test_common.py
@ -1,6 +1,3 @@
-import datetime as dt
-import io
-import os
 import pathlib
 from dataclasses import dataclass
 from multiprocessing import Pool
@ -20,7 +17,6 @@ from unstructured.documents.elements import (
    TYPE_TO_TEXT_ELEMENT_MAP,
    CheckBox,
    CoordinatesMetadata,
-    ElementMetadata,
    ElementType,
    FigureCaption,
    Header,
@ -32,7 +28,7 @@ from unstructured.documents.elements import (
 from unstructured.documents.elements import (
    Image as ImageElement,
 )
-from unstructured.partition import common
+from unstructured.partition.common import common
 from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT


@ -347,7 +343,7 @@ class MockRunOutput:


 def test_convert_office_doc_captures_errors(monkeypatch, caplog):
-    from unstructured.partition.common import subprocess
+    from unstructured.partition.common.common import subprocess

    def mock_run(*args, **kwargs):
        return MockRunOutput(1, "an error occurred".encode(), "error details".encode())
@ -429,75 +425,6 @@ def test_get_page_image_metadata_and_coordinate_system():
    assert isinstance(metadata, dict)


-def test_set_element_hierarchy():
-    elements_to_set = [
-        Title(text="Title"),  # 0
-        NarrativeText(text="NarrativeText"),  # 1
-        FigureCaption(text="FigureCaption"),  # 2
-        ListItem(text="ListItem"),  # 3
-        ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)),  # 4
-        ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)),  # 5
-        ListItem(text="ListItem"),  # 6
-        CheckBox(element_id="some-id-1", checked=True),  # 7
-        Title(text="Title 2"),  # 8
-        ListItem(text="ListItem"),  # 9
-        ListItem(text="ListItem"),  # 10
-        Text(text="Text"),  # 11
-    ]
-    elements = common.set_element_hierarchy(elements_to_set)
-
-    assert (
-        elements[1].metadata.parent_id == elements[0].id
-    ), "NarrativeText should be child of Title"
-    assert (
-        elements[2].metadata.parent_id == elements[0].id
-    ), "FigureCaption should be child of Title"
-    assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
-    assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
-    assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
-    assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
-    # NOTE(Hubert): moving the category field to Element, caused this to fail.
-    # Checkboxes will soon be deprecated, then we can remove the test.
-    # assert (
-    #         elements[7].metadata.parent_id is None
-    # ), "CheckBox should be None, as it's not a Text based element"
-    assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
-    assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
-    assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
-    assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
-
-
-def test_set_element_hierarchy_custom_rule_set():
-    elements_to_set = [
-        Header(text="Header"),  # 0
-        Title(text="Title"),  # 1
-        NarrativeText(text="NarrativeText"),  # 2
-        Text(text="Text"),  # 3
-        Title(text="Title 2"),  # 4
-        FigureCaption(text="FigureCaption"),  # 5
-    ]
-
-    custom_rule_set = {
-        "Header": ["Title", "Text"],
-        "Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
-    }
-
-    elements = common.set_element_hierarchy(
-        elements=elements_to_set,
-        ruleset=custom_rule_set,
-    )
-
-    assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
-    assert (
-        elements[2].metadata.parent_id == elements[1].id
-    ), "NarrativeText should be child of Title"
-    assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
-    assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
-    assert (
-        elements[5].metadata.parent_id == elements[4].id
-    ), "FigureCaption should be child of Title 2"
-
-
@dataclass
 class MockImage:
    width = 640
@ -607,99 +534,3 @@ def test_ocr_data_to_elements(
            points=layout_el.bbox.coordinates,
            system=coordinate_system,
        )
-
-
-class Describe_get_last_modified:
-    """Isolated unit-tests for `unstructured.partition.common.get_last_modified()."""
-
-    def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
-        self, file_and_last_modified: tuple[str, str]
-    ):
-        file_path, last_modified = file_and_last_modified
-        last_modified_date = common.get_last_modified(str(file_path), None, False)
-        assert last_modified_date == last_modified
-
-    def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
-        self, file_and_last_modified: tuple[str, str]
-    ):
-        file_path, last_modified = file_and_last_modified
-        with open(file_path, "rb") as f:
-            last_modified_date = common.get_last_modified(None, f, True)
-        assert last_modified_date == last_modified
-
-    def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
-        file_path, _ = file_and_last_modified
-        with open(file_path, "rb") as f:
-            last_modified_date = common.get_last_modified(None, f, False)
-        assert last_modified_date is None
-
-    # -- fixtures --------------------------------------------------------------------------------
-
-    @pytest.fixture()
-    def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
-        modified_timestamp = dt.datetime(
-            year=2024, month=6, day=14, hour=15, minute=39, second=25
-        ).timestamp()
-        file_path = tmp_path / "some_file.txt"
-        file_path.write_text("abcdefg")
-        os.utime(file_path, (modified_timestamp, modified_timestamp))
-        return str(file_path), "2024-06-14T15:39:25"
-
-
-class Describe_get_last_modified_date:
-    def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
-        modified_timestamp = dt.datetime(
-            year=2024, month=3, day=5, hour=17, minute=43, second=40
-        ).timestamp()
-        file_path = tmp_path / "some_file.txt"
-        file_path.write_text("abcdefg")
-        os.utime(file_path, (modified_timestamp, modified_timestamp))
-
-        last_modified_date = common.get_last_modified_date(str(file_path))
-
-        assert last_modified_date == "2024-03-05T17:43:40"
-
-    def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
-        file_path = tmp_path / "some_file_that_does_not_exist.txt"
-
-        last_modified_date = common.get_last_modified_date(str(file_path))
-
-        assert last_modified_date is None
-
-
-class Describe_get_last_modified_date_from_file:
-    def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
-        self, tmp_path: pathlib.Path
-    ):
-        modified_timestamp = dt.datetime(
-            year=2024, month=3, day=5, hour=20, minute=48, second=26
-        ).timestamp()
-        file_path = tmp_path / "some_file_2.txt"
-        file_path.write_text("abcdefg")
-        os.utime(file_path, (modified_timestamp, modified_timestamp))
-
-        with open(file_path, "rb") as f:
-            last_modified_date = common.get_last_modified_date_from_file(f)
-
-        assert last_modified_date == "2024-03-05T20:48:26"
-
-    def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
-        assert common.get_last_modified_date_from_file(b"abcdefg") is None
-
-    def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
-        file = io.BytesIO(b"abcdefg")
-        assert hasattr(file, "name") is False
-
-        last_modified_date = common.get_last_modified_date_from_file(file)
-
-        assert last_modified_date is None
-
-    def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
-        self, tmp_path: pathlib.Path
-    ):
-        file = io.BytesIO(b"abcdefg")
-        file.name = str(tmp_path / "a_file_that_isn't_here.txt")
-
-        last_modified_date = common.get_last_modified_date_from_file(file)
-
-        assert last_modified_date is None
--- a/test_unstructured/partition/common/test_metadata.py
+++ b/test_unstructured/partition/common/test_metadata.py
@ -0,0 +1,201 @@
+"""Test-suite for `unstructured.partition.common.metadata` module."""
+
+from __future__ import annotations
+
+import datetime as dt
+import io
+import os
+import pathlib
+
+import pytest
+
+from unstructured.documents.elements import (
+    CheckBox,
+    ElementMetadata,
+    FigureCaption,
+    Header,
+    ListItem,
+    NarrativeText,
+    Text,
+    Title,
+)
+from unstructured.partition.common.metadata import (
+    get_last_modified,
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+    set_element_hierarchy,
+)
+
+# ================================================================================================
+# LAST-MODIFIED
+# ================================================================================================
+
+
+class Describe_get_last_modified:
+    """Isolated unit-tests for `unstructured.partition.common.metadata.get_last_modified()."""
+
+    def it_pulls_last_modified_from_the_filesystem_when_a_path_is_provided(
+        self, file_and_last_modified: tuple[str, str]
+    ):
+        file_path, last_modified = file_and_last_modified
+        last_modified_date = get_last_modified(str(file_path), None, False)
+        assert last_modified_date == last_modified
+
+    def and_it_pulls_last_modified_from_the_file_like_object_when_one_is_provided(
+        self, file_and_last_modified: tuple[str, str]
+    ):
+        file_path, last_modified = file_and_last_modified
+        with open(file_path, "rb") as f:
+            last_modified_date = get_last_modified(None, f, True)
+        assert last_modified_date == last_modified
+
+    def but_not_when_date_from_file_object_is_False(self, file_and_last_modified: tuple[str, str]):
+        file_path, _ = file_and_last_modified
+        with open(file_path, "rb") as f:
+            last_modified_date = get_last_modified(None, f, False)
+        assert last_modified_date is None
+
+    # -- fixtures --------------------------------------------------------------------------------
+
+    @pytest.fixture()
+    def file_and_last_modified(self, tmp_path: pathlib.Path) -> tuple[str, str]:
+        modified_timestamp = dt.datetime(
+            year=2024, month=6, day=14, hour=15, minute=39, second=25
+        ).timestamp()
+        file_path = tmp_path / "some_file.txt"
+        file_path.write_text("abcdefg")
+        os.utime(file_path, (modified_timestamp, modified_timestamp))
+        return str(file_path), "2024-06-14T15:39:25"
+
+
+class Describe_get_last_modified_date:
+    def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
+        modified_timestamp = dt.datetime(
+            year=2024, month=3, day=5, hour=17, minute=43, second=40
+        ).timestamp()
+        file_path = tmp_path / "some_file.txt"
+        file_path.write_text("abcdefg")
+        os.utime(file_path, (modified_timestamp, modified_timestamp))
+
+        last_modified_date = get_last_modified_date(str(file_path))
+
+        assert last_modified_date == "2024-03-05T17:43:40"
+
+    def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathlib.Path):
+        file_path = tmp_path / "some_file_that_does_not_exist.txt"
+
+        last_modified_date = get_last_modified_date(str(file_path))
+
+        assert last_modified_date is None
+
+
+class Describe_get_last_modified_date_from_file:
+    def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
+        self, tmp_path: pathlib.Path
+    ):
+        modified_timestamp = dt.datetime(
+            year=2024, month=3, day=5, hour=20, minute=48, second=26
+        ).timestamp()
+        file_path = tmp_path / "some_file_2.txt"
+        file_path.write_text("abcdefg")
+        os.utime(file_path, (modified_timestamp, modified_timestamp))
+
+        with open(file_path, "rb") as f:
+            last_modified_date = get_last_modified_date_from_file(f)
+
+        assert last_modified_date == "2024-03-05T20:48:26"
+
+    def but_it_returns_None_when_the_argument_is_a_bytes_object(self):
+        assert get_last_modified_date_from_file(b"abcdefg") is None
+
+    def and_it_returns_None_when_the_file_like_object_has_no_name_attribute(self):
+        file = io.BytesIO(b"abcdefg")
+        assert hasattr(file, "name") is False
+
+        last_modified_date = get_last_modified_date_from_file(file)
+
+        assert last_modified_date is None
+
+    def and_it_returns_None_when_the_file_like_object_name_is_not_a_path_to_a_file(
+        self, tmp_path: pathlib.Path
+    ):
+        file = io.BytesIO(b"abcdefg")
+        file.name = str(tmp_path / "a_file_that_isn't_here.txt")
+
+        last_modified_date = get_last_modified_date_from_file(file)
+
+        assert last_modified_date is None
+
+
+# ================================================================================================
+# ELEMENT HIERARCHY
+# ================================================================================================
+
+
+def test_set_element_hierarchy():
+    elements_to_set = [
+        Title(text="Title"),  # 0
+        NarrativeText(text="NarrativeText"),  # 1
+        FigureCaption(text="FigureCaption"),  # 2
+        ListItem(text="ListItem"),  # 3
+        ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)),  # 4
+        ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)),  # 5
+        ListItem(text="ListItem"),  # 6
+        CheckBox(element_id="some-id-1", checked=True),  # 7
+        Title(text="Title 2"),  # 8
+        ListItem(text="ListItem"),  # 9
+        ListItem(text="ListItem"),  # 10
+        Text(text="Text"),  # 11
+    ]
+    elements = set_element_hierarchy(elements_to_set)
+
+    assert (
+        elements[1].metadata.parent_id == elements[0].id
+    ), "NarrativeText should be child of Title"
+    assert (
+        elements[2].metadata.parent_id == elements[0].id
+    ), "FigureCaption should be child of Title"
+    assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
+    assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
+    assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
+    assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
+    # NOTE(Hubert): moving the category field to Element, caused this to fail.
+    # Checkboxes will soon be deprecated, then we can remove the test.
+    # assert (
+    #         elements[7].metadata.parent_id is None
+    # ), "CheckBox should be None, as it's not a Text based element"
+    assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
+    assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
+    assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
+    assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
+
+
+def test_set_element_hierarchy_custom_rule_set():
+    elements_to_set = [
+        Header(text="Header"),  # 0
+        Title(text="Title"),  # 1
+        NarrativeText(text="NarrativeText"),  # 2
+        Text(text="Text"),  # 3
+        Title(text="Title 2"),  # 4
+        FigureCaption(text="FigureCaption"),  # 5
+    ]
+
+    custom_rule_set = {
+        "Header": ["Title", "Text"],
+        "Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
+    }
+
+    elements = set_element_hierarchy(
+        elements=elements_to_set,
+        ruleset=custom_rule_set,
+    )
+
+    assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
+    assert (
+        elements[2].metadata.parent_id == elements[1].id
+    ), "NarrativeText should be child of Title"
+    assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
+    assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
+    assert (
+        elements[5].metadata.parent_id == elements[4].id
+    ), "FigureCaption should be child of Title 2"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.13"  # pragma: no cover
+__version__ = "0.15.14-dev0"  # pragma: no cover
--- a/unstructured/file_utils/encoding.py
+++ b/unstructured/file_utils/encoding.py
@ -2,7 +2,7 @@ from typing import IO, Optional, Tuple, Union

 import chardet

-from unstructured.partition.common import convert_to_bytes
+from unstructured.partition.common.common import convert_to_bytes

 ENCODE_REC_THRESHOLD = 0.8

--- a/unstructured/file_utils/file_conversion.py
+++ b/unstructured/file_utils/file_conversion.py
@ -4,7 +4,7 @@ import os
 import tempfile
 from typing import IO

-from unstructured.partition.common import exactly_one
+from unstructured.partition.common.common import exactly_one
 from unstructured.utils import requires_dependencies


--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -47,12 +47,12 @@ from unstructured.file_utils.encoding import detect_file_encoding, format_encodi
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
-from unstructured.partition.common import (
+from unstructured.partition.common.common import (
    add_element_metadata,
    exactly_one,
    remove_element_metadata,
-    set_element_hierarchy,
 )
+from unstructured.partition.common.metadata import set_element_hierarchy
 from unstructured.utils import get_call_args_applying_defaults, lazyproperty

 LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
@ -500,8 +500,8 @@ class _OleFileDifferentiator:
    @staticmethod
    def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
        with ctx.open() as f:
-            ole = OleFileIO(f)
-            root_storage = Storage.from_ole(ole)
+            ole = OleFileIO(f)  # pyright: ignore[reportUnknownVariableType]
+            root_storage = Storage.from_ole(ole)  # pyright: ignore[reportUnknownMemberType]

        for stream in root_storage.streams:
            if stream.name == "WordDocument":
--- a/unstructured/partition/api.py
+++ b/unstructured/partition/api.py
@ -9,7 +9,7 @@ from unstructured_client.models import shared

 from unstructured.documents.elements import Element
 from unstructured.logger import logger
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common.common import exactly_one
 from unstructured.staging.base import elements_from_dicts, elements_from_json


--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -13,7 +13,7 @@ from unstructured.documents.elements import DataSourceMetadata, Element
 from unstructured.file_utils.filetype import detect_filetype, is_json_processable
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common.common import exactly_one
 from unstructured.partition.lang import check_language_args
 from unstructured.partition.utils.constants import PartitionStrategy
 from unstructured.utils import dependency_exists
--- a/unstructured/partition/common/init.py
+++ b/unstructured/partition/common/init.py
--- a/unstructured/partition/common/common.py
+++ b/unstructured/partition/common/common.py
@ -1,9 +1,7 @@
 from __future__ import annotations

 import numbers
-import os
 import subprocess
-from datetime import datetime
 from io import BufferedReader, BytesIO, TextIOWrapper
 from tempfile import SpooledTemporaryFile
 from time import sleep
@ -41,76 +39,6 @@ if TYPE_CHECKING:
    from unstructured_inference.inference.layout import DocumentLayout, PageLayout
    from unstructured_inference.inference.layoutelement import LayoutElement

-HIERARCHY_RULE_SET = {
-    "Title": [
-        "Text",
-        "UncategorizedText",
-        "NarrativeText",
-        "ListItem",
-        "BulletedText",
-        "Table",
-        "FigureCaption",
-        "CheckBox",
-        "Table",
-    ],
-    "Header": [
-        "Title",
-        "Text",
-        "UncategorizedText",
-        "NarrativeText",
-        "ListItem",
-        "BulletedText",
-        "Table",
-        "FigureCaption",
-        "CheckBox",
-        "Table",
-    ],
-}
-
-
-def get_last_modified(
-    filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
-) -> str | None:
-    """Determine best available last-modified date from file or filename."""
-    if filename is not None:
-        return get_last_modified_date(filename)
-
-    if file is not None:
-        return get_last_modified_date_from_file(file) if date_from_file_object else None
-
-    return None
-
-
-def get_last_modified_date(filename: str) -> Optional[str]:
-    """Modification time of file at path `filename`, if it exists.
-
-    Returns `None` when `filename` is not a path to a file on the local filesystem.
-
-    Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
-    "2024-03-05T17:02:53".
-    """
-    if not os.path.isfile(filename):
-        return None
-
-    modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
-    return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
-
-
-def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
-    """Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
-    # -- a file-like object will have a name attribute if created by `open()` or if a name is
-    # -- assigned to it for metadata purposes. Use "" as default because the empty string is never
-    # -- a path to an actual file.
-    filename = str(getattr(file, "name", ""))
-
-    # -- there's no guarantee the path corresponds to an actual file on the filesystem. In
-    # -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
-    # -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
-    if not os.path.isfile(filename):
-        return None
-
-    return get_last_modified_date(filename)
-

 def normalize_layout_element(
    layout_element: LayoutElement | Element | dict[str, Any],
@ -230,54 +158,6 @@ def layout_list_to_list_items(
    return list_items


-def set_element_hierarchy(
-    elements: list[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
-) -> list[Element]:
-    """Sets the parent_id for each element in the list of elements
-    based on the element's category, depth and a ruleset
-
-    """
-    stack: list[Element] = []
-    for element in elements:
-        if element.metadata.parent_id is not None:
-            continue
-        parent_id = None
-        element_category = getattr(element, "category", None)
-        element_category_depth = getattr(element.metadata, "category_depth", 0) or 0
-
-        if not element_category:
-            continue
-
-        while stack:
-            top_element: Element = stack[-1]
-            top_element_category = getattr(top_element, "category")
-            top_element_category_depth = (
-                getattr(
-                    top_element.metadata,
-                    "category_depth",
-                    0,
-                )
-                or 0
-            )
-
-            if (
-                top_element_category == element_category
-                and top_element_category_depth < element_category_depth
-            ) or (
-                top_element_category != element_category
-                and element_category in ruleset.get(top_element_category, [])
-            ):
-                parent_id = top_element.id
-                break
-
-            stack.pop()
-
-        element.metadata.parent_id = parent_id
-        stack.append(element)
-
-    return elements
-
-
 def add_element_metadata(
    element: Element,
    filename: Optional[str] = None,
@ -580,7 +460,7 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
 # FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
 # unstructured.documents.html, which imports this module so we can't import the class for type
 # hints. Moreover, those two types of documents have different lists of attributes
-# UPDATE(scanny): HTMLDocument no longer uses this function, so it can be optimized for use by
+# UPDATE(scanny): HTMLDocument no longer exists, so this function can be optimized for use by
 # DocumentLayout only.
 def document_to_element_list(
    document: DocumentLayout,
--- a/unstructured/partition/common/metadata.py
+++ b/unstructured/partition/common/metadata.py
@ -0,0 +1,128 @@
+"""Helpers used across multiple partitioners to compute metadata."""
+
+from __future__ import annotations
+
+import datetime as dt
+import os
+from typing import IO, Optional, Sequence
+
+from unstructured.documents.elements import Element
+
+
+def get_last_modified(
+    filename: str | None, file: IO[bytes] | None, date_from_file_object: bool
+) -> str | None:
+    """Determine best available last-modified date from file or filename."""
+    if filename is not None:
+        return get_last_modified_date(filename)
+
+    if file is not None:
+        return get_last_modified_date_from_file(file) if date_from_file_object else None
+
+    return None
+
+
+def get_last_modified_date(filename: str) -> Optional[str]:
+    """Modification time of file at path `filename`, if it exists.
+
+    Returns `None` when `filename` is not a path to a file on the local filesystem.
+
+    Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
+    "2024-03-05T17:02:53".
+    """
+    if not os.path.isfile(filename):
+        return None
+
+    modify_date = dt.datetime.fromtimestamp(os.path.getmtime(filename))
+    return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
+
+
+def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
+    """Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
+    # -- a file-like object will have a name attribute if created by `open()` or if a name is
+    # -- assigned to it for metadata purposes. Use "" as default because the empty string is never
+    # -- a path to an actual file.
+    filename = str(getattr(file, "name", ""))
+
+    # -- there's no guarantee the path corresponds to an actual file on the filesystem. In
+    # -- particular, a user can set the `.name` attribute of an e.g. `io.BytesIO` object to
+    # -- populate the `.metadata.filename` fields for a payload perhaps downloaded via HTTP.
+    if not os.path.isfile(filename):
+        return None
+
+    return get_last_modified_date(filename)
+
+
+HIERARCHY_RULE_SET = {
+    "Title": [
+        "Text",
+        "UncategorizedText",
+        "NarrativeText",
+        "ListItem",
+        "BulletedText",
+        "Table",
+        "FigureCaption",
+        "CheckBox",
+        "Table",
+    ],
+    "Header": [
+        "Title",
+        "Text",
+        "UncategorizedText",
+        "NarrativeText",
+        "ListItem",
+        "BulletedText",
+        "Table",
+        "FigureCaption",
+        "CheckBox",
+        "Table",
+    ],
+}
+
+
+def set_element_hierarchy(
+    elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
+) -> list[Element]:
+    """Sets the parent_id for each element in the list of elements
+    based on the element's category, depth and a ruleset
+
+    """
+    stack: list[Element] = []
+    for element in elements:
+        if element.metadata.parent_id is not None:
+            continue
+        parent_id = None
+        element_category = getattr(element, "category", None)
+        element_category_depth = getattr(element.metadata, "category_depth", 0) or 0
+
+        if not element_category:
+            continue
+
+        while stack:
+            top_element: Element = stack[-1]
+            top_element_category = getattr(top_element, "category")
+            top_element_category_depth = (
+                getattr(
+                    top_element.metadata,
+                    "category_depth",
+                    0,
+                )
+                or 0
+            )
+
+            if (
+                top_element_category == element_category
+                and top_element_category_depth < element_category_depth
+            ) or (
+                top_element_category != element_category
+                and element_category in ruleset.get(top_element_category, [])
+            ):
+                parent_id = top_element.id
+                break
+
+            stack.pop()
+
+        element.metadata.parent_id = parent_id
+        stack.append(element)
+
+    return list(elements)
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -16,7 +16,10 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
+from unstructured.partition.common.metadata import (
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+)
 from unstructured.partition.lang import apply_lang_metadata
 from unstructured.utils import is_temp_file_path, lazyproperty

--- a/unstructured/partition/doc.py
+++ b/unstructured/partition/doc.py
@ -8,11 +8,8 @@ from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
-    convert_office_doc,
-    exactly_one,
-    get_last_modified,
-)
+from unstructured.partition.common.common import convert_office_doc, exactly_one
+from unstructured.partition.common.metadata import get_last_modified
 from unstructured.partition.docx import partition_docx


--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -46,7 +46,7 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -46,9 +46,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
-from unstructured.partition.common import (
-    convert_to_bytes,
-    exactly_one,
+from unstructured.partition.common.common import convert_to_bytes, exactly_one
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/epub.py
+++ b/unstructured/partition/epub.py
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import exactly_one, get_last_modified
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import get_last_modified
 from unstructured.partition.html import partition_html

 DETECTION_ORIGIN: str = "epub"
--- a/unstructured/partition/html/partition.py
+++ b/unstructured/partition/html/partition.py
@ -14,7 +14,10 @@ from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
+from unstructured.partition.common.metadata import (
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+)
 from unstructured.partition.html.parser import Flow, html_parser
 from unstructured.partition.lang import apply_lang_metadata
 from unstructured.utils import is_temp_file_path, lazyproperty
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -5,7 +5,7 @@ from typing import IO, Any, Optional
 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common.common import exactly_one
 from unstructured.partition.lang import check_language_args
 from unstructured.partition.pdf import partition_pdf_or_image
 from unstructured.partition.utils.constants import PartitionStrategy
--- a/unstructured/partition/json.py
+++ b/unstructured/partition/json.py
@ -19,8 +19,8 @@ from unstructured.file_utils.filetype import (
    add_metadata_with_filetype,
    is_json_processable,
 )
-from unstructured.partition.common import (
-    exactly_one,
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@ -9,8 +9,8 @@ from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
-    exactly_one,
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@ -14,7 +14,7 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
-from unstructured.partition.common import (
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@ -8,7 +8,8 @@ from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import exactly_one, get_last_modified
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import get_last_modified
 from unstructured.partition.docx import partition_docx
 from unstructured.utils import requires_dependencies

--- a/unstructured/partition/org.py
+++ b/unstructured/partition/org.py
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import exactly_one, get_last_modified
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import get_last_modified
 from unstructured.partition.html import partition_html

 DETECTION_ORIGIN: str = "org"
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -42,7 +42,7 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger, trace_logger
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
-from unstructured.partition.common import (
+from unstructured.partition.common.common import (
    document_to_element_list,
    exactly_one,
    ocr_data_to_elements,
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -17,16 +17,16 @@ from PIL import Image

 from unstructured.documents.elements import ElementType
 from unstructured.logger import logger
-from unstructured.partition.common import (
-    convert_to_bytes,
-    exactly_one,
+from unstructured.partition.common.common import convert_to_bytes, exactly_one
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
 from unstructured.partition.utils.config import env_config

 if TYPE_CHECKING:
-    from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
+    from unstructured_inference.inference.elements import TextRegion
+    from unstructured_inference.inference.layout import DocumentLayout, PageLayout
    from unstructured_inference.inference.layoutelement import LayoutElement

    from unstructured.documents.elements import Element
--- a/unstructured/partition/ppt.py
+++ b/unstructured/partition/ppt.py
@ -8,9 +8,8 @@ from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
-    convert_office_doc,
-    exactly_one,
+from unstructured.partition.common.common import convert_office_doc, exactly_one
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -36,8 +36,8 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
-    convert_ms_office_table_to_text,
+from unstructured.partition.common.common import convert_ms_office_table_to_text
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/rst.py
+++ b/unstructured/partition/rst.py
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import exactly_one, get_last_modified
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import get_last_modified
 from unstructured.partition.html import partition_html

 DETECTION_ORIGIN: str = "rst"
--- a/unstructured/partition/rtf.py
+++ b/unstructured/partition/rtf.py
@ -7,7 +7,8 @@ from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import exactly_one, get_last_modified
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import get_last_modified
 from unstructured.partition.html import partition_html

 DETECTION_ORIGIN: str = "rtf"
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -29,8 +29,8 @@ from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
 from unstructured.nlp.tokenize import sent_tokenize
-from unstructured.partition.common import (
-    exactly_one,
+from unstructured.partition.common.common import exactly_one
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
 )
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@ -14,11 +14,13 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
+from unstructured.partition.common.common import (
    exactly_one,
+    spooled_to_bytes_io_if_needed,
+)
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
-    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.lang import apply_lang_metadata

--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@ -26,7 +26,10 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
+from unstructured.partition.common.metadata import (
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+)
 from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text_type import (
    is_bulleted_text,
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@ -16,11 +16,13 @@ from unstructured.documents.elements import (
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common import (
+from unstructured.partition.common.common import (
    exactly_one,
+    spooled_to_bytes_io_if_needed,
+)
+from unstructured.partition.common.metadata import (
    get_last_modified_date,
    get_last_modified_date_from_file,
-    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text import element_from_text
--- a/unstructured/staging/base.py
+++ b/unstructured/staging/base.py
@ -16,7 +16,7 @@ from unstructured.documents.elements import (
    Element,
    ElementMetadata,
 )
-from unstructured.partition.common import exactly_one
+from unstructured.partition.common.common import exactly_one
 from unstructured.utils import Point, dependency_exists, requires_dependencies

 if dependency_exists("pandas"):