rfctr(docx): extract DocxPartitionerOptions (#3018)

**Reviewers:** Probably easier to review first and second commits separately as the first one adds all the new code and tests (without installing it), and the second one installs it into the partitioner along with the required changes to code and tests. **Summary** Enable communication of partitioning options to sub-partitioners, in particular to the pluggable `PicturePartitioner` coming in a closely subsequent PR to implement image-extraction and OCR for DOCX, DOC, and ODT formats. **Additional Context** In general, validation of partitioning options as well as assigning default values and computing derived partitioning settings can be extracted from partitioners into a neatly encapsulated separate object. This simplifies the core partitioning code by removing the noise associated with computing metadata values and deciding how to access the source document, etc. However, better factoring aside, having the partition-time "settings" available in a single object allows partitioning of certain document features, for example images, to be readily _delegated_ to a sub-partitioner while still giving it access to all the relevant partitioning settings for the current document. This is particularly important when a sub-partitioner is "pluggable" at runtime and must rely on a clearly-defined (and simple as possible) interface to operate smoothly.
2025-12-27 07:03:52 +00:00 · 2024-05-14 17:50:31 -07:00 · 2024-05-14 17:50:31 -07:00 · 12b30d2810
commit 12b30d2810
parent db186dc23b
4 changed files with 543 additions and 200 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,8 @@
-## 0.13.8-dev7
+## 0.13.8-dev8

 ### Enhancements

-**Faster evaluation** Support for concurrent processing of documents during evaluation
+* **Faster evaluation** Support for concurrent processing of documents during evaluation

 ### Features

--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@ -4,16 +4,26 @@

 from __future__ import annotations

+import io
 import pathlib
 import re
 import tempfile
+from typing import Any

 import docx
 import pytest
 from docx.document import Document
 from pytest_mock import MockFixture

-from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
+from test_unstructured.unit_utils import (
+    FixtureRequest,
+    Mock,
+    assert_round_trips_through_JSON,
+    example_doc_path,
+    function_mock,
+    instance_mock,
+    property_mock,
+)
 from unstructured.chunking.title import chunk_by_title
 from unstructured.documents.elements import (
    Address,
@ -29,7 +39,7 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.partition.docx import _DocxPartitioner, partition_docx
+from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA

 # -- docx-file loading behaviors -----------------------------------------------------------------
@ -89,14 +99,16 @@ def test_partition_docx_from_file_with_metadata_filename(
        assert element.metadata.filename == "test"


-def test_partition_docx_raises_with_both_specified(mock_document_file_path: str):
-    with open(mock_document_file_path, "rb") as f:
-        with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
-            partition_docx(filename=mock_document_file_path, file=f)
+def test_partition_docx_uses_file_path_when_both_are_specified(
+    mock_document_file_path: str, expected_elements: list[Text]
+):
+    f = io.BytesIO(b"abcde")
+    elements = partition_docx(filename=mock_document_file_path, file=f)
+    assert elements == expected_elements


 def test_partition_docx_raises_with_neither():
-    with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
+    with pytest.raises(ValueError, match="either `filename` or `file` argument must be provided"):
        partition_docx()


@ -292,15 +304,13 @@ def test_partition_docx_from_file_without_metadata_date():
    assert elements[0].metadata.last_modified is None


-def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dict[str, str]]):
-    partitioner = _DocxPartitioner(
-        example_doc_path("fake-doc-emphasized-text.docx"),
-        None,
-        None,
-        False,
-        True,
-        None,
-    )
+def test_get_emphasized_texts_from_paragraph(
+    opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
+):
+    opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
+    opts = DocxPartitionerOptions(**opts_args)
+    partitioner = _DocxPartitioner(opts)
+
    paragraph = partitioner._document.paragraphs[1]
    emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
    assert paragraph.text == "I am a bold italic bold-italic text."
@ -317,34 +327,31 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dic
    assert emphasized_texts == []


-def test_iter_table_emphasis(expected_emphasized_texts: list[dict[str, str]]):
-    partitioner = _DocxPartitioner(
-        example_doc_path("fake-doc-emphasized-text.docx"),
-        None,
-        None,
-        False,
-        True,
-        None,
-    )
+def test_iter_table_emphasis(
+    opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
+):
+    opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
+    opts = DocxPartitionerOptions(**opts_args)
+    partitioner = _DocxPartitioner(opts)
    table = partitioner._document.tables[0]
+
    emphasized_texts = list(partitioner._iter_table_emphasis(table))
+
    assert emphasized_texts == expected_emphasized_texts


 def test_table_emphasis(
+    opts_args: dict[str, Any],
    expected_emphasized_text_contents: list[str],
    expected_emphasized_text_tags: list[str],
 ):
-    partitioner = _DocxPartitioner(
-        example_doc_path("fake-doc-emphasized-text.docx"),
-        None,
-        None,
-        False,
-        True,
-        None,
-    )
+    opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
+    opts = DocxPartitionerOptions(**opts_args)
+    partitioner = _DocxPartitioner(opts)
    table = partitioner._document.tables[0]
+
    emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table)
+
    assert emphasized_text_contents == expected_emphasized_text_contents
    assert emphasized_text_tags == expected_emphasized_text_tags

@ -373,15 +380,10 @@ def test_partition_docx_with_json(mock_document_file_path: str):
    assert_round_trips_through_JSON(elements)


-def test_parse_category_depth_by_style():
-    partitioner = _DocxPartitioner(
-        example_doc_path("category-level.docx"),
-        None,
-        None,
-        False,
-        True,
-        None,
-    )
+def test_parse_category_depth_by_style(opts_args: dict[str, Any]):
+    opts_args["file_path"] = example_doc_path("category-level.docx")
+    opts = DocxPartitionerOptions(**opts_args)
+    partitioner = _DocxPartitioner(opts)

    # Category depths are 0-indexed and relative to the category type
    # Title, list item, bullet, narrative text, etc.
@ -411,9 +413,9 @@ def test_parse_category_depth_by_style():
        ), f"expected paragraph[{idx}] to have depth=={depth}, got {actual_depth}"


-def test_parse_category_depth_by_style_name():
-    partitioner = _DocxPartitioner(None, None, None, False, True, None)
-
+def test_parse_category_depth_by_style_name(opts_args: dict[str, Any]):
+    opts = DocxPartitionerOptions(**opts_args)
+    partitioner = _DocxPartitioner(opts)
    test_cases = [
        (0, "Heading 1"),
        (1, "Heading 2"),
@ -436,8 +438,9 @@ def test_parse_category_depth_by_style_name():
        ), f"test case {test_cases[idx]} failed"


-def test_parse_category_depth_by_style_ilvl():
-    partitioner = _DocxPartitioner(None, None, None, False, True, None)
+def test_parse_category_depth_by_style_ilvl(opts_args: dict[str, Any]):
+    opts = DocxPartitionerOptions(**opts_args)
+    partitioner = _DocxPartitioner(opts)
    assert partitioner._parse_category_depth_by_style_ilvl() == 0


@ -683,6 +686,24 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) ->
    return filename


+@pytest.fixture()
+def opts_args() -> dict[str, Any]:
+    """All default arguments for `DocxPartitionerOptions`.
+
+    Individual argument values can be changed to suit each test. Makes construction of opts more
+    compact for testing purposes.
+    """
+    return {
+        "date_from_file_object": False,
+        "file": None,
+        "file_path": None,
+        "include_page_breaks": True,
+        "infer_table_structure": True,
+        "metadata_file_path": None,
+        "metadata_last_modified": None,
+    }
+
+
 # ================================================================================================
 # ISOLATED UNIT TESTS
 # ================================================================================================
@ -691,14 +712,280 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) ->
 # ================================================================================================


+class DescribeDocxPartitionerOptions:
+    """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
+
+    # -- .document -------------------------------
+
+    def it_loads_the_docx_document(
+        self,
+        request: FixtureRequest,
+        opts_args: dict[str, Any],
+    ):
+        document_ = instance_mock(request, Document)
+        docx_Document_ = function_mock(
+            request, "unstructured.partition.docx.docx.Document", return_value=document_
+        )
+        _docx_file_prop_ = property_mock(
+            request, DocxPartitionerOptions, "_docx_file", return_value="abcde.docx"
+        )
+        opts = DocxPartitionerOptions(**opts_args)
+
+        document = opts.document
+
+        _docx_file_prop_.assert_called_once_with()
+        docx_Document_.assert_called_once_with("abcde.docx")
+        assert document is document_
+
+    # -- .include_page_breaks --------------------
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["include_page_breaks"] = arg_value
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts.include_page_breaks is arg_value
+
+    # -- .infer_table_structure ------------------
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_include_text_as_html_in_Table_metadata(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["infer_table_structure"] = arg_value
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts.infer_table_structure is arg_value
+
+    # -- .increment_page_number() ----------------
+
+    def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
+        self, opts_args: dict[str, Any]
+    ):
+        opts = DocxPartitionerOptions(**opts_args)
+
+        page_break_iter = opts.increment_page_number()
+
+        assert isinstance(next(page_break_iter, None), PageBreak)
+        assert opts.page_number == 2
+        with pytest.raises(StopIteration):
+            next(page_break_iter)
+
+    def but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["include_page_breaks"] = False
+        opts = DocxPartitionerOptions(**opts_args)
+
+        page_break_iter = opts.increment_page_number()
+
+        with pytest.raises(StopIteration):
+            next(page_break_iter)
+        assert opts.page_number == 2
+
+    # -- .last_modified --------------------------
+
+    def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts.last_modified == "2024-03-05T17:02:53"
+
+    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
+        self, opts_args: dict[str, Any], get_last_modified_date_: Mock
+    ):
+        opts_args["file_path"] = "a/b/document.docx"
+        get_last_modified_date_.return_value = "2024-04-02T20:32:35"
+        opts = DocxPartitionerOptions(**opts_args)
+
+        last_modified = opts.last_modified
+
+        get_last_modified_date_.assert_called_once_with("a/b/document.docx")
+        assert last_modified == "2024-04-02T20:32:35"
+
+    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
+        self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
+    ):
+        file = io.BytesIO(b"abcdefg")
+        opts_args["file"] = file
+        opts_args["date_from_file_object"] = True
+        get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
+        opts = DocxPartitionerOptions(**opts_args)
+
+        last_modified = opts.last_modified
+
+        get_last_modified_date_from_file_.assert_called_once_with(file)
+        assert last_modified == "2024-04-02T20:42:07"
+
+    def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
+        self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
+    ):
+        file = io.BytesIO(b"abcdefg")
+        opts_args["file"] = file
+        opts_args["date_from_file_object"] = False
+        get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
+        opts = DocxPartitionerOptions(**opts_args)
+
+        last_modified = opts.last_modified
+
+        get_last_modified_date_from_file_.assert_not_called()
+        assert last_modified is None
+
+    # -- .metadata_file_path ---------------------
+
+    def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["file_path"] = "x/y/z.docx"
+        opts_args["metadata_file_path"] = "a/b/c.docx"
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts.metadata_file_path == "a/b/c.docx"
+
+    @pytest.mark.parametrize("file_path", ["u/v/w.docx", None])
+    def and_it_falls_back_to_the_document_file_path_otherwise(
+        self, file_path: str | None, opts_args: dict[str, Any]
+    ):
+        opts_args["file_path"] = file_path
+        opts_args["metadata_file_path"] = None
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts.metadata_file_path == file_path
+
+    # -- ._metadata_page_number ------------------
+
+    @pytest.mark.parametrize(
+        ("page_count", "document_contains_pagebreaks", "expected_value"),
+        [(7, True, 7), (1, False, None)],
+    )
+    def it_reports_None_when_no_rendered_page_breaks_are_found_in_document(
+        self,
+        request: FixtureRequest,
+        opts_args: dict[str, Any],
+        page_count: int,
+        document_contains_pagebreaks: bool,
+        expected_value: int | None,
+    ):
+        _document_contains_pagebreaks_prop_ = property_mock(
+            request,
+            DocxPartitionerOptions,
+            "_document_contains_pagebreaks",
+            return_value=document_contains_pagebreaks,
+        )
+        opts = DocxPartitionerOptions(**opts_args)
+        opts._page_counter = page_count
+
+        metadata_page_number = opts.metadata_page_number
+
+        _document_contains_pagebreaks_prop_.assert_called_once_with()
+        assert metadata_page_number is expected_value
+
+    # -- .page_number ----------------------------
+
+    def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
+        """In DOCX, page-number is the slide number."""
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts.page_number == 1
+        list(opts.increment_page_number())
+        assert opts.page_number == 2
+        list(opts.increment_page_number())
+        assert opts.page_number == 3
+
+    def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
+        self, opts_args: dict[str, Any]
+    ):
+        opts = DocxPartitionerOptions(**opts_args, starting_page_number=3)
+
+        assert opts.page_number == 3
+        list(opts.increment_page_number())
+        assert opts.page_number == 4
+
+    # -- ._document_contains_pagebreaks ----------
+
+    @pytest.mark.parametrize(
+        ("file_name", "expected_value"), [("page-breaks.docx", True), ("teams_chat.docx", False)]
+    )
+    def it_knows_whether_the_document_contains_page_breaks(
+        self, opts_args: dict[str, Any], file_name: str, expected_value: bool
+    ):
+        opts_args["file_path"] = example_doc_path(file_name)
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts._document_contains_pagebreaks is expected_value
+
+    # -- ._docx_file -----------------------------
+
+    def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["file_path"] = "l/m/n.docx"
+        opts = DocxPartitionerOptions(**opts_args)
+
+        assert opts._docx_file == "l/m/n.docx"
+
+    def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        spooled_temp_file = tempfile.SpooledTemporaryFile()
+        spooled_temp_file.write(b"abcdefg")
+        opts_args["file"] = spooled_temp_file
+        opts = DocxPartitionerOptions(**opts_args)
+
+        docx_file = opts._docx_file
+
+        assert docx_file is not spooled_temp_file
+        assert isinstance(docx_file, io.BytesIO)
+        assert docx_file.getvalue() == b"abcdefg"
+
+    def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
+        self, opts_args: dict[str, Any]
+    ):
+        file = io.BytesIO(b"abcdefg")
+        opts_args["file"] = file
+        opts = DocxPartitionerOptions(**opts_args)
+
+        docx_file = opts._docx_file
+
+        assert docx_file is file
+        assert isinstance(docx_file, io.BytesIO)
+        assert docx_file.getvalue() == b"abcdefg"
+
+    def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts = DocxPartitionerOptions(**opts_args)
+
+        with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
+            opts._docx_file
+
+    # -- fixtures --------------------------------------------------------------------------------
+
+    @pytest.fixture()
+    def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
+        return function_mock(request, "unstructured.partition.docx.get_last_modified_date")
+
+    @pytest.fixture()
+    def get_last_modified_date_from_file_(self, request: FixtureRequest):
+        return function_mock(
+            request, "unstructured.partition.docx.get_last_modified_date_from_file"
+        )
+
+
 class Describe_DocxPartitioner:
    """Unit-test suite for `unstructured.partition.docx._DocxPartitioner`."""

    # -- table behaviors -------------------------------------------------------------------------

-    def it_can_convert_a_table_to_html(self):
+    def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]):
+        opts = DocxPartitionerOptions(**opts_args)
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
-        assert _DocxPartitioner()._convert_table_to_html(table) == (
+
+        assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
            "<table>\n"
            "<thead>\n"
            "<tr><th>Header Col 1  </th><th>Header Col 2  </th></tr>\n"
@ -709,7 +996,7 @@ class Describe_DocxPartitioner:
            "</table>"
        )

-    def and_it_can_convert_a_nested_table_to_html(self):
+    def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]):
        """
        Fixture table is:

@ -725,10 +1012,11 @@ class Describe_DocxPartitioner:
            | j |      k      | l |
            +---+-------------+---+
        """
+        opts = DocxPartitionerOptions(**opts_args)
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]

        # -- re.sub() strips out the extra padding inserted by tabulate --
-        html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table))
+        html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))

        expected_lines = [
            "<table>",
@ -750,13 +1038,15 @@ class Describe_DocxPartitioner:
        for expected, actual in zip(expected_lines, actual_lines):
            assert actual == expected, f"\nexpected: {repr(expected)}\nactual:   {repr(actual)}"

-    def it_can_convert_a_table_to_plain_text(self):
+    def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
+        opts = DocxPartitionerOptions(**opts_args)
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
-        assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == (
+
+        assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == (
            "Header Col 1 Header Col 2 Lorem ipsum A link example"
        )

-    def and_it_can_convert_a_nested_table_to_plain_text(self):
+    def and_it_can_convert_a_nested_table_to_plain_text(self, opts_args: dict[str, Any]):
        """
        Fixture table is:

@ -772,12 +1062,14 @@ class Describe_DocxPartitioner:
            | j |      k      | l |
            +---+-------------+---+
        """
+        opts = DocxPartitionerOptions(**opts_args)
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
-        assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == (
+
+        assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == (
            "a >b< c d e f g&t h i j k l"
        )

-    def but_the_text_of_a_merged_cell_appears_only_once(self):
+    def but_the_text_of_a_merged_cell_appears_only_once(self, opts_args: dict[str, Any]):
        """
        Fixture table is:

@ -789,8 +1081,9 @@ class Describe_DocxPartitioner:
            | e     |   |
            +-------+---+
        """
+        opts = DocxPartitionerOptions(**opts_args)
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
-        assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
+        assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == "a b c d e"

    def it_can_partition_tables_with_incomplete_rows(self):
        """DOCX permits table rows to start late and end early.
@ -921,7 +1214,7 @@ class Describe_DocxPartitioner:

    # -- page-break behaviors --------------------------------------------------------------------

-    def it_places_page_breaks_precisely_where_they_occur(self):
+    def it_places_page_breaks_precisely_where_they_occur(self, opts_args: dict[str, Any]):
        """Page-break behavior has some subtleties.

        * A hard page-break does not generate a PageBreak element (because that would double-count
@ -940,6 +1233,8 @@ class Describe_DocxPartitioner:
            """A more detailed `repr()` to aid debugging when assertion fails."""
            return f"{e.__class__.__name__}('{e}')"

+        opts_args["file_path"] = example_doc_path("page-breaks.docx")
+        opts = DocxPartitionerOptions(**opts_args)
        expected = [
            # NOTE(scanny) - -- page 1 --
            NarrativeText(
@ -975,7 +1270,7 @@ class Describe_DocxPartitioner:
            Title("<<and then more text proceeds."),
        ]

-        elements = _DocxPartitioner.iter_document_elements(example_doc_path("page-breaks.docx"))
+        elements = _DocxPartitioner.iter_document_elements(opts)

        for idx, e in enumerate(elements):
            assert e == expected[idx], (
@ -986,8 +1281,10 @@ class Describe_DocxPartitioner:

    # -- header/footer behaviors -----------------------------------------------------------------

-    def it_includes_table_cell_text_in_Header_text(self):
-        partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx"))
+    def it_includes_table_cell_text_in_Header_text(self, opts_args: dict[str, Any]):
+        opts_args["file_path"] = example_doc_path("docx-hdrftr.docx")
+        opts = DocxPartitionerOptions(**opts_args)
+        partitioner = _DocxPartitioner(opts)
        section = partitioner._document.sections[0]

        header_iter = partitioner._iter_section_headers(section)
@ -995,9 +1292,11 @@ class Describe_DocxPartitioner:
        element = next(header_iter)
        assert element.text == "First header para\nTable cell1 Table cell2\nLast header para"

-    def it_includes_table_cell_text_in_Footer_text(self):
+    def it_includes_table_cell_text_in_Footer_text(self, opts_args: dict[str, Any]):
        """This case also verifies nested-table and merged-cell behaviors."""
-        partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx"))
+        opts_args["file_path"] = example_doc_path("docx-hdrftr.docx")
+        opts = DocxPartitionerOptions(**opts_args)
+        partitioner = _DocxPartitioner(opts)
        section = partitioner._document.sections[0]

        footer_iter = partitioner._iter_section_footers(section)
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.13.8-dev7"  # pragma: no cover
+__version__ = "0.13.8-dev8"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -217,19 +217,19 @@ def partition_docx(
        Assign this number to the first page of this document and increment the page number from
        there.
    """
-    # -- verify that only one file-specifier argument was provided --
-    exactly_one(filename=filename, file=file)
-
-    elements = _DocxPartitioner.iter_document_elements(
-        filename,
-        file,
-        metadata_filename,
-        include_page_breaks,
-        infer_table_structure,
-        metadata_last_modified,
-        date_from_file_object,
+    opts = DocxPartitionerOptions(
+        date_from_file_object=date_from_file_object,
+        file=file,
+        file_path=filename,
+        include_page_breaks=include_page_breaks,
+        infer_table_structure=infer_table_structure,
+        metadata_file_path=metadata_filename,
+        metadata_last_modified=metadata_last_modified,
        starting_page_number=starting_page_number,
    )
+
+    elements = _DocxPartitioner.iter_document_elements(opts)
+
    elements = apply_lang_metadata(
        elements=elements,
        languages=languages,
@ -238,56 +238,169 @@ def partition_docx(
    return list(elements)


-class _DocxPartitioner:
-    """Provides `.partition()` for MS-Word 2007+ (.docx) files."""
+class DocxPartitionerOptions:
+    """Encapsulates partitioning option validation, computation, and application of defaults."""

    def __init__(
        self,
-        # -- NOTE(scanny): default values here are unnecessary for production use because
-        # -- `.iter_document_elements()` is the only interface method and always calls with all
-        # -- args. However, providing defaults eases unit-testing and decouples unit-tests from
-        # -- future changes to args.
-        filename: Optional[str] = None,
-        file: Optional[IO[bytes]] = None,
-        metadata_filename: Optional[str] = None,
-        include_page_breaks: bool = True,
-        infer_table_structure: bool = True,
-        metadata_last_modified: Optional[str] = None,
-        date_from_file_object: bool = False,
+        *,
+        date_from_file_object: bool,
+        file: IO[bytes] | None,
+        file_path: str | None,
+        include_page_breaks: bool,
+        infer_table_structure: bool,
+        metadata_file_path: Optional[str],
+        metadata_last_modified: Optional[str],
        starting_page_number: int = 1,
-    ) -> None:
-        self._filename = filename
+    ):
+        self._date_from_file_object = date_from_file_object
        self._file = file
-        self._metadata_filename = metadata_filename
+        self._file_path = file_path
        self._include_page_breaks = include_page_breaks
        self._infer_table_structure = infer_table_structure
+        self._metadata_file_path = metadata_file_path
        self._metadata_last_modified = metadata_last_modified
+        # -- options object maintains page-number state --
        self._page_counter = starting_page_number
-        self._date_from_file_object = date_from_file_object
+
+    @lazyproperty
+    def document(self) -> Document:
+        """The python-docx `Document` object loaded from file or filename."""
+        return docx.Document(self._docx_file)
+
+    @lazyproperty
+    def include_page_breaks(self) -> bool:
+        """When True, include `PageBreak` elements in element-stream.
+
+        Note that regardless of this setting, page-breaks are detected, and page-number is tracked
+        and included in element metadata. Only the presence of distinct `PageBreak` elements (which
+        contain no text) in the element stream is affected.
+        """
+        return self._include_page_breaks
+
+    def increment_page_number(self) -> Iterator[PageBreak]:
+        """Increment page-number by 1 and generate a PageBreak element if enabled."""
+        self._page_counter += 1
+        # -- only emit page-breaks when enabled --
+        if self._include_page_breaks:
+            yield PageBreak("", detection_origin=DETECTION_ORIGIN)
+
+    @lazyproperty
+    def infer_table_structure(self) -> bool:
+        """True when partitioner should compute and apply `text_as_html` metadata for tables."""
+        return self._infer_table_structure
+
+    @lazyproperty
+    def last_modified(self) -> Optional[str]:
+        """The best last-modified date available, None if no sources are available."""
+        # -- Value explicitly specified by caller takes precedence. This is used for example when
+        # -- this file was converted from another format, and any last-modified date for the file
+        # -- would be just now.
+        if self._metadata_last_modified:
+            return self._metadata_last_modified
+
+        if self._file_path:
+            return (
+                None
+                if is_temp_file_path(self._file_path)
+                else get_last_modified_date(self._file_path)
+            )
+
+        if self._file:
+            return (
+                get_last_modified_date_from_file(self._file)
+                if self._date_from_file_object
+                else None
+            )
+
+        return None
+
+    @lazyproperty
+    def metadata_file_path(self) -> str | None:
+        """The best available file-path for this document or `None` if unavailable."""
+        return self._metadata_file_path or self._file_path
+
+    @property
+    def metadata_page_number(self) -> Optional[int]:
+        """The current page number to report in metadata, or None if we can't really tell.
+
+        Page numbers are not added to element metadata if we can't find any page-breaks in the
+        document (which may be a common case).
+
+        In the DOCX format, determining page numbers is strictly a best-efforts attempt since
+        actual page-breaks are determined at rendering time (e.g. printing) based on the
+        font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the
+        docx file but the rendered page-breaks are only added optionally.
+        """
+        return self._page_counter if self._document_contains_pagebreaks else None
+
+    @property
+    def page_number(self) -> int:
+        """The current page number.
+
+        Note this value may not represent the actual rendered page number when rendered page-break
+        indicators are not present in the document (not uncommon). Use `.metadata_page_number` for
+        metadata purposes, which is `None` when rendered page-breaks are not present in this
+        document.
+        """
+        return self._page_counter
+
+    @lazyproperty
+    def _document_contains_pagebreaks(self) -> bool:
+        """True when there is at least one page-break detected in the document.
+
+        Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
+        inserted by Microsoft Word, but probably don't appear in documents converted into .docx
+        format from for example .odt format.
+        """
+        xpath = (
+            # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
+            # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
+            # is w:p inner-content and both of these can occur inside a table-cell as well as the
+            # document body
+            "./w:body/w:p/w:r/w:lastRenderedPageBreak"
+            " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
+            " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
+            " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
+        )
+
+        return bool(self.document.element.xpath(xpath))
+
+    @lazyproperty
+    def _docx_file(self) -> str | IO[bytes]:
+        """The Word 2007+ document file to be partitioned.
+
+        This is either a `str` path or a file-like object. `python-docx` accepts either for opening
+        a document file.
+        """
+        if self._file_path:
+            return self._file_path
+
+        # -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
+        # -- exception when Zipfile tries to open it. The docx format is a zip archive so we need
+        # -- to work around that bug here.
+        if isinstance(self._file, tempfile.SpooledTemporaryFile):
+            self._file.seek(0)
+            return io.BytesIO(self._file.read())
+
+        if self._file:
+            return self._file
+
+        raise ValueError(
+            "No DOCX document specified, either `filename` or `file` argument must be provided"
+        )
+
+
+class _DocxPartitioner:
+    """Provides `.partition()` for MS-Word 2007+ (.docx) files."""
+
+    def __init__(self, opts: DocxPartitionerOptions) -> None:
+        self._opts = opts

    @classmethod
-    def iter_document_elements(
-        cls,
-        filename: Optional[str] = None,
-        file: Optional[IO[bytes]] = None,
-        metadata_filename: Optional[str] = None,
-        include_page_breaks: bool = True,
-        infer_table_structure: bool = True,
-        metadata_last_modified: Optional[str] = None,
-        date_from_file_object: bool = False,
-        starting_page_number: int = 1,
-    ) -> Iterator[Element]:
+    def iter_document_elements(cls, opts: DocxPartitionerOptions) -> Iterator[Element]:
        """Partition MS Word documents (.docx format) into its document elements."""
-        self = cls(
-            filename=filename,
-            file=file,
-            metadata_filename=metadata_filename,
-            include_page_breaks=include_page_breaks,
-            infer_table_structure=infer_table_structure,
-            metadata_last_modified=metadata_last_modified,
-            date_from_file_object=date_from_file_object,
-            starting_page_number=starting_page_number,
-        )
+        self = cls(opts)
        # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
        # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
        # "section-less" document has to be interated differently and has no headers or footers and
@ -452,37 +565,7 @@ class _DocxPartitioner:
    @lazyproperty
    def _document(self) -> Document:
        """The python-docx `Document` object loaded from file or filename."""
-        filename, file = self._filename, self._file
-
-        if filename is not None:
-            return docx.Document(filename)
-
-        assert file is not None
-        if isinstance(file, tempfile.SpooledTemporaryFile):
-            file.seek(0)
-            file = io.BytesIO(file.read())
-        return docx.Document(file)
-
-    @lazyproperty
-    def _document_contains_pagebreaks(self) -> bool:
-        """True when there is at least one page-break detected in the document.
-
-        Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
-        inserted by Microsoft Word, but probably don't appear in documents converted into .docx
-        format from for example .odt format.
-        """
-        xpath = (
-            # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
-            # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
-            # is w:p inner-content and both of these can occur inside a table-cell as well as the
-            # document body
-            "./w:body/w:p/w:r/w:lastRenderedPageBreak"
-            " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
-            " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
-            " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
-        )
-
-        return bool(self._document.element.xpath(xpath))
+        return self._opts.document

    @lazyproperty
    def _document_contains_sections(self) -> bool:
@ -524,12 +607,6 @@ class _DocxPartitioner:

        return "\n".join(text for text in iter_hdrftr_texts(hdrftr) if text)

-    def _increment_page_number(self) -> Iterator[PageBreak]:
-        """Increment page-number by 1 and generate a PageBreak element if enabled."""
-        self._page_counter += 1
-        if self._include_page_breaks:
-            yield PageBreak("", detection_origin=DETECTION_ORIGIN)
-
    def _is_list_item(self, paragraph: Paragraph) -> bool:
        """True when `paragraph` can be identified as a list-item."""
        if is_bulleted_text(paragraph.text):
@ -581,7 +658,7 @@ class _DocxPartitioner:
            if isinstance(item, Paragraph):
                yield from self._classify_paragraph_to_element(item)
            else:
-                yield from self._increment_page_number()
+                yield from self._opts.increment_page_number()

    def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, str]]:
        """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
@ -616,7 +693,7 @@ class _DocxPartitioner:
                text=text,
                detection_origin=DETECTION_ORIGIN,
                metadata=ElementMetadata(
-                    filename=self._metadata_filename,
+                    filename=self._opts.metadata_file_path,
                    header_footer_type=header_footer_type,
                    category_depth=0,
                ),
@ -645,7 +722,7 @@ class _DocxPartitioner:
                text=text,
                detection_origin=DETECTION_ORIGIN,
                metadata=ElementMetadata(
-                    filename=self._metadata_filename,
+                    filename=self._opts.metadata_file_path,
                    header_footer_type=header_footer_type,
                    category_depth=0,  # -- headers are always at the root level}
                ),
@ -668,7 +745,7 @@ class _DocxPartitioner:
        """

        def page_is_odd() -> bool:
-            return self._page_counter % 2 == 1
+            return self._opts.page_number % 2 == 1

        start_type = section.start_type

@ -682,14 +759,14 @@ class _DocxPartitioner:
            # -- on an even page we need two total, add one to supplement the rendered page break
            # -- to follow. There is no "first-document-page" special case because 1 is odd.
            if not page_is_odd():
-                yield from self._increment_page_number()
+                yield from self._opts.increment_page_number()

        elif start_type == WD_SECTION_START.ODD_PAGE:
            # -- the first page of the document is an implicit "new" odd-page, so no page-break --
            if section_idx == 0:
                return
            if page_is_odd():
-                yield from self._increment_page_number()
+                yield from self._opts.increment_page_number()

        # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of
        # -- which need our help to get the page-breaks right.
@ -699,7 +776,9 @@ class _DocxPartitioner:
        """Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
        # -- at present, we always generate exactly one Table element, but we might want
        # -- to skip, for example, an empty table.
-        html_table = self._convert_table_to_html(table) if self._infer_table_structure else None
+        html_table = (
+            self._convert_table_to_html(table) if self._opts.infer_table_structure else None
+        )
        text_table = " ".join(self._iter_table_texts(table))
        emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)

@ -708,9 +787,9 @@ class _DocxPartitioner:
            detection_origin=DETECTION_ORIGIN,
            metadata=ElementMetadata(
                text_as_html=html_table,
-                filename=self._metadata_filename,
-                page_number=self._page_number,
-                last_modified=self._last_modified,
+                filename=self._opts.metadata_file_path,
+                page_number=self._opts.metadata_page_number,
+                last_modified=self._opts.last_modified,
                emphasized_text_contents=emphasized_text_contents or None,
                emphasized_text_tags=emphasized_text_tags or None,
            ),
@ -753,41 +832,6 @@ class _DocxPartitioner:
                # -- do not generate empty strings --
                yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text)

-    @lazyproperty
-    def _last_modified(self) -> Optional[str]:
-        """Last-modified date suitable for use in element metadata."""
-        # -- if this file was converted from another format, any last-modified date for the file
-        # -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
-        if self._metadata_last_modified:
-            return self._metadata_last_modified
-
-        file_path, file = self._filename, self._file
-
-        # -- if the file is on the filesystem, get its date from there --
-        if file_path is not None:
-            return None if is_temp_file_path(file_path) else get_last_modified_date(file_path)
-
-        # -- otherwise, as long as user explicitly requested it, try getting it from the file-like
-        # -- object (unlikely since BytesIO and its brethren have no such metadata).
-        assert file is not None
-        if self._date_from_file_object:
-            return get_last_modified_date_from_file(file)
-        return None
-
-    @property
-    def _page_number(self) -> Optional[int]:
-        """The current page number, or None if we can't really tell.
-
-        Page numbers are not added to element metadata if we can't find any page-breaks in the
-        document (which may be a common case).
-
-        In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
-        page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
-        target device. Explicit (hard) page-breaks are always recorded in the docx file but the
-        rendered page-breaks are only added optionally.
-        """
-        return self._page_counter if self._document_contains_pagebreaks else None
-
    def _paragraph_emphasis(self, paragraph: Paragraph) -> tuple[list[str], list[str]]:
        """[contents, tags] pair describing emphasized text in `paragraph`."""
        iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph))
@ -842,12 +886,12 @@ class _DocxPartitioner:
            category_depth=category_depth,
            emphasized_text_contents=emphasized_text_contents or None,
            emphasized_text_tags=emphasized_text_tags or None,
-            filename=self._metadata_filename,
-            last_modified=self._last_modified,
+            filename=self._opts.metadata_file_path,
+            last_modified=self._opts.last_modified,
            link_texts=link_texts or None,
            link_urls=link_urls or None,
            links=links or None,
-            page_number=self._page_number,
+            page_number=self._opts.metadata_page_number,
        )
        element_metadata.detection_origin = "docx"
        return element_metadata