rfctr(xlsx): extract _XlsxPartitionerOptions (#2838)

**Summary** As an initial step in reducing the complexity of the monolithic `partition_xlsx()` function, extract all argument-handling to a separate `_XlsxPartitionerOptions` object which can be fully covered by isolated unit tests. **Additional Context** This code was from a prior XLSX bug-fix branch that did not get committed because of time constraints. I wanted to revisit it here because I need the benefits of this as part of some new work on PPTX that will require a separate options object that can be passed to delegate objects. This approach was incubated in the chunking context and has produced a lot of opportunities there to decompose the logic into smaller components that are more understandable and isolated-test-able, without having to pass an extended list of option values in ever sub-call. As well as decluttering the code, this removes coupling where the caller needs to know which options a subroutine might need to reference.
2025-10-16 10:33:57 +00:00 · 2024-04-03 16:27:33 -07:00 · 2024-04-03 16:27:33 -07:00 · 1ce60f2bba
commit 1ce60f2bba
parent e49c35933d
4 changed files with 335 additions and 68 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.13.1-dev1
+## 0.13.1-dev2

 ### Enhancements

--- a/test_unstructured/partition/xlsx/test_xlsx.py
+++ b/test_unstructured/partition/xlsx/test_xlsx.py
@ -7,7 +7,7 @@ from __future__ import annotations
 import io
 import sys
 import tempfile
-from typing import cast
+from typing import Any, cast

 import pandas as pd
 import pandas.testing as pdt
@ -19,13 +19,20 @@ from test_unstructured.partition.test_constants import (
    EXPECTED_TEXT_XLSX,
    EXPECTED_TITLE,
 )
-from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
+from test_unstructured.unit_utils import (
+    FixtureRequest,
+    Mock,
+    assert_round_trips_through_JSON,
+    example_doc_path,
+    function_mock,
+)
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import ListItem, Table, Text, Title
 from unstructured.partition.xlsx import (
    _CellCoordinate,
    _ConnectedComponent,
    _SubtableParser,
+    _XlsxPartitionerOptions,
    partition_xlsx,
 )

@ -116,13 +123,7 @@ def test_partition_xlsx_from_filename_with_metadata_filename():
    assert elements[0].metadata.filename == "test"


-@pytest.mark.parametrize(
-    "infer_table_structure",
-    [
-        True,
-        False,
-    ],
-)
+@pytest.mark.parametrize("infer_table_structure", [True, False])
 def test_partition_xlsx_infer_table_structure(infer_table_structure: bool):
    elements = partition_xlsx(
        "example-docs/stanley-cups.xlsx", infer_table_structure=infer_table_structure
@ -395,6 +396,176 @@ def test_partition_xlsx_with_more_than_1k_cells():
 # ------------------------------------------------------------------------------------------------


+class Describe_XlsxPartitionerOptions:
+    """Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects."""
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_detect_language_for_each_element_individually(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["detect_language_per_element"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.detect_language_per_element is arg_value
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["find_subtable"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.find_subtable is arg_value
+
+    @pytest.mark.parametrize(("arg_value", "expected_value"), [(True, 0), (False, None)])
+    def it_knows_the_header_row_index_for_Pandas(
+        self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
+    ):
+        opts_args["include_header"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.header_row_idx == expected_value
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_include_column_headings_in_Table_text_as_html(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["include_header"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.include_header is arg_value
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_include_metadata_on_elements(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["include_metadata"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.include_metadata is arg_value
+
+    @pytest.mark.parametrize("arg_value", [True, False])
+    def it_knows_whether_to_include_text_as_html_in_Table_metadata(
+        self, arg_value: bool, opts_args: dict[str, Any]
+    ):
+        opts_args["infer_table_structure"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.infer_table_structure is arg_value
+
+    @pytest.mark.parametrize(
+        ("arg_value", "expected_value"),
+        [(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])],
+    )
+    def it_knows_what_languages_the_caller_expects_to_appear_in_the_text(
+        self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
+    ):
+        opts_args["languages"] = arg_value
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.languages == expected_value
+
+    def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.last_modified == "2024-03-05T17:02:53"
+
+    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
+        self, opts_args: dict[str, Any], get_last_modified_date_: Mock
+    ):
+        opts_args["file_path"] = "a/b/spreadsheet.xlsx"
+        get_last_modified_date_.return_value = "2024-04-02T20:32:35"
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        last_modified = opts.last_modified
+
+        get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx")
+        assert last_modified == "2024-04-02T20:32:35"
+
+    def and_it_falls_back_to_the_last_modified_date_of_the_open_file_when_a_file_is_provided(
+        self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
+    ):
+        file = io.BytesIO(b"abcdefg")
+        opts_args["file"] = file
+        opts_args["date_from_file_object"] = True
+        get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        last_modified = opts.last_modified
+
+        get_last_modified_date_from_file_.assert_called_once_with(file)
+        assert last_modified == "2024-04-02T20:42:07"
+
+    def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
+        self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
+    ):
+        file = io.BytesIO(b"abcdefg")
+        opts_args["file"] = file
+        opts_args["date_from_file_object"] = False
+        get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        last_modified = opts.last_modified
+
+        get_last_modified_date_from_file_.assert_not_called()
+        assert last_modified is None
+
+    def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["file_path"] = "x/y/z.xlsx"
+        opts_args["metadata_file_path"] = "a/b/c.xlsx"
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.metadata_file_path == "a/b/c.xlsx"
+
+    @pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None])
+    def and_it_falls_back_to_the_document_file_path_otherwise(
+        self, file_path: str | None, opts_args: dict[str, Any]
+    ):
+        opts_args["file_path"] = file_path
+        opts_args["metadata_file_path"] = None
+        opts = _XlsxPartitionerOptions(**opts_args)
+
+        assert opts.metadata_file_path == file_path
+
+    # -- fixtures --------------------------------------------------------------------------------
+
+    @pytest.fixture()
+    def get_last_modified_date_(self, request: FixtureRequest):
+        return function_mock(request, "unstructured.partition.xlsx.get_last_modified_date")
+
+    @pytest.fixture()
+    def get_last_modified_date_from_file_(self, request: FixtureRequest):
+        return function_mock(
+            request, "unstructured.partition.xlsx.get_last_modified_date_from_file"
+        )
+
+    @pytest.fixture()
+    def opts_args(self) -> dict[str, Any]:
+        """All default arguments for `_XlsxPartitionerOptions`.
+
+        Individual argument values can be changed to suit each test. Makes construction of opts more
+        compact for testing purposes.
+        """
+        return {
+            "date_from_file_object": False,
+            "detect_language_per_element": False,
+            "file": None,
+            "file_path": None,
+            "find_subtable": True,
+            "include_header": False,
+            "include_metadata": True,
+            "infer_table_structure": True,
+            "languages": ["auto"],
+            "metadata_file_path": None,
+            "metadata_last_modified": None,
+        }
+
+
 class Describe_ConnectedComponent:
    """Unit-test suite for `unstructured.partition.xlsx._ConnectedComponent` objects."""

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.13.1-dev1"  # pragma: no cover
+__version__ = "0.13.1-dev2"  # pragma: no cover
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@ -88,39 +88,28 @@ def partition_xlsx(
        Applies only when providing file via `file` parameter. If this option is True, attempt
        infer last_modified metadata from bytes, otherwise set it to None.
    """
-    last_modification_date = None
-    header = 0 if include_header else None
-
-    sheets: dict[str, pd.DataFrame] = {}
-    if filename:
-        sheets = pd.read_excel(  # pyright: ignore[reportUnknownMemberType]
-            filename, sheet_name=None, header=header
-        )
-        last_modification_date = get_last_modified_date(filename)
-
-    elif file:
-        if isinstance(file, SpooledTemporaryFile):
-            file.seek(0)
-            f = io.BytesIO(file.read())
-        else:
-            f = file
-        sheets = pd.read_excel(  # pyright: ignore[reportUnknownMemberType]
-            f, sheet_name=None, header=header
-        )
-        last_modification_date = (
-            get_last_modified_date_from_file(file) if date_from_file_object else None
-        )
-    else:
-        raise ValueError("Either 'filename' or 'file' argument must be specified")
+    opts = _XlsxPartitionerOptions(
+        date_from_file_object=date_from_file_object,
+        detect_language_per_element=detect_language_per_element,
+        file=file,
+        file_path=filename,
+        find_subtable=find_subtable,
+        include_header=include_header,
+        include_metadata=include_metadata,
+        infer_table_structure=infer_table_structure,
+        languages=languages,
+        metadata_file_path=metadata_filename,
+        metadata_last_modified=metadata_last_modified,
+    )

    elements: list[Element] = []
-    for page_number, (sheet_name, sheet) in enumerate(sheets.items(), start=1):
-        if not find_subtable:
+    for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1):
+        if not opts.find_subtable:
            html_text = (
                sheet.to_html(  # pyright: ignore[reportUnknownMemberType]
-                    index=False, header=include_header, na_rep=""
+                    index=False, header=opts.include_header, na_rep=""
                )
-                if infer_table_structure
+                if opts.infer_table_structure
                else None
            )
            # XXX: `html_text` can be `None`. What happens on this call in that case?
@ -131,13 +120,13 @@ def partition_xlsx(
                ).text_content(),
            )

-            if include_metadata:
+            if opts.include_metadata:
                metadata = ElementMetadata(
                    text_as_html=html_text,
                    page_name=sheet_name,
                    page_number=page_number,
-                    filename=metadata_filename or filename,
-                    last_modified=metadata_last_modified or last_modification_date,
+                    filename=opts.metadata_file_path,
+                    last_modified=opts.last_modified,
                )
                metadata.detection_origin = DETECTION_ORIGIN
            else:
@ -149,17 +138,11 @@ def partition_xlsx(
            for component in _ConnectedComponents.from_worksheet_df(sheet):
                subtable_parser = _SubtableParser(component.subtable)

-                metadata = _get_metadata(
-                    include_metadata,
-                    sheet_name,
-                    page_number,
-                    metadata_filename or filename,
-                    metadata_last_modified or last_modification_date,
-                )
+                metadata = _get_metadata(sheet_name, page_number, opts)

                # -- emit each leading single-cell row as its own `Text`-subtype element --
                for content in subtable_parser.iter_leading_single_cell_rows_texts():
-                    element = _check_content_element_type(str(content))
+                    element = _create_element(str(content))
                    element.metadata = metadata
                    elements.append(element)

@ -167,7 +150,7 @@ def partition_xlsx(
                core_table = subtable_parser.core_table
                if core_table is not None:
                    html_text = core_table.to_html(  # pyright: ignore[reportUnknownMemberType]
-                        index=False, header=include_header, na_rep=""
+                        index=False, header=opts.include_header, na_rep=""
                    )
                    text = cast(
                        str,
@ -177,27 +160,144 @@ def partition_xlsx(
                    )
                    element = Table(text=text)
                    element.metadata = metadata
-                    element.metadata.text_as_html = html_text if infer_table_structure else None
+                    element.metadata.text_as_html = (
+                        html_text if opts.infer_table_structure else None
+                    )
                    elements.append(element)

                # -- no core-table is emitted if it's empty (all rows are single-cell rows) --

                # -- emit each trailing single-cell row as its own `Text`-subtype element --
                for content in subtable_parser.iter_trailing_single_cell_rows_texts():
-                    element = _check_content_element_type(str(content))
+                    element = _create_element(str(content))
                    element.metadata = metadata
                    elements.append(element)

    elements = list(
        apply_lang_metadata(
            elements=elements,
-            languages=languages,
-            detect_language_per_element=detect_language_per_element,
+            languages=opts.languages,
+            detect_language_per_element=opts.detect_language_per_element,
        ),
    )
    return elements


+class _XlsxPartitionerOptions:
+    """Encapsulates partitioning option validation, computation, and application of defaults."""
+
+    def __init__(
+        self,
+        *,
+        date_from_file_object: bool,
+        detect_language_per_element: bool,
+        file: Optional[IO[bytes]],
+        file_path: Optional[str],
+        find_subtable: bool,
+        include_header: bool,
+        include_metadata: bool,
+        infer_table_structure: bool,
+        languages: Optional[list[str]],
+        metadata_file_path: Optional[str],
+        metadata_last_modified: Optional[str],
+    ):
+        self._date_from_file_object = date_from_file_object
+        self._detect_language_per_element = detect_language_per_element
+        self._file = file
+        self._file_path = file_path
+        self._find_subtable = find_subtable
+        self._include_header = include_header
+        self._include_metadata = include_metadata
+        self._infer_table_structure = infer_table_structure
+        self._languages = languages
+        self._metadata_file_path = metadata_file_path
+        self._metadata_last_modified = metadata_last_modified
+
+    @lazyproperty
+    def detect_language_per_element(self) -> bool:
+        """When True, detect language on element-by-element basis instead of document level."""
+        return self._detect_language_per_element
+
+    @lazyproperty
+    def find_subtable(self) -> bool:
+        """True when partitioner should detect and emit separate `Table` elements for subtables.
+
+        A subtable is (roughly) a contiguous rectangle of populated cells bounded by empty rows.
+        """
+        return self._find_subtable
+
+    @lazyproperty
+    def header_row_idx(self) -> int | None:
+        """The index of the row Pandas should treat as column-headings. Either 0 or None."""
+        return 0 if self._include_header else None
+
+    @lazyproperty
+    def include_header(self) -> bool:
+        """True when column headers should be included in tables."""
+        return self._include_header
+
+    @lazyproperty
+    def include_metadata(self) -> bool:
+        """True when partitioner should apply metadata to emitted elements."""
+        return self._include_metadata
+
+    @lazyproperty
+    def infer_table_structure(self) -> bool:
+        """True when partitioner should compute and apply `text_as_html` metadata."""
+        return self._infer_table_structure
+
+    @lazyproperty
+    def languages(self) -> Optional[list[str]]:
+        """User-specified language(s) of this document.
+
+        When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple
+        language codes indicate text could be in any of those languages.
+        """
+        return self._languages
+
+    @lazyproperty
+    def last_modified(self) -> Optional[str]:
+        """The best last-modified date available, None if no sources are available."""
+        # -- value explicitly specified by caller takes precedence --
+        if self._metadata_last_modified:
+            return self._metadata_last_modified
+
+        if self._file_path:
+            return get_last_modified_date(self._file_path)
+
+        if self._file:
+            return (
+                get_last_modified_date_from_file(self._file)
+                if self._date_from_file_object
+                else None
+            )
+
+        return None
+
+    @lazyproperty
+    def metadata_file_path(self) -> str | None:
+        """The best available file-path for this document or `None` if unavailable."""
+        return self._metadata_file_path or self._file_path
+
+    @lazyproperty
+    def sheets(self) -> dict[str, pd.DataFrame]:
+        """The spreadsheet worksheets, each as a data-frame mapped by sheet-name."""
+        if file_path := self._file_path:
+            return pd.read_excel(  # pyright: ignore[reportUnknownMemberType]
+                file_path, sheet_name=None, header=self.header_row_idx
+            )
+
+        if f := self._file:
+            if isinstance(f, SpooledTemporaryFile):
+                f.seek(0)
+                f = io.BytesIO(f.read())
+            return pd.read_excel(  # pyright: ignore[reportUnknownMemberType]
+                f, sheet_name=None, header=self.header_row_idx
+            )
+
+        raise ValueError("Either 'filename' or 'file' argument must be specified.")
+
+
 class _ConnectedComponent:
    """A collection of cells that are "2d-connected" in a worksheet.

@ -423,7 +523,7 @@ class _SubtableParser:
        return tuple(reversed(list(iter_trailing_single_cell_row_indices())))


-def _check_content_element_type(text: str) -> Element:
+def _create_element(text: str) -> Element:
    """Create `Text`-subtype document element appropriate to `text`."""
    if is_bulleted_text(text):
        return ListItem(text=clean_bullets(text))
@ -438,20 +538,16 @@ def _check_content_element_type(text: str) -> Element:


 def _get_metadata(
-    include_metadata: bool = True,
-    sheet_name: Optional[str] = None,
-    page_number: Optional[int] = -1,
-    filename: Optional[str] = None,
-    last_modification_date: Optional[str] = None,
+    sheet_name: str, page_number: int, opts: _XlsxPartitionerOptions
 ) -> ElementMetadata:
    """Returns metadata depending on `include_metadata` flag"""
-    if include_metadata:
-        metadata = ElementMetadata(
+    return (
+        ElementMetadata(
            page_name=sheet_name,
            page_number=page_number,
-            filename=filename,
-            last_modified=last_modification_date,
+            filename=opts.metadata_file_path,
+            last_modified=opts.last_modified,
        )
-    else:
-        metadata = ElementMetadata()
-    return metadata
+        if opts.include_metadata
+        else ElementMetadata()
+    )