rfctr(xlsx): extract _XlsxPartitionerOptions (#2838)

**Summary**
As an initial step in reducing the complexity of the monolithic
`partition_xlsx()` function, extract all argument-handling to a separate
`_XlsxPartitionerOptions` object which can be fully covered by isolated
unit tests.
    
**Additional Context**
This code was from a prior XLSX bug-fix branch that did not get
committed because of time constraints. I wanted to revisit it here
because I need the benefits of this as part of some new work on PPTX
that will require a separate options object that can be passed to
delegate objects.

This approach was incubated in the chunking context and has produced a
lot of opportunities there to decompose the logic into smaller
components that are more understandable and isolated-test-able, without
having to pass an extended list of option values in ever sub-call. As
well as decluttering the code, this removes coupling where the caller
needs to know which options a subroutine might need to reference.
This commit is contained in:
Steve Canny 2024-04-03 16:27:33 -07:00 committed by GitHub
parent e49c35933d
commit 1ce60f2bba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 335 additions and 68 deletions

View File

@ -1,4 +1,4 @@
## 0.13.1-dev1 ## 0.13.1-dev2
### Enhancements ### Enhancements

View File

@ -7,7 +7,7 @@ from __future__ import annotations
import io import io
import sys import sys
import tempfile import tempfile
from typing import cast from typing import Any, cast
import pandas as pd import pandas as pd
import pandas.testing as pdt import pandas.testing as pdt
@ -19,13 +19,20 @@ from test_unstructured.partition.test_constants import (
EXPECTED_TEXT_XLSX, EXPECTED_TEXT_XLSX,
EXPECTED_TITLE, EXPECTED_TITLE,
) )
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import ListItem, Table, Text, Title from unstructured.documents.elements import ListItem, Table, Text, Title
from unstructured.partition.xlsx import ( from unstructured.partition.xlsx import (
_CellCoordinate, _CellCoordinate,
_ConnectedComponent, _ConnectedComponent,
_SubtableParser, _SubtableParser,
_XlsxPartitionerOptions,
partition_xlsx, partition_xlsx,
) )
@ -116,13 +123,7 @@ def test_partition_xlsx_from_filename_with_metadata_filename():
assert elements[0].metadata.filename == "test" assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize( @pytest.mark.parametrize("infer_table_structure", [True, False])
"infer_table_structure",
[
True,
False,
],
)
def test_partition_xlsx_infer_table_structure(infer_table_structure: bool): def test_partition_xlsx_infer_table_structure(infer_table_structure: bool):
elements = partition_xlsx( elements = partition_xlsx(
"example-docs/stanley-cups.xlsx", infer_table_structure=infer_table_structure "example-docs/stanley-cups.xlsx", infer_table_structure=infer_table_structure
@ -395,6 +396,176 @@ def test_partition_xlsx_with_more_than_1k_cells():
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------
class Describe_XlsxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects."""
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_detect_language_for_each_element_individually(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["detect_language_per_element"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.detect_language_per_element is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["find_subtable"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.find_subtable is arg_value
@pytest.mark.parametrize(("arg_value", "expected_value"), [(True, 0), (False, None)])
def it_knows_the_header_row_index_for_Pandas(
self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
):
opts_args["include_header"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.header_row_idx == expected_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_include_column_headings_in_Table_text_as_html(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_header"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.include_header is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_include_metadata_on_elements(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_metadata"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.include_metadata is arg_value
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_include_text_as_html_in_Table_metadata(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["infer_table_structure"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.infer_table_structure is arg_value
@pytest.mark.parametrize(
("arg_value", "expected_value"),
[(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])],
)
def it_knows_what_languages_the_caller_expects_to_appear_in_the_text(
self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any]
):
opts_args["languages"] = arg_value
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.languages == expected_value
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/spreadsheet.xlsx"
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
opts = _XlsxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx")
assert last_modified == "2024-04-02T20:32:35"
def and_it_falls_back_to_the_last_modified_date_of_the_open_file_when_a_file_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = True
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = _XlsxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_called_once_with(file)
assert last_modified == "2024-04-02T20:42:07"
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = False
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = _XlsxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_not_called()
assert last_modified is None
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "x/y/z.xlsx"
opts_args["metadata_file_path"] = "a/b/c.xlsx"
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.xlsx"
@pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None])
def and_it_falls_back_to_the_document_file_path_otherwise(
self, file_path: str | None, opts_args: dict[str, Any]
):
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = _XlsxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest):
return function_mock(request, "unstructured.partition.xlsx.get_last_modified_date")
@pytest.fixture()
def get_last_modified_date_from_file_(self, request: FixtureRequest):
return function_mock(
request, "unstructured.partition.xlsx.get_last_modified_date_from_file"
)
@pytest.fixture()
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"detect_language_per_element": False,
"file": None,
"file_path": None,
"find_subtable": True,
"include_header": False,
"include_metadata": True,
"infer_table_structure": True,
"languages": ["auto"],
"metadata_file_path": None,
"metadata_last_modified": None,
}
class Describe_ConnectedComponent: class Describe_ConnectedComponent:
"""Unit-test suite for `unstructured.partition.xlsx._ConnectedComponent` objects.""" """Unit-test suite for `unstructured.partition.xlsx._ConnectedComponent` objects."""

View File

@ -1 +1 @@
__version__ = "0.13.1-dev1" # pragma: no cover __version__ = "0.13.1-dev2" # pragma: no cover

View File

@ -88,39 +88,28 @@ def partition_xlsx(
Applies only when providing file via `file` parameter. If this option is True, attempt Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None. infer last_modified metadata from bytes, otherwise set it to None.
""" """
last_modification_date = None opts = _XlsxPartitionerOptions(
header = 0 if include_header else None date_from_file_object=date_from_file_object,
detect_language_per_element=detect_language_per_element,
sheets: dict[str, pd.DataFrame] = {} file=file,
if filename: file_path=filename,
sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType] find_subtable=find_subtable,
filename, sheet_name=None, header=header include_header=include_header,
include_metadata=include_metadata,
infer_table_structure=infer_table_structure,
languages=languages,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
) )
last_modification_date = get_last_modified_date(filename)
elif file:
if isinstance(file, SpooledTemporaryFile):
file.seek(0)
f = io.BytesIO(file.read())
else:
f = file
sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType]
f, sheet_name=None, header=header
)
last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
else:
raise ValueError("Either 'filename' or 'file' argument must be specified")
elements: list[Element] = [] elements: list[Element] = []
for page_number, (sheet_name, sheet) in enumerate(sheets.items(), start=1): for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1):
if not find_subtable: if not opts.find_subtable:
html_text = ( html_text = (
sheet.to_html( # pyright: ignore[reportUnknownMemberType] sheet.to_html( # pyright: ignore[reportUnknownMemberType]
index=False, header=include_header, na_rep="" index=False, header=opts.include_header, na_rep=""
) )
if infer_table_structure if opts.infer_table_structure
else None else None
) )
# XXX: `html_text` can be `None`. What happens on this call in that case? # XXX: `html_text` can be `None`. What happens on this call in that case?
@ -131,13 +120,13 @@ def partition_xlsx(
).text_content(), ).text_content(),
) )
if include_metadata: if opts.include_metadata:
metadata = ElementMetadata( metadata = ElementMetadata(
text_as_html=html_text, text_as_html=html_text,
page_name=sheet_name, page_name=sheet_name,
page_number=page_number, page_number=page_number,
filename=metadata_filename or filename, filename=opts.metadata_file_path,
last_modified=metadata_last_modified or last_modification_date, last_modified=opts.last_modified,
) )
metadata.detection_origin = DETECTION_ORIGIN metadata.detection_origin = DETECTION_ORIGIN
else: else:
@ -149,17 +138,11 @@ def partition_xlsx(
for component in _ConnectedComponents.from_worksheet_df(sheet): for component in _ConnectedComponents.from_worksheet_df(sheet):
subtable_parser = _SubtableParser(component.subtable) subtable_parser = _SubtableParser(component.subtable)
metadata = _get_metadata( metadata = _get_metadata(sheet_name, page_number, opts)
include_metadata,
sheet_name,
page_number,
metadata_filename or filename,
metadata_last_modified or last_modification_date,
)
# -- emit each leading single-cell row as its own `Text`-subtype element -- # -- emit each leading single-cell row as its own `Text`-subtype element --
for content in subtable_parser.iter_leading_single_cell_rows_texts(): for content in subtable_parser.iter_leading_single_cell_rows_texts():
element = _check_content_element_type(str(content)) element = _create_element(str(content))
element.metadata = metadata element.metadata = metadata
elements.append(element) elements.append(element)
@ -167,7 +150,7 @@ def partition_xlsx(
core_table = subtable_parser.core_table core_table = subtable_parser.core_table
if core_table is not None: if core_table is not None:
html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType] html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType]
index=False, header=include_header, na_rep="" index=False, header=opts.include_header, na_rep=""
) )
text = cast( text = cast(
str, str,
@ -177,27 +160,144 @@ def partition_xlsx(
) )
element = Table(text=text) element = Table(text=text)
element.metadata = metadata element.metadata = metadata
element.metadata.text_as_html = html_text if infer_table_structure else None element.metadata.text_as_html = (
html_text if opts.infer_table_structure else None
)
elements.append(element) elements.append(element)
# -- no core-table is emitted if it's empty (all rows are single-cell rows) -- # -- no core-table is emitted if it's empty (all rows are single-cell rows) --
# -- emit each trailing single-cell row as its own `Text`-subtype element -- # -- emit each trailing single-cell row as its own `Text`-subtype element --
for content in subtable_parser.iter_trailing_single_cell_rows_texts(): for content in subtable_parser.iter_trailing_single_cell_rows_texts():
element = _check_content_element_type(str(content)) element = _create_element(str(content))
element.metadata = metadata element.metadata = metadata
elements.append(element) elements.append(element)
elements = list( elements = list(
apply_lang_metadata( apply_lang_metadata(
elements=elements, elements=elements,
languages=languages, languages=opts.languages,
detect_language_per_element=detect_language_per_element, detect_language_per_element=opts.detect_language_per_element,
), ),
) )
return elements return elements
class _XlsxPartitionerOptions:
"""Encapsulates partitioning option validation, computation, and application of defaults."""
def __init__(
self,
*,
date_from_file_object: bool,
detect_language_per_element: bool,
file: Optional[IO[bytes]],
file_path: Optional[str],
find_subtable: bool,
include_header: bool,
include_metadata: bool,
infer_table_structure: bool,
languages: Optional[list[str]],
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
):
self._date_from_file_object = date_from_file_object
self._detect_language_per_element = detect_language_per_element
self._file = file
self._file_path = file_path
self._find_subtable = find_subtable
self._include_header = include_header
self._include_metadata = include_metadata
self._infer_table_structure = infer_table_structure
self._languages = languages
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
@lazyproperty
def detect_language_per_element(self) -> bool:
"""When True, detect language on element-by-element basis instead of document level."""
return self._detect_language_per_element
@lazyproperty
def find_subtable(self) -> bool:
"""True when partitioner should detect and emit separate `Table` elements for subtables.
A subtable is (roughly) a contiguous rectangle of populated cells bounded by empty rows.
"""
return self._find_subtable
@lazyproperty
def header_row_idx(self) -> int | None:
"""The index of the row Pandas should treat as column-headings. Either 0 or None."""
return 0 if self._include_header else None
@lazyproperty
def include_header(self) -> bool:
"""True when column headers should be included in tables."""
return self._include_header
@lazyproperty
def include_metadata(self) -> bool:
"""True when partitioner should apply metadata to emitted elements."""
return self._include_metadata
@lazyproperty
def infer_table_structure(self) -> bool:
"""True when partitioner should compute and apply `text_as_html` metadata."""
return self._infer_table_structure
@lazyproperty
def languages(self) -> Optional[list[str]]:
"""User-specified language(s) of this document.
When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple
language codes indicate text could be in any of those languages.
"""
return self._languages
@lazyproperty
def last_modified(self) -> Optional[str]:
"""The best last-modified date available, None if no sources are available."""
# -- value explicitly specified by caller takes precedence --
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return get_last_modified_date(self._file_path)
if self._file:
return (
get_last_modified_date_from_file(self._file)
if self._date_from_file_object
else None
)
return None
@lazyproperty
def metadata_file_path(self) -> str | None:
"""The best available file-path for this document or `None` if unavailable."""
return self._metadata_file_path or self._file_path
@lazyproperty
def sheets(self) -> dict[str, pd.DataFrame]:
"""The spreadsheet worksheets, each as a data-frame mapped by sheet-name."""
if file_path := self._file_path:
return pd.read_excel( # pyright: ignore[reportUnknownMemberType]
file_path, sheet_name=None, header=self.header_row_idx
)
if f := self._file:
if isinstance(f, SpooledTemporaryFile):
f.seek(0)
f = io.BytesIO(f.read())
return pd.read_excel( # pyright: ignore[reportUnknownMemberType]
f, sheet_name=None, header=self.header_row_idx
)
raise ValueError("Either 'filename' or 'file' argument must be specified.")
class _ConnectedComponent: class _ConnectedComponent:
"""A collection of cells that are "2d-connected" in a worksheet. """A collection of cells that are "2d-connected" in a worksheet.
@ -423,7 +523,7 @@ class _SubtableParser:
return tuple(reversed(list(iter_trailing_single_cell_row_indices()))) return tuple(reversed(list(iter_trailing_single_cell_row_indices())))
def _check_content_element_type(text: str) -> Element: def _create_element(text: str) -> Element:
"""Create `Text`-subtype document element appropriate to `text`.""" """Create `Text`-subtype document element appropriate to `text`."""
if is_bulleted_text(text): if is_bulleted_text(text):
return ListItem(text=clean_bullets(text)) return ListItem(text=clean_bullets(text))
@ -438,20 +538,16 @@ def _check_content_element_type(text: str) -> Element:
def _get_metadata( def _get_metadata(
include_metadata: bool = True, sheet_name: str, page_number: int, opts: _XlsxPartitionerOptions
sheet_name: Optional[str] = None,
page_number: Optional[int] = -1,
filename: Optional[str] = None,
last_modification_date: Optional[str] = None,
) -> ElementMetadata: ) -> ElementMetadata:
"""Returns metadata depending on `include_metadata` flag""" """Returns metadata depending on `include_metadata` flag"""
if include_metadata: return (
metadata = ElementMetadata( ElementMetadata(
page_name=sheet_name, page_name=sheet_name,
page_number=page_number, page_number=page_number,
filename=filename, filename=opts.metadata_file_path,
last_modified=last_modification_date, last_modified=opts.last_modified,
)
if opts.include_metadata
else ElementMetadata()
) )
else:
metadata = ElementMetadata()
return metadata