From 1ce60f2bba5e01299cef70c68b8b886bb4ff087a Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 3 Apr 2024 16:27:33 -0700 Subject: [PATCH] rfctr(xlsx): extract _XlsxPartitionerOptions (#2838) **Summary** As an initial step in reducing the complexity of the monolithic `partition_xlsx()` function, extract all argument-handling to a separate `_XlsxPartitionerOptions` object which can be fully covered by isolated unit tests. **Additional Context** This code was from a prior XLSX bug-fix branch that did not get committed because of time constraints. I wanted to revisit it here because I need the benefits of this as part of some new work on PPTX that will require a separate options object that can be passed to delegate objects. This approach was incubated in the chunking context and has produced a lot of opportunities there to decompose the logic into smaller components that are more understandable and isolated-test-able, without having to pass an extended list of option values in ever sub-call. As well as decluttering the code, this removes coupling where the caller needs to know which options a subroutine might need to reference. --- CHANGELOG.md | 2 +- test_unstructured/partition/xlsx/test_xlsx.py | 189 +++++++++++++++- unstructured/__version__.py | 2 +- unstructured/partition/xlsx.py | 210 +++++++++++++----- 4 files changed, 335 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bea4e67c6..2369344cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.1-dev1 +## 0.13.1-dev2 ### Enhancements diff --git a/test_unstructured/partition/xlsx/test_xlsx.py b/test_unstructured/partition/xlsx/test_xlsx.py index b0b837729..7e81c7966 100644 --- a/test_unstructured/partition/xlsx/test_xlsx.py +++ b/test_unstructured/partition/xlsx/test_xlsx.py @@ -7,7 +7,7 @@ from __future__ import annotations import io import sys import tempfile -from typing import cast +from typing import Any, cast import pandas as pd import pandas.testing as pdt @@ -19,13 +19,20 @@ from test_unstructured.partition.test_constants import ( EXPECTED_TEXT_XLSX, EXPECTED_TITLE, ) -from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path +from test_unstructured.unit_utils import ( + FixtureRequest, + Mock, + assert_round_trips_through_JSON, + example_doc_path, + function_mock, +) from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ListItem, Table, Text, Title from unstructured.partition.xlsx import ( _CellCoordinate, _ConnectedComponent, _SubtableParser, + _XlsxPartitionerOptions, partition_xlsx, ) @@ -116,13 +123,7 @@ def test_partition_xlsx_from_filename_with_metadata_filename(): assert elements[0].metadata.filename == "test" -@pytest.mark.parametrize( - "infer_table_structure", - [ - True, - False, - ], -) +@pytest.mark.parametrize("infer_table_structure", [True, False]) def test_partition_xlsx_infer_table_structure(infer_table_structure: bool): elements = partition_xlsx( "example-docs/stanley-cups.xlsx", infer_table_structure=infer_table_structure @@ -395,6 +396,176 @@ def test_partition_xlsx_with_more_than_1k_cells(): # ------------------------------------------------------------------------------------------------ +class Describe_XlsxPartitionerOptions: + """Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects.""" + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_detect_language_for_each_element_individually( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["detect_language_per_element"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.detect_language_per_element is arg_value + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["find_subtable"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.find_subtable is arg_value + + @pytest.mark.parametrize(("arg_value", "expected_value"), [(True, 0), (False, None)]) + def it_knows_the_header_row_index_for_Pandas( + self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any] + ): + opts_args["include_header"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.header_row_idx == expected_value + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_include_column_headings_in_Table_text_as_html( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["include_header"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.include_header is arg_value + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_include_metadata_on_elements( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["include_metadata"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.include_metadata is arg_value + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_include_text_as_html_in_Table_metadata( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["infer_table_structure"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.infer_table_structure is arg_value + + @pytest.mark.parametrize( + ("arg_value", "expected_value"), + [(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])], + ) + def it_knows_what_languages_the_caller_expects_to_appear_in_the_text( + self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any] + ): + opts_args["languages"] = arg_value + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.languages == expected_value + + def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.last_modified == "2024-03-05T17:02:53" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_: Mock + ): + opts_args["file_path"] = "a/b/spreadsheet.xlsx" + get_last_modified_date_.return_value = "2024-04-02T20:32:35" + opts = _XlsxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx") + assert last_modified == "2024-04-02T20:32:35" + + def and_it_falls_back_to_the_last_modified_date_of_the_open_file_when_a_file_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = True + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = _XlsxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_called_once_with(file) + assert last_modified == "2024-04-02T20:42:07" + + def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = False + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = _XlsxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_not_called() + assert last_modified is None + + def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "x/y/z.xlsx" + opts_args["metadata_file_path"] = "a/b/c.xlsx" + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == "a/b/c.xlsx" + + @pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None]) + def and_it_falls_back_to_the_document_file_path_otherwise( + self, file_path: str | None, opts_args: dict[str, Any] + ): + opts_args["file_path"] = file_path + opts_args["metadata_file_path"] = None + opts = _XlsxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == file_path + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def get_last_modified_date_(self, request: FixtureRequest): + return function_mock(request, "unstructured.partition.xlsx.get_last_modified_date") + + @pytest.fixture() + def get_last_modified_date_from_file_(self, request: FixtureRequest): + return function_mock( + request, "unstructured.partition.xlsx.get_last_modified_date_from_file" + ) + + @pytest.fixture() + def opts_args(self) -> dict[str, Any]: + """All default arguments for `_XlsxPartitionerOptions`. + + Individual argument values can be changed to suit each test. Makes construction of opts more + compact for testing purposes. + """ + return { + "date_from_file_object": False, + "detect_language_per_element": False, + "file": None, + "file_path": None, + "find_subtable": True, + "include_header": False, + "include_metadata": True, + "infer_table_structure": True, + "languages": ["auto"], + "metadata_file_path": None, + "metadata_last_modified": None, + } + + class Describe_ConnectedComponent: """Unit-test suite for `unstructured.partition.xlsx._ConnectedComponent` objects.""" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2f456b1a9..99d654ba8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.1-dev1" # pragma: no cover +__version__ = "0.13.1-dev2" # pragma: no cover diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index abeba0ce7..c022f7ee6 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -88,39 +88,28 @@ def partition_xlsx( Applies only when providing file via `file` parameter. If this option is True, attempt infer last_modified metadata from bytes, otherwise set it to None. """ - last_modification_date = None - header = 0 if include_header else None - - sheets: dict[str, pd.DataFrame] = {} - if filename: - sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType] - filename, sheet_name=None, header=header - ) - last_modification_date = get_last_modified_date(filename) - - elif file: - if isinstance(file, SpooledTemporaryFile): - file.seek(0) - f = io.BytesIO(file.read()) - else: - f = file - sheets = pd.read_excel( # pyright: ignore[reportUnknownMemberType] - f, sheet_name=None, header=header - ) - last_modification_date = ( - get_last_modified_date_from_file(file) if date_from_file_object else None - ) - else: - raise ValueError("Either 'filename' or 'file' argument must be specified") + opts = _XlsxPartitionerOptions( + date_from_file_object=date_from_file_object, + detect_language_per_element=detect_language_per_element, + file=file, + file_path=filename, + find_subtable=find_subtable, + include_header=include_header, + include_metadata=include_metadata, + infer_table_structure=infer_table_structure, + languages=languages, + metadata_file_path=metadata_filename, + metadata_last_modified=metadata_last_modified, + ) elements: list[Element] = [] - for page_number, (sheet_name, sheet) in enumerate(sheets.items(), start=1): - if not find_subtable: + for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1): + if not opts.find_subtable: html_text = ( sheet.to_html( # pyright: ignore[reportUnknownMemberType] - index=False, header=include_header, na_rep="" + index=False, header=opts.include_header, na_rep="" ) - if infer_table_structure + if opts.infer_table_structure else None ) # XXX: `html_text` can be `None`. What happens on this call in that case? @@ -131,13 +120,13 @@ def partition_xlsx( ).text_content(), ) - if include_metadata: + if opts.include_metadata: metadata = ElementMetadata( text_as_html=html_text, page_name=sheet_name, page_number=page_number, - filename=metadata_filename or filename, - last_modified=metadata_last_modified or last_modification_date, + filename=opts.metadata_file_path, + last_modified=opts.last_modified, ) metadata.detection_origin = DETECTION_ORIGIN else: @@ -149,17 +138,11 @@ def partition_xlsx( for component in _ConnectedComponents.from_worksheet_df(sheet): subtable_parser = _SubtableParser(component.subtable) - metadata = _get_metadata( - include_metadata, - sheet_name, - page_number, - metadata_filename or filename, - metadata_last_modified or last_modification_date, - ) + metadata = _get_metadata(sheet_name, page_number, opts) # -- emit each leading single-cell row as its own `Text`-subtype element -- for content in subtable_parser.iter_leading_single_cell_rows_texts(): - element = _check_content_element_type(str(content)) + element = _create_element(str(content)) element.metadata = metadata elements.append(element) @@ -167,7 +150,7 @@ def partition_xlsx( core_table = subtable_parser.core_table if core_table is not None: html_text = core_table.to_html( # pyright: ignore[reportUnknownMemberType] - index=False, header=include_header, na_rep="" + index=False, header=opts.include_header, na_rep="" ) text = cast( str, @@ -177,27 +160,144 @@ def partition_xlsx( ) element = Table(text=text) element.metadata = metadata - element.metadata.text_as_html = html_text if infer_table_structure else None + element.metadata.text_as_html = ( + html_text if opts.infer_table_structure else None + ) elements.append(element) # -- no core-table is emitted if it's empty (all rows are single-cell rows) -- # -- emit each trailing single-cell row as its own `Text`-subtype element -- for content in subtable_parser.iter_trailing_single_cell_rows_texts(): - element = _check_content_element_type(str(content)) + element = _create_element(str(content)) element.metadata = metadata elements.append(element) elements = list( apply_lang_metadata( elements=elements, - languages=languages, - detect_language_per_element=detect_language_per_element, + languages=opts.languages, + detect_language_per_element=opts.detect_language_per_element, ), ) return elements +class _XlsxPartitionerOptions: + """Encapsulates partitioning option validation, computation, and application of defaults.""" + + def __init__( + self, + *, + date_from_file_object: bool, + detect_language_per_element: bool, + file: Optional[IO[bytes]], + file_path: Optional[str], + find_subtable: bool, + include_header: bool, + include_metadata: bool, + infer_table_structure: bool, + languages: Optional[list[str]], + metadata_file_path: Optional[str], + metadata_last_modified: Optional[str], + ): + self._date_from_file_object = date_from_file_object + self._detect_language_per_element = detect_language_per_element + self._file = file + self._file_path = file_path + self._find_subtable = find_subtable + self._include_header = include_header + self._include_metadata = include_metadata + self._infer_table_structure = infer_table_structure + self._languages = languages + self._metadata_file_path = metadata_file_path + self._metadata_last_modified = metadata_last_modified + + @lazyproperty + def detect_language_per_element(self) -> bool: + """When True, detect language on element-by-element basis instead of document level.""" + return self._detect_language_per_element + + @lazyproperty + def find_subtable(self) -> bool: + """True when partitioner should detect and emit separate `Table` elements for subtables. + + A subtable is (roughly) a contiguous rectangle of populated cells bounded by empty rows. + """ + return self._find_subtable + + @lazyproperty + def header_row_idx(self) -> int | None: + """The index of the row Pandas should treat as column-headings. Either 0 or None.""" + return 0 if self._include_header else None + + @lazyproperty + def include_header(self) -> bool: + """True when column headers should be included in tables.""" + return self._include_header + + @lazyproperty + def include_metadata(self) -> bool: + """True when partitioner should apply metadata to emitted elements.""" + return self._include_metadata + + @lazyproperty + def infer_table_structure(self) -> bool: + """True when partitioner should compute and apply `text_as_html` metadata.""" + return self._infer_table_structure + + @lazyproperty + def languages(self) -> Optional[list[str]]: + """User-specified language(s) of this document. + + When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple + language codes indicate text could be in any of those languages. + """ + return self._languages + + @lazyproperty + def last_modified(self) -> Optional[str]: + """The best last-modified date available, None if no sources are available.""" + # -- value explicitly specified by caller takes precedence -- + if self._metadata_last_modified: + return self._metadata_last_modified + + if self._file_path: + return get_last_modified_date(self._file_path) + + if self._file: + return ( + get_last_modified_date_from_file(self._file) + if self._date_from_file_object + else None + ) + + return None + + @lazyproperty + def metadata_file_path(self) -> str | None: + """The best available file-path for this document or `None` if unavailable.""" + return self._metadata_file_path or self._file_path + + @lazyproperty + def sheets(self) -> dict[str, pd.DataFrame]: + """The spreadsheet worksheets, each as a data-frame mapped by sheet-name.""" + if file_path := self._file_path: + return pd.read_excel( # pyright: ignore[reportUnknownMemberType] + file_path, sheet_name=None, header=self.header_row_idx + ) + + if f := self._file: + if isinstance(f, SpooledTemporaryFile): + f.seek(0) + f = io.BytesIO(f.read()) + return pd.read_excel( # pyright: ignore[reportUnknownMemberType] + f, sheet_name=None, header=self.header_row_idx + ) + + raise ValueError("Either 'filename' or 'file' argument must be specified.") + + class _ConnectedComponent: """A collection of cells that are "2d-connected" in a worksheet. @@ -423,7 +523,7 @@ class _SubtableParser: return tuple(reversed(list(iter_trailing_single_cell_row_indices()))) -def _check_content_element_type(text: str) -> Element: +def _create_element(text: str) -> Element: """Create `Text`-subtype document element appropriate to `text`.""" if is_bulleted_text(text): return ListItem(text=clean_bullets(text)) @@ -438,20 +538,16 @@ def _check_content_element_type(text: str) -> Element: def _get_metadata( - include_metadata: bool = True, - sheet_name: Optional[str] = None, - page_number: Optional[int] = -1, - filename: Optional[str] = None, - last_modification_date: Optional[str] = None, + sheet_name: str, page_number: int, opts: _XlsxPartitionerOptions ) -> ElementMetadata: """Returns metadata depending on `include_metadata` flag""" - if include_metadata: - metadata = ElementMetadata( + return ( + ElementMetadata( page_name=sheet_name, page_number=page_number, - filename=filename, - last_modified=last_modification_date, + filename=opts.metadata_file_path, + last_modified=opts.last_modified, ) - else: - metadata = ElementMetadata() - return metadata + if opts.include_metadata + else ElementMetadata() + )