feat(docx): add strategy parameter to partition_docx() (#3026)

**Summary**
The behavior of an image sub-partitioner can be partially determined by
the partitioning strategy, for example whether it is "hi_res" or "fast".
Add this parameter to `partition_docx()` so it can pass it along to
`DocxPartitionerOptions` which will make it available to any image
sub-partitioners.
This commit is contained in:
Steve Canny 2024-05-15 14:05:32 -07:00 committed by GitHub
parent a164b01c7e
commit 094e3542cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 40 additions and 6 deletions

View File

@ -1,8 +1,9 @@
## 0.13.8-dev10 ## 0.13.8-dev11
### Enhancements ### Enhancements
* **Faster evaluation** Support for concurrent processing of documents during evaluation * **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
### Features ### Features

View File

@ -40,7 +40,10 @@ from unstructured.documents.elements import (
Title, Title,
) )
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.partition.utils.constants import (
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
PartitionStrategy,
)
# -- docx-file loading behaviors ----------------------------------------------------------------- # -- docx-file loading behaviors -----------------------------------------------------------------
@ -701,6 +704,7 @@ def opts_args() -> dict[str, Any]:
"infer_table_structure": True, "infer_table_structure": True,
"metadata_file_path": None, "metadata_file_path": None,
"metadata_last_modified": None, "metadata_last_modified": None,
"strategy": None,
} }
@ -905,6 +909,20 @@ class DescribeDocxPartitionerOptions:
list(opts.increment_page_number()) list(opts.increment_page_number())
assert opts.page_number == 4 assert opts.page_number == 4
# -- .strategy -------------------------------
@pytest.mark.parametrize(
("arg_value", "expected_value"),
[(None, "hi_res"), (PartitionStrategy.FAST, "fast"), (PartitionStrategy.HI_RES, "hi_res")],
)
def it_knows_which_partitioning_strategy_to_use(
self, opts_args: dict[str, Any], arg_value: str, expected_value: str
):
opts_args["strategy"] = arg_value
opts = DocxPartitionerOptions(**opts_args)
assert opts.strategy == expected_value
# -- ._document_contains_pagebreaks ---------- # -- ._document_contains_pagebreaks ----------
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -1 +1 @@
__version__ = "0.13.8-dev10" # pragma: no cover __version__ = "0.13.8-dev11" # pragma: no cover

View File

@ -57,6 +57,7 @@ from unstructured.partition.text_type import (
is_possible_title, is_possible_title,
is_us_city_state_zip, is_us_city_state_zip,
) )
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import ( from unstructured.utils import (
dependency_exists, dependency_exists,
is_temp_file_path, is_temp_file_path,
@ -170,15 +171,17 @@ def convert_and_partition_docx(
@add_chunking_strategy @add_chunking_strategy
def partition_docx( def partition_docx(
filename: Optional[str] = None, filename: Optional[str] = None,
*,
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
file: Optional[IO[bytes]] = None, file: Optional[IO[bytes]] = None,
include_page_breaks: bool = True, include_page_breaks: bool = True,
infer_table_structure: bool = True, infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1, starting_page_number: int = 1,
strategy: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> list[Element]: ) -> list[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements. """Partitions Microsoft Word Documents in .docx format into its document elements.
@ -226,6 +229,7 @@ def partition_docx(
metadata_file_path=metadata_filename, metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number, starting_page_number=starting_page_number,
strategy=strategy,
) )
elements = _DocxPartitioner.iter_document_elements(opts) elements = _DocxPartitioner.iter_document_elements(opts)
@ -252,6 +256,7 @@ class DocxPartitionerOptions:
metadata_file_path: Optional[str], metadata_file_path: Optional[str],
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
starting_page_number: int = 1, starting_page_number: int = 1,
strategy: str | None = None,
): ):
self._date_from_file_object = date_from_file_object self._date_from_file_object = date_from_file_object
self._file = file self._file = file
@ -260,6 +265,7 @@ class DocxPartitionerOptions:
self._infer_table_structure = infer_table_structure self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified self._metadata_last_modified = metadata_last_modified
self._strategy = strategy
# -- options object maintains page-number state -- # -- options object maintains page-number state --
self._page_counter = starting_page_number self._page_counter = starting_page_number
@ -345,6 +351,15 @@ class DocxPartitionerOptions:
""" """
return self._page_counter return self._page_counter
@lazyproperty
def strategy(self) -> str:
"""The partitioning strategy for this document.
One of "hi_res", "fast", and a few others. These are available as class attributes on
`unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values.
"""
return PartitionStrategy.HI_RES if self._strategy is None else self._strategy
@lazyproperty @lazyproperty
def _document_contains_pagebreaks(self) -> bool: def _document_contains_pagebreaks(self) -> bool:
"""True when there is at least one page-break detected in the document. """True when there is at least one page-break detected in the document.