mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 19:13:13 +00:00
feat(docx): add strategy parameter to partition_docx() (#3026)
**Summary** The behavior of an image sub-partitioner can be partially determined by the partitioning strategy, for example whether it is "hi_res" or "fast". Add this parameter to `partition_docx()` so it can pass it along to `DocxPartitionerOptions` which will make it available to any image sub-partitioners.
This commit is contained in:
parent
a164b01c7e
commit
094e3542cb
@ -1,8 +1,9 @@
|
||||
## 0.13.8-dev10
|
||||
## 0.13.8-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Faster evaluation** Support for concurrent processing of documents during evaluation
|
||||
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -40,7 +40,10 @@ from unstructured.documents.elements import (
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
from unstructured.partition.utils.constants import (
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
||||
PartitionStrategy,
|
||||
)
|
||||
|
||||
# -- docx-file loading behaviors -----------------------------------------------------------------
|
||||
|
||||
@ -701,6 +704,7 @@ def opts_args() -> dict[str, Any]:
|
||||
"infer_table_structure": True,
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
"strategy": None,
|
||||
}
|
||||
|
||||
|
||||
@ -905,6 +909,20 @@ class DescribeDocxPartitionerOptions:
|
||||
list(opts.increment_page_number())
|
||||
assert opts.page_number == 4
|
||||
|
||||
# -- .strategy -------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg_value", "expected_value"),
|
||||
[(None, "hi_res"), (PartitionStrategy.FAST, "fast"), (PartitionStrategy.HI_RES, "hi_res")],
|
||||
)
|
||||
def it_knows_which_partitioning_strategy_to_use(
|
||||
self, opts_args: dict[str, Any], arg_value: str, expected_value: str
|
||||
):
|
||||
opts_args["strategy"] = arg_value
|
||||
opts = DocxPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.strategy == expected_value
|
||||
|
||||
# -- ._document_contains_pagebreaks ----------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.13.8-dev10" # pragma: no cover
|
||||
__version__ = "0.13.8-dev11" # pragma: no cover
|
||||
|
||||
@ -57,6 +57,7 @@ from unstructured.partition.text_type import (
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
from unstructured.utils import (
|
||||
dependency_exists,
|
||||
is_temp_file_path,
|
||||
@ -170,15 +171,17 @@ def convert_and_partition_docx(
|
||||
@add_chunking_strategy
|
||||
def partition_docx(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
date_from_file_object: bool = False,
|
||||
detect_language_per_element: bool = False,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
include_page_breaks: bool = True,
|
||||
infer_table_structure: bool = True,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
date_from_file_object: bool = False,
|
||||
starting_page_number: int = 1,
|
||||
strategy: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||
@ -226,6 +229,7 @@ def partition_docx(
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
elements = _DocxPartitioner.iter_document_elements(opts)
|
||||
@ -252,6 +256,7 @@ class DocxPartitionerOptions:
|
||||
metadata_file_path: Optional[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
starting_page_number: int = 1,
|
||||
strategy: str | None = None,
|
||||
):
|
||||
self._date_from_file_object = date_from_file_object
|
||||
self._file = file
|
||||
@ -260,6 +265,7 @@ class DocxPartitionerOptions:
|
||||
self._infer_table_structure = infer_table_structure
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._strategy = strategy
|
||||
# -- options object maintains page-number state --
|
||||
self._page_counter = starting_page_number
|
||||
|
||||
@ -345,6 +351,15 @@ class DocxPartitionerOptions:
|
||||
"""
|
||||
return self._page_counter
|
||||
|
||||
@lazyproperty
|
||||
def strategy(self) -> str:
|
||||
"""The partitioning strategy for this document.
|
||||
|
||||
One of "hi_res", "fast", and a few others. These are available as class attributes on
|
||||
`unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values.
|
||||
"""
|
||||
return PartitionStrategy.HI_RES if self._strategy is None else self._strategy
|
||||
|
||||
@lazyproperty
|
||||
def _document_contains_pagebreaks(self) -> bool:
|
||||
"""True when there is at least one page-break detected in the document.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user