mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-06 21:29:42 +00:00
feat(docx): add strategy parameter to partition_docx() (#3026)
**Summary** The behavior of an image sub-partitioner can be partially determined by the partitioning strategy, for example whether it is "hi_res" or "fast". Add this parameter to `partition_docx()` so it can pass it along to `DocxPartitionerOptions` which will make it available to any image sub-partitioners.
This commit is contained in:
parent
a164b01c7e
commit
094e3542cb
@ -1,8 +1,9 @@
|
|||||||
## 0.13.8-dev10
|
## 0.13.8-dev11
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Faster evaluation** Support for concurrent processing of documents during evaluation
|
* **Faster evaluation** Support for concurrent processing of documents during evaluation
|
||||||
|
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,10 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
|
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
|
||||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
from unstructured.partition.utils.constants import (
|
||||||
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
||||||
|
PartitionStrategy,
|
||||||
|
)
|
||||||
|
|
||||||
# -- docx-file loading behaviors -----------------------------------------------------------------
|
# -- docx-file loading behaviors -----------------------------------------------------------------
|
||||||
|
|
||||||
@ -701,6 +704,7 @@ def opts_args() -> dict[str, Any]:
|
|||||||
"infer_table_structure": True,
|
"infer_table_structure": True,
|
||||||
"metadata_file_path": None,
|
"metadata_file_path": None,
|
||||||
"metadata_last_modified": None,
|
"metadata_last_modified": None,
|
||||||
|
"strategy": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -905,6 +909,20 @@ class DescribeDocxPartitionerOptions:
|
|||||||
list(opts.increment_page_number())
|
list(opts.increment_page_number())
|
||||||
assert opts.page_number == 4
|
assert opts.page_number == 4
|
||||||
|
|
||||||
|
# -- .strategy -------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("arg_value", "expected_value"),
|
||||||
|
[(None, "hi_res"), (PartitionStrategy.FAST, "fast"), (PartitionStrategy.HI_RES, "hi_res")],
|
||||||
|
)
|
||||||
|
def it_knows_which_partitioning_strategy_to_use(
|
||||||
|
self, opts_args: dict[str, Any], arg_value: str, expected_value: str
|
||||||
|
):
|
||||||
|
opts_args["strategy"] = arg_value
|
||||||
|
opts = DocxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
|
assert opts.strategy == expected_value
|
||||||
|
|
||||||
# -- ._document_contains_pagebreaks ----------
|
# -- ._document_contains_pagebreaks ----------
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.13.8-dev10" # pragma: no cover
|
__version__ = "0.13.8-dev11" # pragma: no cover
|
||||||
|
|||||||
@ -57,6 +57,7 @@ from unstructured.partition.text_type import (
|
|||||||
is_possible_title,
|
is_possible_title,
|
||||||
is_us_city_state_zip,
|
is_us_city_state_zip,
|
||||||
)
|
)
|
||||||
|
from unstructured.partition.utils.constants import PartitionStrategy
|
||||||
from unstructured.utils import (
|
from unstructured.utils import (
|
||||||
dependency_exists,
|
dependency_exists,
|
||||||
is_temp_file_path,
|
is_temp_file_path,
|
||||||
@ -170,15 +171,17 @@ def convert_and_partition_docx(
|
|||||||
@add_chunking_strategy
|
@add_chunking_strategy
|
||||||
def partition_docx(
|
def partition_docx(
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
date_from_file_object: bool = False,
|
||||||
|
detect_language_per_element: bool = False,
|
||||||
file: Optional[IO[bytes]] = None,
|
file: Optional[IO[bytes]] = None,
|
||||||
include_page_breaks: bool = True,
|
include_page_breaks: bool = True,
|
||||||
infer_table_structure: bool = True,
|
infer_table_structure: bool = True,
|
||||||
|
languages: Optional[list[str]] = ["auto"],
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
languages: Optional[list[str]] = ["auto"],
|
|
||||||
detect_language_per_element: bool = False,
|
|
||||||
date_from_file_object: bool = False,
|
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
|
strategy: Optional[str] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[Element]:
|
) -> list[Element]:
|
||||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||||
@ -226,6 +229,7 @@ def partition_docx(
|
|||||||
metadata_file_path=metadata_filename,
|
metadata_file_path=metadata_filename,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
|
strategy=strategy,
|
||||||
)
|
)
|
||||||
|
|
||||||
elements = _DocxPartitioner.iter_document_elements(opts)
|
elements = _DocxPartitioner.iter_document_elements(opts)
|
||||||
@ -252,6 +256,7 @@ class DocxPartitionerOptions:
|
|||||||
metadata_file_path: Optional[str],
|
metadata_file_path: Optional[str],
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
|
strategy: str | None = None,
|
||||||
):
|
):
|
||||||
self._date_from_file_object = date_from_file_object
|
self._date_from_file_object = date_from_file_object
|
||||||
self._file = file
|
self._file = file
|
||||||
@ -260,6 +265,7 @@ class DocxPartitionerOptions:
|
|||||||
self._infer_table_structure = infer_table_structure
|
self._infer_table_structure = infer_table_structure
|
||||||
self._metadata_file_path = metadata_file_path
|
self._metadata_file_path = metadata_file_path
|
||||||
self._metadata_last_modified = metadata_last_modified
|
self._metadata_last_modified = metadata_last_modified
|
||||||
|
self._strategy = strategy
|
||||||
# -- options object maintains page-number state --
|
# -- options object maintains page-number state --
|
||||||
self._page_counter = starting_page_number
|
self._page_counter = starting_page_number
|
||||||
|
|
||||||
@ -345,6 +351,15 @@ class DocxPartitionerOptions:
|
|||||||
"""
|
"""
|
||||||
return self._page_counter
|
return self._page_counter
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def strategy(self) -> str:
|
||||||
|
"""The partitioning strategy for this document.
|
||||||
|
|
||||||
|
One of "hi_res", "fast", and a few others. These are available as class attributes on
|
||||||
|
`unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values.
|
||||||
|
"""
|
||||||
|
return PartitionStrategy.HI_RES if self._strategy is None else self._strategy
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def _document_contains_pagebreaks(self) -> bool:
|
def _document_contains_pagebreaks(self) -> bool:
|
||||||
"""True when there is at least one page-break detected in the document.
|
"""True when there is at least one page-break detected in the document.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user