fix(auto): partition() passes strategy to PPTX,DOCX (#3273)

**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_pptx()` or `partition_docx()`.
This commit is contained in:
Steve Canny 2024-06-21 17:16:39 -07:00 committed by GitHub
parent 6fe1c9980e
commit 16df6944dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 67 additions and 2 deletions

View File

@ -1,4 +1,4 @@
## 0.14.8-dev1
## 0.14.8-dev2
### Enhancements
@ -6,6 +6,8 @@
### Fixes
* **`partition()` now forwards `strategy` arg to `partition_docx()` and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()` and `partition_pptx()` when those filetypes are detected.
## 0.14.7
### Enhancements

View File

@ -1,3 +1,5 @@
# pyright: reportPrivateUsage=false
from __future__ import annotations
import json
@ -6,6 +8,7 @@ import pathlib
import tempfile
import warnings
from importlib import import_module
from typing import Iterator
from unittest.mock import Mock, patch
import docx
@ -20,10 +23,12 @@ from test_unstructured.partition.test_constants import (
EXPECTED_TEXT_XLSX,
EXPECTED_TITLE,
)
from test_unstructured.unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
ListItem,
NarrativeText,
@ -173,6 +178,34 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
assert elements == expected_docx_elements
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
from unstructured.partition.docx import _DocxPartitioner
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_DocxPartitioner,
"_iter_document_elements",
side_effect=fake_iter_document_elements,
)
(element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
@ -556,6 +589,34 @@ def test_auto_partition_pptx_from_filename():
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
from unstructured.partition.pptx import _PptxPartitioner
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_PptxPartitioner,
"_iter_presentation_elements",
side_effect=fake_iter_presentation_elements,
)
(element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_ppt_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")

View File

@ -1 +1 @@
__version__ = "0.14.8-dev1" # pragma: no cover
__version__ = "0.14.8-dev2" # pragma: no cover

View File

@ -327,6 +327,7 @@ def partition(
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif filetype == FileType.ODT:
@ -499,6 +500,7 @@ def partition(
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif filetype == FileType.JSON: