mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-03 18:49:53 +00:00
fix(auto): partition() passes strategy to PPTX,DOCX (#3273)
**Summary** Remedy gap where `strategy` argument passed to `partition()` was not forwarded to `partition_pptx()` or `partition_docx()`.
This commit is contained in:
parent
6fe1c9980e
commit
16df6944dd
@ -1,4 +1,4 @@
|
||||
## 0.14.8-dev1
|
||||
## 0.14.8-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **`partition()` now forwards `strategy` arg to `partition_docx()` and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()` and `partition_pptx()` when those filetypes are detected.
|
||||
|
||||
## 0.14.7
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
@ -6,6 +8,7 @@ import pathlib
|
||||
import tempfile
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from typing import Iterator
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import docx
|
||||
@ -20,10 +23,12 @@ from test_unstructured.partition.test_constants import (
|
||||
EXPECTED_TEXT_XLSX,
|
||||
EXPECTED_TITLE,
|
||||
)
|
||||
from test_unstructured.unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
@ -173,6 +178,34 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy",
|
||||
[
|
||||
PartitionStrategy.AUTO,
|
||||
PartitionStrategy.FAST,
|
||||
PartitionStrategy.HI_RES,
|
||||
PartitionStrategy.OCR_ONLY,
|
||||
],
|
||||
)
|
||||
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
|
||||
from unstructured.partition.docx import _DocxPartitioner
|
||||
|
||||
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
|
||||
yield Text(f"strategy=={self._opts.strategy}")
|
||||
|
||||
_iter_elements_ = method_mock(
|
||||
request,
|
||||
_DocxPartitioner,
|
||||
"_iter_document_elements",
|
||||
side_effect=fake_iter_document_elements,
|
||||
)
|
||||
|
||||
(element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
|
||||
|
||||
_iter_elements_.assert_called_once_with(ANY)
|
||||
assert element.text == f"strategy=={strategy}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||
@ -556,6 +589,34 @@ def test_auto_partition_pptx_from_filename():
|
||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"strategy",
|
||||
[
|
||||
PartitionStrategy.AUTO,
|
||||
PartitionStrategy.FAST,
|
||||
PartitionStrategy.HI_RES,
|
||||
PartitionStrategy.OCR_ONLY,
|
||||
],
|
||||
)
|
||||
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
|
||||
from unstructured.partition.pptx import _PptxPartitioner
|
||||
|
||||
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
|
||||
yield Text(f"strategy=={self._opts.strategy}")
|
||||
|
||||
_iter_elements_ = method_mock(
|
||||
request,
|
||||
_PptxPartitioner,
|
||||
"_iter_presentation_elements",
|
||||
side_effect=fake_iter_presentation_elements,
|
||||
)
|
||||
|
||||
(element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)
|
||||
|
||||
_iter_elements_.assert_called_once_with(ANY)
|
||||
assert element.text == f"strategy=={strategy}"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_ppt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.8-dev1" # pragma: no cover
|
||||
__version__ = "0.14.8-dev2" # pragma: no cover
|
||||
|
||||
@ -327,6 +327,7 @@ def partition(
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.ODT:
|
||||
@ -499,6 +500,7 @@ def partition(
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.JSON:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user