mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 03:00:24 +00:00
fix(auto): partition() passes strategy to PPTX,DOCX (#3273)
**Summary** Remedy gap where `strategy` argument passed to `partition()` was not forwarded to `partition_pptx()` or `partition_docx()`.
This commit is contained in:
parent
6fe1c9980e
commit
16df6944dd
@ -1,4 +1,4 @@
|
|||||||
## 0.14.8-dev1
|
## 0.14.8-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* **`partition()` now forwards `strategy` arg to `partition_docx()` and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()` and `partition_pptx()` when those filetypes are detected.
|
||||||
|
|
||||||
## 0.14.7
|
## 0.14.7
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# pyright: reportPrivateUsage=false
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -6,6 +8,7 @@ import pathlib
|
|||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
|
from typing import Iterator
|
||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
@ -20,10 +23,12 @@ from test_unstructured.partition.test_constants import (
|
|||||||
EXPECTED_TEXT_XLSX,
|
EXPECTED_TEXT_XLSX,
|
||||||
EXPECTED_TITLE,
|
EXPECTED_TITLE,
|
||||||
)
|
)
|
||||||
|
from test_unstructured.unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Address,
|
Address,
|
||||||
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
@ -173,6 +178,34 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
|
|||||||
assert elements == expected_docx_elements
|
assert elements == expected_docx_elements
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"strategy",
|
||||||
|
[
|
||||||
|
PartitionStrategy.AUTO,
|
||||||
|
PartitionStrategy.FAST,
|
||||||
|
PartitionStrategy.HI_RES,
|
||||||
|
PartitionStrategy.OCR_ONLY,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
|
||||||
|
from unstructured.partition.docx import _DocxPartitioner
|
||||||
|
|
||||||
|
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
|
||||||
|
yield Text(f"strategy=={self._opts.strategy}")
|
||||||
|
|
||||||
|
_iter_elements_ = method_mock(
|
||||||
|
request,
|
||||||
|
_DocxPartitioner,
|
||||||
|
"_iter_document_elements",
|
||||||
|
side_effect=fake_iter_document_elements,
|
||||||
|
)
|
||||||
|
|
||||||
|
(element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
|
||||||
|
|
||||||
|
_iter_elements_.assert_called_once_with(ANY)
|
||||||
|
assert element.text == f"strategy=={strategy}"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("pass_metadata_filename", "content_type"),
|
("pass_metadata_filename", "content_type"),
|
||||||
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
|
||||||
@ -556,6 +589,34 @@ def test_auto_partition_pptx_from_filename():
|
|||||||
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"strategy",
|
||||||
|
[
|
||||||
|
PartitionStrategy.AUTO,
|
||||||
|
PartitionStrategy.FAST,
|
||||||
|
PartitionStrategy.HI_RES,
|
||||||
|
PartitionStrategy.OCR_ONLY,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
|
||||||
|
from unstructured.partition.pptx import _PptxPartitioner
|
||||||
|
|
||||||
|
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
|
||||||
|
yield Text(f"strategy=={self._opts.strategy}")
|
||||||
|
|
||||||
|
_iter_elements_ = method_mock(
|
||||||
|
request,
|
||||||
|
_PptxPartitioner,
|
||||||
|
"_iter_presentation_elements",
|
||||||
|
side_effect=fake_iter_presentation_elements,
|
||||||
|
)
|
||||||
|
|
||||||
|
(element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)
|
||||||
|
|
||||||
|
_iter_elements_.assert_called_once_with(ANY)
|
||||||
|
assert element.text == f"strategy=={strategy}"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||||
def test_auto_partition_ppt_from_filename():
|
def test_auto_partition_ppt_from_filename():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.14.8-dev1" # pragma: no cover
|
__version__ = "0.14.8-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -327,6 +327,7 @@ def partition(
|
|||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.ODT:
|
elif filetype == FileType.ODT:
|
||||||
@ -499,6 +500,7 @@ def partition(
|
|||||||
languages=languages,
|
languages=languages,
|
||||||
detect_language_per_element=detect_language_per_element,
|
detect_language_per_element=detect_language_per_element,
|
||||||
starting_page_number=starting_page_number,
|
starting_page_number=starting_page_number,
|
||||||
|
strategy=strategy,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.JSON:
|
elif filetype == FileType.JSON:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user