diff --git a/CHANGELOG.md b/CHANGELOG.md index 98c93d0c1..0fe92a028 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.8-dev1 +## 0.14.8-dev2 ### Enhancements @@ -6,6 +6,8 @@ ### Fixes +* **`partition()` now forwards `strategy` arg to `partition_docx()` and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()` and `partition_pptx()` when those filetypes are detected. + ## 0.14.7 ### Enhancements diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index ce3d442ef..16ee3e9e2 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1,3 +1,5 @@ +# pyright: reportPrivateUsage=false + from __future__ import annotations import json @@ -6,6 +8,7 @@ import pathlib import tempfile import warnings from importlib import import_module +from typing import Iterator from unittest.mock import Mock, patch import docx @@ -20,10 +23,12 @@ from test_unstructured.partition.test_constants import ( EXPECTED_TEXT_XLSX, EXPECTED_TITLE, ) +from test_unstructured.unit_utils import ANY, FixtureRequest, example_doc_path, method_mock from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( Address, + Element, ElementMetadata, ListItem, NarrativeText, @@ -173,6 +178,34 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements assert elements == expected_docx_elements +@pytest.mark.parametrize( + "strategy", + [ + PartitionStrategy.AUTO, + PartitionStrategy.FAST, + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, + ], +) +def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str): + from unstructured.partition.docx import _DocxPartitioner + + def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]: + yield Text(f"strategy=={self._opts.strategy}") + + _iter_elements_ = method_mock( + request, + _DocxPartitioner, + "_iter_document_elements", + side_effect=fake_iter_document_elements, + ) + + (element,) = partition(example_doc_path("simple.docx"), strategy=strategy) + + _iter_elements_.assert_called_once_with(ANY) + assert element.text == f"strategy=={strategy}" + + @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], @@ -556,6 +589,34 @@ def test_auto_partition_pptx_from_filename(): assert elements[0].metadata.file_directory == os.path.split(filename)[0] +@pytest.mark.parametrize( + "strategy", + [ + PartitionStrategy.AUTO, + PartitionStrategy.FAST, + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, + ], +) +def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str): + from unstructured.partition.pptx import _PptxPartitioner + + def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]: + yield Text(f"strategy=={self._opts.strategy}") + + _iter_elements_ = method_mock( + request, + _PptxPartitioner, + "_iter_presentation_elements", + side_effect=fake_iter_presentation_elements, + ) + + (element,) = partition(example_doc_path("simple.pptx"), strategy=strategy) + + _iter_elements_.assert_called_once_with(ANY) + assert element.text == f"strategy=={strategy}" + + @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") def test_auto_partition_ppt_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 55906adbc..074150137 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.8-dev1" # pragma: no cover +__version__ = "0.14.8-dev2" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 6a57340f7..64655c80a 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -327,6 +327,7 @@ def partition( languages=languages, detect_language_per_element=detect_language_per_element, starting_page_number=starting_page_number, + strategy=strategy, **kwargs, ) elif filetype == FileType.ODT: @@ -499,6 +500,7 @@ def partition( languages=languages, detect_language_per_element=detect_language_per_element, starting_page_number=starting_page_number, + strategy=strategy, **kwargs, ) elif filetype == FileType.JSON: