diff --git a/CHANGELOG.md b/CHANGELOG.md index bcff92178..4d9f1b615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.8-dev15 +## 0.13.8-dev16 ### Enhancements diff --git a/test_unstructured/partition/docx/test_doc.py b/test_unstructured/partition/docx/test_doc.py index da954c493..f7dc9122b 100644 --- a/test_unstructured/partition/docx/test_doc.py +++ b/test_unstructured/partition/docx/test_doc.py @@ -5,14 +5,17 @@ from __future__ import annotations import os import pathlib import tempfile +from typing import Any import pytest from pytest_mock import MockFixture from test_unstructured.unit_utils import ( CaptureFixture, + FixtureRequest, assert_round_trips_through_JSON, example_doc_path, + function_mock, ) from unstructured.chunking.basic import chunk_elements from unstructured.documents.elements import ( @@ -243,6 +246,24 @@ def test_partition_doc_respects_detect_language_per_element_arg(): # -- miscellaneous ------------------------------------------------------------------------------- +@pytest.mark.parametrize( + ("kwargs", "expected_value"), + [({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")], +) +def test_partition_doc_forwards_strategy_arg_to_partition_docx( + request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None +): + partition_docx_ = function_mock(request, "unstructured.partition.doc.partition_docx") + + partition_doc(example_doc_path("simple.doc"), **kwargs) + + call_kwargs = partition_docx_.call_args.kwargs + # -- `strategy` keyword-argument appeared in the call -- + assert "strategy" in call_kwargs + # -- `strategy` argument was passed with the expected value -- + assert call_kwargs["strategy"] == expected_value + + def test_partition_doc_grabs_emphasized_texts(): expected_emphasized_text_contents = ["bold", "italic", "bold-italic", "bold-italic"] expected_emphasized_text_tags = ["b", "i", "b", "i"] diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py index 5a639aae5..72e2311f7 100644 --- a/test_unstructured/partition/odt/test_odt.py +++ b/test_unstructured/partition/odt/test_odt.py @@ -8,7 +8,12 @@ from typing import Any import pytest from pytest_mock import MockFixture -from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path +from test_unstructured.unit_utils import ( + FixtureRequest, + assert_round_trips_through_JSON, + example_doc_path, + function_mock, +) from unstructured.chunking.basic import chunk_elements from unstructured.documents.elements import CompositeElement, Table, TableChunk, Title from unstructured.partition.docx import partition_docx @@ -220,6 +225,24 @@ def test_partition_odt_respects_detect_language_per_element_arg(): # -- miscellaneous ------------------------------------------------------------------------------- +@pytest.mark.parametrize( + ("kwargs", "expected_value"), + [({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")], +) +def test_partition_odt_forwards_strategy_arg_to_partition_docx( + request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None +): + partition_docx_ = function_mock(request, "unstructured.partition.odt.partition_docx") + + partition_odt(example_doc_path("simple.odt"), **kwargs) + + call_kwargs = partition_docx_.call_args.kwargs + # -- `strategy` keyword-argument appeared in the call -- + assert "strategy" in call_kwargs + # -- `strategy` argument was passed with the expected value -- + assert call_kwargs["strategy"] == expected_value + + def test_partition_odt_round_trips_through_json(): """Elements produced can be serialized then deserialized without loss.""" assert_round_trips_through_JSON(partition_odt(example_doc_path("simple.odt"))) diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py index a4902341c..237443c52 100644 --- a/test_unstructured/unit_utils.py +++ b/test_unstructured/unit_utils.py @@ -134,7 +134,7 @@ def cls_attr_mock( def function_mock( request: FixtureRequest, q_function_name: str, autospec: bool = True, **kwargs: Any -): +) -> Mock: """Return mock patching function with qualified name `q_function_name`. Patch is reversed after calling test returns. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 10a704ee0..4b6615ad3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev15" # pragma: no cover +__version__ = "0.13.8-dev16" # pragma: no cover diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 223d7c104..8b5f4f441 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -32,6 +32,7 @@ def partition_doc( detect_language_per_element: bool = False, date_from_file_object: bool = False, starting_page_number: int = 1, + strategy: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions Microsoft Word Documents in .doc format into its document elements. @@ -112,6 +113,7 @@ def partition_doc( metadata_filename=metadata_filename, metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, + strategy=strategy, ) # -- Remove temporary document.docx path from metadata when necessary. Note `metadata_filename` diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index 922e5032b..676033472 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -30,6 +30,7 @@ def partition_odt( metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + strategy: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions Open Office Documents in .odt format into its document elements. @@ -76,6 +77,7 @@ def partition_odt( metadata_filename=metadata_filename, metadata_last_modified=metadata_last_modified or last_modification_date, starting_page_number=starting_page_number, + strategy=strategy, ) return elements