feat(docx): add strategy parameter to DOC and ODT (#3042)

**Summary**
Because DOCX now supports the `strategy` argument to control aspects of
image extraction, `partition_doc()` and `partition_odt()` will need to
support it to because they delegate partitioning to `partition_docx()`.
This will allow image extraction to work the same way for those two
additional document-types.
This commit is contained in:
Steve Canny 2024-05-16 15:14:02 -07:00 committed by GitHub
parent 8644a3b09a
commit f320889b4f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 52 additions and 4 deletions

View File

@ -1,4 +1,4 @@
## 0.13.8-dev15
## 0.13.8-dev16
### Enhancements

View File

@ -5,14 +5,17 @@ from __future__ import annotations
import os
import pathlib
import tempfile
from typing import Any
import pytest
from pytest_mock import MockFixture
from test_unstructured.unit_utils import (
CaptureFixture,
FixtureRequest,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import (
@ -243,6 +246,24 @@ def test_partition_doc_respects_detect_language_per_element_arg():
# -- miscellaneous -------------------------------------------------------------------------------
@pytest.mark.parametrize(
("kwargs", "expected_value"),
[({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")],
)
def test_partition_doc_forwards_strategy_arg_to_partition_docx(
request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
):
partition_docx_ = function_mock(request, "unstructured.partition.doc.partition_docx")
partition_doc(example_doc_path("simple.doc"), **kwargs)
call_kwargs = partition_docx_.call_args.kwargs
# -- `strategy` keyword-argument appeared in the call --
assert "strategy" in call_kwargs
# -- `strategy` argument was passed with the expected value --
assert call_kwargs["strategy"] == expected_value
def test_partition_doc_grabs_emphasized_texts():
expected_emphasized_text_contents = ["bold", "italic", "bold-italic", "bold-italic"]
expected_emphasized_text_tags = ["b", "i", "b", "i"]

View File

@ -8,7 +8,12 @@ from typing import Any
import pytest
from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from test_unstructured.unit_utils import (
FixtureRequest,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import CompositeElement, Table, TableChunk, Title
from unstructured.partition.docx import partition_docx
@ -220,6 +225,24 @@ def test_partition_odt_respects_detect_language_per_element_arg():
# -- miscellaneous -------------------------------------------------------------------------------
@pytest.mark.parametrize(
("kwargs", "expected_value"),
[({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")],
)
def test_partition_odt_forwards_strategy_arg_to_partition_docx(
request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
):
partition_docx_ = function_mock(request, "unstructured.partition.odt.partition_docx")
partition_odt(example_doc_path("simple.odt"), **kwargs)
call_kwargs = partition_docx_.call_args.kwargs
# -- `strategy` keyword-argument appeared in the call --
assert "strategy" in call_kwargs
# -- `strategy` argument was passed with the expected value --
assert call_kwargs["strategy"] == expected_value
def test_partition_odt_round_trips_through_json():
"""Elements produced can be serialized then deserialized without loss."""
assert_round_trips_through_JSON(partition_odt(example_doc_path("simple.odt")))

View File

@ -134,7 +134,7 @@ def cls_attr_mock(
def function_mock(
request: FixtureRequest, q_function_name: str, autospec: bool = True, **kwargs: Any
):
) -> Mock:
"""Return mock patching function with qualified name `q_function_name`.
Patch is reversed after calling test returns.

View File

@ -1 +1 @@
__version__ = "0.13.8-dev15" # pragma: no cover
__version__ = "0.13.8-dev16" # pragma: no cover

View File

@ -32,6 +32,7 @@ def partition_doc(
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
strategy: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements.
@ -112,6 +113,7 @@ def partition_doc(
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modified,
starting_page_number=starting_page_number,
strategy=strategy,
)
# -- Remove temporary document.docx path from metadata when necessary. Note `metadata_filename`

View File

@ -30,6 +30,7 @@ def partition_odt(
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
strategy: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Open Office Documents in .odt format into its document elements.
@ -76,6 +77,7 @@ def partition_odt(
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modification_date,
starting_page_number=starting_page_number,
strategy=strategy,
)
return elements