mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-29 09:00:30 +00:00
feat(docx): add strategy parameter to DOC and ODT (#3042)
**Summary** Because DOCX now supports the `strategy` argument to control aspects of image extraction, `partition_doc()` and `partition_odt()` will need to support it to because they delegate partitioning to `partition_docx()`. This will allow image extraction to work the same way for those two additional document-types.
This commit is contained in:
parent
8644a3b09a
commit
f320889b4f
@ -1,4 +1,4 @@
|
||||
## 0.13.8-dev15
|
||||
## 0.13.8-dev16
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -5,14 +5,17 @@ from __future__ import annotations
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import (
|
||||
CaptureFixture,
|
||||
FixtureRequest,
|
||||
assert_round_trips_through_JSON,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
)
|
||||
from unstructured.chunking.basic import chunk_elements
|
||||
from unstructured.documents.elements import (
|
||||
@ -243,6 +246,24 @@ def test_partition_doc_respects_detect_language_per_element_arg():
|
||||
# -- miscellaneous -------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("kwargs", "expected_value"),
|
||||
[({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")],
|
||||
)
|
||||
def test_partition_doc_forwards_strategy_arg_to_partition_docx(
|
||||
request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
|
||||
):
|
||||
partition_docx_ = function_mock(request, "unstructured.partition.doc.partition_docx")
|
||||
|
||||
partition_doc(example_doc_path("simple.doc"), **kwargs)
|
||||
|
||||
call_kwargs = partition_docx_.call_args.kwargs
|
||||
# -- `strategy` keyword-argument appeared in the call --
|
||||
assert "strategy" in call_kwargs
|
||||
# -- `strategy` argument was passed with the expected value --
|
||||
assert call_kwargs["strategy"] == expected_value
|
||||
|
||||
|
||||
def test_partition_doc_grabs_emphasized_texts():
|
||||
expected_emphasized_text_contents = ["bold", "italic", "bold-italic", "bold-italic"]
|
||||
expected_emphasized_text_tags = ["b", "i", "b", "i"]
|
||||
|
||||
@ -8,7 +8,12 @@ from typing import Any
|
||||
import pytest
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from test_unstructured.unit_utils import (
|
||||
FixtureRequest,
|
||||
assert_round_trips_through_JSON,
|
||||
example_doc_path,
|
||||
function_mock,
|
||||
)
|
||||
from unstructured.chunking.basic import chunk_elements
|
||||
from unstructured.documents.elements import CompositeElement, Table, TableChunk, Title
|
||||
from unstructured.partition.docx import partition_docx
|
||||
@ -220,6 +225,24 @@ def test_partition_odt_respects_detect_language_per_element_arg():
|
||||
# -- miscellaneous -------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("kwargs", "expected_value"),
|
||||
[({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")],
|
||||
)
|
||||
def test_partition_odt_forwards_strategy_arg_to_partition_docx(
|
||||
request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
|
||||
):
|
||||
partition_docx_ = function_mock(request, "unstructured.partition.odt.partition_docx")
|
||||
|
||||
partition_odt(example_doc_path("simple.odt"), **kwargs)
|
||||
|
||||
call_kwargs = partition_docx_.call_args.kwargs
|
||||
# -- `strategy` keyword-argument appeared in the call --
|
||||
assert "strategy" in call_kwargs
|
||||
# -- `strategy` argument was passed with the expected value --
|
||||
assert call_kwargs["strategy"] == expected_value
|
||||
|
||||
|
||||
def test_partition_odt_round_trips_through_json():
|
||||
"""Elements produced can be serialized then deserialized without loss."""
|
||||
assert_round_trips_through_JSON(partition_odt(example_doc_path("simple.odt")))
|
||||
|
||||
@ -134,7 +134,7 @@ def cls_attr_mock(
|
||||
|
||||
def function_mock(
|
||||
request: FixtureRequest, q_function_name: str, autospec: bool = True, **kwargs: Any
|
||||
):
|
||||
) -> Mock:
|
||||
"""Return mock patching function with qualified name `q_function_name`.
|
||||
|
||||
Patch is reversed after calling test returns.
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.13.8-dev15" # pragma: no cover
|
||||
__version__ = "0.13.8-dev16" # pragma: no cover
|
||||
|
||||
@ -32,6 +32,7 @@ def partition_doc(
|
||||
detect_language_per_element: bool = False,
|
||||
date_from_file_object: bool = False,
|
||||
starting_page_number: int = 1,
|
||||
strategy: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions Microsoft Word Documents in .doc format into its document elements.
|
||||
@ -112,6 +113,7 @@ def partition_doc(
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
# -- Remove temporary document.docx path from metadata when necessary. Note `metadata_filename`
|
||||
|
||||
@ -30,6 +30,7 @@ def partition_odt(
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
starting_page_number: int = 1,
|
||||
strategy: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions Open Office Documents in .odt format into its document elements.
|
||||
@ -76,6 +77,7 @@ def partition_odt(
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
starting_page_number=starting_page_number,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user