mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	fix(auto): partition() passes strategy to DOC,ODT (#3278)
**Summary** Remedy gap where `strategy` argument passed to `partition()` was not forwarded to `partition_doc()` or `partition_odt()` and so was not making its way to `partition_docx()`.
This commit is contained in:
		
							parent
							
								
									0665e94b96
								
							
						
					
					
						commit
						f2fee0c32f
					
				@ -1,4 +1,4 @@
 | 
			
		||||
## 0.14.9-dev4
 | 
			
		||||
## 0.14.9-dev5
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
@ -7,6 +7,7 @@
 | 
			
		||||
### Fixes
 | 
			
		||||
 | 
			
		||||
* **Fix a bug where multiple `soffice` processes could be attempted** Add a wait mechanism in `convert_office_doc` so that the function first checks if another `soffice` is running already: if yes wait till the other process finishes or till the wait timeout before spawning a subprocess to run `soffice`
 | 
			
		||||
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_pptx()`, and their brokering partitioners for DOC, ODT, and PPT formats.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_pptx()`, and their brokering partitioners when those filetypes are detected.
 | 
			
		||||
 | 
			
		||||
## 0.14.8
 | 
			
		||||
 | 
			
		||||
@ -20,7 +21,6 @@
 | 
			
		||||
 | 
			
		||||
* **Bump unstructured-inference==0.7.36** Fix `ValueError` when converting cells to html.
 | 
			
		||||
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_ppt()`, and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_ppt()`, and `partition_pptx()` when those filetypes are detected.
 | 
			
		||||
 | 
			
		||||
* **Fix missing sensitive field markers** for embedders
 | 
			
		||||
 | 
			
		||||
## 0.14.7
 | 
			
		||||
 | 
			
		||||
@ -178,6 +178,7 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
 | 
			
		||||
    assert elements == expected_docx_elements
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "strategy",
 | 
			
		||||
    [
 | 
			
		||||
@ -187,7 +188,17 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
 | 
			
		||||
        PartitionStrategy.OCR_ONLY,
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
 | 
			
		||||
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
 | 
			
		||||
    request: FixtureRequest, file_name: str, strategy: str
 | 
			
		||||
):
 | 
			
		||||
    """The `strategy` arg value received by `partition()` is received by `partition_docx().
 | 
			
		||||
 | 
			
		||||
    To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
 | 
			
		||||
    `partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
 | 
			
		||||
    test makes sure it made it all the way.
 | 
			
		||||
 | 
			
		||||
    Note this is 3 file-types X 4 strategies = 12 test-cases.
 | 
			
		||||
    """
 | 
			
		||||
    from unstructured.partition.docx import _DocxPartitioner
 | 
			
		||||
 | 
			
		||||
    def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
 | 
			
		||||
@ -200,7 +211,7 @@ def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureReque
 | 
			
		||||
        side_effect=fake_iter_document_elements,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    (element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
 | 
			
		||||
    (element,) = partition(example_doc_path(file_name), strategy=strategy)
 | 
			
		||||
 | 
			
		||||
    _iter_elements_.assert_called_once_with(ANY)
 | 
			
		||||
    assert element.text == f"strategy=={strategy}"
 | 
			
		||||
@ -589,6 +600,7 @@ def test_auto_partition_pptx_from_filename():
 | 
			
		||||
    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "strategy",
 | 
			
		||||
    [
 | 
			
		||||
@ -598,7 +610,17 @@ def test_auto_partition_pptx_from_filename():
 | 
			
		||||
        PartitionStrategy.OCR_ONLY,
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
 | 
			
		||||
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
 | 
			
		||||
    request: FixtureRequest, file_name: str, strategy: str
 | 
			
		||||
):
 | 
			
		||||
    """The `strategy` arg value received by `partition()` is received by `partition_pptx().
 | 
			
		||||
 | 
			
		||||
    To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
 | 
			
		||||
    `partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
 | 
			
		||||
    made it all the way.
 | 
			
		||||
 | 
			
		||||
    Note this is 2 file-types X 4 strategies = 8 test-cases.
 | 
			
		||||
    """
 | 
			
		||||
    from unstructured.partition.pptx import _PptxPartitioner
 | 
			
		||||
 | 
			
		||||
    def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
 | 
			
		||||
@ -611,35 +633,7 @@ def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureReque
 | 
			
		||||
        side_effect=fake_iter_presentation_elements,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    (element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)
 | 
			
		||||
 | 
			
		||||
    _iter_elements_.assert_called_once_with(ANY)
 | 
			
		||||
    assert element.text == f"strategy=={strategy}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "strategy",
 | 
			
		||||
    [
 | 
			
		||||
        PartitionStrategy.AUTO,
 | 
			
		||||
        PartitionStrategy.FAST,
 | 
			
		||||
        PartitionStrategy.HI_RES,
 | 
			
		||||
        PartitionStrategy.OCR_ONLY,
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_partition_forwards_strategy_arg_to_partition_ppt(request: FixtureRequest, strategy: str):
 | 
			
		||||
    from unstructured.partition.pptx import _PptxPartitioner
 | 
			
		||||
 | 
			
		||||
    def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
 | 
			
		||||
        yield Text(f"strategy=={self._opts.strategy}")
 | 
			
		||||
 | 
			
		||||
    _iter_elements_ = method_mock(
 | 
			
		||||
        request,
 | 
			
		||||
        _PptxPartitioner,
 | 
			
		||||
        "_iter_presentation_elements",
 | 
			
		||||
        side_effect=fake_iter_presentation_elements,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    (element,) = partition(example_doc_path("fake-power-point.ppt"), strategy=strategy)
 | 
			
		||||
    (element,) = partition(example_doc_path(file_name), strategy=strategy)
 | 
			
		||||
 | 
			
		||||
    _iter_elements_.assert_called_once_with(ANY)
 | 
			
		||||
    assert element.text == f"strategy=={strategy}"
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.14.9-dev4"  # pragma: no cover
 | 
			
		||||
__version__ = "0.14.9-dev5"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -316,6 +316,7 @@ def partition(
 | 
			
		||||
            languages=languages,
 | 
			
		||||
            detect_language_per_element=detect_language_per_element,
 | 
			
		||||
            starting_page_number=starting_page_number,
 | 
			
		||||
            strategy=strategy,
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
    elif filetype == FileType.DOCX:
 | 
			
		||||
@ -339,6 +340,7 @@ def partition(
 | 
			
		||||
            languages=languages,
 | 
			
		||||
            detect_language_per_element=detect_language_per_element,
 | 
			
		||||
            starting_page_number=starting_page_number,
 | 
			
		||||
            strategy=strategy,
 | 
			
		||||
            **kwargs,
 | 
			
		||||
        )
 | 
			
		||||
    elif filetype == FileType.EML:
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user