mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	 bba60260b2
			
		
	
	
		bba60260b2
		
			
		
	
	
	
	
		
			
			**Summary** Install new `@apply_metadata()` on CSV and DOCX and remove decoration from DOC and ODT. **Additional Context** - Working in alphabetical order and keeping PR size manageable, replace use of `@process_metadata()` and `@add_metadata_with_filetype()` decorators with `@apply_metadata()` on principal partitioners (those that do not delegate to other partitioners. - Remove all decorators from delegating partitioners (DOC and ODT in this case); this removes the "double-decorating".
		
			
				
	
	
		
			284 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			284 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # pyright: reportPrivateUsage=false
 | |
| 
 | |
| """Test suite for `unstructured.partition.doc` module."""
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| import pathlib
 | |
| from typing import Any, Iterator
 | |
| 
 | |
| import pytest
 | |
| from pytest_mock import MockFixture
 | |
| 
 | |
| from test_unstructured.unit_utils import (
 | |
|     ANY,
 | |
|     CaptureFixture,
 | |
|     FixtureRequest,
 | |
|     assert_round_trips_through_JSON,
 | |
|     example_doc_path,
 | |
|     method_mock,
 | |
| )
 | |
| from unstructured.chunking.basic import chunk_elements
 | |
| from unstructured.documents.elements import (
 | |
|     Address,
 | |
|     CompositeElement,
 | |
|     Element,
 | |
|     ListItem,
 | |
|     NarrativeText,
 | |
|     Table,
 | |
|     TableChunk,
 | |
|     Text,
 | |
|     Title,
 | |
| )
 | |
| from unstructured.partition.doc import partition_doc
 | |
| from unstructured.partition.docx import partition_docx
 | |
| 
 | |
| 
 | |
| def test_partition_doc_matches_partition_docx(request: FixtureRequest):
 | |
|     doc_file_path = example_doc_path("simple.doc")
 | |
|     docx_file_path = example_doc_path("simple.docx")
 | |
| 
 | |
|     assert partition_doc(doc_file_path) == partition_docx(docx_file_path)
 | |
| 
 | |
| 
 | |
| # -- document-source (file or filename) ----------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_filename(expected_elements: list[Element], capsys: CaptureFixture[str]):
 | |
|     elements = partition_doc(example_doc_path("simple.doc"))
 | |
| 
 | |
|     assert elements == expected_elements
 | |
|     assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
 | |
|     assert capsys.readouterr().out == ""
 | |
|     assert capsys.readouterr().err == ""
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_file_with_libre_office_filter(
 | |
|     expected_elements: list[Element], capsys: CaptureFixture[str]
 | |
| ):
 | |
|     with open(example_doc_path("simple.doc"), "rb") as f:
 | |
|         elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML")
 | |
| 
 | |
|     assert elements == expected_elements
 | |
|     assert capsys.readouterr().out == ""
 | |
|     assert capsys.readouterr().err == ""
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_file_with_no_libre_office_filter(
 | |
|     expected_elements: list[Element], capsys: CaptureFixture[str]
 | |
| ):
 | |
|     with open(example_doc_path("simple.doc"), "rb") as f:
 | |
|         elements = partition_doc(file=f, libre_office_filter=None)
 | |
| 
 | |
|     assert elements == expected_elements
 | |
|     assert capsys.readouterr().out == ""
 | |
|     assert capsys.readouterr().err == ""
 | |
|     assert all(e.metadata.filename is None for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_raises_when_both_a_filename_and_file_are_specified():
 | |
|     doc_file_path = example_doc_path("simple.doc")
 | |
| 
 | |
|     with open(doc_file_path, "rb") as f:
 | |
|         with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
 | |
|             partition_doc(filename=doc_file_path, file=f)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_raises_when_neither_a_file_path_nor_a_file_like_object_are_provided():
 | |
|     with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
 | |
|         partition_doc()
 | |
| 
 | |
| 
 | |
| def test_partition_raises_with_missing_doc(tmp_path: pathlib.Path):
 | |
|     doc_filename = str(tmp_path / "asdf.doc")
 | |
| 
 | |
|     with pytest.raises(ValueError, match="asdf.doc does not exist"):
 | |
|         partition_doc(filename=doc_filename)
 | |
| 
 | |
| 
 | |
| # -- .metadata.filename --------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_filename_gets_filename_from_filename_arg():
 | |
|     elements = partition_doc(example_doc_path("simple.doc"))
 | |
| 
 | |
|     assert len(elements) > 0
 | |
|     assert all(e.metadata.filename == "simple.doc" for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_file_gets_filename_None():
 | |
|     with open(example_doc_path("simple.doc"), "rb") as f:
 | |
|         elements = partition_doc(file=f)
 | |
| 
 | |
|     assert len(elements) > 0
 | |
|     assert all(e.metadata.filename is None for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_filename_prefers_metadata_filename():
 | |
|     elements = partition_doc(example_doc_path("simple.doc"), metadata_filename="test")
 | |
| 
 | |
|     assert len(elements) > 0
 | |
|     assert all(element.metadata.filename == "test" for element in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_from_file_prefers_metadata_filename():
 | |
|     with open(example_doc_path("simple.doc"), "rb") as f:
 | |
|         elements = partition_doc(file=f, metadata_filename="test")
 | |
| 
 | |
|     assert all(e.metadata.filename == "test" for e in elements)
 | |
| 
 | |
| 
 | |
| # -- .metadata.filetype --------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_doc_gets_the_DOC_MIME_type_in_metadata_filetype():
 | |
|     DOC_MIME_TYPE = "application/msword"
 | |
|     elements = partition_doc(example_doc_path("simple.doc"))
 | |
|     assert all(e.metadata.filetype == DOC_MIME_TYPE for e in elements), (
 | |
|         f"Expected all elements to have '{DOC_MIME_TYPE}' as their filetype, but got:"
 | |
|         f" {repr(elements[0].metadata.filetype)}"
 | |
|     )
 | |
| 
 | |
| 
 | |
| # -- .metadata.last_modified ---------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_doc_pulls_last_modified_from_filesystem(mocker: MockFixture):
 | |
|     filesystem_last_modified = "2029-07-05T09:24:28"
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.doc.get_last_modified_date", return_value=filesystem_last_modified
 | |
|     )
 | |
| 
 | |
|     elements = partition_doc(example_doc_path("fake.doc"))
 | |
| 
 | |
|     assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_prefers_metadata_last_modified_when_provided(
 | |
|     mocker: MockFixture,
 | |
| ):
 | |
|     filesystem_last_modified = "2029-07-05T09:24:28"
 | |
|     metadata_last_modified = "2020-07-05T09:24:28"
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.doc.get_last_modified_date", return_value=filesystem_last_modified
 | |
|     )
 | |
| 
 | |
|     elements = partition_doc(
 | |
|         example_doc_path("simple.doc"), metadata_last_modified=metadata_last_modified
 | |
|     )
 | |
| 
 | |
|     assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 | |
| 
 | |
| 
 | |
| # -- language-recognition metadata ---------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_doc_adds_languages_metadata():
 | |
|     elements = partition_doc(example_doc_path("simple.doc"))
 | |
|     assert all(e.metadata.languages == ["eng"] for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_respects_detect_language_per_element_arg():
 | |
|     elements = partition_doc(
 | |
|         example_doc_path("language-docs/eng_spa_mult.doc"), detect_language_per_element=True
 | |
|     )
 | |
|     assert [e.metadata.languages for e in elements] == [
 | |
|         ["eng"],
 | |
|         ["spa", "eng"],
 | |
|         ["eng"],
 | |
|         ["eng"],
 | |
|         ["spa"],
 | |
|     ]
 | |
| 
 | |
| 
 | |
| # -- miscellaneous -------------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("kwargs", "expected_value"),
 | |
|     [({}, "hi_res"), ({"strategy": None}, "hi_res"), ({"strategy": "auto"}, "auto")],
 | |
| )
 | |
| def test_partition_odt_forwards_strategy_arg_to_partition_docx(
 | |
|     request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
 | |
| ):
 | |
|     from unstructured.partition.docx import _DocxPartitioner
 | |
| 
 | |
|     def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
 | |
|         yield Text(f"strategy == {self._opts.strategy}")
 | |
| 
 | |
|     _iter_elements_ = method_mock(
 | |
|         request,
 | |
|         _DocxPartitioner,
 | |
|         "_iter_document_elements",
 | |
|         side_effect=fake_iter_document_elements,
 | |
|     )
 | |
| 
 | |
|     (element,) = partition_doc(example_doc_path("simple.doc"), **kwargs)
 | |
| 
 | |
|     _iter_elements_.assert_called_once_with(ANY)
 | |
|     assert element.text == f"strategy == {expected_value}"
 | |
| 
 | |
| 
 | |
| def test_partition_doc_grabs_emphasized_texts():
 | |
|     expected_emphasized_text_contents = ["bold", "italic", "bold-italic", "bold-italic"]
 | |
|     expected_emphasized_text_tags = ["b", "i", "b", "i"]
 | |
| 
 | |
|     elements = partition_doc(example_doc_path("fake-doc-emphasized-text.doc"))
 | |
| 
 | |
|     assert isinstance(elements[0], Table)
 | |
|     assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
 | |
|     assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags
 | |
| 
 | |
|     assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
 | |
|     assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
 | |
|     assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags
 | |
| 
 | |
|     assert elements[2] == NarrativeText("I am a normal text.")
 | |
|     assert elements[2].metadata.emphasized_text_contents is None
 | |
|     assert elements[2].metadata.emphasized_text_tags is None
 | |
| 
 | |
| 
 | |
| def test_partition_doc_round_trips_through_json():
 | |
|     """Elements produced can be serialized then deserialized without loss."""
 | |
|     assert_round_trips_through_JSON(partition_doc(example_doc_path("simple.doc")))
 | |
| 
 | |
| 
 | |
| def test_partition_doc_chunks_elements_when_chunking_strategy_is_specified():
 | |
|     document_path = example_doc_path("simple.doc")
 | |
|     elements = partition_doc(document_path)
 | |
|     chunks = partition_doc(document_path, chunking_strategy="basic")
 | |
| 
 | |
|     # -- all chunks are chunk element-types --
 | |
|     assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks)
 | |
|     # -- chunks from partitioning match those produced by chunking elements in separate step --
 | |
|     assert chunks == chunk_elements(elements)
 | |
| 
 | |
| 
 | |
| def test_partition_doc_assigns_deterministic_and_unique_element_ids():
 | |
|     document_path = example_doc_path("duplicate-paragraphs.doc")
 | |
| 
 | |
|     ids = [element.id for element in partition_doc(document_path)]
 | |
|     ids_2 = [element.id for element in partition_doc(document_path)]
 | |
| 
 | |
|     # -- ids should match even though partitioned separately --
 | |
|     assert ids == ids_2
 | |
|     # -- ids should be unique --
 | |
|     assert len(ids) == len(set(ids))
 | |
| 
 | |
| 
 | |
| # == module-level fixtures =======================================================================
 | |
| 
 | |
| 
 | |
| @pytest.fixture()
 | |
| def expected_elements() -> list[Element]:
 | |
|     return [
 | |
|         Title("These are a few of my favorite things:"),
 | |
|         ListItem("Parrots"),
 | |
|         ListItem("Hockey"),
 | |
|         Title("Analysis"),
 | |
|         NarrativeText("This is my first thought. This is my second thought."),
 | |
|         NarrativeText("This is my third thought."),
 | |
|         Text("2023"),
 | |
|         Address("DOYLESTOWN, PA 18901"),
 | |
|     ]
 |