| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  | # pyright: reportPrivateUsage=false | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | """Test suite for `unstructured.partition.odt` module.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from __future__ import annotations | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  | from typing import Any, Iterator | 
					
						
							| 
									
										
										
										
											2023-05-04 15:28:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-23 17:11:53 -07:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | from pytest_mock import MockFixture | 
					
						
							| 
									
										
										
										
											2023-10-23 17:11:53 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | from test_unstructured.unit_utils import ( | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     ANY, | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  |     FixtureRequest, | 
					
						
							|  |  |  |     assert_round_trips_through_JSON, | 
					
						
							|  |  |  |     example_doc_path, | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     method_mock, | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | from unstructured.chunking.basic import chunk_elements | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  | from unstructured.documents.elements import ( | 
					
						
							|  |  |  |     CompositeElement, | 
					
						
							|  |  |  |     Element, | 
					
						
							|  |  |  |     Table, | 
					
						
							|  |  |  |     TableChunk, | 
					
						
							|  |  |  |     Text, | 
					
						
							|  |  |  |     Title, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | from unstructured.partition.docx import partition_docx | 
					
						
							| 
									
										
										
										
											2023-05-04 15:28:08 -04:00
										 |  |  | from unstructured.partition.odt import partition_odt | 
					
						
							| 
									
										
										
										
											2023-10-05 15:26:47 -05:00
										 |  |  | from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA | 
					
						
							| 
									
										
										
										
											2023-05-04 15:28:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | def test_partition_odt_matches_partition_docx(): | 
					
						
							|  |  |  |     odt_file_path = example_doc_path("simple.odt") | 
					
						
							|  |  |  |     docx_file_path = example_doc_path("simple.docx") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert partition_odt(odt_file_path) == partition_docx(docx_file_path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # -- document-source (file or filename) ---------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2023-05-04 15:28:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_odt_from_filename(): | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     elements = partition_odt(example_doc_path("fake.odt")) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-16 22:29:44 -07:00
										 |  |  |     assert elements == [ | 
					
						
							|  |  |  |         Title("Lorem ipsum dolor sit amet."), | 
					
						
							|  |  |  |         Table( | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |             "Header row Mon Wed Fri" | 
					
						
							|  |  |  |             " Color Blue Red Green" | 
					
						
							|  |  |  |             " Time 1pm 2pm 3pm" | 
					
						
							|  |  |  |             " Leader Sarah Mark Ryan" | 
					
						
							| 
									
										
										
										
											2023-09-16 22:29:44 -07:00
										 |  |  |         ), | 
					
						
							|  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     assert all(e.metadata.filename == "fake.odt" for e in elements) | 
					
						
							| 
									
										
										
										
											2023-10-05 15:26:47 -05:00
										 |  |  |     if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |         # -- document is ultimately partitioned by partition_docx() -- | 
					
						
							|  |  |  |         assert {e.metadata.detection_origin for e in elements} == {"docx"} | 
					
						
							| 
									
										
										
										
											2023-05-04 15:28:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_odt_from_file(): | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     with open(example_doc_path("fake.odt"), "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-05-04 15:28:08 -04:00
										 |  |  |         elements = partition_odt(file=f) | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-16 22:29:44 -07:00
										 |  |  |     assert elements == [ | 
					
						
							|  |  |  |         Title("Lorem ipsum dolor sit amet."), | 
					
						
							|  |  |  |         Table( | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |             "Header row Mon Wed Fri" | 
					
						
							|  |  |  |             " Color Blue Red Green" | 
					
						
							|  |  |  |             " Time 1pm 2pm 3pm" | 
					
						
							|  |  |  |             " Leader Sarah Mark Ryan" | 
					
						
							| 
									
										
										
										
											2023-09-16 22:29:44 -07:00
										 |  |  |         ), | 
					
						
							|  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | # -- .metadata.filename -------------------------------------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  | def test_partition_odt_from_filename_gets_the_ODT_filename_in_metadata_not_the_DOCX_filename(): | 
					
						
							|  |  |  |     elements = partition_odt(example_doc_path("simple.odt")) | 
					
						
							|  |  |  |     assert all(e.metadata.filename == "simple.odt" for e in elements), ( | 
					
						
							|  |  |  |         f"Expected all elements to have 'simple.odt' as their filename, but got:" | 
					
						
							|  |  |  |         f" {repr(elements[0].metadata.filename)}" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | def test_partition_odt_from_filename_with_metadata_filename(): | 
					
						
							|  |  |  |     elements = partition_odt(example_doc_path("fake.odt"), metadata_filename="test") | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     assert all(e.metadata.filename == "test" for e in elements) | 
					
						
							| 
									
										
										
										
											2023-10-23 17:11:53 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | def test_partition_odt_from_file_with_metadata_filename(): | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     with open(example_doc_path("fake.odt"), "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |         elements = partition_odt(file=f, metadata_filename="test") | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     assert all(e.metadata.filename == "test" for e in elements) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # -- .metadata.filetype -------------------------------------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_odt_gets_the_ODT_MIME_type_in_metadata_filetype(): | 
					
						
							|  |  |  |     ODT_MIME_TYPE = "application/vnd.oasis.opendocument.text" | 
					
						
							|  |  |  |     elements = partition_odt(example_doc_path("simple.odt")) | 
					
						
							|  |  |  |     assert all(e.metadata.filetype == ODT_MIME_TYPE for e in elements), ( | 
					
						
							|  |  |  |         f"Expected all elements to have '{ODT_MIME_TYPE}' as their filetype, but got:" | 
					
						
							|  |  |  |         f" {repr(elements[0].metadata.filetype)}" | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | # -- .metadata.text_as_html ---------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | @pytest.mark.parametrize("kwargs", [{}, {"infer_table_structure": True}]) | 
					
						
							|  |  |  | def test_partition_odt_adds_text_as_html_when_infer_table_structure_is_omitted_or_True( | 
					
						
							| 
									
										
										
										
											2024-09-25 11:17:48 -07:00
										 |  |  |     kwargs: dict[str, Any], | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | ): | 
					
						
							|  |  |  |     with open(example_doc_path("fake.odt"), "rb") as f: | 
					
						
							|  |  |  |         elements = partition_odt(file=f, **kwargs) | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     table = elements[1] | 
					
						
							|  |  |  |     assert isinstance(table, Table) | 
					
						
							|  |  |  |     assert table.metadata.text_as_html is not None | 
					
						
							|  |  |  |     assert table.metadata.text_as_html.startswith("<table>") | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | def test_partition_odt_suppresses_text_as_html_when_infer_table_structure_is_False(): | 
					
						
							|  |  |  |     with open(example_doc_path("fake.odt"), "rb") as f: | 
					
						
							|  |  |  |         elements = partition_odt(file=f, infer_table_structure=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     table = elements[1] | 
					
						
							|  |  |  |     assert isinstance(table, Table) | 
					
						
							|  |  |  |     assert table.metadata.text_as_html is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # -- .metadata.last_modified --------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-17 12:43:18 -07:00
										 |  |  | def test_partition_odt_pulls_last_modified_from_filesystem(mocker: MockFixture): | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     filesystem_last_modified = "2029-07-05T09:24:28" | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocker.patch( | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |         "unstructured.partition.odt.get_last_modified_date", return_value=filesystem_last_modified | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     elements = partition_odt(example_doc_path("fake.odt")) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     assert all(e.metadata.last_modified == filesystem_last_modified for e in elements) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-17 12:43:18 -07:00
										 |  |  | def test_partition_odt_prefers_metadata_last_modified_when_provided(mocker: MockFixture): | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     filesystem_last_modified = "2029-07-05T09:24:28" | 
					
						
							|  |  |  |     metadata_last_modified = "2020-07-05T09:24:28" | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocker.patch( | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |         "unstructured.partition.odt.get_last_modified_date", return_value=filesystem_last_modified | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     elements = partition_odt( | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |         example_doc_path("simple.odt"), metadata_last_modified=metadata_last_modified | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  | # -- .metadata.languages ------------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | def test_partition_odt_adds_languages_metadata(): | 
					
						
							|  |  |  |     elements = partition_odt(example_doc_path("simple.odt")) | 
					
						
							|  |  |  |     assert all(e.metadata.languages == ["eng"] for e in elements) | 
					
						
							| 
									
										
										
										
											2023-09-11 16:00:14 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | def test_partition_odt_respects_detect_language_per_element_arg(): | 
					
						
							|  |  |  |     elements = partition_odt( | 
					
						
							|  |  |  |         example_doc_path("language-docs/eng_spa_mult.odt"), detect_language_per_element=True | 
					
						
							| 
									
										
										
										
											2023-10-03 09:40:34 -07:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     assert [e.metadata.languages for e in elements] == [ | 
					
						
							|  |  |  |         ["eng"], | 
					
						
							|  |  |  |         ["spa", "eng"], | 
					
						
							|  |  |  |         ["eng"], | 
					
						
							|  |  |  |         ["eng"], | 
					
						
							|  |  |  |         ["spa"], | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # -- miscellaneous ------------------------------------------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("kwargs", "expected_value"), | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     [({}, "hi_res"), ({"strategy": None}, "hi_res"), ({"strategy": "auto"}, "auto")], | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | ) | 
					
						
							|  |  |  | def test_partition_odt_forwards_strategy_arg_to_partition_docx( | 
					
						
							|  |  |  |     request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     from unstructured.partition.docx import _DocxPartitioner | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]: | 
					
						
							|  |  |  |         yield Text(f"strategy == {self._opts.strategy}") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     _iter_elements_ = method_mock( | 
					
						
							|  |  |  |         request, | 
					
						
							|  |  |  |         _DocxPartitioner, | 
					
						
							|  |  |  |         "_iter_document_elements", | 
					
						
							|  |  |  |         side_effect=fake_iter_document_elements, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     (element,) = partition_odt(example_doc_path("simple.odt"), **kwargs) | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-01 15:40:58 -07:00
										 |  |  |     _iter_elements_.assert_called_once_with(ANY) | 
					
						
							|  |  |  |     assert element.text == f"strategy == {expected_value}" | 
					
						
							| 
									
										
										
										
											2024-05-16 15:14:02 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | def test_partition_odt_round_trips_through_json(): | 
					
						
							|  |  |  |     """Elements produced can be serialized then deserialized without loss.""" | 
					
						
							|  |  |  |     assert_round_trips_through_JSON(partition_odt(example_doc_path("simple.odt"))) | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  | def test_partition_odt_chunks_elements_when_chunking_strategy_is_specified(): | 
					
						
							|  |  |  |     document_path = example_doc_path("simple.odt") | 
					
						
							|  |  |  |     elements = partition_odt(document_path) | 
					
						
							|  |  |  |     chunks = partition_odt(document_path, chunking_strategy="basic") | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 18:04:06 -07:00
										 |  |  |     # -- all chunks are chunk element-types -- | 
					
						
							|  |  |  |     assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks) | 
					
						
							|  |  |  |     # -- chunks from partitioning match those produced by chunking elements in separate step -- | 
					
						
							|  |  |  |     assert chunks == chunk_elements(elements) |