| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | """Test-suite for `unstructured.partition.tsv` module.""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from __future__ import annotations | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | from pytest_mock import MockFixture | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | from test_unstructured.partition.test_constants import ( | 
					
						
							|  |  |  |     EXPECTED_TABLE, | 
					
						
							|  |  |  |     EXPECTED_TABLE_WITH_EMOJI, | 
					
						
							|  |  |  |     EXPECTED_TEXT, | 
					
						
							|  |  |  |     EXPECTED_TEXT_WITH_EMOJI, | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |     EXPECTED_TEXT_XLSX, | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-10-12 12:47:55 -07:00
										 |  |  | from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path | 
					
						
							| 
									
										
										
										
											2024-05-07 18:09:27 -05:00
										 |  |  | from unstructured.chunking.title import chunk_by_title | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  | from unstructured.cleaners.core import clean_extra_whitespace | 
					
						
							|  |  |  | from unstructured.documents.elements import Table | 
					
						
							|  |  |  | from unstructured.partition.tsv import partition_tsv | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | EXPECTED_FILETYPE = "text/tsv" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "expected_text", "expected_table"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE), | 
					
						
							|  |  |  |         ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_filename(filename: str, expected_text: str, expected_table: str): | 
					
						
							|  |  |  |     elements = partition_tsv(example_doc_path(filename), include_header=False) | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  |     assert clean_extra_whitespace(elements[0].text) == expected_text | 
					
						
							|  |  |  |     assert elements[0].metadata.text_as_html == expected_table | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  |     assert elements[0].metadata.filetype == EXPECTED_FILETYPE | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  |         assert element.metadata.filename == filename | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_filename_with_metadata_filename(): | 
					
						
							|  |  |  |     elements = partition_tsv( | 
					
						
							|  |  |  |         example_doc_path("stanley-cups.tsv"), metadata_filename="test", include_header=False | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT | 
					
						
							|  |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "test" | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "expected_text", "expected_table"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE), | 
					
						
							|  |  |  |         ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_file(filename: str, expected_text: str, expected_table: str): | 
					
						
							|  |  |  |     with open(example_doc_path(filename), "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |         elements = partition_tsv(file=f, include_header=False) | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  |     assert clean_extra_whitespace(elements[0].text) == expected_text | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  |     assert isinstance(elements[0], Table) | 
					
						
							| 
									
										
										
										
											2023-08-13 18:22:36 -07:00
										 |  |  |     assert elements[0].metadata.text_as_html == expected_table | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  |     assert elements[0].metadata.filetype == EXPECTED_FILETYPE | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_file_with_metadata_filename(): | 
					
						
							|  |  |  |     with open(example_doc_path("stanley-cups.tsv"), "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |         elements = partition_tsv(file=f, metadata_filename="test", include_header=False) | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT | 
					
						
							|  |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "test" | 
					
						
							| 
									
										
										
										
											2023-06-15 13:50:53 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | # -- .metadata.last_modified --------------------------------------------------------------------- | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture): | 
					
						
							|  |  |  |     filesystem_last_modified = "2024-05-01T15:37:28" | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     mocker.patch( | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |         "unstructured.partition.tsv.get_last_modified_date", return_value=filesystem_last_modified | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     elements = partition_tsv(example_doc_path("stanley-cups.tsv")) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     assert all(e.metadata.last_modified == filesystem_last_modified for e in elements) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_file_gets_last_modified_None(): | 
					
						
							|  |  |  |     with open(example_doc_path("stanley-cups.tsv"), "rb") as f: | 
					
						
							|  |  |  |         elements = partition_tsv(file=f) | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     assert all(e.metadata.last_modified is None for e in elements) | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_file_path_prefers_metadata_last_modified(mocker: MockFixture): | 
					
						
							|  |  |  |     filesystem_last_modified = "2024-05-01T15:37:28" | 
					
						
							|  |  |  |     metadata_last_modified = "2020-07-05T09:24:28" | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  |     mocker.patch( | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |         "unstructured.partition.tsv.get_last_modified_date", return_value=filesystem_last_modified | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     elements = partition_tsv( | 
					
						
							|  |  |  |         example_doc_path("stanley-cups.tsv"), metadata_last_modified=metadata_last_modified | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     assert all(e.metadata.last_modified == metadata_last_modified for e in elements) | 
					
						
							| 
									
										
										
										
											2023-07-26 15:10:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | def test_partition_tsv_from_file_prefers_metadata_last_modified(): | 
					
						
							|  |  |  |     metadata_last_modified = "2020-07-05T09:24:28" | 
					
						
							| 
									
										
										
										
											2023-08-29 16:59:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     with open(example_doc_path("stanley-cups.tsv"), "rb") as f: | 
					
						
							|  |  |  |         elements = partition_tsv(file=f, metadata_last_modified=metadata_last_modified) | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     assert elements[0].metadata.last_modified == metadata_last_modified | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | # ------------------------------------------------------------------------------------------------ | 
					
						
							| 
									
										
										
										
											2024-03-18 02:09:44 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-12 12:47:55 -07:00
										 |  |  | @pytest.mark.parametrize("filename", ["stanley-cups.tsv", "stanley-cups-with-emoji.tsv"]) | 
					
						
							|  |  |  | def test_partition_tsv_with_json(filename: str): | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |     elements = partition_tsv(example_doc_path(filename), include_header=False) | 
					
						
							| 
									
										
										
										
											2023-10-12 12:47:55 -07:00
										 |  |  |     assert_round_trips_through_JSON(elements) | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # NOTE (jennings) partition_tsv returns a single TableElement per sheet, | 
					
						
							|  |  |  | # so no adding tests for multiple languages like the other partitions | 
					
						
							|  |  |  | def test_partition_tsv_element_metadata_has_languages(): | 
					
						
							|  |  |  |     filename = "example-docs/stanley-cups-with-emoji.tsv" | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |     elements = partition_tsv(filename=filename, include_header=False) | 
					
						
							| 
									
										
										
										
											2023-10-10 20:47:56 -05:00
										 |  |  |     assert elements[0].metadata.languages == ["eng"] | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-07 18:09:27 -05:00
										 |  |  | def test_partition_tsv_header(): | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     elements = partition_tsv( | 
					
						
							|  |  |  |         example_doc_path("stanley-cups.tsv"), strategy="fast", include_header=True | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     e = elements[0] | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |         clean_extra_whitespace(e.text) == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX | 
					
						
							| 
									
										
										
										
											2023-10-31 01:16:36 -07:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-09-23 15:23:10 -07:00
										 |  |  |     assert e.metadata.text_as_html is not None | 
					
						
							|  |  |  |     assert "<thead>" in e.metadata.text_as_html | 
					
						
							| 
									
										
										
										
											2024-05-07 18:09:27 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_tsv_supports_chunking_strategy_while_partitioning(): | 
					
						
							|  |  |  |     elements = partition_tsv(filename=example_doc_path("stanley-cups.tsv")) | 
					
						
							|  |  |  |     chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     chunk_elements = partition_tsv( | 
					
						
							|  |  |  |         example_doc_path("stanley-cups.tsv"), | 
					
						
							|  |  |  |         chunking_strategy="by_title", | 
					
						
							|  |  |  |         max_characters=9, | 
					
						
							|  |  |  |         combine_text_under_n_chars=0, | 
					
						
							|  |  |  |         include_header=False, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # The same chunks are returned if chunking elements or chunking during partitioning. | 
					
						
							|  |  |  |     assert chunk_elements == chunks |