mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-30 17:38:13 +00:00 
			
		
		
		
	 3bab9d93e6
			
		
	
	
		3bab9d93e6
		
			
		
	
	
	
	
		
			
			**Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
		
			
				
	
	
		
			188 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			188 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Test-suite for `unstructured.partition.tsv` module."""
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| import pytest
 | |
| from pytest_mock import MockFixture
 | |
| 
 | |
| from test_unstructured.partition.test_constants import (
 | |
|     EXPECTED_TABLE,
 | |
|     EXPECTED_TABLE_WITH_EMOJI,
 | |
|     EXPECTED_TEXT,
 | |
|     EXPECTED_TEXT_WITH_EMOJI,
 | |
|     EXPECTED_TEXT_XLSX,
 | |
| )
 | |
| from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 | |
| from unstructured.chunking.title import chunk_by_title
 | |
| from unstructured.cleaners.core import clean_extra_whitespace
 | |
| from unstructured.documents.elements import Table
 | |
| from unstructured.partition.tsv import partition_tsv
 | |
| 
 | |
| EXPECTED_FILETYPE = "text/tsv"
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("filename", "expected_text", "expected_table"),
 | |
|     [
 | |
|         ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
 | |
|         ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
 | |
|     ],
 | |
| )
 | |
| def test_partition_tsv_from_filename(filename: str, expected_text: str, expected_table: str):
 | |
|     elements = partition_tsv(example_doc_path(filename), include_header=False)
 | |
| 
 | |
|     assert clean_extra_whitespace(elements[0].text) == expected_text
 | |
|     assert elements[0].metadata.text_as_html == expected_table
 | |
|     assert elements[0].metadata.filetype == EXPECTED_FILETYPE
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == filename
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_filename_with_metadata_filename():
 | |
|     elements = partition_tsv(
 | |
|         example_doc_path("stanley-cups.tsv"), metadata_filename="test", include_header=False
 | |
|     )
 | |
| 
 | |
|     assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("filename", "expected_text", "expected_table"),
 | |
|     [
 | |
|         ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
 | |
|         ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
 | |
|     ],
 | |
| )
 | |
| def test_partition_tsv_from_file(filename: str, expected_text: str, expected_table: str):
 | |
|     with open(example_doc_path(filename), "rb") as f:
 | |
|         elements = partition_tsv(file=f, include_header=False)
 | |
| 
 | |
|     assert clean_extra_whitespace(elements[0].text) == expected_text
 | |
|     assert isinstance(elements[0], Table)
 | |
|     assert elements[0].metadata.text_as_html == expected_table
 | |
|     assert elements[0].metadata.filetype == EXPECTED_FILETYPE
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename is None
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_file_with_metadata_filename():
 | |
|     with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
 | |
|         elements = partition_tsv(file=f, metadata_filename="test", include_header=False)
 | |
| 
 | |
|     assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_filename_exclude_metadata():
 | |
|     elements = partition_tsv(
 | |
|         example_doc_path("stanley-cups.tsv"), include_metadata=False, include_header=False
 | |
|     )
 | |
| 
 | |
|     assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
 | |
|     assert isinstance(elements[0], Table)
 | |
|     assert elements[0].metadata.text_as_html is None
 | |
|     assert elements[0].metadata.filetype is None
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename is None
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_file_exclude_metadata():
 | |
|     with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
 | |
|         elements = partition_tsv(file=f, include_metadata=False)
 | |
| 
 | |
|     for i in range(len(elements)):
 | |
|         assert elements[i].metadata.to_dict() == {}
 | |
| 
 | |
| 
 | |
| # -- .metadata.last_modified ---------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture):
 | |
|     filesystem_last_modified = "2024-05-01T15:37:28"
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.tsv.get_last_modified_date", return_value=filesystem_last_modified
 | |
|     )
 | |
| 
 | |
|     elements = partition_tsv(example_doc_path("stanley-cups.tsv"))
 | |
| 
 | |
|     assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_file_gets_last_modified_None():
 | |
|     with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
 | |
|         elements = partition_tsv(file=f)
 | |
| 
 | |
|     assert all(e.metadata.last_modified is None for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
 | |
|     filesystem_last_modified = "2024-05-01T15:37:28"
 | |
|     metadata_last_modified = "2020-07-05T09:24:28"
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.tsv.get_last_modified_date", return_value=filesystem_last_modified
 | |
|     )
 | |
| 
 | |
|     elements = partition_tsv(
 | |
|         example_doc_path("stanley-cups.tsv"), metadata_last_modified=metadata_last_modified
 | |
|     )
 | |
| 
 | |
|     assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_from_file_prefers_metadata_last_modified():
 | |
|     metadata_last_modified = "2020-07-05T09:24:28"
 | |
| 
 | |
|     with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
 | |
|         elements = partition_tsv(file=f, metadata_last_modified=metadata_last_modified)
 | |
| 
 | |
|     assert elements[0].metadata.last_modified == metadata_last_modified
 | |
| 
 | |
| 
 | |
| # ------------------------------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("filename", ["stanley-cups.tsv", "stanley-cups-with-emoji.tsv"])
 | |
| def test_partition_tsv_with_json(filename: str):
 | |
|     elements = partition_tsv(example_doc_path(filename), include_header=False)
 | |
|     assert_round_trips_through_JSON(elements)
 | |
| 
 | |
| 
 | |
| # NOTE (jennings) partition_tsv returns a single TableElement per sheet,
 | |
| # so no adding tests for multiple languages like the other partitions
 | |
| def test_partition_tsv_element_metadata_has_languages():
 | |
|     filename = "example-docs/stanley-cups-with-emoji.tsv"
 | |
|     elements = partition_tsv(filename=filename, include_header=False)
 | |
|     assert elements[0].metadata.languages == ["eng"]
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_header():
 | |
|     elements = partition_tsv(
 | |
|         example_doc_path("stanley-cups.tsv"), strategy="fast", include_header=True
 | |
|     )
 | |
| 
 | |
|     e = elements[0]
 | |
|     assert (
 | |
|         clean_extra_whitespace(e.text) == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
 | |
|     )
 | |
|     assert e.metadata.text_as_html is not None
 | |
|     assert "<thead>" in e.metadata.text_as_html
 | |
| 
 | |
| 
 | |
| def test_partition_tsv_supports_chunking_strategy_while_partitioning():
 | |
|     elements = partition_tsv(filename=example_doc_path("stanley-cups.tsv"))
 | |
|     chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
 | |
| 
 | |
|     chunk_elements = partition_tsv(
 | |
|         example_doc_path("stanley-cups.tsv"),
 | |
|         chunking_strategy="by_title",
 | |
|         max_characters=9,
 | |
|         combine_text_under_n_chars=0,
 | |
|         include_header=False,
 | |
|     )
 | |
| 
 | |
|     # The same chunks are returned if chunking elements or chunking during partitioning.
 | |
|     assert chunk_elements == chunks
 |