mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 d726963e42
			
		
	
	
		d726963e42
		
			
		
	
	
	
	
		
			
			Each partitioner has a test like `test_partition_x_with_json()`. What these do is serialize the elements produced by the partitioner to JSON, then read them back in from JSON and compare the before and after elements. Because our element equality (`Element.__eq__()`) is shallow, this doesn't tell us a lot, but if we take it one more step, like `List[Element] -> JSON -> List[Element] -> JSON` and then compare the JSON, it gives us some confidence that the serialized elements can be "re-hydrated" without losing any information. This actually showed up a few problems, all in the serialization/deserialization (serde) code that all elements share.
		
			
				
	
	
		
			162 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			162 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 | |
| from unstructured.chunking.title import chunk_by_title
 | |
| from unstructured.documents.elements import Title
 | |
| from unstructured.partition.rst import partition_rst
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_filename(filename="example-docs/README.rst"):
 | |
|     elements = partition_rst(filename=filename)
 | |
|     assert elements[0] == Title("Example Docs")
 | |
|     assert elements[0].metadata.filetype == "text/x-rst"
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "README.rst"
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_filename_returns_uns_elements(filename="example-docs/README.rst"):
 | |
|     elements = partition_rst(filename=filename)
 | |
|     assert isinstance(elements[0], Title)
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_filename_with_metadata_filename(
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     elements = partition_rst(filename=filename, metadata_filename="test")
 | |
|     assert all(element.metadata.filename == "test" for element in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_file(filename="example-docs/README.rst"):
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_rst(file=f)
 | |
|     assert elements[0] == Title("Example Docs")
 | |
|     assert elements[0].metadata.filetype == "text/x-rst"
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename is None
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_file_with_metadata_filename(
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_rst(file=f, metadata_filename="test")
 | |
|     assert elements[0] == Title("Example Docs")
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_filename_exclude_metadata(
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     elements = partition_rst(filename=filename, include_metadata=False)
 | |
| 
 | |
|     for i in range(len(elements)):
 | |
|         assert elements[i].metadata.to_dict() == {}
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_file_exclude_metadata(filename="example-docs/README.rst"):
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_rst(file=f, include_metadata=False)
 | |
| 
 | |
|     for i in range(len(elements)):
 | |
|         assert elements[i].metadata.to_dict() == {}
 | |
| 
 | |
| 
 | |
| def test_partition_rst_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.html.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = partition_rst(
 | |
|         filename=filename,
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.last_modified == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_rst_with_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.html.get_last_modified_date",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     elements = partition_rst(
 | |
|         filename=filename,
 | |
|         metadata_last_modified=expected_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.last_modified == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_file_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.html.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_rst(
 | |
|             file=f,
 | |
|         )
 | |
| 
 | |
|     assert elements[0].metadata.last_modified == mocked_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_rst_from_file_with_custom_metadata_date(
 | |
|     mocker,
 | |
|     filename="example-docs/README.rst",
 | |
| ):
 | |
|     mocked_last_modification_date = "2029-07-05T09:24:28"
 | |
|     expected_last_modification_date = "2020-07-05T09:24:28"
 | |
| 
 | |
|     mocker.patch(
 | |
|         "unstructured.partition.html.get_last_modified_date_from_file",
 | |
|         return_value=mocked_last_modification_date,
 | |
|     )
 | |
| 
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_rst(file=f, metadata_last_modified=expected_last_modification_date)
 | |
| 
 | |
|     assert elements[0].metadata.last_modified == expected_last_modification_date
 | |
| 
 | |
| 
 | |
| def test_partition_rst_with_json():
 | |
|     elements = partition_rst(example_doc_path("README.rst"))
 | |
|     assert_round_trips_through_JSON(elements)
 | |
| 
 | |
| 
 | |
| def test_add_chunking_strategy_on_partition_rst(filename="example-docs/README.rst"):
 | |
|     elements = partition_rst(filename=filename)
 | |
|     chunk_elements = partition_rst(filename, chunking_strategy="by_title")
 | |
|     chunks = chunk_by_title(elements)
 | |
|     assert chunk_elements != elements
 | |
|     assert chunk_elements == chunks
 | |
| 
 | |
| 
 | |
| def test_partition_rst_element_metadata_has_languages():
 | |
|     filename = "example-docs/README.rst"
 | |
|     elements = partition_rst(filename=filename)
 | |
|     assert elements[0].metadata.languages == ["eng"]
 | |
| 
 | |
| 
 | |
| def test_partition_rst_respects_detect_language_per_element():
 | |
|     filename = "example-docs/language-docs/eng_spa_mult.rst"
 | |
|     elements = partition_rst(filename=filename, detect_language_per_element=True)
 | |
|     langs = [element.metadata.languages for element in elements]
 | |
|     assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
 |