from functools import partial import pytest from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title @pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)]) def chunking_fn(request): return request.param def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn): metadata_1 = '
First
Second
First
' metadata_3 = 'Second
' elements = [ Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)), NarrativeText( text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1") ), NarrativeText( text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1") ), ] chunks = chunking_fn(elements, max_characters=6) assert len(chunks) == 2 assert chunks[0].text == "First" assert chunks[1].text == "Second" assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2 assert chunks[1].metadata.text_as_html == metadata_3 def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): """Mimic behaviour of elements with non-html metadata""" metadata_1 = '