From 85ecdab077a08e7d2fbcc7dfebf5278ccd2ad31c Mon Sep 17 00:00:00 2001 From: Pluto Date: Wed, 20 Nov 2024 14:27:17 +0100 Subject: [PATCH] Add text as html to orig elements chunks (#3779) This simplest solution doesn't drop HTML from metadata when merging Elements from HTML input. We still need to address how to handle nested elements, and if we want to have `LayoutElements` in the metadata of Composite Elements, a unit test showing the current behavior. Note: metadata still contains `orig_elements` which has all the metadata. --- CHANGELOG.md | 1 + .../chunking/test_html_output.py | 90 +++++++++++++++++++ unstructured/chunking/base.py | 2 + unstructured/documents/elements.py | 5 +- 4 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 test_unstructured/chunking/test_html_output.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 31bea43ac..4d27953a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### Features ### Fixes +- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output ## 0.16.5 diff --git a/test_unstructured/chunking/test_html_output.py b/test_unstructured/chunking/test_html_output.py new file mode 100644 index 000000000..6e3e92d94 --- /dev/null +++ b/test_unstructured/chunking/test_html_output.py @@ -0,0 +1,90 @@ +from functools import partial + +import pytest + +from unstructured.chunking.basic import chunk_elements +from unstructured.chunking.title import chunk_by_title +from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title + + +@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)]) +def chunking_fn(request): + return request.param + + +def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn): + metadata_1 = '

Header

' + metadata_2 = '' + metadata_3 = ( + '
' + '' + '' + "
" + ) + combined_metadata = " ".join([metadata_1, metadata_2, metadata_3]) + + elements = [ + Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), + Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)), + Text( + text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3) + ), + ] + chunks = chunking_fn(elements) + assert len(chunks) == 1 + assert chunks[0].metadata.text_as_html == combined_metadata + + +def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn): + """ + Ground truth + + +
+

First

+

Second

+
+
+
+ Elements: Document, Page, Section, Paragraph, Paragraph + Chunk 1: Document, Page, Section, Paragraph + + Chunk 2: + Paragraph + """ + + metadata_1 = '
' + metadata_2 = '

First

' + metadata_3 = '

Second

' + + elements = [ + Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)), + NarrativeText( + text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1") + ), + NarrativeText( + text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1") + ), + ] + chunks = chunking_fn(elements, max_characters=6) + assert len(chunks) == 2 + assert chunks[0].text == "First" + assert chunks[1].text == "Second" + + assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2 + assert chunks[1].metadata.text_as_html == metadata_3 + + +def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): + """Mimic behaviour of elements with non-html metadata""" + metadata_1 = '

Header

' + elements = [ + Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), + ] + chunks = chunking_fn(elements, max_characters=3) + assert len(chunks) == 2 + + assert chunks[0].text == "Hea" + assert chunks[1].text == "der" + assert chunks[0].metadata.text_as_html == '

Header

' + assert chunks[1].metadata.text_as_html == '

Header

' diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 90057d11d..b91c3982e 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -774,6 +774,8 @@ class TextPreChunk: # -- Python 3.7+ maintains dict insertion order -- ordered_unique_keys = {key: None for val_list in values for key in val_list} yield field_name, list(ordered_unique_keys.keys()) + elif strategy is CS.STRING_CONCATENATE: + yield field_name, " ".join(val.strip() for val in values) elif strategy is CS.DROP: continue else: # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index d6f4c3fc3..a9636b5d6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -458,6 +458,9 @@ class ConsolidationStrategy(enum.Enum): FIRST = "first" """Use the first value encountered, omit if not present in any elements.""" + STRING_CONCATENATE = "string_concatenate" + """Combine the values of this field across elements. Only suitable for fields of `str` type.""" + LIST_CONCATENATE = "LIST_CONCATENATE" """Concatenate the list values across elements. Only suitable for fields of `List` type.""" @@ -507,7 +510,7 @@ class ConsolidationStrategy(enum.Enum): "sent_to": cls.FIRST, "signature": cls.FIRST, "subject": cls.FIRST, - "text_as_html": cls.FIRST, # -- only occurs in Table -- + "text_as_html": cls.STRING_CONCATENATE, "table_as_cells": cls.FIRST, # -- only occurs in Table -- "url": cls.FIRST, "key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --