mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-07 15:50:07 +00:00
Add text as html to orig elements chunks (#3779)
This simplest solution doesn't drop HTML from metadata when merging Elements from HTML input. We still need to address how to handle nested elements, and if we want to have `LayoutElements` in the metadata of Composite Elements, a unit test showing the current behavior. Note: metadata still contains `orig_elements` which has all the metadata.
This commit is contained in:
parent
e1babf0660
commit
85ecdab077
@ -7,6 +7,7 @@
|
|||||||
### Features
|
### Features
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output
|
||||||
|
|
||||||
## 0.16.5
|
## 0.16.5
|
||||||
|
|
||||||
|
90
test_unstructured/chunking/test_html_output.py
Normal file
90
test_unstructured/chunking/test_html_output.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.chunking.basic import chunk_elements
|
||||||
|
from unstructured.chunking.title import chunk_by_title
|
||||||
|
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)])
|
||||||
|
def chunking_fn(request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
|
def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn):
|
||||||
|
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
|
||||||
|
metadata_2 = '<time class="CalendarDate" id="2">Date: October 30, 2023 </time>'
|
||||||
|
metadata_3 = (
|
||||||
|
'<form class="Form" id="3"> '
|
||||||
|
'<label class="FormField" for="company-name" id="4">Form field name </label>'
|
||||||
|
'<input class="FormFieldValue" id="5" value="Example value" />'
|
||||||
|
"</form>"
|
||||||
|
)
|
||||||
|
combined_metadata = " ".join([metadata_1, metadata_2, metadata_3])
|
||||||
|
|
||||||
|
elements = [
|
||||||
|
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
|
||||||
|
Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)),
|
||||||
|
Text(
|
||||||
|
text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3)
|
||||||
|
),
|
||||||
|
]
|
||||||
|
chunks = chunking_fn(elements)
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].metadata.text_as_html == combined_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn):
|
||||||
|
"""
|
||||||
|
Ground truth
|
||||||
|
<Document>
|
||||||
|
<Page>
|
||||||
|
<Section>
|
||||||
|
<p>First</p>
|
||||||
|
<p>Second</p>
|
||||||
|
</Section>
|
||||||
|
</Page>
|
||||||
|
</Document>
|
||||||
|
Elements: Document, Page, Section, Paragraph, Paragraph
|
||||||
|
Chunk 1: Document, Page, Section, Paragraph
|
||||||
|
|
||||||
|
Chunk 2:
|
||||||
|
Paragraph
|
||||||
|
"""
|
||||||
|
|
||||||
|
metadata_1 = '<div class="Section" id="1" />'
|
||||||
|
metadata_2 = '<p class="Paragraph" id="2">First </p>'
|
||||||
|
metadata_3 = '<p class="Paragraph" id="3">Second </p>'
|
||||||
|
|
||||||
|
elements = [
|
||||||
|
Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)),
|
||||||
|
NarrativeText(
|
||||||
|
text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1")
|
||||||
|
),
|
||||||
|
NarrativeText(
|
||||||
|
text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1")
|
||||||
|
),
|
||||||
|
]
|
||||||
|
chunks = chunking_fn(elements, max_characters=6)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].text == "First"
|
||||||
|
assert chunks[1].text == "Second"
|
||||||
|
|
||||||
|
assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2
|
||||||
|
assert chunks[1].metadata.text_as_html == metadata_3
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn):
|
||||||
|
"""Mimic behaviour of elements with non-html metadata"""
|
||||||
|
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
|
||||||
|
elements = [
|
||||||
|
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
|
||||||
|
]
|
||||||
|
chunks = chunking_fn(elements, max_characters=3)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
|
||||||
|
assert chunks[0].text == "Hea"
|
||||||
|
assert chunks[1].text == "der"
|
||||||
|
assert chunks[0].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
|
||||||
|
assert chunks[1].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
|
@ -774,6 +774,8 @@ class TextPreChunk:
|
|||||||
# -- Python 3.7+ maintains dict insertion order --
|
# -- Python 3.7+ maintains dict insertion order --
|
||||||
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
||||||
yield field_name, list(ordered_unique_keys.keys())
|
yield field_name, list(ordered_unique_keys.keys())
|
||||||
|
elif strategy is CS.STRING_CONCATENATE:
|
||||||
|
yield field_name, " ".join(val.strip() for val in values)
|
||||||
elif strategy is CS.DROP:
|
elif strategy is CS.DROP:
|
||||||
continue
|
continue
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
|
@ -458,6 +458,9 @@ class ConsolidationStrategy(enum.Enum):
|
|||||||
FIRST = "first"
|
FIRST = "first"
|
||||||
"""Use the first value encountered, omit if not present in any elements."""
|
"""Use the first value encountered, omit if not present in any elements."""
|
||||||
|
|
||||||
|
STRING_CONCATENATE = "string_concatenate"
|
||||||
|
"""Combine the values of this field across elements. Only suitable for fields of `str` type."""
|
||||||
|
|
||||||
LIST_CONCATENATE = "LIST_CONCATENATE"
|
LIST_CONCATENATE = "LIST_CONCATENATE"
|
||||||
"""Concatenate the list values across elements. Only suitable for fields of `List` type."""
|
"""Concatenate the list values across elements. Only suitable for fields of `List` type."""
|
||||||
|
|
||||||
@ -507,7 +510,7 @@ class ConsolidationStrategy(enum.Enum):
|
|||||||
"sent_to": cls.FIRST,
|
"sent_to": cls.FIRST,
|
||||||
"signature": cls.FIRST,
|
"signature": cls.FIRST,
|
||||||
"subject": cls.FIRST,
|
"subject": cls.FIRST,
|
||||||
"text_as_html": cls.FIRST, # -- only occurs in Table --
|
"text_as_html": cls.STRING_CONCATENATE,
|
||||||
"table_as_cells": cls.FIRST, # -- only occurs in Table --
|
"table_as_cells": cls.FIRST, # -- only occurs in Table --
|
||||||
"url": cls.FIRST,
|
"url": cls.FIRST,
|
||||||
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
|
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
|
||||||
|
Loading…
x
Reference in New Issue
Block a user