Ml 415/merge inline elements (#3749)

This commit is contained in:
Pluto 2024-10-31 13:17:25 +01:00 committed by GitHub
parent eb1b294b73
commit 1953b8699f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 3248 additions and 18 deletions

View File

@ -1,10 +1,11 @@
## 0.16.4-dev1
## 0.16.4-dev2
### Enhancements
* **`value` attribute in `<input/>` element is parsed to `OntologyElement.text` in ontology**
* **`id` and `class` attributes removed from Table subtags in HTML partitioning**
* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`**
* **Elements created from V2 HTML are less granular** Added merging of adjacent text elements and inline html tags in the HTML partitioner to reduce the number of elements created from V2 HTML.
### Features

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
<header class="Header" id="45b3d0053468484ba1c7b53998115412">
<p class="NarrativeText" id="6cd3c1ba79654abb9c86162b6d1dae46">
Table of Contents
</p>
<address class="Address" id="7d7541d9943c4ad0b88bc47fd0b29e4a">
68 Prince Street Palmdale, CA 93550
</address>
<a class="Hyperlink" id="fde2621aa3df4b159bf305566110cca4">
www.google.com
</a>
<span class="UncategorizedText" id="cb0d6675109241428778c7b996e0b21c">
More text
</span>
</header>
</div>
</body>

View File

@ -4,7 +4,16 @@ import pytest
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.ontology import Column, Document, Page, Paragraph
from unstructured.documents.ontology import (
Column,
Document,
Hyperlink,
Image,
Page,
Paragraph,
Section,
Table,
)
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
from unstructured.partition.html import partition_html
from unstructured.partition.html.transformations import (
@ -171,6 +180,11 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
("html_file_path", "json_file_path"),
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
(
"html_files/example_with_inline_fields.html",
"unstructured_json_output/example_with_inline_fields.json",
),
],
)
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
@ -180,8 +194,136 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text()
predicted_elements = partition_html(text=html_code, html_parser_version="v2")
predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True
)
assert len(expected_json_elements) == len(predicted_elements)
for i in range(len(expected_json_elements)):
assert expected_json_elements[i] == expected_json_elements[i]
assert expected_json_elements[i] == predicted_elements[i]
def test_inline_elements_are_squeezed():
ontology = Document(
children=[
Page(
children=[
Hyperlink(text="Hyperlink1"),
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 2
page, text1 = unstructured_elements
assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3"
def test_text_elements_are_squeezed():
ontology = Document(
children=[
Page(
children=[
Paragraph(text="Paragraph1"),
Paragraph(text="Paragraph2"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 2
page, text1 = unstructured_elements
assert text1.text == "Paragraph1 Paragraph2"
def test_inline_elements_are_squeezed_when_image():
ontology = Document(
children=[
Page(
children=[
Paragraph(text="Paragraph1"),
Hyperlink(text="Hyperlink1"),
Image(text="Image1"),
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
Paragraph(text="Paragraph2"),
Paragraph(text="Paragraph3"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 4
page, text1, image, text2 = unstructured_elements
assert text1.text == "Paragraph1 Hyperlink1"
assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2 Paragraph3"
assert '<a class="Hyperlink"' in text1.metadata.text_as_html
assert '<p class="Paragraph"' in text1.metadata.text_as_html
assert '<a class="Hyperlink"' in text2.metadata.text_as_html
assert '<p class="Paragraph"' in text2.metadata.text_as_html
def test_inline_elements_are_squeezed_when_table():
ontology = Document(
children=[
Page(
children=[
Hyperlink(text="Hyperlink1"),
Paragraph(text="Paragraph1"),
Paragraph(text="Paragraph2"),
Table(text="Table1"),
Paragraph(text="Paragraph2"),
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 4
page, text1, table1, text3 = unstructured_elements
assert text1.text == "Hyperlink1 Paragraph1 Paragraph2"
assert table1.text == "Table1"
assert text3.text == "Paragraph2 Hyperlink2 Hyperlink3"
def test_inline_elements_are_on_many_depths():
ontology = Document(
children=[
Page(
children=[
Hyperlink(text="Hyperlink1"),
Paragraph(text="Paragraph1"),
Section(
children=[
Section(
children=[
Hyperlink(text="Hyperlink2"),
Hyperlink(text="Hyperlink3"),
]
),
Paragraph(text="Paragraph2"),
Hyperlink(text="Hyperlink4"),
]
),
],
)
]
)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 6
page, text1, section1, section2, text2, text3 = unstructured_elements
assert text1.text == "Hyperlink1 Paragraph1"
assert text2.text == "Hyperlink2 Hyperlink3"
assert text3.text == "Paragraph2 Hyperlink4"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
[
{
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "45b3d0053468484ba1c7b53998115412",
"metadata": {
"category_depth": 1,
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "6cd3c1ba79654abb9c86162b6d1dae46",
"metadata": {
"category_depth": 2,
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<p class=\"NarrativeText\" id=\"6cd3c1ba79654abb9c86162b6d1dae46\">Table of Contents </p> <address class=\"Address\" id=\"7d7541d9943c4ad0b88bc47fd0b29e4a\">68 Prince Street Palmdale, CA 93550 </address> <a class=\"Hyperlink\" id=\"fde2621aa3df4b159bf305566110cca4\">www.google.com </a>"
},
"text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
"type": "NarrativeText"
},
{
"element_id": "cb0d6675109241428778c7b996e0b21c",
"metadata": {
"category_depth": 2,
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<span class=\"UncategorizedText\" id=\"cb0d6675109241428778c7b996e0b21c\">More text </span>"
},
"text": "More text",
"type": "UncategorizedText"
}
]

View File

@ -8,9 +8,11 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.documents.ontology import Address, Paragraph
from unstructured.partition.html.html_utils import indent_html
from unstructured.partition.html.transformations import (
ontology_to_unstructured_elements,
parse_html_to_ontology,
parse_html_to_ontology_element,
unstructured_elements_to_ontology,
)
@ -484,3 +486,72 @@ def test_ordered_list():
)
]
_assert_elements_equal(unstructured_elements, expected_elements)
def test_squeezed_elements_are_parsed_back():
# language=HTML
html_as_str = _wrap_in_body_and_page(
"""
<p class="NarrativeText" id="2">
Table of Contents
</p>
<address class="Address" id="3">
68 Prince Street Palmdale, CA 93550
</address>
<a class="Hyperlink" id="4">
www.google.com
</a>
"""
)
unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html(
html_as_str
)
expected_html = indent_html(html_as_str, html_parser="html.parser")
parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser")
assert expected_html == parsed_html
expected_elements = _page_elements + [
NarrativeText(
text="Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
element_id="2",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<p class="NarrativeText" id="2">Table of Contents </p> '
'<address class="Address" id="3">'
"68 Prince Street Palmdale, CA 93550 "
"</address> "
'<a class="Hyperlink" id="4">www.google.com </a>',
parent_id="1",
),
)
]
_assert_elements_equal(unstructured_elements, expected_elements)
def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
# language=HTML
base_html = """
<div class="Page">
About the same
<address class="Address">
1356 Hornor Avenue Oklahoma
</address>
Some text
</div>
"""
# Such HTML is transformed into Page: [Pargraph, Address, Paragraph]
# We would like it to be parsed to UnstructuredElements as [Page, NarrativeText]
ontology = parse_html_to_ontology(base_html)
p1, address, p2 = ontology.children
assert isinstance(p1, Paragraph)
assert isinstance(address, Address)
assert isinstance(p2, Paragraph)
unstructured_elements = ontology_to_unstructured_elements(ontology)
assert len(unstructured_elements) == 2
assert isinstance(unstructured_elements[0], Text)
assert isinstance(unstructured_elements[1], NarrativeText)

View File

@ -1 +1 @@
__version__ = "0.16.4-dev1" # pragma: no cover
__version__ = "0.16.4-dev2" # pragma: no cover

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import html
from collections import OrderedDict
from itertools import chain
from typing import Sequence, Type
from bs4 import BeautifulSoup, Tag
@ -19,11 +20,19 @@ from unstructured.documents.mappings import (
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME,
)
from unstructured.documents.ontology import (
Bibliography,
Citation,
Document,
ElementTypeEnum,
Footnote,
FootnoteReference,
Glossary,
Hyperlink,
NarrativeText,
OntologyElement,
Page,
Paragraph,
Quote,
UncategorizedText,
)
@ -79,17 +88,19 @@ def ontology_to_unstructured_elements(
),
)
]
childreen = []
children = []
for child in ontology_element.children:
childreen += ontology_to_unstructured_elements(
child = ontology_to_unstructured_elements(
child,
parent_id=ontology_element.id,
page_number=page_number,
depth=0 if isinstance(ontology_element, Document) else depth + 1,
filename=filename,
)
children += child
elements_to_return += childreen
combined_children = combine_inline_elements(children)
elements_to_return += combined_children
else:
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
ontology_element.__class__.__name__
@ -115,6 +126,117 @@ def ontology_to_unstructured_elements(
return elements_to_return
def combine_inline_elements(elements: list[Element]) -> list[Element]:
"""
Combines consecutive inline elements into a single element. Inline elements
can be also combined with text elements.
Combined elements contains multiple HTML tags together eg.
{
'text': "Text from element 1 Text from element 2",
'metadata': {
'text_as_html': "<p>Text from element 1</p><a>Text from element 2</a>"
}
}
Args:
elements (list[Element]): A list of elements to be combined.
Returns:
list[Element]: A list of combined elements.
"""
result_elements = []
current_element = None
for next_element in elements:
if current_element is None:
current_element = next_element
continue
if can_unstructured_elements_be_merged(current_element, next_element):
current_element.text += " " + next_element.text
current_element.metadata.text_as_html += " " + next_element.metadata.text_as_html
else:
result_elements.append(current_element)
current_element = next_element
if current_element is not None:
result_elements.append(current_element)
return result_elements
def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool:
"""
Elements can be merged when:
- They are on the same level in the HTML tree
- Neither of them has children
- All elements are inline elements or text element
"""
if current_element.metadata.category_depth != next_element.metadata.category_depth:
return False
current_html_tags = BeautifulSoup(
current_element.metadata.text_as_html, "html.parser"
).find_all(recursive=False)
next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all(
recursive=False
)
ontology_elements = [
parse_html_to_ontology_element(html_tag)
for html_tag in chain(current_html_tags, next_html_tags)
]
for ontology_element in ontology_elements:
if ontology_element.children:
return False
if not (is_inline_element(ontology_element) or is_text_element(ontology_element)):
return False
return True
def is_text_element(ontology_element: OntologyElement) -> bool:
"""Categories or classes that we want to combine with inline text"""
text_classes = [
NarrativeText,
Quote,
Paragraph,
Footnote,
FootnoteReference,
Citation,
Bibliography,
Glossary,
]
text_categories = [ElementTypeEnum.metadata]
if any(isinstance(ontology_element, class_) for class_ in text_classes):
return True
if any(ontology_element.elementType == category for category in text_categories):
return True
return False
def is_inline_element(ontology_element: OntologyElement) -> bool:
"""Categories or classes that we want to combine with text elements"""
inline_classes = [Hyperlink]
inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation]
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
return True
if any(ontology_element.elementType == category for category in inline_categories):
return True
return False
def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement:
"""
Converts a sequence of unstructured Element objects to an OntologyElement object.
@ -144,18 +266,21 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
)
for element in unstructured_elements:
html_as_tag = BeautifulSoup(element.metadata.text_as_html, "html.parser").find()
ontology_element = parse_html_to_ontology_element(html_as_tag)
# Note: Each HTML of non-terminal Element doesn't have children in HTML
# So we just add Ontology Element with tag and class, later children are appended by
# parent_id.
# For terminal Elements entire HTML is added to text_as_html, thus it allows us to
# recreate the entire HTML structure
html_as_tags = BeautifulSoup(element.metadata.text_as_html, "html.parser").find_all(
recursive=False
)
for html_as_tag in html_as_tags:
ontology_element = parse_html_to_ontology_element(html_as_tag)
# Note: Each HTML of non-terminal Element doesn't have children in HTML
# So we just add Ontology Element with tag and class, later children are appended by
# parent_id.
# For terminal Elements entire HTML is added to text_as_html, thus it allows us to
# recreate the entire HTML structure
id_to_element_mapping[ontology_element.id] = ontology_element
id_to_element_mapping[ontology_element.id] = ontology_element
if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping:
id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element)
if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping:
id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element)
root_id, root_element = id_to_element_mapping.popitem(last=False)
return root_element