mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
Ml 415/merge inline elements (#3749)
This commit is contained in:
parent
eb1b294b73
commit
1953b8699f
@ -1,10 +1,11 @@
|
||||
## 0.16.4-dev1
|
||||
## 0.16.4-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **`value` attribute in `<input/>` element is parsed to `OntologyElement.text` in ontology**
|
||||
* **`id` and `class` attributes removed from Table subtags in HTML partitioning**
|
||||
* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`**
|
||||
* **Elements created from V2 HTML are less granular** Added merging of adjacent text elements and inline html tags in the HTML partitioner to reduce the number of elements created from V2 HTML.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
1802
test_unstructured/documents/html_files/example_full_doc.html
Normal file
1802
test_unstructured/documents/html_files/example_full_doc.html
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,18 @@
|
||||
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
|
||||
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
|
||||
<header class="Header" id="45b3d0053468484ba1c7b53998115412">
|
||||
<p class="NarrativeText" id="6cd3c1ba79654abb9c86162b6d1dae46">
|
||||
Table of Contents
|
||||
</p>
|
||||
<address class="Address" id="7d7541d9943c4ad0b88bc47fd0b29e4a">
|
||||
68 Prince Street Palmdale, CA 93550
|
||||
</address>
|
||||
<a class="Hyperlink" id="fde2621aa3df4b159bf305566110cca4">
|
||||
www.google.com
|
||||
</a>
|
||||
<span class="UncategorizedText" id="cb0d6675109241428778c7b996e0b21c">
|
||||
More text
|
||||
</span>
|
||||
</header>
|
||||
</div>
|
||||
</body>
|
||||
@ -4,7 +4,16 @@ import pytest
|
||||
|
||||
from unstructured.chunking.basic import chunk_elements
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.ontology import Column, Document, Page, Paragraph
|
||||
from unstructured.documents.ontology import (
|
||||
Column,
|
||||
Document,
|
||||
Hyperlink,
|
||||
Image,
|
||||
Page,
|
||||
Paragraph,
|
||||
Section,
|
||||
Table,
|
||||
)
|
||||
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.html.transformations import (
|
||||
@ -171,6 +180,11 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
|
||||
("html_file_path", "json_file_path"),
|
||||
[
|
||||
("html_files/example.html", "unstructured_json_output/example.json"),
|
||||
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
|
||||
(
|
||||
"html_files/example_with_inline_fields.html",
|
||||
"unstructured_json_output/example_with_inline_fields.json",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
|
||||
@ -180,8 +194,136 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
|
||||
expected_json_elements = elements_from_json(str(json_file_path))
|
||||
html_code = html_file_path.read_text()
|
||||
|
||||
predicted_elements = partition_html(text=html_code, html_parser_version="v2")
|
||||
predicted_elements = partition_html(
|
||||
text=html_code, html_parser_version="v2", unique_element_ids=True
|
||||
)
|
||||
assert len(expected_json_elements) == len(predicted_elements)
|
||||
|
||||
for i in range(len(expected_json_elements)):
|
||||
assert expected_json_elements[i] == expected_json_elements[i]
|
||||
assert expected_json_elements[i] == predicted_elements[i]
|
||||
|
||||
|
||||
def test_inline_elements_are_squeezed():
|
||||
ontology = Document(
|
||||
children=[
|
||||
Page(
|
||||
children=[
|
||||
Hyperlink(text="Hyperlink1"),
|
||||
Hyperlink(text="Hyperlink2"),
|
||||
Hyperlink(text="Hyperlink3"),
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
assert len(unstructured_elements) == 2
|
||||
|
||||
page, text1 = unstructured_elements
|
||||
assert text1.text == "Hyperlink1 Hyperlink2 Hyperlink3"
|
||||
|
||||
|
||||
def test_text_elements_are_squeezed():
|
||||
ontology = Document(
|
||||
children=[
|
||||
Page(
|
||||
children=[
|
||||
Paragraph(text="Paragraph1"),
|
||||
Paragraph(text="Paragraph2"),
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
assert len(unstructured_elements) == 2
|
||||
|
||||
page, text1 = unstructured_elements
|
||||
assert text1.text == "Paragraph1 Paragraph2"
|
||||
|
||||
|
||||
def test_inline_elements_are_squeezed_when_image():
|
||||
ontology = Document(
|
||||
children=[
|
||||
Page(
|
||||
children=[
|
||||
Paragraph(text="Paragraph1"),
|
||||
Hyperlink(text="Hyperlink1"),
|
||||
Image(text="Image1"),
|
||||
Hyperlink(text="Hyperlink2"),
|
||||
Hyperlink(text="Hyperlink3"),
|
||||
Paragraph(text="Paragraph2"),
|
||||
Paragraph(text="Paragraph3"),
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
assert len(unstructured_elements) == 4
|
||||
|
||||
page, text1, image, text2 = unstructured_elements
|
||||
assert text1.text == "Paragraph1 Hyperlink1"
|
||||
assert text2.text == "Hyperlink2 Hyperlink3 Paragraph2 Paragraph3"
|
||||
|
||||
assert '<a class="Hyperlink"' in text1.metadata.text_as_html
|
||||
assert '<p class="Paragraph"' in text1.metadata.text_as_html
|
||||
|
||||
assert '<a class="Hyperlink"' in text2.metadata.text_as_html
|
||||
assert '<p class="Paragraph"' in text2.metadata.text_as_html
|
||||
|
||||
|
||||
def test_inline_elements_are_squeezed_when_table():
|
||||
ontology = Document(
|
||||
children=[
|
||||
Page(
|
||||
children=[
|
||||
Hyperlink(text="Hyperlink1"),
|
||||
Paragraph(text="Paragraph1"),
|
||||
Paragraph(text="Paragraph2"),
|
||||
Table(text="Table1"),
|
||||
Paragraph(text="Paragraph2"),
|
||||
Hyperlink(text="Hyperlink2"),
|
||||
Hyperlink(text="Hyperlink3"),
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
assert len(unstructured_elements) == 4
|
||||
|
||||
page, text1, table1, text3 = unstructured_elements
|
||||
assert text1.text == "Hyperlink1 Paragraph1 Paragraph2"
|
||||
assert table1.text == "Table1"
|
||||
assert text3.text == "Paragraph2 Hyperlink2 Hyperlink3"
|
||||
|
||||
|
||||
def test_inline_elements_are_on_many_depths():
|
||||
ontology = Document(
|
||||
children=[
|
||||
Page(
|
||||
children=[
|
||||
Hyperlink(text="Hyperlink1"),
|
||||
Paragraph(text="Paragraph1"),
|
||||
Section(
|
||||
children=[
|
||||
Section(
|
||||
children=[
|
||||
Hyperlink(text="Hyperlink2"),
|
||||
Hyperlink(text="Hyperlink3"),
|
||||
]
|
||||
),
|
||||
Paragraph(text="Paragraph2"),
|
||||
Hyperlink(text="Hyperlink4"),
|
||||
]
|
||||
),
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
|
||||
assert len(unstructured_elements) == 6
|
||||
|
||||
page, text1, section1, section2, text2, text3 = unstructured_elements
|
||||
|
||||
assert text1.text == "Hyperlink1 Paragraph1"
|
||||
assert text2.text == "Hyperlink2 Hyperlink3"
|
||||
assert text3.text == "Paragraph2 Hyperlink4"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,46 @@
|
||||
[
|
||||
{
|
||||
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
|
||||
},
|
||||
"text": "",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
"element_id": "6cd3c1ba79654abb9c86162b6d1dae46",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<p class=\"NarrativeText\" id=\"6cd3c1ba79654abb9c86162b6d1dae46\">Table of Contents </p> <address class=\"Address\" id=\"7d7541d9943c4ad0b88bc47fd0b29e4a\">68 Prince Street Palmdale, CA 93550 </address> <a class=\"Hyperlink\" id=\"fde2621aa3df4b159bf305566110cca4\">www.google.com </a>"
|
||||
},
|
||||
"text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "cb0d6675109241428778c7b996e0b21c",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<span class=\"UncategorizedText\" id=\"cb0d6675109241428778c7b996e0b21c\">More text </span>"
|
||||
},
|
||||
"text": "More text",
|
||||
"type": "UncategorizedText"
|
||||
}
|
||||
]
|
||||
@ -8,9 +8,11 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.documents.ontology import Address, Paragraph
|
||||
from unstructured.partition.html.html_utils import indent_html
|
||||
from unstructured.partition.html.transformations import (
|
||||
ontology_to_unstructured_elements,
|
||||
parse_html_to_ontology,
|
||||
parse_html_to_ontology_element,
|
||||
unstructured_elements_to_ontology,
|
||||
)
|
||||
@ -484,3 +486,72 @@ def test_ordered_list():
|
||||
)
|
||||
]
|
||||
_assert_elements_equal(unstructured_elements, expected_elements)
|
||||
|
||||
|
||||
def test_squeezed_elements_are_parsed_back():
|
||||
# language=HTML
|
||||
html_as_str = _wrap_in_body_and_page(
|
||||
"""
|
||||
<p class="NarrativeText" id="2">
|
||||
Table of Contents
|
||||
</p>
|
||||
<address class="Address" id="3">
|
||||
68 Prince Street Palmdale, CA 93550
|
||||
</address>
|
||||
<a class="Hyperlink" id="4">
|
||||
www.google.com
|
||||
</a>
|
||||
"""
|
||||
)
|
||||
|
||||
unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html(
|
||||
html_as_str
|
||||
)
|
||||
expected_html = indent_html(html_as_str, html_parser="html.parser")
|
||||
parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser")
|
||||
|
||||
assert expected_html == parsed_html
|
||||
expected_elements = _page_elements + [
|
||||
NarrativeText(
|
||||
text="Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
|
||||
element_id="2",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="NarrativeText" id="2">Table of Contents </p> '
|
||||
'<address class="Address" id="3">'
|
||||
"68 Prince Street Palmdale, CA 93550 "
|
||||
"</address> "
|
||||
'<a class="Hyperlink" id="4">www.google.com </a>',
|
||||
parent_id="1",
|
||||
),
|
||||
)
|
||||
]
|
||||
_assert_elements_equal(unstructured_elements, expected_elements)
|
||||
|
||||
|
||||
def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
|
||||
# language=HTML
|
||||
base_html = """
|
||||
<div class="Page">
|
||||
About the same
|
||||
<address class="Address">
|
||||
1356 Hornor Avenue Oklahoma
|
||||
</address>
|
||||
Some text
|
||||
</div>
|
||||
"""
|
||||
# Such HTML is transformed into Page: [Pargraph, Address, Paragraph]
|
||||
# We would like it to be parsed to UnstructuredElements as [Page, NarrativeText]
|
||||
|
||||
ontology = parse_html_to_ontology(base_html)
|
||||
|
||||
p1, address, p2 = ontology.children
|
||||
assert isinstance(p1, Paragraph)
|
||||
assert isinstance(address, Address)
|
||||
assert isinstance(p2, Paragraph)
|
||||
|
||||
unstructured_elements = ontology_to_unstructured_elements(ontology)
|
||||
|
||||
assert len(unstructured_elements) == 2
|
||||
assert isinstance(unstructured_elements[0], Text)
|
||||
assert isinstance(unstructured_elements[1], NarrativeText)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.4-dev1" # pragma: no cover
|
||||
__version__ = "0.16.4-dev2" # pragma: no cover
|
||||
|
||||
@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import html
|
||||
from collections import OrderedDict
|
||||
from itertools import chain
|
||||
from typing import Sequence, Type
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
@ -19,11 +20,19 @@ from unstructured.documents.mappings import (
|
||||
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME,
|
||||
)
|
||||
from unstructured.documents.ontology import (
|
||||
Bibliography,
|
||||
Citation,
|
||||
Document,
|
||||
ElementTypeEnum,
|
||||
Footnote,
|
||||
FootnoteReference,
|
||||
Glossary,
|
||||
Hyperlink,
|
||||
NarrativeText,
|
||||
OntologyElement,
|
||||
Page,
|
||||
Paragraph,
|
||||
Quote,
|
||||
UncategorizedText,
|
||||
)
|
||||
|
||||
@ -79,17 +88,19 @@ def ontology_to_unstructured_elements(
|
||||
),
|
||||
)
|
||||
]
|
||||
childreen = []
|
||||
children = []
|
||||
for child in ontology_element.children:
|
||||
childreen += ontology_to_unstructured_elements(
|
||||
child = ontology_to_unstructured_elements(
|
||||
child,
|
||||
parent_id=ontology_element.id,
|
||||
page_number=page_number,
|
||||
depth=0 if isinstance(ontology_element, Document) else depth + 1,
|
||||
filename=filename,
|
||||
)
|
||||
children += child
|
||||
|
||||
elements_to_return += childreen
|
||||
combined_children = combine_inline_elements(children)
|
||||
elements_to_return += combined_children
|
||||
else:
|
||||
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
|
||||
ontology_element.__class__.__name__
|
||||
@ -115,6 +126,117 @@ def ontology_to_unstructured_elements(
|
||||
return elements_to_return
|
||||
|
||||
|
||||
def combine_inline_elements(elements: list[Element]) -> list[Element]:
|
||||
"""
|
||||
Combines consecutive inline elements into a single element. Inline elements
|
||||
can be also combined with text elements.
|
||||
|
||||
Combined elements contains multiple HTML tags together eg.
|
||||
{
|
||||
'text': "Text from element 1 Text from element 2",
|
||||
'metadata': {
|
||||
'text_as_html': "<p>Text from element 1</p><a>Text from element 2</a>"
|
||||
}
|
||||
}
|
||||
|
||||
Args:
|
||||
elements (list[Element]): A list of elements to be combined.
|
||||
|
||||
Returns:
|
||||
list[Element]: A list of combined elements.
|
||||
"""
|
||||
result_elements = []
|
||||
|
||||
current_element = None
|
||||
for next_element in elements:
|
||||
if current_element is None:
|
||||
current_element = next_element
|
||||
continue
|
||||
|
||||
if can_unstructured_elements_be_merged(current_element, next_element):
|
||||
current_element.text += " " + next_element.text
|
||||
current_element.metadata.text_as_html += " " + next_element.metadata.text_as_html
|
||||
else:
|
||||
result_elements.append(current_element)
|
||||
current_element = next_element
|
||||
|
||||
if current_element is not None:
|
||||
result_elements.append(current_element)
|
||||
|
||||
return result_elements
|
||||
|
||||
|
||||
def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool:
|
||||
"""
|
||||
Elements can be merged when:
|
||||
- They are on the same level in the HTML tree
|
||||
- Neither of them has children
|
||||
- All elements are inline elements or text element
|
||||
"""
|
||||
if current_element.metadata.category_depth != next_element.metadata.category_depth:
|
||||
return False
|
||||
|
||||
current_html_tags = BeautifulSoup(
|
||||
current_element.metadata.text_as_html, "html.parser"
|
||||
).find_all(recursive=False)
|
||||
next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all(
|
||||
recursive=False
|
||||
)
|
||||
|
||||
ontology_elements = [
|
||||
parse_html_to_ontology_element(html_tag)
|
||||
for html_tag in chain(current_html_tags, next_html_tags)
|
||||
]
|
||||
|
||||
for ontology_element in ontology_elements:
|
||||
if ontology_element.children:
|
||||
return False
|
||||
|
||||
if not (is_inline_element(ontology_element) or is_text_element(ontology_element)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_text_element(ontology_element: OntologyElement) -> bool:
|
||||
"""Categories or classes that we want to combine with inline text"""
|
||||
|
||||
text_classes = [
|
||||
NarrativeText,
|
||||
Quote,
|
||||
Paragraph,
|
||||
Footnote,
|
||||
FootnoteReference,
|
||||
Citation,
|
||||
Bibliography,
|
||||
Glossary,
|
||||
]
|
||||
text_categories = [ElementTypeEnum.metadata]
|
||||
|
||||
if any(isinstance(ontology_element, class_) for class_ in text_classes):
|
||||
return True
|
||||
|
||||
if any(ontology_element.elementType == category for category in text_categories):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_inline_element(ontology_element: OntologyElement) -> bool:
|
||||
"""Categories or classes that we want to combine with text elements"""
|
||||
|
||||
inline_classes = [Hyperlink]
|
||||
inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation]
|
||||
|
||||
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
|
||||
return True
|
||||
|
||||
if any(ontology_element.elementType == category for category in inline_categories):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement:
|
||||
"""
|
||||
Converts a sequence of unstructured Element objects to an OntologyElement object.
|
||||
@ -144,18 +266,21 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
|
||||
)
|
||||
|
||||
for element in unstructured_elements:
|
||||
html_as_tag = BeautifulSoup(element.metadata.text_as_html, "html.parser").find()
|
||||
ontology_element = parse_html_to_ontology_element(html_as_tag)
|
||||
# Note: Each HTML of non-terminal Element doesn't have children in HTML
|
||||
# So we just add Ontology Element with tag and class, later children are appended by
|
||||
# parent_id.
|
||||
# For terminal Elements entire HTML is added to text_as_html, thus it allows us to
|
||||
# recreate the entire HTML structure
|
||||
html_as_tags = BeautifulSoup(element.metadata.text_as_html, "html.parser").find_all(
|
||||
recursive=False
|
||||
)
|
||||
for html_as_tag in html_as_tags:
|
||||
ontology_element = parse_html_to_ontology_element(html_as_tag)
|
||||
# Note: Each HTML of non-terminal Element doesn't have children in HTML
|
||||
# So we just add Ontology Element with tag and class, later children are appended by
|
||||
# parent_id.
|
||||
# For terminal Elements entire HTML is added to text_as_html, thus it allows us to
|
||||
# recreate the entire HTML structure
|
||||
|
||||
id_to_element_mapping[ontology_element.id] = ontology_element
|
||||
id_to_element_mapping[ontology_element.id] = ontology_element
|
||||
|
||||
if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping:
|
||||
id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element)
|
||||
if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping:
|
||||
id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element)
|
||||
|
||||
root_id, root_element = id_to_element_mapping.popitem(last=False)
|
||||
return root_element
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user