mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-31 18:14:51 +00:00
ML-405/ML-427 - OntologyElement improvements (#3758)
- the "value" attribute from <input/> tag will be taken into account and processed as "text" in ontology - the tables will now be parsed without any ids and classes - we have different reasons behind that, for example, embeddings with ids and classes can lose some semantic value. Also, more tokens = more expensive LLM call - cleaned to_html, created to_text for OntologyElement
This commit is contained in:
parent
d0be1151a1
commit
eb1b294b73
@ -1,7 +1,11 @@
|
||||
## 0.16.4-dev0
|
||||
## 0.16.4-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **`value` attribute in `<input/>` element is parsed to `OntologyElement.text` in ontology**
|
||||
* **`id` and `class` attributes removed from Table subtags in HTML partitioning**
|
||||
* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`**
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
@ -56,7 +56,7 @@
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
|
||||
},
|
||||
"text": "From field name",
|
||||
"text": "From field name Example value",
|
||||
"type": "UncategorizedText"
|
||||
},
|
||||
{
|
||||
@ -78,9 +78,9 @@
|
||||
"filename": "example.pdf",
|
||||
"page_number": 1,
|
||||
"parent_id": "592422373ed741b68a077e2003f8ed81",
|
||||
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead class=\"TableHeader\" id=\"50a5548a87e84024af590b3d2830d140\"> <tr class=\"TableRow\" id=\"5e473d7742474412be72dc4e2c45bd4a\"> <th class=\"TableCellHeader\" id=\"01800309aa42411c98ae30f85b23f399\">Description </th><th class=\"TableCellHeader\" id=\"c2765b63d08946a2851955e79e301de4\">Row header </th></tr></thead><tbody class=\"TableBody\" id=\"e0a9a8ffdd7148ad8b4a274b073d340a\"> <tr class=\"TableRow\" id=\"77e829974632455191330b0b8545d1e3\"> <td class=\"TableCell\" id=\"7fee12d4c5554b7da778d6f8fdec8a57\">Value description </td><td class=\"TableCell\" id=\"5a7a33b0c57b4eb881a35bce9f87c831\"> <span class=\"Currency\" id=\"87220f9d62c3482e92e7de72a26869cd\">50 $ </span><span class=\"Measurement\" id=\"0095b9efb90a4cca991e73547c7165f1\">(1.32 %) </span></td></tr></tbody></table>"
|
||||
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
|
||||
},
|
||||
"text": "Description Row header Value description 50 $ (1.32 %)",
|
||||
"text": "Description Row header Value description 50 $ (1.32 %)",
|
||||
"type": "Table"
|
||||
},
|
||||
{
|
||||
|
||||
@ -356,12 +356,12 @@ def test_broken_cell_is_not_raising_error():
|
||||
"""
|
||||
<div class="Page">
|
||||
<table class="Table">
|
||||
<tbody class="TableBody">
|
||||
<tr class="TableRow">
|
||||
<td class="TableCell" tablecell"="">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td tablecell"="">
|
||||
83.64 GiB
|
||||
</td>
|
||||
<th class="TableCellHeader" rowspan="2">
|
||||
<th rowspan="2">
|
||||
Fair Value
|
||||
</th>
|
||||
</tr>
|
||||
@ -406,12 +406,12 @@ def test_table():
|
||||
"""
|
||||
<div class="Page">
|
||||
<table class="Table">
|
||||
<tbody class="TableBody">
|
||||
<tr class="TableRow">
|
||||
<td class="TableCell">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
Fair Value1
|
||||
</td>
|
||||
<th class="TableCellHeader" rowspan="2">
|
||||
<th rowspan="2">
|
||||
Fair Value2
|
||||
</th>
|
||||
</tr>
|
||||
@ -467,24 +467,20 @@ def test_table_and_time():
|
||||
"""
|
||||
<div class="Page">
|
||||
<table class="Table">
|
||||
<thead class='TableHeader'>
|
||||
<tr class="TableRow">
|
||||
<th class="TableCellHeader" colspan="6">
|
||||
<thead>
|
||||
<tr>
|
||||
<th colspan="6">
|
||||
Carrying Value
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody class='TableBody'>
|
||||
<tr class="TableRow">
|
||||
<td class="TableCell" colspan="5">
|
||||
<time class="CalendarDate">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td colspan="5">
|
||||
June 30, 2023
|
||||
</time>
|
||||
</td>
|
||||
<td class="TableCell">
|
||||
<span class="Currency">
|
||||
<td>
|
||||
$—
|
||||
</span>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
@ -594,3 +590,18 @@ def test_text_is_wrapped_inside_layout_element():
|
||||
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
|
||||
|
||||
assert parsed_ontology == expected_html
|
||||
|
||||
|
||||
def test_text_in_form_field_value():
|
||||
# language=HTML
|
||||
input_html = """
|
||||
<div class="Page">
|
||||
<input class="FormFieldValue" value="Random Input Value"/>
|
||||
</div>
|
||||
"""
|
||||
page = parse_html_to_ontology(input_html)
|
||||
|
||||
assert len(page.children) == 1
|
||||
form_field_value = page.children[0]
|
||||
assert form_field_value.text == ""
|
||||
assert form_field_value.to_text() == "Random Input Value"
|
||||
|
||||
@ -314,10 +314,7 @@ def test_table():
|
||||
unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html(
|
||||
html_as_str
|
||||
)
|
||||
expected_html = indent_html(html_as_str, html_parser="html.parser")
|
||||
parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser")
|
||||
|
||||
assert expected_html == parsed_html
|
||||
expected_elements = _page_elements + [
|
||||
Table(
|
||||
text="Fair Value1 Fair Value2",
|
||||
@ -325,13 +322,13 @@ def test_table():
|
||||
element_id="2",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<table class="Table" id="2"> '
|
||||
'<tbody class="TableBody" id="3"> '
|
||||
'<tr class="TableRow" id="4"> '
|
||||
'<td class="TableCell" id="5">'
|
||||
"Fair Value1 "
|
||||
"<tbody> "
|
||||
"<tr> "
|
||||
"<td>"
|
||||
"Fair Value1"
|
||||
"</td>"
|
||||
'<th class="TableCellHeader" rowspan="2" id="6">'
|
||||
"Fair Value2 "
|
||||
'<th rowspan="2">'
|
||||
"Fair Value2"
|
||||
"</th></tr></tbody></table>",
|
||||
parent_id="1",
|
||||
),
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.4-dev0" # pragma: no cover
|
||||
__version__ = "0.16.4-dev1" # pragma: no cover
|
||||
|
||||
@ -20,6 +20,7 @@ from copy import copy
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@ -75,32 +76,39 @@ class OntologyElement(BaseModel):
|
||||
|
||||
def to_html(self, add_children=True) -> str:
|
||||
additional_attrs = copy(self.additional_attributes)
|
||||
if "class" in additional_attrs:
|
||||
del additional_attrs["class"]
|
||||
|
||||
# TODO(Pluto) Add support for multiple classes
|
||||
attrs = " ".join(
|
||||
f'{key}="{value}"' if value else f"{key}" for key, value in additional_attrs.items()
|
||||
)
|
||||
additional_attrs.pop("class", None)
|
||||
|
||||
attr_str = self._construct_attribute_string(additional_attrs)
|
||||
class_attr = f'class="{self.css_class_name}"' if self.css_class_name else ""
|
||||
attr_str = f"{class_attr} {attrs}".strip()
|
||||
|
||||
children_html = (
|
||||
("" if not self.children else "".join(child.to_html() for child in self.children))
|
||||
if add_children
|
||||
else ""
|
||||
combined_attr_str = f"{class_attr} {attr_str}".strip()
|
||||
|
||||
children_html = self._generate_children_html(add_children)
|
||||
|
||||
result_html = self._generate_final_html(combined_attr_str, children_html)
|
||||
|
||||
return result_html
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
|
||||
|
||||
def _construct_attribute_string(self, attributes: dict) -> str:
|
||||
return " ".join(
|
||||
f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items()
|
||||
)
|
||||
text = "" if not self.text else self.text
|
||||
|
||||
def _generate_children_html(self, add_children: bool) -> str:
|
||||
if not add_children or not self.children:
|
||||
return ""
|
||||
return "".join(child.to_html() for child in self.children)
|
||||
|
||||
def _generate_final_html(self, attr_str: str, children_html: str) -> str:
|
||||
text = self.text or ""
|
||||
|
||||
if text or children_html:
|
||||
# This is either one or another, never both
|
||||
result_html = (
|
||||
f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
|
||||
)
|
||||
return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
|
||||
else:
|
||||
result_html = f"<{self.html_tag_name} {attr_str} />"
|
||||
return result_html
|
||||
return f"<{self.html_tag_name} {attr_str} />"
|
||||
|
||||
@property
|
||||
def id(self) -> str | None:
|
||||
@ -254,6 +262,18 @@ class Table(OntologyElement):
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True)
|
||||
allowed_tags: List[str] = Field(["table"], frozen=True)
|
||||
|
||||
def to_html(self, add_children=True) -> str:
|
||||
soup = BeautifulSoup(super().to_html(add_children), "html.parser")
|
||||
|
||||
for tag in soup.find_all(True):
|
||||
if tag.name != "table":
|
||||
tag.attrs.pop("class", None)
|
||||
tag.attrs.pop("id", None)
|
||||
if tag.name in ["td", "th"]:
|
||||
tag.string = " ".join(tag.stripped_strings)
|
||||
|
||||
return str(soup)
|
||||
|
||||
|
||||
class TableBody(OntologyElement):
|
||||
description: str = Field("A body of the table", frozen=True)
|
||||
@ -430,6 +450,15 @@ class Form(OntologyElement):
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
||||
allowed_tags: List[str] = Field(["form"], frozen=True)
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
texts = [self.text] if self.text else []
|
||||
|
||||
if add_children:
|
||||
for child in self.children:
|
||||
texts.append(child.to_text(add_children=True))
|
||||
|
||||
return " ".join(filter(None, texts)).strip()
|
||||
|
||||
|
||||
class FormField(OntologyElement):
|
||||
description: str = Field("A property value of a form", frozen=True)
|
||||
@ -442,6 +471,9 @@ class FormFieldValue(OntologyElement):
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
||||
allowed_tags: List[str] = Field(["input"], frozen=True)
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
return super().to_text() + self.additional_attributes.get("value", "")
|
||||
|
||||
|
||||
class Checkbox(OntologyElement):
|
||||
description: str = Field("A small box that can be checked or unchecked", frozen=True)
|
||||
|
||||
@ -96,10 +96,8 @@ def ontology_to_unstructured_elements(
|
||||
]
|
||||
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
|
||||
html_code_of_ontology_element = ontology_element.to_html()
|
||||
element_text = (
|
||||
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
|
||||
)
|
||||
# TODO value attribute from form input should be added to the text
|
||||
element_text = ontology_element.to_text()
|
||||
|
||||
unstructured_element = element_class(
|
||||
text=element_text,
|
||||
element_id=ontology_element.id,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user