ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
This commit is contained in:
Maksymilian Operlejn 2024-10-31 02:30:53 +01:00 committed by GitHub
parent d0be1151a1
commit eb1b294b73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 98 additions and 56 deletions

View File

@ -1,7 +1,11 @@
## 0.16.4-dev0
## 0.16.4-dev1
### Enhancements
* **`value` attribute in `<input/>` element is parsed to `OntologyElement.text` in ontology**
* **`id` and `class` attributes removed from Table subtags in HTML partitioning**
* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`**
### Features
### Fixes

View File

@ -56,7 +56,7 @@
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
},
"text": "From field name",
"text": "From field name Example value",
"type": "UncategorizedText"
},
{
@ -78,9 +78,9 @@
"filename": "example.pdf",
"page_number": 1,
"parent_id": "592422373ed741b68a077e2003f8ed81",
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead class=\"TableHeader\" id=\"50a5548a87e84024af590b3d2830d140\"> <tr class=\"TableRow\" id=\"5e473d7742474412be72dc4e2c45bd4a\"> <th class=\"TableCellHeader\" id=\"01800309aa42411c98ae30f85b23f399\">Description </th><th class=\"TableCellHeader\" id=\"c2765b63d08946a2851955e79e301de4\">Row header </th></tr></thead><tbody class=\"TableBody\" id=\"e0a9a8ffdd7148ad8b4a274b073d340a\"> <tr class=\"TableRow\" id=\"77e829974632455191330b0b8545d1e3\"> <td class=\"TableCell\" id=\"7fee12d4c5554b7da778d6f8fdec8a57\">Value description </td><td class=\"TableCell\" id=\"5a7a33b0c57b4eb881a35bce9f87c831\"> <span class=\"Currency\" id=\"87220f9d62c3482e92e7de72a26869cd\">50 $ </span><span class=\"Measurement\" id=\"0095b9efb90a4cca991e73547c7165f1\">(1.32 %) </span></td></tr></tbody></table>"
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
},
"text": "Description Row header Value description 50 $ (1.32 %)",
"text": "Description Row header Value description 50 $ (1.32 %)",
"type": "Table"
},
{

View File

@ -356,12 +356,12 @@ def test_broken_cell_is_not_raising_error():
"""
<div class="Page">
<table class="Table">
<tbody class="TableBody">
<tr class="TableRow">
<td class="TableCell" tablecell&quot;="">
<tbody>
<tr>
<td tablecell&quot;="">
83.64 GiB
</td>
<th class="TableCellHeader" rowspan="2">
<th rowspan="2">
Fair Value
</th>
</tr>
@ -406,12 +406,12 @@ def test_table():
"""
<div class="Page">
<table class="Table">
<tbody class="TableBody">
<tr class="TableRow">
<td class="TableCell">
<tbody>
<tr>
<td>
Fair Value1
</td>
<th class="TableCellHeader" rowspan="2">
<th rowspan="2">
Fair Value2
</th>
</tr>
@ -467,24 +467,20 @@ def test_table_and_time():
"""
<div class="Page">
<table class="Table">
<thead class='TableHeader'>
<tr class="TableRow">
<th class="TableCellHeader" colspan="6">
<thead>
<tr>
<th colspan="6">
Carrying Value
</th>
</tr>
</thead>
<tbody class='TableBody'>
<tr class="TableRow">
<td class="TableCell" colspan="5">
<time class="CalendarDate">
<tbody>
<tr>
<td colspan="5">
June 30, 2023
</time>
</td>
<td class="TableCell">
<span class="Currency">
<td>
$
</span>
</td>
</tr>
</tbody>
@ -594,3 +590,18 @@ def test_text_is_wrapped_inside_layout_element():
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
assert parsed_ontology == expected_html
def test_text_in_form_field_value():
# language=HTML
input_html = """
<div class="Page">
<input class="FormFieldValue" value="Random Input Value"/>
</div>
"""
page = parse_html_to_ontology(input_html)
assert len(page.children) == 1
form_field_value = page.children[0]
assert form_field_value.text == ""
assert form_field_value.to_text() == "Random Input Value"

View File

@ -314,10 +314,7 @@ def test_table():
unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html(
html_as_str
)
expected_html = indent_html(html_as_str, html_parser="html.parser")
parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser")
assert expected_html == parsed_html
expected_elements = _page_elements + [
Table(
text="Fair Value1 Fair Value2",
@ -325,13 +322,13 @@ def test_table():
element_id="2",
metadata=ElementMetadata(
text_as_html='<table class="Table" id="2"> '
'<tbody class="TableBody" id="3"> '
'<tr class="TableRow" id="4"> '
'<td class="TableCell" id="5">'
"Fair Value1 "
"<tbody> "
"<tr> "
"<td>"
"Fair Value1"
"</td>"
'<th class="TableCellHeader" rowspan="2" id="6">'
"Fair Value2 "
'<th rowspan="2">'
"Fair Value2"
"</th></tr></tbody></table>",
parent_id="1",
),

View File

@ -1 +1 @@
__version__ = "0.16.4-dev0" # pragma: no cover
__version__ = "0.16.4-dev1" # pragma: no cover

View File

@ -20,6 +20,7 @@ from copy import copy
from enum import Enum
from typing import List, Optional
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
@ -75,32 +76,39 @@ class OntologyElement(BaseModel):
def to_html(self, add_children=True) -> str:
additional_attrs = copy(self.additional_attributes)
if "class" in additional_attrs:
del additional_attrs["class"]
# TODO(Pluto) Add support for multiple classes
attrs = " ".join(
f'{key}="{value}"' if value else f"{key}" for key, value in additional_attrs.items()
)
additional_attrs.pop("class", None)
attr_str = self._construct_attribute_string(additional_attrs)
class_attr = f'class="{self.css_class_name}"' if self.css_class_name else ""
attr_str = f"{class_attr} {attrs}".strip()
children_html = (
("" if not self.children else "".join(child.to_html() for child in self.children))
if add_children
else ""
combined_attr_str = f"{class_attr} {attr_str}".strip()
children_html = self._generate_children_html(add_children)
result_html = self._generate_final_html(combined_attr_str, children_html)
return result_html
def to_text(self, add_children=True) -> str:
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items()
)
text = "" if not self.text else self.text
def _generate_children_html(self, add_children: bool) -> str:
if not add_children or not self.children:
return ""
return "".join(child.to_html() for child in self.children)
def _generate_final_html(self, attr_str: str, children_html: str) -> str:
text = self.text or ""
if text or children_html:
# This is either one or another, never both
result_html = (
f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
)
return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
else:
result_html = f"<{self.html_tag_name} {attr_str} />"
return result_html
return f"<{self.html_tag_name} {attr_str} />"
@property
def id(self) -> str | None:
@ -254,6 +262,18 @@ class Table(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True)
allowed_tags: List[str] = Field(["table"], frozen=True)
def to_html(self, add_children=True) -> str:
soup = BeautifulSoup(super().to_html(add_children), "html.parser")
for tag in soup.find_all(True):
if tag.name != "table":
tag.attrs.pop("class", None)
tag.attrs.pop("id", None)
if tag.name in ["td", "th"]:
tag.string = " ".join(tag.stripped_strings)
return str(soup)
class TableBody(OntologyElement):
description: str = Field("A body of the table", frozen=True)
@ -430,6 +450,15 @@ class Form(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["form"], frozen=True)
def to_text(self, add_children=True) -> str:
texts = [self.text] if self.text else []
if add_children:
for child in self.children:
texts.append(child.to_text(add_children=True))
return " ".join(filter(None, texts)).strip()
class FormField(OntologyElement):
description: str = Field("A property value of a form", frozen=True)
@ -442,6 +471,9 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True)
def to_text(self, add_children=True) -> str:
return super().to_text() + self.additional_attributes.get("value", "")
class Checkbox(OntologyElement):
description: str = Field("A small box that can be checked or unchecked", frozen=True)

View File

@ -96,10 +96,8 @@ def ontology_to_unstructured_elements(
]
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
html_code_of_ontology_element = ontology_element.to_html()
element_text = (
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
)
# TODO value attribute from form input should be added to the text
element_text = ontology_element.to_text()
unstructured_element = element_class(
text=element_text,
element_id=ontology_element.id,