ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
This commit is contained in:
Maksymilian Operlejn 2024-10-31 02:30:53 +01:00 committed by GitHub
parent d0be1151a1
commit eb1b294b73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 98 additions and 56 deletions

View File

@ -1,7 +1,11 @@
## 0.16.4-dev0 ## 0.16.4-dev1
### Enhancements ### Enhancements
* **`value` attribute in `<input/>` element is parsed to `OntologyElement.text` in ontology**
* **`id` and `class` attributes removed from Table subtags in HTML partitioning**
* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`**
### Features ### Features
### Fixes ### Fixes

View File

@ -56,7 +56,7 @@
"parent_id": "3a6b156a81764e17be128264241f8136", "parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>" "text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
}, },
"text": "From field name", "text": "From field name Example value",
"type": "UncategorizedText" "type": "UncategorizedText"
}, },
{ {
@ -78,7 +78,7 @@
"filename": "example.pdf", "filename": "example.pdf",
"page_number": 1, "page_number": 1,
"parent_id": "592422373ed741b68a077e2003f8ed81", "parent_id": "592422373ed741b68a077e2003f8ed81",
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead class=\"TableHeader\" id=\"50a5548a87e84024af590b3d2830d140\"> <tr class=\"TableRow\" id=\"5e473d7742474412be72dc4e2c45bd4a\"> <th class=\"TableCellHeader\" id=\"01800309aa42411c98ae30f85b23f399\">Description </th><th class=\"TableCellHeader\" id=\"c2765b63d08946a2851955e79e301de4\">Row header </th></tr></thead><tbody class=\"TableBody\" id=\"e0a9a8ffdd7148ad8b4a274b073d340a\"> <tr class=\"TableRow\" id=\"77e829974632455191330b0b8545d1e3\"> <td class=\"TableCell\" id=\"7fee12d4c5554b7da778d6f8fdec8a57\">Value description </td><td class=\"TableCell\" id=\"5a7a33b0c57b4eb881a35bce9f87c831\"> <span class=\"Currency\" id=\"87220f9d62c3482e92e7de72a26869cd\">50 $ </span><span class=\"Measurement\" id=\"0095b9efb90a4cca991e73547c7165f1\">(1.32 %) </span></td></tr></tbody></table>" "text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
}, },
"text": "Description Row header Value description 50 $ (1.32 %)", "text": "Description Row header Value description 50 $ (1.32 %)",
"type": "Table" "type": "Table"

View File

@ -356,12 +356,12 @@ def test_broken_cell_is_not_raising_error():
""" """
<div class="Page"> <div class="Page">
<table class="Table"> <table class="Table">
<tbody class="TableBody"> <tbody>
<tr class="TableRow"> <tr>
<td class="TableCell" tablecell&quot;=""> <td tablecell&quot;="">
83.64 GiB 83.64 GiB
</td> </td>
<th class="TableCellHeader" rowspan="2"> <th rowspan="2">
Fair Value Fair Value
</th> </th>
</tr> </tr>
@ -406,12 +406,12 @@ def test_table():
""" """
<div class="Page"> <div class="Page">
<table class="Table"> <table class="Table">
<tbody class="TableBody"> <tbody>
<tr class="TableRow"> <tr>
<td class="TableCell"> <td>
Fair Value1 Fair Value1
</td> </td>
<th class="TableCellHeader" rowspan="2"> <th rowspan="2">
Fair Value2 Fair Value2
</th> </th>
</tr> </tr>
@ -467,24 +467,20 @@ def test_table_and_time():
""" """
<div class="Page"> <div class="Page">
<table class="Table"> <table class="Table">
<thead class='TableHeader'> <thead>
<tr class="TableRow"> <tr>
<th class="TableCellHeader" colspan="6"> <th colspan="6">
Carrying Value Carrying Value
</th> </th>
</tr> </tr>
</thead> </thead>
<tbody class='TableBody'> <tbody>
<tr class="TableRow"> <tr>
<td class="TableCell" colspan="5"> <td colspan="5">
<time class="CalendarDate">
June 30, 2023 June 30, 2023
</time>
</td> </td>
<td class="TableCell"> <td>
<span class="Currency">
$ $
</span>
</td> </td>
</tr> </tr>
</tbody> </tbody>
@ -594,3 +590,18 @@ def test_text_is_wrapped_inside_layout_element():
parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
assert parsed_ontology == expected_html assert parsed_ontology == expected_html
def test_text_in_form_field_value():
# language=HTML
input_html = """
<div class="Page">
<input class="FormFieldValue" value="Random Input Value"/>
</div>
"""
page = parse_html_to_ontology(input_html)
assert len(page.children) == 1
form_field_value = page.children[0]
assert form_field_value.text == ""
assert form_field_value.to_text() == "Random Input Value"

View File

@ -314,10 +314,7 @@ def test_table():
unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html( unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html(
html_as_str html_as_str
) )
expected_html = indent_html(html_as_str, html_parser="html.parser")
parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser")
assert expected_html == parsed_html
expected_elements = _page_elements + [ expected_elements = _page_elements + [
Table( Table(
text="Fair Value1 Fair Value2", text="Fair Value1 Fair Value2",
@ -325,13 +322,13 @@ def test_table():
element_id="2", element_id="2",
metadata=ElementMetadata( metadata=ElementMetadata(
text_as_html='<table class="Table" id="2"> ' text_as_html='<table class="Table" id="2"> '
'<tbody class="TableBody" id="3"> ' "<tbody> "
'<tr class="TableRow" id="4"> ' "<tr> "
'<td class="TableCell" id="5">' "<td>"
"Fair Value1 " "Fair Value1"
"</td>" "</td>"
'<th class="TableCellHeader" rowspan="2" id="6">' '<th rowspan="2">'
"Fair Value2 " "Fair Value2"
"</th></tr></tbody></table>", "</th></tr></tbody></table>",
parent_id="1", parent_id="1",
), ),

View File

@ -1 +1 @@
__version__ = "0.16.4-dev0" # pragma: no cover __version__ = "0.16.4-dev1" # pragma: no cover

View File

@ -20,6 +20,7 @@ from copy import copy
from enum import Enum from enum import Enum
from typing import List, Optional from typing import List, Optional
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -75,32 +76,39 @@ class OntologyElement(BaseModel):
def to_html(self, add_children=True) -> str: def to_html(self, add_children=True) -> str:
additional_attrs = copy(self.additional_attributes) additional_attrs = copy(self.additional_attributes)
if "class" in additional_attrs: additional_attrs.pop("class", None)
del additional_attrs["class"]
# TODO(Pluto) Add support for multiple classes
attrs = " ".join(
f'{key}="{value}"' if value else f"{key}" for key, value in additional_attrs.items()
)
attr_str = self._construct_attribute_string(additional_attrs)
class_attr = f'class="{self.css_class_name}"' if self.css_class_name else "" class_attr = f'class="{self.css_class_name}"' if self.css_class_name else ""
attr_str = f"{class_attr} {attrs}".strip()
children_html = ( combined_attr_str = f"{class_attr} {attr_str}".strip()
("" if not self.children else "".join(child.to_html() for child in self.children))
if add_children children_html = self._generate_children_html(add_children)
else ""
result_html = self._generate_final_html(combined_attr_str, children_html)
return result_html
def to_text(self, add_children=True) -> str:
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items()
) )
text = "" if not self.text else self.text
def _generate_children_html(self, add_children: bool) -> str:
if not add_children or not self.children:
return ""
return "".join(child.to_html() for child in self.children)
def _generate_final_html(self, attr_str: str, children_html: str) -> str:
text = self.text or ""
if text or children_html: if text or children_html:
# This is either one or another, never both return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
result_html = (
f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
)
else: else:
result_html = f"<{self.html_tag_name} {attr_str} />" return f"<{self.html_tag_name} {attr_str} />"
return result_html
@property @property
def id(self) -> str | None: def id(self) -> str | None:
@ -254,6 +262,18 @@ class Table(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True)
allowed_tags: List[str] = Field(["table"], frozen=True) allowed_tags: List[str] = Field(["table"], frozen=True)
def to_html(self, add_children=True) -> str:
soup = BeautifulSoup(super().to_html(add_children), "html.parser")
for tag in soup.find_all(True):
if tag.name != "table":
tag.attrs.pop("class", None)
tag.attrs.pop("id", None)
if tag.name in ["td", "th"]:
tag.string = " ".join(tag.stripped_strings)
return str(soup)
class TableBody(OntologyElement): class TableBody(OntologyElement):
description: str = Field("A body of the table", frozen=True) description: str = Field("A body of the table", frozen=True)
@ -430,6 +450,15 @@ class Form(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["form"], frozen=True) allowed_tags: List[str] = Field(["form"], frozen=True)
def to_text(self, add_children=True) -> str:
texts = [self.text] if self.text else []
if add_children:
for child in self.children:
texts.append(child.to_text(add_children=True))
return " ".join(filter(None, texts)).strip()
class FormField(OntologyElement): class FormField(OntologyElement):
description: str = Field("A property value of a form", frozen=True) description: str = Field("A property value of a form", frozen=True)
@ -442,6 +471,9 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True)
def to_text(self, add_children=True) -> str:
return super().to_text() + self.additional_attributes.get("value", "")
class Checkbox(OntologyElement): class Checkbox(OntologyElement):
description: str = Field("A small box that can be checked or unchecked", frozen=True) description: str = Field("A small box that can be checked or unchecked", frozen=True)

View File

@ -96,10 +96,8 @@ def ontology_to_unstructured_elements(
] ]
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name] element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
html_code_of_ontology_element = ontology_element.to_html() html_code_of_ontology_element = ontology_element.to_html()
element_text = ( element_text = ontology_element.to_text()
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
)
# TODO value attribute from form input should be added to the text
unstructured_element = element_class( unstructured_element = element_class(
text=element_text, text=element_text,
element_id=ontology_element.id, element_id=ontology_element.id,