mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 16:07:37 +00:00
Add max recursion limit and fix to_text() method (#3773)
This commit is contained in:
parent
df156ebe5a
commit
66d1e5a5cb
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.16.5-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element
|
||||
|
||||
|
||||
## 0.16.4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from unstructured.documents.ontology import OntologyElement
|
||||
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
|
||||
from unstructured.partition.html.html_utils import indent_html
|
||||
from unstructured.partition.html.transformations import parse_html_to_ontology
|
||||
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
|
||||
|
||||
|
||||
def _wrap_with_body(html: str) -> str:
|
||||
@ -605,3 +605,52 @@ def test_text_in_form_field_value():
|
||||
form_field_value = page.children[0]
|
||||
assert form_field_value.text == ""
|
||||
assert form_field_value.to_text() == "Random Input Value"
|
||||
|
||||
|
||||
def test_to_text_when_form_field():
|
||||
ontology = Page(
|
||||
children=[
|
||||
Form(
|
||||
tag="input",
|
||||
additional_attributes={"value": "Random Input Value"},
|
||||
children=[
|
||||
FormFieldValue(
|
||||
tag="input",
|
||||
additional_attributes={"value": "Random Input Value"},
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
)
|
||||
assert ontology.to_text(add_children=True) == "Random Input Value"
|
||||
|
||||
|
||||
def test_recursion_limit_is_limiting_parsing():
|
||||
# language=HTML
|
||||
broken_html = "some text"
|
||||
for i in range(100):
|
||||
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
|
||||
broken_html = _wrap_with_body(broken_html)
|
||||
ontology = parse_html_to_ontology(broken_html)
|
||||
|
||||
iterator = 1
|
||||
last_child = ontology.children[0]
|
||||
while last_child.children:
|
||||
last_child = last_child.children[0]
|
||||
iterator += 1
|
||||
assert last_child.text.startswith('<p class="Paragraph">')
|
||||
assert iterator == RECURSION_LIMIT
|
||||
|
||||
|
||||
def test_get_text_when_recursion_limit_activated():
|
||||
broken_html = "some text"
|
||||
for i in range(100):
|
||||
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
|
||||
broken_html = _wrap_with_body(broken_html)
|
||||
ontology = parse_html_to_ontology(broken_html)
|
||||
|
||||
last_child = ontology.children[0]
|
||||
while last_child.children:
|
||||
last_child = last_child.children[0]
|
||||
|
||||
assert last_child.to_text() == "some text"
|
||||
|
||||
@ -274,7 +274,7 @@ def test_forms():
|
||||
assert expected_html == parsed_html
|
||||
expected_elements = _page_elements + [
|
||||
Text(
|
||||
text="Option 1 (Checked)",
|
||||
text="2 Option 1 (Checked)",
|
||||
element_id="2",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.4" # pragma: no cover
|
||||
__version__ = "0.16.5-dev0" # pragma: no cover
|
||||
|
||||
@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum):
|
||||
|
||||
|
||||
class OntologyElement(BaseModel):
|
||||
text: Optional[str] = Field(None, description="Text content of the element")
|
||||
text: Optional[str] = Field("", description="Text content of the element")
|
||||
css_class_name: Optional[str] = Field(
|
||||
default_factory=lambda: "", description="CSS class associated with the element"
|
||||
)
|
||||
@ -90,7 +90,10 @@ class OntologyElement(BaseModel):
|
||||
return result_html
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
|
||||
if self.children and add_children:
|
||||
children_text = " ".join(child.to_text().strip() for child in self.children)
|
||||
return children_text
|
||||
return BeautifulSoup(self.to_html()).get_text().strip()
|
||||
|
||||
def _construct_attribute_string(self, attributes: dict) -> str:
|
||||
return " ".join(
|
||||
@ -450,15 +453,6 @@ class Form(OntologyElement):
|
||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
||||
allowed_tags: List[str] = Field(["form"], frozen=True)
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
texts = [self.text] if self.text else []
|
||||
|
||||
if add_children:
|
||||
for child in self.children:
|
||||
texts.append(child.to_text(add_children=True))
|
||||
|
||||
return " ".join(filter(None, texts)).strip()
|
||||
|
||||
|
||||
class FormField(OntologyElement):
|
||||
description: str = Field("A property value of a form", frozen=True)
|
||||
@ -472,7 +466,8 @@ class FormFieldValue(OntologyElement):
|
||||
allowed_tags: List[str] = Field(["input"], frozen=True)
|
||||
|
||||
def to_text(self, add_children=True) -> str:
|
||||
return super().to_text() + self.additional_attributes.get("value", "")
|
||||
text = super().to_text() + self.additional_attributes.get("value", "")
|
||||
return text.strip()
|
||||
|
||||
|
||||
class Checkbox(OntologyElement):
|
||||
|
||||
@ -36,6 +36,8 @@ from unstructured.documents.ontology import (
|
||||
UncategorizedText,
|
||||
)
|
||||
|
||||
RECURSION_LIMIT = 50
|
||||
|
||||
|
||||
def ontology_to_unstructured_elements(
|
||||
ontology_element: OntologyElement,
|
||||
@ -68,7 +70,7 @@ def ontology_to_unstructured_elements(
|
||||
list[Element]: A list of unstructured Element objects.
|
||||
"""
|
||||
elements_to_return = []
|
||||
if ontology_element.elementType == ElementTypeEnum.layout:
|
||||
if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
|
||||
|
||||
if page_number is None and isinstance(ontology_element, Page):
|
||||
page_number = ontology_element.page_number
|
||||
@ -354,7 +356,7 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
|
||||
return str(soup)
|
||||
|
||||
|
||||
def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
||||
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
|
||||
"""
|
||||
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
|
||||
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
|
||||
@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
||||
|
||||
Args:
|
||||
soup (Tag): The BeautifulSoup Tag object to be converted.
|
||||
recursion_depth (int): Flag to control limit of recursion depth.
|
||||
|
||||
Returns:
|
||||
OntologyElement: The converted OntologyElement object.
|
||||
@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
||||
and any(isinstance(content, Tag) for content in soup.contents)
|
||||
or ontology_class().elementType == ElementTypeEnum.layout
|
||||
)
|
||||
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
|
||||
|
||||
if has_children:
|
||||
if should_unwrap_html:
|
||||
text = ""
|
||||
children = [
|
||||
(
|
||||
parse_html_to_ontology_element(child)
|
||||
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
|
||||
if isinstance(child, Tag)
|
||||
else Paragraph(text=str(child).strip())
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user