mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 16:07:37 +00:00
Add max recursion limit and fix to_text() method (#3773)
This commit is contained in:
parent
df156ebe5a
commit
66d1e5a5cb
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.16.5-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element
|
||||||
|
|
||||||
|
|
||||||
## 0.16.4
|
## 0.16.4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
@ -9,7 +19,7 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
|
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from unstructured.documents.ontology import OntologyElement
|
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
|
||||||
from unstructured.partition.html.html_utils import indent_html
|
from unstructured.partition.html.html_utils import indent_html
|
||||||
from unstructured.partition.html.transformations import parse_html_to_ontology
|
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
|
||||||
|
|
||||||
|
|
||||||
def _wrap_with_body(html: str) -> str:
|
def _wrap_with_body(html: str) -> str:
|
||||||
@ -605,3 +605,52 @@ def test_text_in_form_field_value():
|
|||||||
form_field_value = page.children[0]
|
form_field_value = page.children[0]
|
||||||
assert form_field_value.text == ""
|
assert form_field_value.text == ""
|
||||||
assert form_field_value.to_text() == "Random Input Value"
|
assert form_field_value.to_text() == "Random Input Value"
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_text_when_form_field():
|
||||||
|
ontology = Page(
|
||||||
|
children=[
|
||||||
|
Form(
|
||||||
|
tag="input",
|
||||||
|
additional_attributes={"value": "Random Input Value"},
|
||||||
|
children=[
|
||||||
|
FormFieldValue(
|
||||||
|
tag="input",
|
||||||
|
additional_attributes={"value": "Random Input Value"},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert ontology.to_text(add_children=True) == "Random Input Value"
|
||||||
|
|
||||||
|
|
||||||
|
def test_recursion_limit_is_limiting_parsing():
|
||||||
|
# language=HTML
|
||||||
|
broken_html = "some text"
|
||||||
|
for i in range(100):
|
||||||
|
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
|
||||||
|
broken_html = _wrap_with_body(broken_html)
|
||||||
|
ontology = parse_html_to_ontology(broken_html)
|
||||||
|
|
||||||
|
iterator = 1
|
||||||
|
last_child = ontology.children[0]
|
||||||
|
while last_child.children:
|
||||||
|
last_child = last_child.children[0]
|
||||||
|
iterator += 1
|
||||||
|
assert last_child.text.startswith('<p class="Paragraph">')
|
||||||
|
assert iterator == RECURSION_LIMIT
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_text_when_recursion_limit_activated():
|
||||||
|
broken_html = "some text"
|
||||||
|
for i in range(100):
|
||||||
|
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
|
||||||
|
broken_html = _wrap_with_body(broken_html)
|
||||||
|
ontology = parse_html_to_ontology(broken_html)
|
||||||
|
|
||||||
|
last_child = ontology.children[0]
|
||||||
|
while last_child.children:
|
||||||
|
last_child = last_child.children[0]
|
||||||
|
|
||||||
|
assert last_child.to_text() == "some text"
|
||||||
|
|||||||
@ -274,7 +274,7 @@ def test_forms():
|
|||||||
assert expected_html == parsed_html
|
assert expected_html == parsed_html
|
||||||
expected_elements = _page_elements + [
|
expected_elements = _page_elements + [
|
||||||
Text(
|
Text(
|
||||||
text="Option 1 (Checked)",
|
text="2 Option 1 (Checked)",
|
||||||
element_id="2",
|
element_id="2",
|
||||||
detection_origin="vlm_partitioner",
|
detection_origin="vlm_partitioner",
|
||||||
metadata=ElementMetadata(
|
metadata=ElementMetadata(
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.16.4" # pragma: no cover
|
__version__ = "0.16.5-dev0" # pragma: no cover
|
||||||
|
|||||||
@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
class OntologyElement(BaseModel):
|
class OntologyElement(BaseModel):
|
||||||
text: Optional[str] = Field(None, description="Text content of the element")
|
text: Optional[str] = Field("", description="Text content of the element")
|
||||||
css_class_name: Optional[str] = Field(
|
css_class_name: Optional[str] = Field(
|
||||||
default_factory=lambda: "", description="CSS class associated with the element"
|
default_factory=lambda: "", description="CSS class associated with the element"
|
||||||
)
|
)
|
||||||
@ -90,7 +90,10 @@ class OntologyElement(BaseModel):
|
|||||||
return result_html
|
return result_html
|
||||||
|
|
||||||
def to_text(self, add_children=True) -> str:
|
def to_text(self, add_children=True) -> str:
|
||||||
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
|
if self.children and add_children:
|
||||||
|
children_text = " ".join(child.to_text().strip() for child in self.children)
|
||||||
|
return children_text
|
||||||
|
return BeautifulSoup(self.to_html()).get_text().strip()
|
||||||
|
|
||||||
def _construct_attribute_string(self, attributes: dict) -> str:
|
def _construct_attribute_string(self, attributes: dict) -> str:
|
||||||
return " ".join(
|
return " ".join(
|
||||||
@ -450,15 +453,6 @@ class Form(OntologyElement):
|
|||||||
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
|
||||||
allowed_tags: List[str] = Field(["form"], frozen=True)
|
allowed_tags: List[str] = Field(["form"], frozen=True)
|
||||||
|
|
||||||
def to_text(self, add_children=True) -> str:
|
|
||||||
texts = [self.text] if self.text else []
|
|
||||||
|
|
||||||
if add_children:
|
|
||||||
for child in self.children:
|
|
||||||
texts.append(child.to_text(add_children=True))
|
|
||||||
|
|
||||||
return " ".join(filter(None, texts)).strip()
|
|
||||||
|
|
||||||
|
|
||||||
class FormField(OntologyElement):
|
class FormField(OntologyElement):
|
||||||
description: str = Field("A property value of a form", frozen=True)
|
description: str = Field("A property value of a form", frozen=True)
|
||||||
@ -472,7 +466,8 @@ class FormFieldValue(OntologyElement):
|
|||||||
allowed_tags: List[str] = Field(["input"], frozen=True)
|
allowed_tags: List[str] = Field(["input"], frozen=True)
|
||||||
|
|
||||||
def to_text(self, add_children=True) -> str:
|
def to_text(self, add_children=True) -> str:
|
||||||
return super().to_text() + self.additional_attributes.get("value", "")
|
text = super().to_text() + self.additional_attributes.get("value", "")
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
class Checkbox(OntologyElement):
|
class Checkbox(OntologyElement):
|
||||||
|
|||||||
@ -36,6 +36,8 @@ from unstructured.documents.ontology import (
|
|||||||
UncategorizedText,
|
UncategorizedText,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
RECURSION_LIMIT = 50
|
||||||
|
|
||||||
|
|
||||||
def ontology_to_unstructured_elements(
|
def ontology_to_unstructured_elements(
|
||||||
ontology_element: OntologyElement,
|
ontology_element: OntologyElement,
|
||||||
@ -68,7 +70,7 @@ def ontology_to_unstructured_elements(
|
|||||||
list[Element]: A list of unstructured Element objects.
|
list[Element]: A list of unstructured Element objects.
|
||||||
"""
|
"""
|
||||||
elements_to_return = []
|
elements_to_return = []
|
||||||
if ontology_element.elementType == ElementTypeEnum.layout:
|
if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
|
||||||
|
|
||||||
if page_number is None and isinstance(ontology_element, Page):
|
if page_number is None and isinstance(ontology_element, Page):
|
||||||
page_number = ontology_element.page_number
|
page_number = ontology_element.page_number
|
||||||
@ -354,7 +356,7 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
|
|||||||
return str(soup)
|
return str(soup)
|
||||||
|
|
||||||
|
|
||||||
def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
|
||||||
"""
|
"""
|
||||||
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
|
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
|
||||||
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
|
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
|
||||||
@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
soup (Tag): The BeautifulSoup Tag object to be converted.
|
soup (Tag): The BeautifulSoup Tag object to be converted.
|
||||||
|
recursion_depth (int): Flag to control limit of recursion depth.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
OntologyElement: The converted OntologyElement object.
|
OntologyElement: The converted OntologyElement object.
|
||||||
@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
|
|||||||
and any(isinstance(content, Tag) for content in soup.contents)
|
and any(isinstance(content, Tag) for content in soup.contents)
|
||||||
or ontology_class().elementType == ElementTypeEnum.layout
|
or ontology_class().elementType == ElementTypeEnum.layout
|
||||||
)
|
)
|
||||||
|
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
|
||||||
|
|
||||||
if has_children:
|
if should_unwrap_html:
|
||||||
text = ""
|
text = ""
|
||||||
children = [
|
children = [
|
||||||
(
|
(
|
||||||
parse_html_to_ontology_element(child)
|
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
|
||||||
if isinstance(child, Tag)
|
if isinstance(child, Tag)
|
||||||
else Paragraph(text=str(child).strip())
|
else Paragraph(text=str(child).strip())
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user