mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	Add max recursion limit and fix to_text() method (#3773)
This commit is contained in:
		
							parent
							
								
									df156ebe5a
								
							
						
					
					
						commit
						66d1e5a5cb
					
				
							
								
								
									
										12
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								CHANGELOG.md
									
									
									
									
									
								
							@ -1,3 +1,13 @@
 | 
			
		||||
## 0.16.5-dev0
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
### Features
 | 
			
		||||
 | 
			
		||||
### Fixes
 | 
			
		||||
- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 0.16.4
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
@ -9,7 +19,7 @@
 | 
			
		||||
 | 
			
		||||
### Features
 | 
			
		||||
 | 
			
		||||
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. 
 | 
			
		||||
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
 | 
			
		||||
 | 
			
		||||
### Fixes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,8 +1,8 @@
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
from unstructured.documents.ontology import OntologyElement
 | 
			
		||||
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
 | 
			
		||||
from unstructured.partition.html.html_utils import indent_html
 | 
			
		||||
from unstructured.partition.html.transformations import parse_html_to_ontology
 | 
			
		||||
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _wrap_with_body(html: str) -> str:
 | 
			
		||||
@ -605,3 +605,52 @@ def test_text_in_form_field_value():
 | 
			
		||||
    form_field_value = page.children[0]
 | 
			
		||||
    assert form_field_value.text == ""
 | 
			
		||||
    assert form_field_value.to_text() == "Random Input Value"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_to_text_when_form_field():
 | 
			
		||||
    ontology = Page(
 | 
			
		||||
        children=[
 | 
			
		||||
            Form(
 | 
			
		||||
                tag="input",
 | 
			
		||||
                additional_attributes={"value": "Random Input Value"},
 | 
			
		||||
                children=[
 | 
			
		||||
                    FormFieldValue(
 | 
			
		||||
                        tag="input",
 | 
			
		||||
                        additional_attributes={"value": "Random Input Value"},
 | 
			
		||||
                    )
 | 
			
		||||
                ],
 | 
			
		||||
            )
 | 
			
		||||
        ]
 | 
			
		||||
    )
 | 
			
		||||
    assert ontology.to_text(add_children=True) == "Random Input Value"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_recursion_limit_is_limiting_parsing():
 | 
			
		||||
    # language=HTML
 | 
			
		||||
    broken_html = "some text"
 | 
			
		||||
    for i in range(100):
 | 
			
		||||
        broken_html = f"<p class='Paragraph'>{broken_html}</p>"
 | 
			
		||||
    broken_html = _wrap_with_body(broken_html)
 | 
			
		||||
    ontology = parse_html_to_ontology(broken_html)
 | 
			
		||||
 | 
			
		||||
    iterator = 1
 | 
			
		||||
    last_child = ontology.children[0]
 | 
			
		||||
    while last_child.children:
 | 
			
		||||
        last_child = last_child.children[0]
 | 
			
		||||
        iterator += 1
 | 
			
		||||
    assert last_child.text.startswith('<p class="Paragraph">')
 | 
			
		||||
    assert iterator == RECURSION_LIMIT
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_text_when_recursion_limit_activated():
 | 
			
		||||
    broken_html = "some text"
 | 
			
		||||
    for i in range(100):
 | 
			
		||||
        broken_html = f"<p class='Paragraph'>{broken_html}</p>"
 | 
			
		||||
    broken_html = _wrap_with_body(broken_html)
 | 
			
		||||
    ontology = parse_html_to_ontology(broken_html)
 | 
			
		||||
 | 
			
		||||
    last_child = ontology.children[0]
 | 
			
		||||
    while last_child.children:
 | 
			
		||||
        last_child = last_child.children[0]
 | 
			
		||||
 | 
			
		||||
    assert last_child.to_text() == "some text"
 | 
			
		||||
 | 
			
		||||
@ -274,7 +274,7 @@ def test_forms():
 | 
			
		||||
    assert expected_html == parsed_html
 | 
			
		||||
    expected_elements = _page_elements + [
 | 
			
		||||
        Text(
 | 
			
		||||
            text="Option 1 (Checked)",
 | 
			
		||||
            text="2 Option 1 (Checked)",
 | 
			
		||||
            element_id="2",
 | 
			
		||||
            detection_origin="vlm_partitioner",
 | 
			
		||||
            metadata=ElementMetadata(
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.16.4"  # pragma: no cover
 | 
			
		||||
__version__ = "0.16.5-dev0"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class OntologyElement(BaseModel):
 | 
			
		||||
    text: Optional[str] = Field(None, description="Text content of the element")
 | 
			
		||||
    text: Optional[str] = Field("", description="Text content of the element")
 | 
			
		||||
    css_class_name: Optional[str] = Field(
 | 
			
		||||
        default_factory=lambda: "", description="CSS class associated with the element"
 | 
			
		||||
    )
 | 
			
		||||
@ -90,7 +90,10 @@ class OntologyElement(BaseModel):
 | 
			
		||||
        return result_html
 | 
			
		||||
 | 
			
		||||
    def to_text(self, add_children=True) -> str:
 | 
			
		||||
        return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
 | 
			
		||||
        if self.children and add_children:
 | 
			
		||||
            children_text = " ".join(child.to_text().strip() for child in self.children)
 | 
			
		||||
            return children_text
 | 
			
		||||
        return BeautifulSoup(self.to_html()).get_text().strip()
 | 
			
		||||
 | 
			
		||||
    def _construct_attribute_string(self, attributes: dict) -> str:
 | 
			
		||||
        return " ".join(
 | 
			
		||||
@ -450,15 +453,6 @@ class Form(OntologyElement):
 | 
			
		||||
    elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
 | 
			
		||||
    allowed_tags: List[str] = Field(["form"], frozen=True)
 | 
			
		||||
 | 
			
		||||
    def to_text(self, add_children=True) -> str:
 | 
			
		||||
        texts = [self.text] if self.text else []
 | 
			
		||||
 | 
			
		||||
        if add_children:
 | 
			
		||||
            for child in self.children:
 | 
			
		||||
                texts.append(child.to_text(add_children=True))
 | 
			
		||||
 | 
			
		||||
        return " ".join(filter(None, texts)).strip()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FormField(OntologyElement):
 | 
			
		||||
    description: str = Field("A property value of a form", frozen=True)
 | 
			
		||||
@ -472,7 +466,8 @@ class FormFieldValue(OntologyElement):
 | 
			
		||||
    allowed_tags: List[str] = Field(["input"], frozen=True)
 | 
			
		||||
 | 
			
		||||
    def to_text(self, add_children=True) -> str:
 | 
			
		||||
        return super().to_text() + self.additional_attributes.get("value", "")
 | 
			
		||||
        text = super().to_text() + self.additional_attributes.get("value", "")
 | 
			
		||||
        return text.strip()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Checkbox(OntologyElement):
 | 
			
		||||
 | 
			
		||||
@ -36,6 +36,8 @@ from unstructured.documents.ontology import (
 | 
			
		||||
    UncategorizedText,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
RECURSION_LIMIT = 50
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ontology_to_unstructured_elements(
 | 
			
		||||
    ontology_element: OntologyElement,
 | 
			
		||||
@ -68,7 +70,7 @@ def ontology_to_unstructured_elements(
 | 
			
		||||
        list[Element]: A list of unstructured Element objects.
 | 
			
		||||
    """
 | 
			
		||||
    elements_to_return = []
 | 
			
		||||
    if ontology_element.elementType == ElementTypeEnum.layout:
 | 
			
		||||
    if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
 | 
			
		||||
 | 
			
		||||
        if page_number is None and isinstance(ontology_element, Page):
 | 
			
		||||
            page_number = ontology_element.page_number
 | 
			
		||||
@ -354,7 +356,7 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
 | 
			
		||||
    return str(soup)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
 | 
			
		||||
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
 | 
			
		||||
    """
 | 
			
		||||
    Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
 | 
			
		||||
    First tries to recognize a class from Unstructured Ontology, then if class is matched tries
 | 
			
		||||
@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        soup (Tag): The BeautifulSoup Tag object to be converted.
 | 
			
		||||
        recursion_depth (int): Flag to control limit of recursion depth.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        OntologyElement: The converted OntologyElement object.
 | 
			
		||||
@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
 | 
			
		||||
        and any(isinstance(content, Tag) for content in soup.contents)
 | 
			
		||||
        or ontology_class().elementType == ElementTypeEnum.layout
 | 
			
		||||
    )
 | 
			
		||||
    should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
 | 
			
		||||
 | 
			
		||||
    if has_children:
 | 
			
		||||
    if should_unwrap_html:
 | 
			
		||||
        text = ""
 | 
			
		||||
        children = [
 | 
			
		||||
            (
 | 
			
		||||
                parse_html_to_ontology_element(child)
 | 
			
		||||
                parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
 | 
			
		||||
                if isinstance(child, Tag)
 | 
			
		||||
                else Paragraph(text=str(child).strip())
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user