fix: partition_html should process container divs that include text (#110)

* check for containers with text * added tests for containers with text * changelog and version bump
2025-12-08 04:55:36 +00:00 · 2022-12-21 16:51:04 -05:00 · 2022-12-21 16:51:04 -05:00 · 4f6fc29b54
commit 4f6fc29b54
parent 6f4d9ad06c
4 changed files with 65 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,9 @@
-## 0.3.5-dev0
+## 0.3.5-dev1
 * Add new pattern to recognize plain text dash bullets
 * Add test for bullet patterns
 * Fix for `partition_html` that allows for processing `div` tags that have both text and child
  elements
 ## 0.3.4
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@ -475,6 +475,34 @@ def test_nested_text_tags():
    assert len(html_document.pages[0].elements) == 1
 def test_containers_with_text_are_processed():
    html_str = """<div dir=3D"ltr">Hi All,<div><br></div>
   <div>Get excited for our first annual family day!</div>
   <div>Best.<br clear=3D"all">
      <div><br></div>
      -- <br>
      <div dir=3D"ltr">
         <div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
                </div>
               <div><br></div>
            </div>
         </div>
      </div>
   </div>
 </div>"""
    html_document = HTMLDocument.from_string(html_str)
    html_document._read()
    assert html_document.elements == [
        Title(text="Hi All,"),
        NarrativeText(text="Get excited for our first annual family day!"),
        Title(text="Best."),
        Title(text="Dino the Datasaur"),
        Title(text="Unstructured Technologies"),
        Title(text="Data Scientist"),
    ]
 def test_html_grabs_bulleted_text_in_tags():
    html_str = """<html>
    <body>
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.3.5-dev0"  # pragma: no cover
+__version__ = "0.3.5-dev1"  # pragma: no cover
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -96,6 +96,11 @@ class HTMLDocument(XMLDocument):
                        page.elements.append(element)
                        descendanttag_elems = tuple(tag_elem.iterdescendants())
                elif _is_container_with_text(tag_elem):
                    element = _text_to_element(tag_elem.text, "div", ())
                    if element is not None:
                        page.elements.append(element)
                elif _is_bulleted_table(tag_elem):
                    bulleted_text = _bulleted_text_from_table(tag_elem)
                    page.elements.extend(bulleted_text)
@ -188,22 +193,46 @@ def _parse_tag(
    text = _construct_text(tag_elem)
    if not text:
        return None
    return _text_to_element(text, tag_elem.tag, ancestortags)
 def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Optional[Element]:
    """Given the text of an element, the tag type and the ancestor tags, produces the appropriate
    HTML element."""
    if is_bulleted_text(text):
        if not clean_bullets(text):
            return None
-        return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)
+        return HTMLListItem(text=clean_bullets(text), tag=tag, ancestortags=ancestortags)
    if len(text) < 2:
        return None
-    elif is_narrative_tag(text, tag_elem.tag):
+    elif is_narrative_tag(text, tag):
-        return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
+        return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags)
    elif is_possible_title(text):
-        return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
+        return HTMLTitle(text, tag=tag, ancestortags=ancestortags)
    else:
        # Something that might end up here is text that's just a number.
        return None
 def _is_container_with_text(tag_elem: etree.Element) -> bool:
    """Checks if a tag is a container that also happens to containe text.
    Example
    -------
    <div>Hi there,
        <div>This is my message.</div>
        <div>Please read my message!</div>
    </div>
    """
    if tag_elem.tag != "div" or len(tag_elem) == 0:
        return False
    if tag_elem.text is None or tag_elem.text.strip() == "":
        return False
    return True
 def is_narrative_tag(text: str, tag: str) -> bool:
    """Uses tag information to infer whether text is narrative."""
    return tag not in HEADING_TAGS and is_possible_narrative_text(text)
`@ -1 +1 @@`
	`__version__ = "0.3.5-dev0" # pragma: no cover`	`__version__ = "0.3.5-dev1" # pragma: no cover`