Image within div or span with no text is annotated as Image (#3962)

Ticket: https://unstructured-ai.atlassian.net/browse/ML-942 The following uncompressed HTML document can be used to test the transformation using the `partition_html` function from the VLM partitioner. [recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
2025-06-27 02:30:08 +00:00 · 2025-03-19 21:09:02 -07:00 · 2025-03-19 21:09:02 -07:00 · 0fa5174bd7
commit 0fa5174bd7
parent 7de630e45e
4 changed files with 32 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,7 @@
+## 0.17.2
+
+* Fix Image in a <div> tag is "UncategorizedText" with no .text
+
 ## 0.17.1

 ### Enhancements
--- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py
+++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py
@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup

-from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
+from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
 from unstructured.partition.html.html_utils import indent_html
 from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology

@ -672,3 +672,24 @@ def test_get_text_when_recursion_limit_activated():
        last_child = last_child.children[0]

    assert last_child.to_text() == "some text"
+
+
+def test_uncategorizedtest_has_image_and_no_text():
+    # language=HTML
+    base_html = _wrap_with_body(
+        """
+        <div class="Page">
+    <div class="UncategorizedText">
+        <img src="https://www.example.com/image.jpg"/>
+    </div>
+    </div>
+    """
+    )
+
+    base_html = indent_html(base_html)
+
+    ontology: OntologyElement = parse_html_to_ontology(base_html)
+
+    element = ontology.children[0].children[0]
+    assert type(element) is Image
+    assert element.css_class_name == "Image"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.17.1"  # pragma: no cover
+__version__ = "0.17.2"  # pragma: no cover
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@ -437,6 +437,11 @@ def extract_tag_and_ontology_class_from_tag(
        html_tag = "span"
        element_class = ontology.UncategorizedText

+    # Scenario 5: UncategorizedText has image and no text
+    # Typically, this happens with a span or div tag with an image inside
+    if element_class == ontology.UncategorizedText and soup.find("img") and not soup.text.strip():
+        element_class = ontology.Image
+
    return html_tag, element_class