Image within div or span with no text is annotated as Image (#3962)

Ticket: https://unstructured-ai.atlassian.net/browse/ML-942

The following uncompressed HTML document can be used to test the
transformation using the `partition_html` function from the VLM
partitioner.


[recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
This commit is contained in:
Antonio Jose Jimeno Yepes 2025-03-19 21:09:02 -07:00 committed by GitHub
parent 7de630e45e
commit 0fa5174bd7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 32 additions and 2 deletions

View File

@ -1,3 +1,7 @@
## 0.17.2
* Fix Image in a <div> tag is "UncategorizedText" with no .text
## 0.17.1
### Enhancements

View File

@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
from unstructured.partition.html.html_utils import indent_html
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
@ -672,3 +672,24 @@ def test_get_text_when_recursion_limit_activated():
last_child = last_child.children[0]
assert last_child.to_text() == "some text"
def test_uncategorizedtest_has_image_and_no_text():
# language=HTML
base_html = _wrap_with_body(
"""
<div class="Page">
<div class="UncategorizedText">
<img src="https://www.example.com/image.jpg"/>
</div>
</div>
"""
)
base_html = indent_html(base_html)
ontology: OntologyElement = parse_html_to_ontology(base_html)
element = ontology.children[0].children[0]
assert type(element) is Image
assert element.css_class_name == "Image"

View File

@ -1 +1 @@
__version__ = "0.17.1" # pragma: no cover
__version__ = "0.17.2" # pragma: no cover

View File

@ -437,6 +437,11 @@ def extract_tag_and_ontology_class_from_tag(
html_tag = "span"
element_class = ontology.UncategorizedText
# Scenario 5: UncategorizedText has image and no text
# Typically, this happens with a span or div tag with an image inside
if element_class == ontology.UncategorizedText and soup.find("img") and not soup.text.strip():
element_class = ontology.Image
return html_tag, element_class