mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Image within div or span with no text is annotated as Image (#3962)
Ticket: https://unstructured-ai.atlassian.net/browse/ML-942 The following uncompressed HTML document can be used to test the transformation using the `partition_html` function from the VLM partitioner. [recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
This commit is contained in:
parent
7de630e45e
commit
0fa5174bd7
@ -1,3 +1,7 @@
|
||||
## 0.17.2
|
||||
|
||||
* Fix Image in a <div> tag is "UncategorizedText" with no .text
|
||||
|
||||
## 0.17.1
|
||||
|
||||
### Enhancements
|
||||
|
@ -1,6 +1,6 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
|
||||
from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
|
||||
from unstructured.partition.html.html_utils import indent_html
|
||||
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
|
||||
|
||||
@ -672,3 +672,24 @@ def test_get_text_when_recursion_limit_activated():
|
||||
last_child = last_child.children[0]
|
||||
|
||||
assert last_child.to_text() == "some text"
|
||||
|
||||
|
||||
def test_uncategorizedtest_has_image_and_no_text():
|
||||
# language=HTML
|
||||
base_html = _wrap_with_body(
|
||||
"""
|
||||
<div class="Page">
|
||||
<div class="UncategorizedText">
|
||||
<img src="https://www.example.com/image.jpg"/>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
)
|
||||
|
||||
base_html = indent_html(base_html)
|
||||
|
||||
ontology: OntologyElement = parse_html_to_ontology(base_html)
|
||||
|
||||
element = ontology.children[0].children[0]
|
||||
assert type(element) is Image
|
||||
assert element.css_class_name == "Image"
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.17.1" # pragma: no cover
|
||||
__version__ = "0.17.2" # pragma: no cover
|
||||
|
@ -437,6 +437,11 @@ def extract_tag_and_ontology_class_from_tag(
|
||||
html_tag = "span"
|
||||
element_class = ontology.UncategorizedText
|
||||
|
||||
# Scenario 5: UncategorizedText has image and no text
|
||||
# Typically, this happens with a span or div tag with an image inside
|
||||
if element_class == ontology.UncategorizedText and soup.find("img") and not soup.text.strip():
|
||||
element_class = ontology.Image
|
||||
|
||||
return html_tag, element_class
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user