feat: keep img tag's class attr (#4050)

This change affects partition html.

Previously when there is a table in the html, we clean any tags inside
the table of their `class` and `id` attributes. However, sometimes there
are images, `img` tags, present in a table and its `class` attribute
identifies some important information about the image. This change
preserves the `class` attribute for `img` tags inside a table. This
change is reflected in a table element's `metadata.text_as_html`
attribute.
This commit is contained in:
Yao You 2025-07-10 15:46:28 -05:00 committed by GitHub
parent 7764fb6fd4
commit 73d239fb28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 5 deletions

View File

@ -1,8 +1,9 @@
## 0.18.5-dev1
## 0.18.5
### Enhancements
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
- **`text_as_html` for Table element now keeps `img` tag's `class` attribute** Previously in partition HTML any tag inside a table is stripped of its `class` attribute. Now this attribute is preserved for `img` tag in the table element's `metadata.text_as_html`.
### Features

View File

@ -1,6 +1,7 @@
from pathlib import Path
import pytest
from bs4 import BeautifulSoup
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
@ -13,6 +14,7 @@ from unstructured.documents.ontology import (
Paragraph,
Section,
Table,
remove_ids_and_class_from_table,
)
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
from unstructured.partition.html import partition_html
@ -24,6 +26,37 @@ from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_from_json
def test_remove_ids_and_class_from_table():
html_text = """
<table>
<tr class="TableRow">
<td><img class="Signature" alt="cell 1"/></td>
<td>cell 2</td>
</tr>
<tr>
<td><IMG class="Signature" alt="cell 3"/></td>
<td>cell 4</td>
</tr>
</table>
"""
soup = BeautifulSoup(html_text, "html.parser")
assert (
str(remove_ids_and_class_from_table(soup))
== """
<table>
<tr>
<td><img alt="cell 1" class="Signature"/></td>
<td>cell 2</td>
</tr>
<tr>
<td><img alt="cell 3" class="Signature"/></td>
<td>cell 4</td>
</tr>
</table>
"""
)
def test_page_number_is_passed_correctly():
ontology = Document(
children=[

View File

@ -1 +1 @@
__version__ = "0.18.5-dev1" # pragma: no cover
__version__ = "0.18.5" # pragma: no cover

View File

@ -147,12 +147,23 @@ class OntologyElement(BaseModel):
return None
def remove_ids_and_class_from_table(soup: BeautifulSoup):
def remove_ids_and_class_from_table(soup: BeautifulSoup) -> BeautifulSoup:
"""
Remove id and class attributes from tags inside tables,
except preserve class attributes for img tags.
Args:
soup: BeautifulSoup object containing the HTML
Returns:
BeautifulSoup: Modified soup with attributes removed
"""
for tag in soup.find_all(True):
if tag.name == "table": # type: ignore
if tag.name.lower() == "table": # type: ignore
continue # We keep table tag
tag.attrs.pop("class", None) # type: ignore
tag.attrs.pop("id", None) # type: ignore
if tag.name.lower() != "img": # type: ignore
tag.attrs.pop("class", None) # type: ignore
return soup