feat: keep img tag's class attr (#4050)

This change affects partition html. Previously when there is a table in the html, we clean any tags inside the table of their `class` and `id` attributes. However, sometimes there are images, `img` tags, present in a table and its `class` attribute identifies some important information about the image. This change preserves the `class` attribute for `img` tags inside a table. This change is reflected in a table element's `metadata.text_as_html` attribute.
2025-12-01 09:40:31 +00:00 · 2025-07-10 15:46:28 -05:00 · 2025-07-10 15:46:28 -05:00 · 73d239fb28
commit 73d239fb28
parent 7764fb6fd4
4 changed files with 50 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,9 @@
-## 0.18.5-dev1
+## 0.18.5

 ### Enhancements

 - **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
+- **`text_as_html` for Table element now keeps `img` tag's `class` attribute** Previously in partition HTML any tag inside a table is stripped of its `class` attribute. Now this attribute is preserved for `img` tag in the table element's `metadata.text_as_html`.

 ### Features

--- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
+++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
@ -1,6 +1,7 @@
 from pathlib import Path

 import pytest
+from bs4 import BeautifulSoup

 from unstructured.chunking.basic import chunk_elements
 from unstructured.chunking.title import chunk_by_title
@ -13,6 +14,7 @@ from unstructured.documents.ontology import (
    Paragraph,
    Section,
    Table,
+    remove_ids_and_class_from_table,
 )
 from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
 from unstructured.partition.html import partition_html
@ -24,6 +26,37 @@ from unstructured.partition.json import partition_json
 from unstructured.staging.base import elements_from_json


+def test_remove_ids_and_class_from_table():
+    html_text = """
+    <table>
+        <tr class="TableRow">
+            <td><img class="Signature" alt="cell 1"/></td>
+            <td>cell 2</td>
+        </tr>
+        <tr>
+            <td><IMG class="Signature" alt="cell 3"/></td>
+            <td>cell 4</td>
+        </tr>
+    </table>
+    """
+    soup = BeautifulSoup(html_text, "html.parser")
+    assert (
+        str(remove_ids_and_class_from_table(soup))
+        == """
+<table>
+<tr>
+<td><img alt="cell 1" class="Signature"/></td>
+<td>cell 2</td>
+</tr>
+<tr>
+<td><img alt="cell 3" class="Signature"/></td>
+<td>cell 4</td>
+</tr>
+</table>
+"""
+    )
+
+
 def test_page_number_is_passed_correctly():
    ontology = Document(
        children=[
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.18.5-dev1"  # pragma: no cover
+__version__ = "0.18.5"  # pragma: no cover
--- a/unstructured/documents/ontology.py
+++ b/unstructured/documents/ontology.py
@ -147,12 +147,23 @@ class OntologyElement(BaseModel):
        return None


-def remove_ids_and_class_from_table(soup: BeautifulSoup):
+def remove_ids_and_class_from_table(soup: BeautifulSoup) -> BeautifulSoup:
+    """
+    Remove id and class attributes from tags inside tables,
+    except preserve class attributes for img tags.
+
+    Args:
+        soup: BeautifulSoup object containing the HTML
+
+    Returns:
+        BeautifulSoup: Modified soup with attributes removed
+    """
    for tag in soup.find_all(True):
-        if tag.name == "table":  # type: ignore
+        if tag.name.lower() == "table":  # type: ignore
            continue  # We keep table tag
-        tag.attrs.pop("class", None)  # type: ignore
        tag.attrs.pop("id", None)  # type: ignore
+        if tag.name.lower() != "img":  # type: ignore
+            tag.attrs.pop("class", None)  # type: ignore
    return soup