mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-01 09:40:31 +00:00
feat: keep img tag's class attr (#4050)
This change affects partition html. Previously when there is a table in the html, we clean any tags inside the table of their `class` and `id` attributes. However, sometimes there are images, `img` tags, present in a table and its `class` attribute identifies some important information about the image. This change preserves the `class` attribute for `img` tags inside a table. This change is reflected in a table element's `metadata.text_as_html` attribute.
This commit is contained in:
parent
7764fb6fd4
commit
73d239fb28
@ -1,8 +1,9 @@
|
||||
## 0.18.5-dev1
|
||||
## 0.18.5
|
||||
|
||||
### Enhancements
|
||||
|
||||
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
|
||||
- **`text_as_html` for Table element now keeps `img` tag's `class` attribute** Previously in partition HTML any tag inside a table is stripped of its `class` attribute. Now this attribute is preserved for `img` tag in the table element's `metadata.text_as_html`.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from unstructured.chunking.basic import chunk_elements
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
@ -13,6 +14,7 @@ from unstructured.documents.ontology import (
|
||||
Paragraph,
|
||||
Section,
|
||||
Table,
|
||||
remove_ids_and_class_from_table,
|
||||
)
|
||||
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
||||
from unstructured.partition.html import partition_html
|
||||
@ -24,6 +26,37 @@ from unstructured.partition.json import partition_json
|
||||
from unstructured.staging.base import elements_from_json
|
||||
|
||||
|
||||
def test_remove_ids_and_class_from_table():
|
||||
html_text = """
|
||||
<table>
|
||||
<tr class="TableRow">
|
||||
<td><img class="Signature" alt="cell 1"/></td>
|
||||
<td>cell 2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><IMG class="Signature" alt="cell 3"/></td>
|
||||
<td>cell 4</td>
|
||||
</tr>
|
||||
</table>
|
||||
"""
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
assert (
|
||||
str(remove_ids_and_class_from_table(soup))
|
||||
== """
|
||||
<table>
|
||||
<tr>
|
||||
<td><img alt="cell 1" class="Signature"/></td>
|
||||
<td>cell 2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><img alt="cell 3" class="Signature"/></td>
|
||||
<td>cell 4</td>
|
||||
</tr>
|
||||
</table>
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def test_page_number_is_passed_correctly():
|
||||
ontology = Document(
|
||||
children=[
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.18.5-dev1" # pragma: no cover
|
||||
__version__ = "0.18.5" # pragma: no cover
|
||||
|
||||
@ -147,12 +147,23 @@ class OntologyElement(BaseModel):
|
||||
return None
|
||||
|
||||
|
||||
def remove_ids_and_class_from_table(soup: BeautifulSoup):
|
||||
def remove_ids_and_class_from_table(soup: BeautifulSoup) -> BeautifulSoup:
|
||||
"""
|
||||
Remove id and class attributes from tags inside tables,
|
||||
except preserve class attributes for img tags.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object containing the HTML
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: Modified soup with attributes removed
|
||||
"""
|
||||
for tag in soup.find_all(True):
|
||||
if tag.name == "table": # type: ignore
|
||||
if tag.name.lower() == "table": # type: ignore
|
||||
continue # We keep table tag
|
||||
tag.attrs.pop("class", None) # type: ignore
|
||||
tag.attrs.pop("id", None) # type: ignore
|
||||
if tag.name.lower() != "img": # type: ignore
|
||||
tag.attrs.pop("class", None) # type: ignore
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user