mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-06 20:13:04 +00:00
feat: keep img tag's class attr (#4050)
This change affects partition html. Previously when there is a table in the html, we clean any tags inside the table of their `class` and `id` attributes. However, sometimes there are images, `img` tags, present in a table and its `class` attribute identifies some important information about the image. This change preserves the `class` attribute for `img` tags inside a table. This change is reflected in a table element's `metadata.text_as_html` attribute.
This commit is contained in:
parent
7764fb6fd4
commit
73d239fb28
@ -1,8 +1,9 @@
|
|||||||
## 0.18.5-dev1
|
## 0.18.5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
|
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
|
||||||
|
- **`text_as_html` for Table element now keeps `img` tag's `class` attribute** Previously in partition HTML any tag inside a table is stripped of its `class` attribute. Now this attribute is preserved for `img` tag in the table element's `metadata.text_as_html`.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from unstructured.chunking.basic import chunk_elements
|
from unstructured.chunking.basic import chunk_elements
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
@ -13,6 +14,7 @@ from unstructured.documents.ontology import (
|
|||||||
Paragraph,
|
Paragraph,
|
||||||
Section,
|
Section,
|
||||||
Table,
|
Table,
|
||||||
|
remove_ids_and_class_from_table,
|
||||||
)
|
)
|
||||||
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
@ -24,6 +26,37 @@ from unstructured.partition.json import partition_json
|
|||||||
from unstructured.staging.base import elements_from_json
|
from unstructured.staging.base import elements_from_json
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_ids_and_class_from_table():
|
||||||
|
html_text = """
|
||||||
|
<table>
|
||||||
|
<tr class="TableRow">
|
||||||
|
<td><img class="Signature" alt="cell 1"/></td>
|
||||||
|
<td>cell 2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><IMG class="Signature" alt="cell 3"/></td>
|
||||||
|
<td>cell 4</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html_text, "html.parser")
|
||||||
|
assert (
|
||||||
|
str(remove_ids_and_class_from_table(soup))
|
||||||
|
== """
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td><img alt="cell 1" class="Signature"/></td>
|
||||||
|
<td>cell 2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><img alt="cell 3" class="Signature"/></td>
|
||||||
|
<td>cell 4</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_page_number_is_passed_correctly():
|
def test_page_number_is_passed_correctly():
|
||||||
ontology = Document(
|
ontology = Document(
|
||||||
children=[
|
children=[
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.18.5-dev1" # pragma: no cover
|
__version__ = "0.18.5" # pragma: no cover
|
||||||
|
|||||||
@ -147,12 +147,23 @@ class OntologyElement(BaseModel):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def remove_ids_and_class_from_table(soup: BeautifulSoup):
|
def remove_ids_and_class_from_table(soup: BeautifulSoup) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Remove id and class attributes from tags inside tables,
|
||||||
|
except preserve class attributes for img tags.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
soup: BeautifulSoup object containing the HTML
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BeautifulSoup: Modified soup with attributes removed
|
||||||
|
"""
|
||||||
for tag in soup.find_all(True):
|
for tag in soup.find_all(True):
|
||||||
if tag.name == "table": # type: ignore
|
if tag.name.lower() == "table": # type: ignore
|
||||||
continue # We keep table tag
|
continue # We keep table tag
|
||||||
tag.attrs.pop("class", None) # type: ignore
|
|
||||||
tag.attrs.pop("id", None) # type: ignore
|
tag.attrs.pop("id", None) # type: ignore
|
||||||
|
if tag.name.lower() != "img": # type: ignore
|
||||||
|
tag.attrs.pop("class", None) # type: ignore
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user