mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

This PR: - Fixes removing HTML tags that exist in <td> cells - stripping function was in general problematic to implement in easy and straightforward way (you can't modify `descendants` in-place). So I decided instead of patching something in table cell I added stripping everywhere in the same consistent way. This is why some tests needed small edits with removing one white-space in each tag. I believe this won't cause any problems for downstream tasks. Tested HTML: ```html <table class="Table"> <tbody> <tr> <td colspan="2"> Some text </td> <td> <input checked="" class="Checkbox" type="checkbox"/> </td> </tr> </tbody> </table> ``` Before & After ```html '<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>' '<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>'' ```
76 lines
2.0 KiB
Python
76 lines
2.0 KiB
Python
from unstructured.partition.html import partition_html
|
|
|
|
|
|
def test_alternative_image_text_can_be_included():
|
|
# language=HTML
|
|
html = """
|
|
<div class="Page">
|
|
<img src="my-logo.png" alt="ALT TEXT Logo"/>
|
|
</div>
|
|
"""
|
|
_, image_to_text_alt_mode = partition_html(
|
|
text=html,
|
|
image_alt_mode="to_text",
|
|
html_parser_version="v2",
|
|
)
|
|
assert "ALT TEXT Logo" in image_to_text_alt_mode.text
|
|
|
|
_, image_none_alt_mode = partition_html(
|
|
text=html,
|
|
image_alt_mode=None,
|
|
html_parser_version="v2",
|
|
)
|
|
assert "ALT TEXT Logo" not in image_none_alt_mode.text
|
|
|
|
|
|
def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
|
|
# language=HTML
|
|
html = """
|
|
<div class="Page">
|
|
<p class="Paragraph">
|
|
<img src="my-logo.png" alt="ALT TEXT Logo"/>
|
|
</p>
|
|
</div>
|
|
"""
|
|
_, paragraph_to_text_alt_mode = partition_html(
|
|
text=html,
|
|
image_alt_mode="to_text",
|
|
html_parser_version="v2",
|
|
)
|
|
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
|
|
|
|
_, paragraph_none_alt_mode = partition_html(
|
|
text=html,
|
|
image_alt_mode=None,
|
|
html_parser_version="v2",
|
|
)
|
|
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
|
|
|
|
|
|
def test_attr_and_html_inside_table_cell_is_kept():
|
|
# language=HTML
|
|
html = """
|
|
<div class="Page">
|
|
<table class="Table">
|
|
<tbody>
|
|
<tr>
|
|
<td colspan="2">
|
|
Some text
|
|
</td>
|
|
<td>
|
|
<input checked="" class="Checkbox" type="checkbox"/>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
"""
|
|
page, table = partition_html(
|
|
text=html,
|
|
image_alt_mode="to_text",
|
|
html_parser_version="v2",
|
|
)
|
|
|
|
assert '<input checked="" type="checkbox"/>' in table.metadata.text_as_html # class is removed
|
|
assert 'colspan="2"' in table.metadata.text_as_html
|