Fix parsing table cells (#3904)

This PR:
- Fixes removing HTML tags that exist in <td> cells 
- stripping function was in general problematic to implement in easy and
straightforward way (you can't modify `descendants` in-place). So I
decided instead of patching something in table cell I added stripping
everywhere in the same consistent way. This is why some tests needed
small edits with removing one white-space in each tag. I believe this
won't cause any problems for downstream tasks.

Tested HTML:
```html
<table class="Table">
    <tbody>
        <tr>
            <td colspan="2">
                Some text                                        
            </td>
            <td>
                <input checked="" class="Checkbox" type="checkbox"/>
            </td>
        </tr>
    </tbody>
</table>
```
Before & After
```html
'<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>'
'<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>''
```
This commit is contained in:
Pluto 2025-02-05 16:28:49 +01:00 committed by GitHub
parent 451ad97ce2
commit 5bb95b5841
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 581 additions and 120 deletions

View File

@ -1,13 +1,14 @@
## 0.16.19-dev2
## 0.16.19-dev3
### Enhancements
### Features
### Fixes
- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
- **Fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
- **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff.
- **Remove upper bound constraint on python version** in setup.py. Python3.13 is not yet officially supported, but allow users to try.
- **Fixes removing HTML elements from the inside of table cells** in html partition v=2.0. The HTML partitioner now correctly preserves HTML elements from the inside of table cells.
## 0.16.17

View File

@ -4,6 +4,10 @@
"metadata": {
"category_depth": 0,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
@ -16,6 +20,10 @@
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
@ -28,9 +36,13 @@
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<h1 class=\"Title\" id=\"c95473e8a3704fc2b418697f9fddb27b\">Header </h1>"
"text_as_html": "<h1 class=\"Title\" id=\"c95473e8a3704fc2b418697f9fddb27b\">Header</h1>"
},
"text": "Header",
"type": "Title"
@ -40,9 +52,13 @@
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<time class=\"CalendarDate\" id=\"379cbfdc16d44bd6a59e6cfabe6438d5\">Date: October 30, 2023 </time>"
"text_as_html": "<time class=\"CalendarDate\" id=\"379cbfdc16d44bd6a59e6cfabe6438d5\">Date: October 30, 2023</time>"
},
"text": "Date: October 30, 2023",
"type": "UncategorizedText"
@ -52,9 +68,13 @@
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"><label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name</label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
},
"text": "From field name Example value",
"type": "UncategorizedText"
@ -64,6 +84,10 @@
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<section class=\"Section\" id=\"592422373ed741b68a077e2003f8ed81\" />"
@ -76,9 +100,13 @@
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "592422373ed741b68a077e2003f8ed81",
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"><thead><tr><th>Description</th><th>Row header</th></tr></thead><tbody><tr><td>Value description</td><td><span>50 $</span><span>(1.32 %)</span></td></tr></tbody></table>"
},
"text": "Description Row header Value description 50 $ (1.32 %)",
"type": "Table"
@ -88,6 +116,10 @@
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<section class=\"Section\" id=\"1032242af75c4b37984ea7fea9aac74c\" />"
@ -100,9 +132,13 @@
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
"text_as_html": "<h2 class=\"Subtitle\" id=\"2a4e2c4a689f4f9a8c180b6b521e45c3\">2. Subtitle </h2>"
"text_as_html": "<h2 class=\"Subtitle\" id=\"2a4e2c4a689f4f9a8c180b6b521e45c3\">2. Subtitle</h2>"
},
"text": "2. Subtitle",
"type": "Title"
@ -112,9 +148,13 @@
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
"text_as_html": "<p class=\"NarrativeText\" id=\"5591f7a4df01447e82515ce45f686fbe\">Paragraph text </p>"
"text_as_html": "<p class=\"NarrativeText\" id=\"5591f7a4df01447e82515ce45f686fbe\">Paragraph text</p>"
},
"text": "Paragraph text",
"type": "NarrativeText"

View File

@ -3,6 +3,10 @@
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
@ -14,6 +18,10 @@
"element_id": "45b3d0053468484ba1c7b53998115412",
"metadata": {
"category_depth": 1,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
@ -25,9 +33,13 @@
"element_id": "6cd3c1ba79654abb9c86162b6d1dae46",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<p class=\"NarrativeText\" id=\"6cd3c1ba79654abb9c86162b6d1dae46\">Table of Contents </p> <address class=\"Address\" id=\"7d7541d9943c4ad0b88bc47fd0b29e4a\">68 Prince Street Palmdale, CA 93550 </address> <a class=\"Hyperlink\" id=\"fde2621aa3df4b159bf305566110cca4\">www.google.com </a>"
"text_as_html": "<p class=\"NarrativeText\" id=\"6cd3c1ba79654abb9c86162b6d1dae46\">Table of Contents</p><address class=\"Address\" id=\"7d7541d9943c4ad0b88bc47fd0b29e4a\">68 Prince Street Palmdale, CA 93550</address><a class=\"Hyperlink\" id=\"fde2621aa3df4b159bf305566110cca4\">www.google.com</a>"
},
"text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
"type": "NarrativeText"
@ -36,9 +48,13 @@
"element_id": "cb0d6675109241428778c7b996e0b21c",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<span class=\"UncategorizedText\" id=\"cb0d6675109241428778c7b996e0b21c\">More text </span>"
"text_as_html": "<span class=\"UncategorizedText\" id=\"cb0d6675109241428778c7b996e0b21c\">More text</span>"
},
"text": "More text",
"type": "UncategorizedText"

View File

@ -8,7 +8,7 @@
"eng"
],
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
"text_as_html": "<table class=\"Table\" id=\"2428404551304d4db5925f6afee11ed5\"> <tr> <th>Header 1</th><th>Header 2</th></tr><tr> <td>Row 1, Cell 1</td><td>Row 1, Cell 2</td></tr><tr> <td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr></table>"
"text_as_html": "<table class=\"Table\" id=\"2428404551304d4db5925f6afee11ed5\"><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Row 1, Cell 1</td><td>Row 1, Cell 2</td></tr><tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr></table>"
},
"text": "Header 1 Header 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2",
"type": "Table"
@ -22,7 +22,7 @@
"eng"
],
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
"text_as_html": "<table class=\"Table\" id=\"9f91cae321c74b31bb1c83ac86cd7afb\"> <tr> <th colspan=\"3\">Big Table Header</th></tr><tr> <td rowspan=\"2\">Merged Cell 1</td><td>Cell 2</td><td>Cell 3</td></tr><tr> <td colspan=\"2\">Merged Cell 4 and 5</td></tr><tr> <td>Cell 6</td><td>Cell 7</td><td>Cell 8</td></tr><tr> <td>Cell 9</td><td colspan=\"2\">A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr><tr> <td>Cell 10</td><td>Cell 11</td><td>Cell 12</td></tr></table>"
"text_as_html": "<table class=\"Table\" id=\"9f91cae321c74b31bb1c83ac86cd7afb\"><tr><th colspan=\"3\">Big Table Header</th></tr><tr><td rowspan=\"2\">Merged Cell 1</td><td>Cell 2</td><td>Cell 3</td></tr><tr><td colspan=\"2\">Merged Cell 4 and 5</td></tr><tr><td>Cell 6</td><td>Cell 7</td><td>Cell 8</td></tr><tr><td>Cell 9</td><td colspan=\"2\">A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr><tr><td>Cell 10</td><td>Cell 11</td><td>Cell 12</td></tr></table>"
},
"text": "Big Table Header Merged Cell 1 Cell 2 Cell 3 Merged Cell 4 and 5 Cell 6 Cell 7 Cell 8 Cell 9 A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Cell 10 Cell 11 Cell 12",
"type": "Table"
@ -36,7 +36,7 @@
"eng"
],
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
"text_as_html": "<table class=\"TableOfContents\" id=\"da6c34391e544b3480e45d68f40870fa\"> <tr> <th>Chapter</th><th>Title</th><th>Page</th></tr><tr> <td>1</td><td>Introduction</td><td>1</td></tr><tr> <td>2</td><td>Getting Started</td><td>5</td></tr><tr> <td>3</td><td>Basic Concepts</td><td>12</td></tr><tr> <td>4</td><td>Advanced Topics</td><td>25</td></tr><tr> <td>5</td><td>Conclusion</td><td>40</td></tr></table>"
"text_as_html": "<table class=\"TableOfContents\" id=\"da6c34391e544b3480e45d68f40870fa\"><tr><th>Chapter</th><th>Title</th><th>Page</th></tr><tr><td>1</td><td>Introduction</td><td>1</td></tr><tr><td>2</td><td>Getting Started</td><td>5</td></tr><tr><td>3</td><td>Basic Concepts</td><td>12</td></tr><tr><td>4</td><td>Advanced Topics</td><td>25</td></tr><tr><td>5</td><td>Conclusion</td><td>40</td></tr></table>"
},
"text": "Chapter Title Page 1 Introduction 1 2 Getting Started 5 3 Basic Concepts 12 4 Advanced Topics 25 5 Conclusion 40",
"type": "Table"

View File

@ -476,10 +476,14 @@ def test_table_and_time():
<tbody>
<tr>
<td colspan="5">
<time>
June 30, 2023
</time>
</td>
<td>
<span>
$
</span>
</td>
</tr>
</tbody>

View File

@ -90,7 +90,7 @@ def test_simple_narrative_text_with_id():
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<p class="NarrativeText" '
'id="73cd7b4a-2444-4910-87a4-138117dfaab9">DEALER ONLY </p>',
'id="73cd7b4a-2444-4910-87a4-138117dfaab9">DEALER ONLY</p>',
parent_id="1",
),
)
@ -158,7 +158,7 @@ def test_multiple_elements():
detection_origin="vlm_partitioner",
element_id="2",
metadata=ElementMetadata(
text_as_html='<p class="Paragraph" id="2">About the same </p>',
text_as_html='<p class="Paragraph" id="2">About the same</p>',
parent_id="1",
),
),
@ -178,7 +178,7 @@ def test_multiple_elements():
text="Some text",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<p class="Paragraph" ' 'id="4">Some text </p>',
text_as_html='<p class="Paragraph" ' 'id="4">Some text</p>',
parent_id="1",
),
),
@ -226,7 +226,7 @@ def test_multiple_pages():
detection_origin="vlm_partitioner",
element_id="2",
metadata=ElementMetadata(
text_as_html='<p class="Paragraph" id="2">Some text </p>', parent_id="1"
text_as_html='<p class="Paragraph" id="2">Some text</p>', parent_id="1"
),
),
Text(
@ -242,7 +242,7 @@ def test_multiple_pages():
detection_origin="vlm_partitioner",
element_id="4",
metadata=ElementMetadata(
text_as_html='<p class="Paragraph" id="4">Another text </p>', parent_id="3"
text_as_html='<p class="Paragraph" id="4">Another text</p>', parent_id="3"
),
),
]
@ -279,13 +279,13 @@ def test_forms():
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html=""
'<form class="Form" id="2"> '
'<form class="Form" id="2">'
'<label class="FormField" '
'for="option1" id="3"> '
'for="option1" id="3">'
'<input class="FormFieldValue" type="radio" '
'name="options" value="2" id="4" checked />'
'<p class="Paragraph" id="5">'
"Option 1 (Checked) "
"Option 1 (Checked)"
"</p></label></form>",
parent_id="1",
),
@ -323,9 +323,9 @@ def test_table():
detection_origin="vlm_partitioner",
element_id="2",
metadata=ElementMetadata(
text_as_html='<table class="Table" id="2"> '
"<tbody> "
"<tr> "
text_as_html='<table class="Table" id="2">'
"<tbody>"
"<tr>"
"<td>"
"Fair Value1"
"</td>"
@ -402,7 +402,7 @@ def test_very_nested_structure_is_preserved():
element_id="10",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<h1 class="Title" id="10">Title </h1>', parent_id="3"
text_as_html='<h1 class="Title" id="10">Title</h1>', parent_id="3"
),
),
Text(
@ -416,9 +416,9 @@ def test_very_nested_structure_is_preserved():
element_id="5",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<blockquote class="Quote" id="5"> '
text_as_html='<blockquote class="Quote" id="5">'
'<p class="Paragraph" id="6">'
"Clever Quote "
"Clever Quote"
"</p>"
"</blockquote>",
parent_id="4",
@ -429,9 +429,9 @@ def test_very_nested_structure_is_preserved():
element_id="8",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<div class="Footnote" id="8"> '
text_as_html='<div class="Footnote" id="8">'
'<span class="UncategorizedText" id="9">'
"Uncategorized footnote text "
"Uncategorized footnote text"
"</span>"
"</div>",
parent_id="4",
@ -472,14 +472,14 @@ def test_ordered_list():
element_id="2",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<ul class="UnorderedList" id="2"> '
text_as_html='<ul class="UnorderedList" id="2">'
'<li class="ListItem" id="3">'
"Item 1 "
"Item 1"
"</li>"
'<li class="ListItem" id="4">'
"Item 2 </li>"
"Item 2</li>"
'<li class="ListItem" id="5">'
"Item 3 "
"Item 3"
"</li></ul>",
parent_id="1",
),
@ -517,11 +517,11 @@ def test_squeezed_elements_are_parsed_back():
element_id="2",
detection_origin="vlm_partitioner",
metadata=ElementMetadata(
text_as_html='<p class="NarrativeText" id="2">Table of Contents </p> '
text_as_html='<p class="NarrativeText" id="2">Table of Contents</p>'
'<address class="Address" id="3">'
"68 Prince Street Palmdale, CA 93550 "
"</address> "
'<a class="Hyperlink" id="4">www.google.com </a>',
"68 Prince Street Palmdale, CA 93550"
"</address>"
'<a class="Hyperlink" id="4">www.google.com</a>',
parent_id="1",
),
)

View File

@ -45,3 +45,31 @@ def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
def test_attr_and_html_inside_table_cell_is_kept():
# language=HTML
html = """
<div class="Page">
<table class="Table">
<tbody>
<tr>
<td colspan="2">
Some text
</td>
<td>
<input checked="" class="Checkbox" type="checkbox"/>
</td>
</tr>
</tbody>
</table>
</div>
"""
page, table = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert '<input checked="" type="checkbox"/>' in table.metadata.text_as_html # class is removed
assert 'colspan="2"' in table.metadata.text_as_html

View File

@ -1 +1 @@
__version__ = "0.16.19-dev2" # pragma: no cover
__version__ = "0.16.19-dev3" # pragma: no cover

View File

@ -125,7 +125,8 @@ class OntologyElement(BaseModel):
text = self.text or ""
if text or children_html:
return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
inside_tag_text = f"{text} {children_html}".strip()
return f"<{self.html_tag_name} {attr_str}>{inside_tag_text}</{self.html_tag_name}>"
else:
return f"<{self.html_tag_name} {attr_str} />"
@ -145,11 +146,10 @@ class OntologyElement(BaseModel):
def remove_ids_and_class_from_table(soup: Tag):
for tag in soup.find_all(True):
if tag.name != "table":
tag.attrs.pop("class", None)
tag.attrs.pop("id", None)
if tag.name in ["td", "th"]:
tag.string = " ".join(tag.stripped_strings)
if tag.name == "table":
continue # We keep table tag
tag.attrs.pop("class", None)
tag.attrs.pop("id", None)
return soup

View File

@ -136,7 +136,7 @@ def combine_inline_elements(elements: list[elements.Element]) -> list[elements.E
if can_unstructured_elements_be_merged(current_element, next_element):
current_element.text += " " + next_element.text
current_element.metadata.text_as_html += " " + next_element.metadata.text_as_html
current_element.metadata.text_as_html += next_element.metadata.text_as_html
else:
result_elements.append(current_element)
current_element = next_element