mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-12 16:37:31 +00:00
Fix parsing table cells (#3904)
This PR:
- Fixes removing HTML tags that exist in <td> cells
- stripping function was in general problematic to implement in easy and
straightforward way (you can't modify `descendants` in-place). So I
decided instead of patching something in table cell I added stripping
everywhere in the same consistent way. This is why some tests needed
small edits with removing one white-space in each tag. I believe this
won't cause any problems for downstream tasks.
Tested HTML:
```html
<table class="Table">
<tbody>
<tr>
<td colspan="2">
Some text
</td>
<td>
<input checked="" class="Checkbox" type="checkbox"/>
</td>
</tr>
</tbody>
</table>
```
Before & After
```html
'<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>'
'<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>''
```
This commit is contained in:
parent
451ad97ce2
commit
5bb95b5841
@ -1,13 +1,14 @@
|
||||
## 0.16.19-dev2
|
||||
## 0.16.19-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
|
||||
- **Fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.
|
||||
- **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff.
|
||||
- **Remove upper bound constraint on python version** in setup.py. Python3.13 is not yet officially supported, but allow users to try.
|
||||
- **Fixes removing HTML elements from the inside of table cells** in html partition v=2.0. The HTML partitioner now correctly preserves HTML elements from the inside of table cells.
|
||||
|
||||
## 0.16.17
|
||||
|
||||
|
||||
@ -4,6 +4,10 @@
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
|
||||
@ -16,6 +20,10 @@
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
|
||||
@ -28,9 +36,13 @@
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<h1 class=\"Title\" id=\"c95473e8a3704fc2b418697f9fddb27b\">Header </h1>"
|
||||
"text_as_html": "<h1 class=\"Title\" id=\"c95473e8a3704fc2b418697f9fddb27b\">Header</h1>"
|
||||
},
|
||||
"text": "Header",
|
||||
"type": "Title"
|
||||
@ -40,9 +52,13 @@
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<time class=\"CalendarDate\" id=\"379cbfdc16d44bd6a59e6cfabe6438d5\">Date: October 30, 2023 </time>"
|
||||
"text_as_html": "<time class=\"CalendarDate\" id=\"379cbfdc16d44bd6a59e6cfabe6438d5\">Date: October 30, 2023</time>"
|
||||
},
|
||||
"text": "Date: October 30, 2023",
|
||||
"type": "UncategorizedText"
|
||||
@ -52,9 +68,13 @@
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
|
||||
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"><label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name</label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
|
||||
},
|
||||
"text": "From field name Example value",
|
||||
"type": "UncategorizedText"
|
||||
@ -64,6 +84,10 @@
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<section class=\"Section\" id=\"592422373ed741b68a077e2003f8ed81\" />"
|
||||
@ -76,9 +100,13 @@
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "592422373ed741b68a077e2003f8ed81",
|
||||
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
|
||||
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"><thead><tr><th>Description</th><th>Row header</th></tr></thead><tbody><tr><td>Value description</td><td><span>50 $</span><span>(1.32 %)</span></td></tr></tbody></table>"
|
||||
},
|
||||
"text": "Description Row header Value description 50 $ (1.32 %)",
|
||||
"type": "Table"
|
||||
@ -88,6 +116,10 @@
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<section class=\"Section\" id=\"1032242af75c4b37984ea7fea9aac74c\" />"
|
||||
@ -100,9 +132,13 @@
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
|
||||
"text_as_html": "<h2 class=\"Subtitle\" id=\"2a4e2c4a689f4f9a8c180b6b521e45c3\">2. Subtitle </h2>"
|
||||
"text_as_html": "<h2 class=\"Subtitle\" id=\"2a4e2c4a689f4f9a8c180b6b521e45c3\">2. Subtitle</h2>"
|
||||
},
|
||||
"text": "2. Subtitle",
|
||||
"type": "Title"
|
||||
@ -112,9 +148,13 @@
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filename": "example.pdf",
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
|
||||
"text_as_html": "<p class=\"NarrativeText\" id=\"5591f7a4df01447e82515ce45f686fbe\">Paragraph text </p>"
|
||||
"text_as_html": "<p class=\"NarrativeText\" id=\"5591f7a4df01447e82515ce45f686fbe\">Paragraph text</p>"
|
||||
},
|
||||
"text": "Paragraph text",
|
||||
"type": "NarrativeText"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,10 @@
|
||||
"element_id": "3a6b156a81764e17be128264241f8136",
|
||||
"metadata": {
|
||||
"category_depth": 0,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
||||
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
|
||||
@ -14,6 +18,10 @@
|
||||
"element_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"metadata": {
|
||||
"category_depth": 1,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "3a6b156a81764e17be128264241f8136",
|
||||
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
|
||||
@ -25,9 +33,13 @@
|
||||
"element_id": "6cd3c1ba79654abb9c86162b6d1dae46",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<p class=\"NarrativeText\" id=\"6cd3c1ba79654abb9c86162b6d1dae46\">Table of Contents </p> <address class=\"Address\" id=\"7d7541d9943c4ad0b88bc47fd0b29e4a\">68 Prince Street Palmdale, CA 93550 </address> <a class=\"Hyperlink\" id=\"fde2621aa3df4b159bf305566110cca4\">www.google.com </a>"
|
||||
"text_as_html": "<p class=\"NarrativeText\" id=\"6cd3c1ba79654abb9c86162b6d1dae46\">Table of Contents</p><address class=\"Address\" id=\"7d7541d9943c4ad0b88bc47fd0b29e4a\">68 Prince Street Palmdale, CA 93550</address><a class=\"Hyperlink\" id=\"fde2621aa3df4b159bf305566110cca4\">www.google.com</a>"
|
||||
},
|
||||
"text": "Table of Contents 68 Prince Street Palmdale, CA 93550 www.google.com",
|
||||
"type": "NarrativeText"
|
||||
@ -36,9 +48,13 @@
|
||||
"element_id": "cb0d6675109241428778c7b996e0b21c",
|
||||
"metadata": {
|
||||
"category_depth": 2,
|
||||
"filetype": "text/html",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
||||
"text_as_html": "<span class=\"UncategorizedText\" id=\"cb0d6675109241428778c7b996e0b21c\">More text </span>"
|
||||
"text_as_html": "<span class=\"UncategorizedText\" id=\"cb0d6675109241428778c7b996e0b21c\">More text</span>"
|
||||
},
|
||||
"text": "More text",
|
||||
"type": "UncategorizedText"
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
"eng"
|
||||
],
|
||||
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
|
||||
"text_as_html": "<table class=\"Table\" id=\"2428404551304d4db5925f6afee11ed5\"> <tr> <th>Header 1</th><th>Header 2</th></tr><tr> <td>Row 1, Cell 1</td><td>Row 1, Cell 2</td></tr><tr> <td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr></table>"
|
||||
"text_as_html": "<table class=\"Table\" id=\"2428404551304d4db5925f6afee11ed5\"><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Row 1, Cell 1</td><td>Row 1, Cell 2</td></tr><tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr></table>"
|
||||
},
|
||||
"text": "Header 1 Header 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2",
|
||||
"type": "Table"
|
||||
@ -22,7 +22,7 @@
|
||||
"eng"
|
||||
],
|
||||
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
|
||||
"text_as_html": "<table class=\"Table\" id=\"9f91cae321c74b31bb1c83ac86cd7afb\"> <tr> <th colspan=\"3\">Big Table Header</th></tr><tr> <td rowspan=\"2\">Merged Cell 1</td><td>Cell 2</td><td>Cell 3</td></tr><tr> <td colspan=\"2\">Merged Cell 4 and 5</td></tr><tr> <td>Cell 6</td><td>Cell 7</td><td>Cell 8</td></tr><tr> <td>Cell 9</td><td colspan=\"2\">A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr><tr> <td>Cell 10</td><td>Cell 11</td><td>Cell 12</td></tr></table>"
|
||||
"text_as_html": "<table class=\"Table\" id=\"9f91cae321c74b31bb1c83ac86cd7afb\"><tr><th colspan=\"3\">Big Table Header</th></tr><tr><td rowspan=\"2\">Merged Cell 1</td><td>Cell 2</td><td>Cell 3</td></tr><tr><td colspan=\"2\">Merged Cell 4 and 5</td></tr><tr><td>Cell 6</td><td>Cell 7</td><td>Cell 8</td></tr><tr><td>Cell 9</td><td colspan=\"2\">A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr><tr><td>Cell 10</td><td>Cell 11</td><td>Cell 12</td></tr></table>"
|
||||
},
|
||||
"text": "Big Table Header Merged Cell 1 Cell 2 Cell 3 Merged Cell 4 and 5 Cell 6 Cell 7 Cell 8 Cell 9 A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Cell 10 Cell 11 Cell 12",
|
||||
"type": "Table"
|
||||
@ -36,7 +36,7 @@
|
||||
"eng"
|
||||
],
|
||||
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
|
||||
"text_as_html": "<table class=\"TableOfContents\" id=\"da6c34391e544b3480e45d68f40870fa\"> <tr> <th>Chapter</th><th>Title</th><th>Page</th></tr><tr> <td>1</td><td>Introduction</td><td>1</td></tr><tr> <td>2</td><td>Getting Started</td><td>5</td></tr><tr> <td>3</td><td>Basic Concepts</td><td>12</td></tr><tr> <td>4</td><td>Advanced Topics</td><td>25</td></tr><tr> <td>5</td><td>Conclusion</td><td>40</td></tr></table>"
|
||||
"text_as_html": "<table class=\"TableOfContents\" id=\"da6c34391e544b3480e45d68f40870fa\"><tr><th>Chapter</th><th>Title</th><th>Page</th></tr><tr><td>1</td><td>Introduction</td><td>1</td></tr><tr><td>2</td><td>Getting Started</td><td>5</td></tr><tr><td>3</td><td>Basic Concepts</td><td>12</td></tr><tr><td>4</td><td>Advanced Topics</td><td>25</td></tr><tr><td>5</td><td>Conclusion</td><td>40</td></tr></table>"
|
||||
},
|
||||
"text": "Chapter Title Page 1 Introduction 1 2 Getting Started 5 3 Basic Concepts 12 4 Advanced Topics 25 5 Conclusion 40",
|
||||
"type": "Table"
|
||||
|
||||
@ -476,10 +476,14 @@ def test_table_and_time():
|
||||
<tbody>
|
||||
<tr>
|
||||
<td colspan="5">
|
||||
<time>
|
||||
June 30, 2023
|
||||
</time>
|
||||
</td>
|
||||
<td>
|
||||
<span>
|
||||
$—
|
||||
</span>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
|
||||
@ -90,7 +90,7 @@ def test_simple_narrative_text_with_id():
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="NarrativeText" '
|
||||
'id="73cd7b4a-2444-4910-87a4-138117dfaab9">DEALER ONLY </p>',
|
||||
'id="73cd7b4a-2444-4910-87a4-138117dfaab9">DEALER ONLY</p>',
|
||||
parent_id="1",
|
||||
),
|
||||
)
|
||||
@ -158,7 +158,7 @@ def test_multiple_elements():
|
||||
detection_origin="vlm_partitioner",
|
||||
element_id="2",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="Paragraph" id="2">About the same </p>',
|
||||
text_as_html='<p class="Paragraph" id="2">About the same</p>',
|
||||
parent_id="1",
|
||||
),
|
||||
),
|
||||
@ -178,7 +178,7 @@ def test_multiple_elements():
|
||||
text="Some text",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="Paragraph" ' 'id="4">Some text </p>',
|
||||
text_as_html='<p class="Paragraph" ' 'id="4">Some text</p>',
|
||||
parent_id="1",
|
||||
),
|
||||
),
|
||||
@ -226,7 +226,7 @@ def test_multiple_pages():
|
||||
detection_origin="vlm_partitioner",
|
||||
element_id="2",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="Paragraph" id="2">Some text </p>', parent_id="1"
|
||||
text_as_html='<p class="Paragraph" id="2">Some text</p>', parent_id="1"
|
||||
),
|
||||
),
|
||||
Text(
|
||||
@ -242,7 +242,7 @@ def test_multiple_pages():
|
||||
detection_origin="vlm_partitioner",
|
||||
element_id="4",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="Paragraph" id="4">Another text </p>', parent_id="3"
|
||||
text_as_html='<p class="Paragraph" id="4">Another text</p>', parent_id="3"
|
||||
),
|
||||
),
|
||||
]
|
||||
@ -279,13 +279,13 @@ def test_forms():
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html=""
|
||||
'<form class="Form" id="2"> '
|
||||
'<form class="Form" id="2">'
|
||||
'<label class="FormField" '
|
||||
'for="option1" id="3"> '
|
||||
'for="option1" id="3">'
|
||||
'<input class="FormFieldValue" type="radio" '
|
||||
'name="options" value="2" id="4" checked />'
|
||||
'<p class="Paragraph" id="5">'
|
||||
"Option 1 (Checked) "
|
||||
"Option 1 (Checked)"
|
||||
"</p></label></form>",
|
||||
parent_id="1",
|
||||
),
|
||||
@ -323,9 +323,9 @@ def test_table():
|
||||
detection_origin="vlm_partitioner",
|
||||
element_id="2",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<table class="Table" id="2"> '
|
||||
"<tbody> "
|
||||
"<tr> "
|
||||
text_as_html='<table class="Table" id="2">'
|
||||
"<tbody>"
|
||||
"<tr>"
|
||||
"<td>"
|
||||
"Fair Value1"
|
||||
"</td>"
|
||||
@ -402,7 +402,7 @@ def test_very_nested_structure_is_preserved():
|
||||
element_id="10",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<h1 class="Title" id="10">Title </h1>', parent_id="3"
|
||||
text_as_html='<h1 class="Title" id="10">Title</h1>', parent_id="3"
|
||||
),
|
||||
),
|
||||
Text(
|
||||
@ -416,9 +416,9 @@ def test_very_nested_structure_is_preserved():
|
||||
element_id="5",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<blockquote class="Quote" id="5"> '
|
||||
text_as_html='<blockquote class="Quote" id="5">'
|
||||
'<p class="Paragraph" id="6">'
|
||||
"Clever Quote "
|
||||
"Clever Quote"
|
||||
"</p>"
|
||||
"</blockquote>",
|
||||
parent_id="4",
|
||||
@ -429,9 +429,9 @@ def test_very_nested_structure_is_preserved():
|
||||
element_id="8",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<div class="Footnote" id="8"> '
|
||||
text_as_html='<div class="Footnote" id="8">'
|
||||
'<span class="UncategorizedText" id="9">'
|
||||
"Uncategorized footnote text "
|
||||
"Uncategorized footnote text"
|
||||
"</span>"
|
||||
"</div>",
|
||||
parent_id="4",
|
||||
@ -472,14 +472,14 @@ def test_ordered_list():
|
||||
element_id="2",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<ul class="UnorderedList" id="2"> '
|
||||
text_as_html='<ul class="UnorderedList" id="2">'
|
||||
'<li class="ListItem" id="3">'
|
||||
"Item 1 "
|
||||
"Item 1"
|
||||
"</li>"
|
||||
'<li class="ListItem" id="4">'
|
||||
"Item 2 </li>"
|
||||
"Item 2</li>"
|
||||
'<li class="ListItem" id="5">'
|
||||
"Item 3 "
|
||||
"Item 3"
|
||||
"</li></ul>",
|
||||
parent_id="1",
|
||||
),
|
||||
@ -517,11 +517,11 @@ def test_squeezed_elements_are_parsed_back():
|
||||
element_id="2",
|
||||
detection_origin="vlm_partitioner",
|
||||
metadata=ElementMetadata(
|
||||
text_as_html='<p class="NarrativeText" id="2">Table of Contents </p> '
|
||||
text_as_html='<p class="NarrativeText" id="2">Table of Contents</p>'
|
||||
'<address class="Address" id="3">'
|
||||
"68 Prince Street Palmdale, CA 93550 "
|
||||
"</address> "
|
||||
'<a class="Hyperlink" id="4">www.google.com </a>',
|
||||
"68 Prince Street Palmdale, CA 93550"
|
||||
"</address>"
|
||||
'<a class="Hyperlink" id="4">www.google.com</a>',
|
||||
parent_id="1",
|
||||
),
|
||||
)
|
||||
|
||||
@ -45,3 +45,31 @@ def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
|
||||
html_parser_version="v2",
|
||||
)
|
||||
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
|
||||
|
||||
|
||||
def test_attr_and_html_inside_table_cell_is_kept():
|
||||
# language=HTML
|
||||
html = """
|
||||
<div class="Page">
|
||||
<table class="Table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
Some text
|
||||
</td>
|
||||
<td>
|
||||
<input checked="" class="Checkbox" type="checkbox"/>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
"""
|
||||
page, table = partition_html(
|
||||
text=html,
|
||||
image_alt_mode="to_text",
|
||||
html_parser_version="v2",
|
||||
)
|
||||
|
||||
assert '<input checked="" type="checkbox"/>' in table.metadata.text_as_html # class is removed
|
||||
assert 'colspan="2"' in table.metadata.text_as_html
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.19-dev2" # pragma: no cover
|
||||
__version__ = "0.16.19-dev3" # pragma: no cover
|
||||
|
||||
@ -125,7 +125,8 @@ class OntologyElement(BaseModel):
|
||||
text = self.text or ""
|
||||
|
||||
if text or children_html:
|
||||
return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
|
||||
inside_tag_text = f"{text} {children_html}".strip()
|
||||
return f"<{self.html_tag_name} {attr_str}>{inside_tag_text}</{self.html_tag_name}>"
|
||||
else:
|
||||
return f"<{self.html_tag_name} {attr_str} />"
|
||||
|
||||
@ -145,11 +146,10 @@ class OntologyElement(BaseModel):
|
||||
|
||||
def remove_ids_and_class_from_table(soup: Tag):
|
||||
for tag in soup.find_all(True):
|
||||
if tag.name != "table":
|
||||
tag.attrs.pop("class", None)
|
||||
tag.attrs.pop("id", None)
|
||||
if tag.name in ["td", "th"]:
|
||||
tag.string = " ".join(tag.stripped_strings)
|
||||
if tag.name == "table":
|
||||
continue # We keep table tag
|
||||
tag.attrs.pop("class", None)
|
||||
tag.attrs.pop("id", None)
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
@ -136,7 +136,7 @@ def combine_inline_elements(elements: list[elements.Element]) -> list[elements.E
|
||||
|
||||
if can_unstructured_elements_be_merged(current_element, next_element):
|
||||
current_element.text += " " + next_element.text
|
||||
current_element.metadata.text_as_html += " " + next_element.metadata.text_as_html
|
||||
current_element.metadata.text_as_html += next_element.metadata.text_as_html
|
||||
else:
|
||||
result_elements.append(current_element)
|
||||
current_element = next_element
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user