fix: partition html fail with table without tbody (#1817)

This PR resolves #1807 
- fix a bug where when a table tagged content does not contain `tbody`
tag but `thead` tag for the rows the code fails
- now when there is no `tbody` in a table section we try to look for
`thead` isntead
- when both are not found return empty table
This commit is contained in:
Yao You 2023-10-20 18:21:59 -05:00 committed by GitHub
parent de685fbc18
commit 3437a23c91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 1 deletions

View File

@ -20,6 +20,7 @@
* **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding
an `__init__.py` file under the folder.
* **Fix a bug when `parition_pdf` get `model_name=None`** In API usage the `model_name` value is `None` and the `cast` function in `partition_pdf` would return `None` and lead to attribution error. Now we use `str` function to explicit convert the content to string so it is garanteed to have `starts_with` and other string functions as attributes
* **Fix html partition fail on tables without `tbody` tag** HTML tables may sometimes just contain headers without body (`tbody` tag)
## 0.10.24

View File

@ -672,3 +672,22 @@ def test_partition_html_respects_detect_language_per_element():
elements = partition_html(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
@pytest.mark.parametrize(
("tag", "expected"),
[
("thead", ""),
("foo", ""),
],
)
def test_partition_html_with_table_without_tbody(tag, expected):
table_html = f"""
<table>
<{tag}>
<tr><th>Header 1</th><th>Header 2</th></tr>
</{tag}>
</table>
"""
partitions = partition_html(text=table_html)
assert partitions[0].metadata.text_as_html == expected

View File

@ -495,7 +495,7 @@ def _process_leaf_table_item(
rows = tag_elem.findall("tr")
if not rows:
body = tag_elem.find("tbody")
rows = body.findall("tr")
rows = body.findall("tr") if body else []
if len(rows) > 0:
table_data = [list(row.itertext()) for row in rows]
html_table = tabulate(table_data, tablefmt="html")