From 3437a23c91f12e80d17d380065e3a35f0aa9fa48 Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 20 Oct 2023 18:21:59 -0500 Subject: [PATCH] fix: partition html fail with table without tbody (#1817) This PR resolves #1807 - fix a bug where when a table tagged content does not contain `tbody` tag but `thead` tag for the rows the code fails - now when there is no `tbody` in a table section we try to look for `thead` isntead - when both are not found return empty table --- CHANGELOG.md | 1 + .../partition/test_html_partition.py | 19 +++++++++++++++++++ unstructured/documents/html.py | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 358498843..9936f5987 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ * **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding an `__init__.py` file under the folder. * **Fix a bug when `parition_pdf` get `model_name=None`** In API usage the `model_name` value is `None` and the `cast` function in `partition_pdf` would return `None` and lead to attribution error. Now we use `str` function to explicit convert the content to string so it is garanteed to have `starts_with` and other string functions as attributes +* **Fix html partition fail on tables without `tbody` tag** HTML tables may sometimes just contain headers without body (`tbody` tag) ## 0.10.24 diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 0d783a82f..dac05c70b 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -672,3 +672,22 @@ def test_partition_html_respects_detect_language_per_element(): elements = partition_html(filename=filename, detect_language_per_element=True) langs = [element.metadata.languages for element in elements] assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] + + +@pytest.mark.parametrize( + ("tag", "expected"), + [ + ("thead", ""), + ("foo", ""), + ], +) +def test_partition_html_with_table_without_tbody(tag, expected): + table_html = f""" + + <{tag}> + + +
Header 1Header 2
+ """ + partitions = partition_html(text=table_html) + assert partitions[0].metadata.text_as_html == expected diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 5d89110af..1a033f0b4 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -495,7 +495,7 @@ def _process_leaf_table_item( rows = tag_elem.findall("tr") if not rows: body = tag_elem.find("tbody") - rows = body.findall("tr") + rows = body.findall("tr") if body else [] if len(rows) > 0: table_data = [list(row.itertext()) for row in rows] html_table = tabulate(table_data, tablefmt="html")