diff --git a/CHANGELOG.md b/CHANGELOG.md index 358498843..9936f5987 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ * **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding an `__init__.py` file under the folder. * **Fix a bug when `parition_pdf` get `model_name=None`** In API usage the `model_name` value is `None` and the `cast` function in `partition_pdf` would return `None` and lead to attribution error. Now we use `str` function to explicit convert the content to string so it is garanteed to have `starts_with` and other string functions as attributes +* **Fix html partition fail on tables without `tbody` tag** HTML tables may sometimes just contain headers without body (`tbody` tag) ## 0.10.24 diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 0d783a82f..dac05c70b 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -672,3 +672,22 @@ def test_partition_html_respects_detect_language_per_element(): elements = partition_html(filename=filename, detect_language_per_element=True) langs = [element.metadata.languages for element in elements] assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] + + +@pytest.mark.parametrize( + ("tag", "expected"), + [ + ("thead", ""), + ("foo", ""), + ], +) +def test_partition_html_with_table_without_tbody(tag, expected): + table_html = f""" +
Header 1 | Header 2 |
---|