mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
fix: partition html fail with table without tbody (#1817)
This PR resolves #1807 - fix a bug where when a table tagged content does not contain `tbody` tag but `thead` tag for the rows the code fails - now when there is no `tbody` in a table section we try to look for `thead` isntead - when both are not found return empty table
This commit is contained in:
parent
de685fbc18
commit
3437a23c91
@ -20,6 +20,7 @@
|
|||||||
* **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding
|
* **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding
|
||||||
an `__init__.py` file under the folder.
|
an `__init__.py` file under the folder.
|
||||||
* **Fix a bug when `parition_pdf` get `model_name=None`** In API usage the `model_name` value is `None` and the `cast` function in `partition_pdf` would return `None` and lead to attribution error. Now we use `str` function to explicit convert the content to string so it is garanteed to have `starts_with` and other string functions as attributes
|
* **Fix a bug when `parition_pdf` get `model_name=None`** In API usage the `model_name` value is `None` and the `cast` function in `partition_pdf` would return `None` and lead to attribution error. Now we use `str` function to explicit convert the content to string so it is garanteed to have `starts_with` and other string functions as attributes
|
||||||
|
* **Fix html partition fail on tables without `tbody` tag** HTML tables may sometimes just contain headers without body (`tbody` tag)
|
||||||
|
|
||||||
## 0.10.24
|
## 0.10.24
|
||||||
|
|
||||||
|
@ -672,3 +672,22 @@ def test_partition_html_respects_detect_language_per_element():
|
|||||||
elements = partition_html(filename=filename, detect_language_per_element=True)
|
elements = partition_html(filename=filename, detect_language_per_element=True)
|
||||||
langs = [element.metadata.languages for element in elements]
|
langs = [element.metadata.languages for element in elements]
|
||||||
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("tag", "expected"),
|
||||||
|
[
|
||||||
|
("thead", ""),
|
||||||
|
("foo", ""),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_html_with_table_without_tbody(tag, expected):
|
||||||
|
table_html = f"""
|
||||||
|
<table>
|
||||||
|
<{tag}>
|
||||||
|
<tr><th>Header 1</th><th>Header 2</th></tr>
|
||||||
|
</{tag}>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
partitions = partition_html(text=table_html)
|
||||||
|
assert partitions[0].metadata.text_as_html == expected
|
||||||
|
@ -495,7 +495,7 @@ def _process_leaf_table_item(
|
|||||||
rows = tag_elem.findall("tr")
|
rows = tag_elem.findall("tr")
|
||||||
if not rows:
|
if not rows:
|
||||||
body = tag_elem.find("tbody")
|
body = tag_elem.find("tbody")
|
||||||
rows = body.findall("tr")
|
rows = body.findall("tr") if body else []
|
||||||
if len(rows) > 0:
|
if len(rows) > 0:
|
||||||
table_data = [list(row.itertext()) for row in rows]
|
table_data = [list(row.itertext()) for row in rows]
|
||||||
html_table = tabulate(table_data, tablefmt="html")
|
html_table = tabulate(table_data, tablefmt="html")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user