mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
fix: handling for empty tables in word docs and powerpoints (#982)
* fix table index error * changelog and version
This commit is contained in:
parent
df1ba39905
commit
15618e8346
@ -9,6 +9,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Handling for empty tables in Word Documents and PowerPoints.
|
||||
|
||||
## 0.8.4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -219,3 +219,14 @@ def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
||||
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
|
||||
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
|
||||
assert "an error occurred" in caplog.text
|
||||
|
||||
|
||||
class MockDocxEmptyTable:
|
||||
def __init__(self):
|
||||
self.rows = []
|
||||
|
||||
|
||||
def test_convert_ms_office_table_to_text_works_with_empty_tables():
|
||||
table = MockDocxEmptyTable()
|
||||
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
|
||||
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
|
||||
|
||||
@ -318,6 +318,10 @@ def convert_ms_office_table_to_text(table: docxtable.Table, as_html: bool = True
|
||||
"""
|
||||
fmt = "html" if as_html else "plain"
|
||||
rows = list(table.rows)
|
||||
headers = [cell.text for cell in rows[0].cells]
|
||||
data = [[cell.text for cell in row.cells] for row in rows[1:]]
|
||||
return tabulate(data, headers=headers, tablefmt=fmt)
|
||||
if len(rows) > 0:
|
||||
headers = [cell.text for cell in rows[0].cells]
|
||||
data = [[cell.text for cell in row.cells] for row in rows[1:]]
|
||||
table_text = tabulate(data, headers=headers, tablefmt=fmt)
|
||||
else:
|
||||
table_text = ""
|
||||
return table_text
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user