mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-06 07:12:37 +00:00

**Summary.** The `python-docx` table API is designed for _uniform_ tables (no merged cells, no nested tables). Naive processing of DOCX tables using this API produces duplicate text when the table has merged cells. Add a more sophisticated parsing method that reads only "root" cells (those with an actual `<tc>` element) and skip cells spanned by a merge. In the process, abandon use of the `tabulate` package for this job (which is also designed for uniform tables) and remove the whitespace padding it adds for visual alignment of columns. Separate the text for each cell with a single newline ("\n"). Since it's little extra trouble, add support for nested tables such that their text also contributes to the `Table.text` string. The new `._iter_table_texts()` method will also be used for parsing tables in headers and footers (where they are frequently used for layout purposes) in a closely following PR. Fixes #2106.
28 lines
672 B
Python
28 lines
672 B
Python
"""Table-related docx proxy-objects."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Sequence
|
|
|
|
from docx.blkcntnr import BlockItemContainer
|
|
from docx.oxml.table import CT_Row, CT_Tbl, CT_Tc
|
|
from docx.shared import Parented
|
|
|
|
class _Cell(BlockItemContainer):
|
|
_tc: CT_Tc
|
|
def __init__(self, tc: CT_Tc, parent: Parented) -> None: ...
|
|
@property
|
|
def text(self) -> str: ...
|
|
|
|
class _Row(Parented):
|
|
_tr: CT_Row
|
|
@property
|
|
def cells(self) -> Sequence[_Cell]: ...
|
|
|
|
class _Rows(Sequence[_Row]): ...
|
|
|
|
class Table(Parented):
|
|
def __init__(self, tbl: CT_Tbl, parent: BlockItemContainer) -> None: ...
|
|
@property
|
|
def rows(self) -> _Rows: ...
|