mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix(docx): fix short-row DOCX table (#2943)
**Summary** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
This commit is contained in:
parent
eff84afe24
commit
601594d373
@ -1,4 +1,4 @@
|
||||
## 0.13.7-dev1
|
||||
## 0.13.7-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
|
||||
|
||||
## 0.13.6
|
||||
|
||||
### Enhancements
|
||||
|
BIN
example-docs/tables-with-incomplete-rows.docx
Normal file
BIN
example-docs/tables-with-incomplete-rows.docx
Normal file
Binary file not shown.
@ -16,7 +16,6 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
CompositeElement,
|
||||
Element,
|
||||
ElementType,
|
||||
Footer,
|
||||
Header,
|
||||
ListItem,
|
||||
@ -132,6 +131,133 @@ class Describe_DocxPartitioner:
|
||||
table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
|
||||
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
|
||||
|
||||
def it_can_partition_tables_with_incomplete_rows(self):
|
||||
"""DOCX permits table rows to start late and end early.
|
||||
|
||||
It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
|
||||
they allow rows to start late, like in column 3, and end early, like the last cell is in
|
||||
column 5 of a 7 column table.
|
||||
|
||||
A practical example might look like this:
|
||||
|
||||
+------+------+
|
||||
| East | West |
|
||||
+----------+------+------+
|
||||
| Started | 25 | 32 |
|
||||
+----------+------+------+
|
||||
| Finished | 17 | 21 |
|
||||
+----------+------+------+
|
||||
"""
|
||||
elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx")))
|
||||
|
||||
e = next(elements)
|
||||
assert e.text.startswith("Example of DOCX table ")
|
||||
# --
|
||||
# ┌───┬───┐
|
||||
# │ a │ b │
|
||||
# ├───┼───┤
|
||||
# │ c │ d │
|
||||
# └───┴───┘
|
||||
e = next(elements)
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
# --
|
||||
# ┌───┐
|
||||
# │ a │
|
||||
# ├───┼───┐
|
||||
# │ b │ c │
|
||||
# └───┴───┘
|
||||
e = next(elements)
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# --
|
||||
# ┌───────┐
|
||||
# │ a │
|
||||
# ├───┬───┼───┐
|
||||
# │ b │ c │ d │
|
||||
# └───┴───┴───┘
|
||||
e = next(elements)
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# --
|
||||
# ┌───┬───┐
|
||||
# │ │ b │
|
||||
# │ a ├───┼───┐
|
||||
# │ │ c │ d │
|
||||
# └───┴───┴───┘
|
||||
e = next(elements)
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# -- late-start, early-end, and >2 rows vertical span --
|
||||
# ┌───────┬───┬───┐
|
||||
# │ a │ b │ c │
|
||||
# └───┬───┴───┼───┘
|
||||
# │ d │
|
||||
# ┌───┤ ├───┐
|
||||
# │ e │ │ f │
|
||||
# └───┤ ├───┘
|
||||
# │ │
|
||||
# └───────┘
|
||||
e = next(elements)
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d e f", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n"
|
||||
"</thead>\n<tbody>\n"
|
||||
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
|
||||
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
|
||||
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# --
|
||||
# -- The table from the specimen file we received with the bug report. --
|
||||
e = next(elements)
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
|
||||
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
|
||||
"<tr><td> </td><td>More </td><td> </td></tr>\n"
|
||||
"<tr><td>Dato </td><td> </td><td> </td></tr>\n"
|
||||
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
|
||||
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
|
||||
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
|
||||
# -- page-break behaviors --------------------------------------------------------------------
|
||||
|
||||
def it_places_page_breaks_precisely_where_they_occur(self):
|
||||
@ -299,11 +425,7 @@ def test_parition_docx_from_team_chat():
|
||||
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
|
||||
"saved-by Dennis Forsythe",
|
||||
]
|
||||
assert [e.category for e in elements] == [
|
||||
ElementType.UNCATEGORIZED_TEXT,
|
||||
ElementType.UNCATEGORIZED_TEXT,
|
||||
ElementType.TABLE,
|
||||
]
|
||||
assert [type(e) for e in elements] == [Text, Text, Table]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("infer_table_structure", [True, False])
|
||||
@ -687,7 +809,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
|
||||
filename = "example-docs/handbook-1p.docx"
|
||||
partition_docx(
|
||||
filename=filename,
|
||||
languages="eng", # pyright: ignore[reportGeneralTypeIssues]
|
||||
languages="eng", # pyright: ignore[reportArgumentType]
|
||||
)
|
||||
|
||||
|
||||
|
@ -1,3 +0,0 @@
|
||||
from docx.api import Document
|
||||
|
||||
__all__ = ["Document"]
|
@ -1,5 +0,0 @@
|
||||
from typing import IO, Optional, Union
|
||||
|
||||
import docx.document
|
||||
|
||||
def Document(docx: Optional[Union[str, IO[bytes]]] = None) -> docx.document.Document: ...
|
@ -1,13 +0,0 @@
|
||||
from typing import Iterator, Sequence
|
||||
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class BlockItemContainer:
|
||||
_element: BaseOxmlElement
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def paragraphs(self) -> Sequence[Paragraph]: ...
|
||||
@property
|
||||
def tables(self) -> Sequence[Table]: ...
|
@ -1,28 +0,0 @@
|
||||
from typing import IO, Iterator, List
|
||||
|
||||
from docx.oxml.document import CT_Document
|
||||
from docx.section import Sections
|
||||
from docx.settings import Settings
|
||||
from docx.shared import ElementProxy
|
||||
from docx.styles.style import ParagraphStyle
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class Document(ElementProxy):
|
||||
def add_paragraph(
|
||||
self,
|
||||
text: str = "",
|
||||
style: ParagraphStyle | str | None = None,
|
||||
) -> Paragraph: ...
|
||||
@property
|
||||
def element(self) -> CT_Document: ...
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def paragraphs(self) -> List[Paragraph]: ...
|
||||
@property
|
||||
def tables(self) -> List[Table]: ...
|
||||
def save(self, path_or_stream: str | IO[bytes]) -> None: ...
|
||||
@property
|
||||
def sections(self) -> Sections: ...
|
||||
@property
|
||||
def settings(self) -> Settings: ...
|
@ -1 +0,0 @@
|
||||
class Drawing: ...
|
@ -1,11 +0,0 @@
|
||||
import enum
|
||||
|
||||
class WD_SECTION_START(enum.Enum):
|
||||
CONTINUOUS: enum.Enum
|
||||
EVEN_PAGE: enum.Enum
|
||||
NEW_COLUMN: enum.Enum
|
||||
NEW_PAGE: enum.Enum
|
||||
ODD_PAGE: enum.Enum
|
||||
|
||||
# -- alias --
|
||||
WD_SECTION = WD_SECTION_START
|
@ -1,7 +0,0 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from typing import Union
|
||||
|
||||
from lxml import etree
|
||||
|
||||
def parse_xml(xml: Union[str, bytes]) -> etree._Element: ...
|
@ -1,10 +0,0 @@
|
||||
from typing import Iterator
|
||||
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_Body(BaseOxmlElement):
|
||||
def __iter__(self) -> Iterator[BaseOxmlElement]: ...
|
||||
|
||||
class CT_Document(BaseOxmlElement):
|
||||
@property
|
||||
def body(self) -> CT_Body: ...
|
@ -1,5 +0,0 @@
|
||||
from typing import Dict
|
||||
|
||||
nsmap: Dict[str, str]
|
||||
|
||||
def qn(tag: str) -> str: ...
|
@ -1,7 +0,0 @@
|
||||
from typing import Optional
|
||||
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_SectPr(BaseOxmlElement):
|
||||
@property
|
||||
def preceding_sectPr(self) -> Optional[CT_SectPr]: ...
|
@ -1,16 +0,0 @@
|
||||
"""Table-related XML element-types."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_Row(BaseOxmlElement):
|
||||
tc_lst: List[CT_Tc]
|
||||
|
||||
class CT_Tc(BaseOxmlElement):
|
||||
@property
|
||||
def vMerge(self) -> str | None: ...
|
||||
|
||||
class CT_Tbl(BaseOxmlElement): ...
|
@ -1,9 +0,0 @@
|
||||
from typing import List
|
||||
|
||||
from docx.oxml.text.run import CT_R
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_Hyperlink(BaseOxmlElement):
|
||||
address: str
|
||||
@property
|
||||
def r_lst(self) -> List[CT_R]: ...
|
@ -1,3 +0,0 @@
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_LastRenderedPageBreak(BaseOxmlElement): ...
|
@ -1,3 +0,0 @@
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_P(BaseOxmlElement): ...
|
@ -1,4 +0,0 @@
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_PPr(BaseOxmlElement): ...
|
||||
class CT_TabStop(BaseOxmlElement): ...
|
@ -1,30 +0,0 @@
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
class CT_Br(BaseOxmlElement):
|
||||
type: str | None
|
||||
clear: str | None
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
class CT_Cr(BaseOxmlElement):
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
class CT_NoBreakHyphen(BaseOxmlElement):
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
class CT_PTab(BaseOxmlElement):
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
class CT_R(BaseOxmlElement):
|
||||
text: str
|
||||
|
||||
class CT_Tab(BaseOxmlElement):
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
class CT_Text(BaseOxmlElement):
|
||||
@property
|
||||
def text(self) -> str: ...
|
@ -1,17 +0,0 @@
|
||||
from typing import Any, Iterator
|
||||
|
||||
from lxml import etree
|
||||
|
||||
class BaseOxmlElement(etree.ElementBase):
|
||||
def __iter__(self) -> Iterator[BaseOxmlElement]: ...
|
||||
@property
|
||||
def xml(self) -> str: ...
|
||||
def xpath(self, xpath_str: str) -> Any:
|
||||
"""Return type is typically Sequence[ElementBase], but ...
|
||||
|
||||
lxml.etree.XPath has many possible return types including bool, (a "smart") str,
|
||||
float. The return type can also be a list containing ElementBase, comments,
|
||||
processing instructions, str, and tuple. So you need to cast the result based on
|
||||
the XPath expression you use.
|
||||
"""
|
||||
...
|
@ -1,36 +0,0 @@
|
||||
from typing import Iterator, Sequence
|
||||
|
||||
from docx.blkcntnr import BlockItemContainer
|
||||
from docx.enum.section import WD_SECTION
|
||||
from docx.oxml.section import CT_SectPr
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class Section:
|
||||
_sectPr: CT_SectPr
|
||||
@property
|
||||
def different_first_page_header_footer(self) -> bool: ...
|
||||
@property
|
||||
def even_page_footer(self) -> _Footer: ...
|
||||
@property
|
||||
def even_page_header(self) -> _Header: ...
|
||||
@property
|
||||
def first_page_footer(self) -> _Footer: ...
|
||||
@property
|
||||
def first_page_header(self) -> _Header: ...
|
||||
@property
|
||||
def footer(self) -> _Footer: ...
|
||||
@property
|
||||
def header(self) -> _Header: ...
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def start_type(self) -> WD_SECTION: ...
|
||||
|
||||
class Sections(Sequence[Section]): ...
|
||||
|
||||
class _BaseHeaderFooter(BlockItemContainer):
|
||||
@property
|
||||
def is_linked_to_previous(self) -> bool: ...
|
||||
|
||||
class _Footer(_BaseHeaderFooter): ...
|
||||
class _Header(_BaseHeaderFooter): ...
|
@ -1,5 +0,0 @@
|
||||
from docx.shared import ElementProxy
|
||||
|
||||
class Settings(ElementProxy):
|
||||
@property
|
||||
def odd_and_even_pages_header_footer(self) -> bool: ...
|
@ -1,16 +0,0 @@
|
||||
from typing import Any, Callable, Generic, TypeVar
|
||||
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
class lazyproperty(Generic[_T]):
|
||||
def __init__(self, fget: Callable[..., _T]) -> None: ...
|
||||
def __get__(self, obj: Any, type: Any = None) -> _T: ...
|
||||
def __set__(self, obj: Any, value: Any) -> None: ...
|
||||
|
||||
class ElementProxy:
|
||||
@property
|
||||
def element(self) -> BaseOxmlElement: ...
|
||||
|
||||
class Parented: ...
|
@ -1,8 +0,0 @@
|
||||
from typing import Optional
|
||||
|
||||
class BaseStyle:
|
||||
@property
|
||||
def name(self) -> Optional[str]: ...
|
||||
|
||||
class CharacterStyle(BaseStyle): ...
|
||||
class ParagraphStyle(CharacterStyle): ...
|
@ -1,27 +0,0 @@
|
||||
"""Table-related docx proxy-objects."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Sequence
|
||||
|
||||
from docx.blkcntnr import BlockItemContainer
|
||||
from docx.oxml.table import CT_Row, CT_Tbl, CT_Tc
|
||||
from docx.shared import Parented
|
||||
|
||||
class _Cell(BlockItemContainer):
|
||||
_tc: CT_Tc
|
||||
def __init__(self, tc: CT_Tc, parent: Parented) -> None: ...
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
class _Row(Parented):
|
||||
_tr: CT_Row
|
||||
@property
|
||||
def cells(self) -> Sequence[_Cell]: ...
|
||||
|
||||
class _Rows(Sequence[_Row]): ...
|
||||
|
||||
class Table(Parented):
|
||||
def __init__(self, tbl: CT_Tbl, parent: BlockItemContainer) -> None: ...
|
||||
@property
|
||||
def rows(self) -> _Rows: ...
|
@ -1,10 +0,0 @@
|
||||
from docx.oxml.text.hyperlink import CT_Hyperlink
|
||||
from docx.shared import Parented
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class Hyperlink(Parented):
|
||||
_element: CT_Hyperlink
|
||||
_r: CT_Hyperlink
|
||||
text: str
|
||||
url: str
|
||||
def __init__(self, hyperlink: CT_Hyperlink, parent: Paragraph) -> None: ...
|
@ -1,12 +0,0 @@
|
||||
from typing import Any
|
||||
|
||||
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
|
||||
from docx.shared import Parented
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class RenderedPageBreak(Parented):
|
||||
def __init__(self, lastRenderedPageBreak: CT_LastRenderedPageBreak, parent: Any) -> None: ...
|
||||
@property
|
||||
def preceding_paragraph_fragment(self) -> Paragraph | None: ...
|
||||
@property
|
||||
def following_paragraph_fragment(self) -> Paragraph | None: ...
|
@ -1,26 +0,0 @@
|
||||
from typing import Iterator, List, Sequence
|
||||
|
||||
from docx.blkcntnr import BlockItemContainer
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
from docx.styles.style import ParagraphStyle
|
||||
from docx.text.hyperlink import Hyperlink
|
||||
from docx.text.pagebreak import RenderedPageBreak
|
||||
from docx.text.run import Run
|
||||
|
||||
class Paragraph(BlockItemContainer):
|
||||
_p: CT_P
|
||||
_parent: BlockItemContainer
|
||||
text: str
|
||||
def __init__(self, p: BaseOxmlElement, parent: BlockItemContainer) -> None: ...
|
||||
@property
|
||||
def contains_page_break(self) -> bool: ...
|
||||
@property
|
||||
def hyperlinks(self) -> List[Hyperlink]: ...
|
||||
def iter_inner_content(self) -> Iterator[Run | Hyperlink]: ...
|
||||
@property
|
||||
def rendered_page_breaks(self) -> List[RenderedPageBreak]: ...
|
||||
@property
|
||||
def runs(self) -> Sequence[Run]: ...
|
||||
@property
|
||||
def style(self) -> ParagraphStyle | None: ...
|
@ -1,14 +0,0 @@
|
||||
from docx.oxml.text.run import CT_R
|
||||
from docx.shared import Parented
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class Run(Parented):
|
||||
_element: CT_R
|
||||
_r: CT_R
|
||||
def __init__(self, r: CT_R, parent: Paragraph) -> None: ...
|
||||
@property
|
||||
def bold(self) -> bool: ...
|
||||
@property
|
||||
def italic(self) -> bool: ...
|
||||
@property
|
||||
def text(self) -> str: ...
|
@ -1 +1 @@
|
||||
__version__ = "0.13.7-dev1" # pragma: no cover
|
||||
__version__ = "0.13.7-dev2" # pragma: no cover
|
||||
|
@ -415,6 +415,10 @@ class _DocxPartitioner:
|
||||
"""
|
||||
|
||||
def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
|
||||
"""Generate the text of each paragraph or table in `cell` as a separate string.
|
||||
|
||||
A table nested in `cell` is converted to HTML and emitted as that string.
|
||||
"""
|
||||
for block_item in cell.iter_inner_content():
|
||||
if isinstance(block_item, Paragraph):
|
||||
# -- all docx content is ultimately in a paragraph; a nested table contributes
|
||||
@ -425,11 +429,26 @@ class _DocxPartitioner:
|
||||
):
|
||||
yield self._convert_table_to_html(block_item, is_nested=True)
|
||||
|
||||
def iter_cells(row: _Row) -> Iterator[str]:
|
||||
return ("\n".join(iter_cell_block_items(cell)) for cell in row.cells)
|
||||
def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
|
||||
"""Generate the text of each cell in `row` as a separate string.
|
||||
|
||||
The text of each paragraph within a cell is separated from the next by a newline
|
||||
(`"\n"`). A table nested in a cell is first converted to HTML and then included as a
|
||||
string, also separated by a newline.
|
||||
"""
|
||||
# -- each omitted cell at the start of the row (pretty rare) gets the empty string --
|
||||
for _ in range(row.grid_cols_before):
|
||||
yield ""
|
||||
|
||||
for cell in row.cells:
|
||||
yield "\n".join(iter_cell_block_items(cell))
|
||||
|
||||
# -- each omitted cell at the end of the row (also rare) gets the empty string --
|
||||
for _ in range(row.grid_cols_after):
|
||||
yield ""
|
||||
|
||||
return tabulate(
|
||||
[list(iter_cells(row)) for row in table.rows],
|
||||
[list(iter_row_cells_as_text(row)) for row in table.rows],
|
||||
headers=[] if is_nested else "firstrow",
|
||||
# -- tabulate isn't really designed for recursive tables so we have to do any
|
||||
# -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
|
||||
@ -739,7 +758,7 @@ class _DocxPartitioner:
|
||||
if tc.vMerge == "continue":
|
||||
continue
|
||||
# -- do not generate empty strings --
|
||||
yield from (text for text in iter_cell_texts(_Cell(tc, row)) if text)
|
||||
yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text)
|
||||
|
||||
@lazyproperty
|
||||
def _last_modified(self) -> Optional[str]:
|
||||
@ -846,7 +865,7 @@ class _DocxPartitioner:
|
||||
# Determine category depth from paragraph ilvl xpath
|
||||
xpath = paragraph._element.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
|
||||
if xpath:
|
||||
return int(xpath[0])
|
||||
return round(float(xpath[0]))
|
||||
|
||||
# Determine category depth from style name
|
||||
style_name = (paragraph.style and paragraph.style.name) or "Normal"
|
||||
|
Loading…
x
Reference in New Issue
Block a user