fix(docx): fix short-row DOCX table (#2943)

**Summary**
The DOCX format allows a table row to start late and/or end early,
meaning cells at the beginning or end of a row can be omitted. While
there are legitimate uses for this capability, using it in practice is
relatively rare. However, it can happen unintentionally when adjusting
cell borders with the mouse. Accommodate this case and generate accurate
`.text` and `.metadata.text_as_html` for these tables.
This commit is contained in:
Steve Canny 2024-05-01 17:45:52 -07:00 committed by GitHub
parent eff84afe24
commit 601594d373
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 157 additions and 340 deletions

View File

@ -1,4 +1,4 @@
## 0.13.7-dev1
## 0.13.7-dev2
### Enhancements
@ -8,6 +8,8 @@
### Fixes
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
## 0.13.6
### Enhancements

Binary file not shown.

View File

@ -16,7 +16,6 @@ from unstructured.documents.elements import (
Address,
CompositeElement,
Element,
ElementType,
Footer,
Header,
ListItem,
@ -132,6 +131,133 @@ class Describe_DocxPartitioner:
table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
def it_can_partition_tables_with_incomplete_rows(self):
"""DOCX permits table rows to start late and end early.
It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
they allow rows to start late, like in column 3, and end early, like the last cell is in
column 5 of a 7 column table.
A practical example might look like this:
+------+------+
| East | West |
+----------+------+------+
| Started | 25 | 32 |
+----------+------+------+
| Finished | 17 | 21 |
+----------+------+------+
"""
elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx")))
e = next(elements)
assert e.text.startswith("Example of DOCX table ")
# --
# ┌───┬───┐
# │ a │ b │
# ├───┼───┤
# │ c │ d │
# └───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
)
# --
# ┌───┐
# │ a │
# ├───┼───┐
# │ b │ c │
# └───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
# ┌───────┐
# │ a │
# ├───┬───┼───┐
# │ b │ c │ d │
# └───┴───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
# ┌───┬───┐
# │ │ b │
# │ a ├───┼───┐
# │ │ c │ d │
# └───┴───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# -- late-start, early-end, and >2 rows vertical span --
# ┌───────┬───┬───┐
# │ a │ b │ c │
# └───┬───┴───┼───┘
# │ d │
# ┌───┤ ├───┐
# │ e │ │ f │
# └───┤ ├───┘
# │ │
# └───────┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d e f", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n"
"</thead>\n<tbody>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
# -- The table from the specimen file we received with the bug report. --
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
"<tr><td> </td><td>More </td><td> </td></tr>\n"
"<tr><td>Dato </td><td> </td><td> </td></tr>\n"
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
"</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# -- page-break behaviors --------------------------------------------------------------------
def it_places_page_breaks_precisely_where_they_occur(self):
@ -299,11 +425,7 @@ def test_parition_docx_from_team_chat():
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
"saved-by Dennis Forsythe",
]
assert [e.category for e in elements] == [
ElementType.UNCATEGORIZED_TEXT,
ElementType.UNCATEGORIZED_TEXT,
ElementType.TABLE,
]
assert [type(e) for e in elements] == [Text, Text, Table]
@pytest.mark.parametrize("infer_table_structure", [True, False])
@ -687,7 +809,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
filename = "example-docs/handbook-1p.docx"
partition_docx(
filename=filename,
languages="eng", # pyright: ignore[reportGeneralTypeIssues]
languages="eng", # pyright: ignore[reportArgumentType]
)

View File

@ -1,3 +0,0 @@
from docx.api import Document
__all__ = ["Document"]

View File

@ -1,5 +0,0 @@
from typing import IO, Optional, Union
import docx.document
def Document(docx: Optional[Union[str, IO[bytes]]] = None) -> docx.document.Document: ...

View File

@ -1,13 +0,0 @@
from typing import Iterator, Sequence
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.table import Table
from docx.text.paragraph import Paragraph
class BlockItemContainer:
_element: BaseOxmlElement
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
@property
def paragraphs(self) -> Sequence[Paragraph]: ...
@property
def tables(self) -> Sequence[Table]: ...

View File

@ -1,28 +0,0 @@
from typing import IO, Iterator, List
from docx.oxml.document import CT_Document
from docx.section import Sections
from docx.settings import Settings
from docx.shared import ElementProxy
from docx.styles.style import ParagraphStyle
from docx.table import Table
from docx.text.paragraph import Paragraph
class Document(ElementProxy):
def add_paragraph(
self,
text: str = "",
style: ParagraphStyle | str | None = None,
) -> Paragraph: ...
@property
def element(self) -> CT_Document: ...
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
@property
def paragraphs(self) -> List[Paragraph]: ...
@property
def tables(self) -> List[Table]: ...
def save(self, path_or_stream: str | IO[bytes]) -> None: ...
@property
def sections(self) -> Sections: ...
@property
def settings(self) -> Settings: ...

View File

@ -1 +0,0 @@
class Drawing: ...

View File

@ -1,11 +0,0 @@
import enum
class WD_SECTION_START(enum.Enum):
CONTINUOUS: enum.Enum
EVEN_PAGE: enum.Enum
NEW_COLUMN: enum.Enum
NEW_PAGE: enum.Enum
ODD_PAGE: enum.Enum
# -- alias --
WD_SECTION = WD_SECTION_START

View File

@ -1,7 +0,0 @@
# pyright: reportPrivateUsage=false
from typing import Union
from lxml import etree
def parse_xml(xml: Union[str, bytes]) -> etree._Element: ...

View File

@ -1,10 +0,0 @@
from typing import Iterator
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_Body(BaseOxmlElement):
def __iter__(self) -> Iterator[BaseOxmlElement]: ...
class CT_Document(BaseOxmlElement):
@property
def body(self) -> CT_Body: ...

View File

@ -1,5 +0,0 @@
from typing import Dict
nsmap: Dict[str, str]
def qn(tag: str) -> str: ...

View File

@ -1,7 +0,0 @@
from typing import Optional
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_SectPr(BaseOxmlElement):
@property
def preceding_sectPr(self) -> Optional[CT_SectPr]: ...

View File

@ -1,16 +0,0 @@
"""Table-related XML element-types."""
from __future__ import annotations
from typing import List
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_Row(BaseOxmlElement):
tc_lst: List[CT_Tc]
class CT_Tc(BaseOxmlElement):
@property
def vMerge(self) -> str | None: ...
class CT_Tbl(BaseOxmlElement): ...

View File

@ -1,9 +0,0 @@
from typing import List
from docx.oxml.text.run import CT_R
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_Hyperlink(BaseOxmlElement):
address: str
@property
def r_lst(self) -> List[CT_R]: ...

View File

@ -1,3 +0,0 @@
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_LastRenderedPageBreak(BaseOxmlElement): ...

View File

@ -1,3 +0,0 @@
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_P(BaseOxmlElement): ...

View File

@ -1,4 +0,0 @@
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_PPr(BaseOxmlElement): ...
class CT_TabStop(BaseOxmlElement): ...

View File

@ -1,30 +0,0 @@
from docx.oxml.xmlchemy import BaseOxmlElement
class CT_Br(BaseOxmlElement):
type: str | None
clear: str | None
@property
def text(self) -> str: ...
class CT_Cr(BaseOxmlElement):
@property
def text(self) -> str: ...
class CT_NoBreakHyphen(BaseOxmlElement):
@property
def text(self) -> str: ...
class CT_PTab(BaseOxmlElement):
@property
def text(self) -> str: ...
class CT_R(BaseOxmlElement):
text: str
class CT_Tab(BaseOxmlElement):
@property
def text(self) -> str: ...
class CT_Text(BaseOxmlElement):
@property
def text(self) -> str: ...

View File

@ -1,17 +0,0 @@
from typing import Any, Iterator
from lxml import etree
class BaseOxmlElement(etree.ElementBase):
def __iter__(self) -> Iterator[BaseOxmlElement]: ...
@property
def xml(self) -> str: ...
def xpath(self, xpath_str: str) -> Any:
"""Return type is typically Sequence[ElementBase], but ...
lxml.etree.XPath has many possible return types including bool, (a "smart") str,
float. The return type can also be a list containing ElementBase, comments,
processing instructions, str, and tuple. So you need to cast the result based on
the XPath expression you use.
"""
...

View File

@ -1,36 +0,0 @@
from typing import Iterator, Sequence
from docx.blkcntnr import BlockItemContainer
from docx.enum.section import WD_SECTION
from docx.oxml.section import CT_SectPr
from docx.table import Table
from docx.text.paragraph import Paragraph
class Section:
_sectPr: CT_SectPr
@property
def different_first_page_header_footer(self) -> bool: ...
@property
def even_page_footer(self) -> _Footer: ...
@property
def even_page_header(self) -> _Header: ...
@property
def first_page_footer(self) -> _Footer: ...
@property
def first_page_header(self) -> _Header: ...
@property
def footer(self) -> _Footer: ...
@property
def header(self) -> _Header: ...
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
@property
def start_type(self) -> WD_SECTION: ...
class Sections(Sequence[Section]): ...
class _BaseHeaderFooter(BlockItemContainer):
@property
def is_linked_to_previous(self) -> bool: ...
class _Footer(_BaseHeaderFooter): ...
class _Header(_BaseHeaderFooter): ...

View File

@ -1,5 +0,0 @@
from docx.shared import ElementProxy
class Settings(ElementProxy):
@property
def odd_and_even_pages_header_footer(self) -> bool: ...

View File

@ -1,16 +0,0 @@
from typing import Any, Callable, Generic, TypeVar
from docx.oxml.xmlchemy import BaseOxmlElement
_T = TypeVar("_T")
class lazyproperty(Generic[_T]):
def __init__(self, fget: Callable[..., _T]) -> None: ...
def __get__(self, obj: Any, type: Any = None) -> _T: ...
def __set__(self, obj: Any, value: Any) -> None: ...
class ElementProxy:
@property
def element(self) -> BaseOxmlElement: ...
class Parented: ...

View File

@ -1,8 +0,0 @@
from typing import Optional
class BaseStyle:
@property
def name(self) -> Optional[str]: ...
class CharacterStyle(BaseStyle): ...
class ParagraphStyle(CharacterStyle): ...

View File

@ -1,27 +0,0 @@
"""Table-related docx proxy-objects."""
from __future__ import annotations
from typing import Sequence
from docx.blkcntnr import BlockItemContainer
from docx.oxml.table import CT_Row, CT_Tbl, CT_Tc
from docx.shared import Parented
class _Cell(BlockItemContainer):
_tc: CT_Tc
def __init__(self, tc: CT_Tc, parent: Parented) -> None: ...
@property
def text(self) -> str: ...
class _Row(Parented):
_tr: CT_Row
@property
def cells(self) -> Sequence[_Cell]: ...
class _Rows(Sequence[_Row]): ...
class Table(Parented):
def __init__(self, tbl: CT_Tbl, parent: BlockItemContainer) -> None: ...
@property
def rows(self) -> _Rows: ...

View File

@ -1,10 +0,0 @@
from docx.oxml.text.hyperlink import CT_Hyperlink
from docx.shared import Parented
from docx.text.paragraph import Paragraph
class Hyperlink(Parented):
_element: CT_Hyperlink
_r: CT_Hyperlink
text: str
url: str
def __init__(self, hyperlink: CT_Hyperlink, parent: Paragraph) -> None: ...

View File

@ -1,12 +0,0 @@
from typing import Any
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
from docx.shared import Parented
from docx.text.paragraph import Paragraph
class RenderedPageBreak(Parented):
def __init__(self, lastRenderedPageBreak: CT_LastRenderedPageBreak, parent: Any) -> None: ...
@property
def preceding_paragraph_fragment(self) -> Paragraph | None: ...
@property
def following_paragraph_fragment(self) -> Paragraph | None: ...

View File

@ -1,26 +0,0 @@
from typing import Iterator, List, Sequence
from docx.blkcntnr import BlockItemContainer
from docx.oxml.text.paragraph import CT_P
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.styles.style import ParagraphStyle
from docx.text.hyperlink import Hyperlink
from docx.text.pagebreak import RenderedPageBreak
from docx.text.run import Run
class Paragraph(BlockItemContainer):
_p: CT_P
_parent: BlockItemContainer
text: str
def __init__(self, p: BaseOxmlElement, parent: BlockItemContainer) -> None: ...
@property
def contains_page_break(self) -> bool: ...
@property
def hyperlinks(self) -> List[Hyperlink]: ...
def iter_inner_content(self) -> Iterator[Run | Hyperlink]: ...
@property
def rendered_page_breaks(self) -> List[RenderedPageBreak]: ...
@property
def runs(self) -> Sequence[Run]: ...
@property
def style(self) -> ParagraphStyle | None: ...

View File

@ -1,14 +0,0 @@
from docx.oxml.text.run import CT_R
from docx.shared import Parented
from docx.text.paragraph import Paragraph
class Run(Parented):
_element: CT_R
_r: CT_R
def __init__(self, r: CT_R, parent: Paragraph) -> None: ...
@property
def bold(self) -> bool: ...
@property
def italic(self) -> bool: ...
@property
def text(self) -> str: ...

View File

@ -1 +1 @@
__version__ = "0.13.7-dev1" # pragma: no cover
__version__ = "0.13.7-dev2" # pragma: no cover

View File

@ -415,6 +415,10 @@ class _DocxPartitioner:
"""
def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
"""Generate the text of each paragraph or table in `cell` as a separate string.
A table nested in `cell` is converted to HTML and emitted as that string.
"""
for block_item in cell.iter_inner_content():
if isinstance(block_item, Paragraph):
# -- all docx content is ultimately in a paragraph; a nested table contributes
@ -425,11 +429,26 @@ class _DocxPartitioner:
):
yield self._convert_table_to_html(block_item, is_nested=True)
def iter_cells(row: _Row) -> Iterator[str]:
return ("\n".join(iter_cell_block_items(cell)) for cell in row.cells)
def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
"""Generate the text of each cell in `row` as a separate string.
The text of each paragraph within a cell is separated from the next by a newline
(`"\n"`). A table nested in a cell is first converted to HTML and then included as a
string, also separated by a newline.
"""
# -- each omitted cell at the start of the row (pretty rare) gets the empty string --
for _ in range(row.grid_cols_before):
yield ""
for cell in row.cells:
yield "\n".join(iter_cell_block_items(cell))
# -- each omitted cell at the end of the row (also rare) gets the empty string --
for _ in range(row.grid_cols_after):
yield ""
return tabulate(
[list(iter_cells(row)) for row in table.rows],
[list(iter_row_cells_as_text(row)) for row in table.rows],
headers=[] if is_nested else "firstrow",
# -- tabulate isn't really designed for recursive tables so we have to do any
# -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
@ -739,7 +758,7 @@ class _DocxPartitioner:
if tc.vMerge == "continue":
continue
# -- do not generate empty strings --
yield from (text for text in iter_cell_texts(_Cell(tc, row)) if text)
yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text)
@lazyproperty
def _last_modified(self) -> Optional[str]:
@ -846,7 +865,7 @@ class _DocxPartitioner:
# Determine category depth from paragraph ilvl xpath
xpath = paragraph._element.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
if xpath:
return int(xpath[0])
return round(float(xpath[0]))
# Determine category depth from style name
style_name = (paragraph.style and paragraph.style.name) or "Normal"