rfctr(docx): DOCX emits std minified .text_as_html (#3545)

**Summary** Eliminate historical "idiosyncracies" of `table.metadata.text_as_html` HTML introduced by `partition_docx()`. Produce minified `.text_as_html` consistent with that formed by chunking. **Additional Context** - nested tables appear as their extracted text in the parent cell (no nested `<table>` elements in `.text_as_html`). - DOCX `.text_as_html` is minified (no extra whitespace or thead, tbody, tfoot elements).
2025-11-01 10:33:09 +00:00 · 2024-08-21 11:54:21 -07:00 · 2024-08-21 11:54:21 -07:00 · 03e0ed3519
commit 03e0ed3519
parent f135344738
4 changed files with 56 additions and 95 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.8-dev1
+## 0.15.8-dev2

 ### Enhancements

@ -7,6 +7,7 @@
 ### Fixes

 * **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data.
+* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

 ## 0.15.6

--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -140,13 +140,9 @@ def test_partition_docx_processes_table():
    assert isinstance(elements[0], Table)
    assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example")
    assert elements[0].metadata.text_as_html == (
-        "<table>\n"
-        "<thead>\n"
-        "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
-        "</thead>\n"
-        "<tbody>\n"
-        "<tr><td>Lorem ipsum    </td><td>A Link example</td></tr>\n"
-        "</tbody>\n"
+        "<table>"
+        "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
+        "<tr><td>Lorem ipsum</td><td>A Link example</td></tr>"
        "</table>"
    )
    assert elements[0].metadata.filename == "fake_table.docx"
@ -1086,13 +1082,9 @@ class Describe_DocxPartitioner:
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]

        assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1  </th><th>Header Col 2  </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lorem ipsum   </td><td>A link example</td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
+            "<tr><td>Lorem ipsum</td><td>A link example</td></tr>"
            "</table>"
        )

@ -1118,25 +1110,13 @@ class Describe_DocxPartitioner:
        # -- re.sub() strips out the extra padding inserted by tabulate --
        html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))

-        expected_lines = [
-            "<table>",
-            "<thead>",
-            "<tr><th>a</th><th>&gt;b&lt;</th><th>c</th></tr>",
-            "</thead>",
-            "<tbody>",
-            "<tr><td>d</td><td><table>",
-            "<tbody>",
-            "<tr><td>e</td><td>f</td></tr>",
-            "<tr><td>g&amp;t</td><td>h</td></tr>",
-            "</tbody>",
-            "</table></td><td>i</td></tr>",
-            "<tr><td>j</td><td>k</td><td>l</td></tr>",
-            "</tbody>",
-            "</table>",
-        ]
-        actual_lines = html.splitlines()
-        for expected, actual in zip(expected_lines, actual_lines):
-            assert actual == expected, f"\nexpected: {repr(expected)}\nactual:   {repr(actual)}"
+        assert html == (
+            "<table>"
+            "<tr><td>a</td><td>&gt;b&lt;</td><td>c</td></tr>"
+            "<tr><td>d</td><td>e f g&amp;t h</td><td>i</td></tr>"
+            "<tr><td>j</td><td>k</td><td>l</td></tr>"
+            "</table>"
+        )

    def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
        opts = DocxPartitionerOptions(**opts_args)
@ -1216,10 +1196,7 @@ class Describe_DocxPartitioner:
        assert type(e).__name__ == "Table"
        assert e.text == "a b c d"
        assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>b  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>c  </td><td>d  </td></tr>\n</tbody>\n"
-            "</table>"
+            "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>"
        )
        # --
        # ┌───┐
@ -1231,10 +1208,7 @@ class Describe_DocxPartitioner:
        assert type(e).__name__ == "Table"
        assert e.text == "a b c", f"actual {e.text=}"
        assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>b  </td><td>c </td></tr>\n</tbody>\n"
-            "</table>"
+            "<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>"
        ), f"actual {e.metadata.text_as_html=}"
        # --
        # ┌───────┐
@ -1246,9 +1220,9 @@ class Describe_DocxPartitioner:
        assert type(e).__name__ == "Table"
        assert e.text == "a b c d", f"actual {e.text=}"
        assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>a  </th><th>  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>b  </td><td>c  </td><td>d </td></tr>\n</tbody>\n"
+            "<table>"
+            "<tr><td>a</td><td>a</td><td/></tr>"
+            "<tr><td>b</td><td>c</td><td>d</td></tr>"
            "</table>"
        ), f"actual {e.metadata.text_as_html=}"
        # --
@ -1261,9 +1235,9 @@ class Describe_DocxPartitioner:
        assert type(e).__name__ == "Table"
        assert e.text == "a b c d", f"actual {e.text=}"
        assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>b  </th><th>  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>a  </td><td>c  </td><td>d </td></tr>\n</tbody>\n"
+            "<table>"
+            "<tr><td>a</td><td>b</td><td/></tr>"
+            "<tr><td>a</td><td>c</td><td>d</td></tr>"
            "</table>"
        ), f"actual {e.metadata.text_as_html=}"
        # -- late-start, early-end, and >2 rows vertical span --
@ -1280,14 +1254,11 @@ class Describe_DocxPartitioner:
        assert type(e).__name__ == "Table"
        assert e.text == "a b c d e f", f"actual {e.text=}"
        assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>a  </th><th>a  </th><th>b  </th><th>c  </th></tr>\n"
-            "</thead>\n<tbody>\n"
-            "<tr><td>   </td><td>d  </td><td>d  </td><td>   </td></tr>\n"
-            "<tr><td>e  </td><td>d  </td><td>d  </td><td>f  </td></tr>\n"
-            "<tr><td>   </td><td>d  </td><td>d  </td><td>   </td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>a</td><td>a</td><td>b</td><td>c</td></tr>"
+            "<tr><td/><td>d</td><td>d</td><td/></tr>"
+            "<tr><td>e</td><td>d</td><td>d</td><td>f</td></tr>"
+            "<tr><td/><td>d</td><td>d</td><td/></tr>"
            "</table>"
        ), f"actual {e.metadata.text_as_html=}"
        # --
@ -1296,19 +1267,15 @@ class Describe_DocxPartitioner:
        assert type(e).__name__ == "Table"
        assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
        assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Data   </th><th>Data   </th><th>      </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Data   </td><td>Data   </td><td>      </td></tr>\n"
-            "<tr><td>Data   </td><td>Data   </td><td>      </td></tr>\n"
-            "<tr><td>       </td><td>More   </td><td>      </td></tr>\n"
-            "<tr><td>Dato   </td><td>       </td><td>      </td></tr>\n"
-            "<tr><td>WTF?   </td><td>WTF?   </td><td>      </td></tr>\n"
-            "<tr><td>Strange</td><td>Strange</td><td>      </td></tr>\n"
-            "<tr><td>       </td><td>Format </td><td>Format</td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>Data</td><td>Data</td><td/></tr>"
+            "<tr><td>Data</td><td>Data</td><td/></tr>"
+            "<tr><td>Data</td><td>Data</td><td/></tr>"
+            "<tr><td/><td>More</td><td/></tr>"
+            "<tr><td>Dato</td><td/></tr>"
+            "<tr><td>WTF?</td><td>WTF?</td><td/></tr>"
+            "<tr><td>Strange</td><td>Strange</td><td/></tr>"
+            "<tr><td/><td>Format</td><td>Format</td></tr>"
            "</table>"
        ), f"actual {e.metadata.text_as_html=}"

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.8-dev1"  # pragma: no cover
+__version__ = "0.15.8-dev2"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -2,7 +2,6 @@

 from __future__ import annotations

-import html
 import io
 import itertools
 import os
@ -23,11 +22,11 @@ from docx.text.hyperlink import Hyperlink
 from docx.text.pagebreak import RenderedPageBreak
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
-from tabulate import tabulate
 from typing_extensions import TypeAlias

 from unstructured.chunking import add_chunking_strategy
 from unstructured.cleaners.core import clean_bullets
+from unstructured.common.html_table import htmlify_matrix_of_cell_texts
 from unstructured.documents.elements import (
    Address,
    Element,
@ -498,7 +497,7 @@ class _DocxPartitioner:
        # NOTE(scanny) - if all that fails we give it the default `Text` element-type
        yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)

-    def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
+    def _convert_table_to_html(self, table: DocxTable) -> str:
        """HTML string version of `table`.

        Example:
@ -520,44 +519,38 @@ class _DocxPartitioner:
        def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
            """Generate the text of each paragraph or table in `cell` as a separate string.

-            A table nested in `cell` is converted to HTML and emitted as that string.
+            A table nested in `cell` is converted to the normalized text it contains.
            """
            for block_item in cell.iter_inner_content():
-                if isinstance(block_item, Paragraph):
+                if isinstance(paragraph := block_item, Paragraph):
                    # -- all docx content is ultimately in a paragraph; a nested table contributes
                    # -- structure only
-                    yield f"{html.escape(block_item.text)}"
-                elif isinstance(  # pyright: ignore[reportUnnecessaryIsInstance]
-                    block_item, DocxTable
-                ):
-                    yield self._convert_table_to_html(block_item, is_nested=True)
+                    yield paragraph.text
+                elif isinstance(table := block_item, DocxTable):
+                    for row in table.rows:
+                        yield from iter_row_cells_as_text(row)

        def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
-            """Generate the text of each cell in `row` as a separate string.
+            """Generate the normalized text of each cell in `row` as a separate string.

-            The text of each paragraph within a cell is separated from the next by a newline
-            (`"\n"`). A table nested in a cell is first converted to HTML and then included as a
-            string, also separated by a newline.
+            The text of each paragraph within a cell is not separated. A table nested in a cell is
+            converted to a normalized string of its contents and combined with the text of the
+            cell that contains the table.
            """
-            # -- each omitted cell at the start of the row (pretty rare) gets the empty string --
+            # -- Each omitted cell at the start of the row (pretty rare) gets the empty string.
+            # -- This preserves column alignment when one or more initial cells are omitted.
            for _ in range(row.grid_cols_before):
                yield ""

            for cell in row.cells:
-                yield "\n".join(iter_cell_block_items(cell))
+                cell_text = " ".join(iter_cell_block_items(cell))
+                yield " ".join(cell_text.split())

-            # -- each omitted cell at the end of the row (also rare) gets the empty string --
+            # -- Each omitted cell at the end of the row (also rare) gets the empty string. --
            for _ in range(row.grid_cols_after):
                yield ""

-        return tabulate(
-            [list(iter_row_cells_as_text(row)) for row in table.rows],
-            headers=[] if is_nested else "firstrow",
-            # -- tabulate isn't really designed for recursive tables so we have to do any
-            # -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
-            # -- contents.
-            tablefmt="unsafehtml",
-        )
+        return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows])

    @lazyproperty
    def _document(self) -> Document: