rfctr(docx): DOCX emits std minified .text_as_html (#3545)

**Summary**
Eliminate historical "idiosyncracies" of `table.metadata.text_as_html`
HTML introduced by `partition_docx()`. Produce minified `.text_as_html`
consistent with that formed by chunking.

**Additional Context**
- nested tables appear as their extracted text in the parent cell (no
nested `<table>` elements in `.text_as_html`).
- DOCX `.text_as_html` is minified (no extra whitespace or thead, tbody,
tfoot elements).
This commit is contained in:
Steve Canny 2024-08-21 11:54:21 -07:00 committed by GitHub
parent f135344738
commit 03e0ed3519
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 56 additions and 95 deletions

View File

@ -1,4 +1,4 @@
## 0.15.8-dev1 ## 0.15.8-dev2
### Enhancements ### Enhancements
@ -7,6 +7,7 @@
### Fixes ### Fixes
* **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data. * **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data.
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
## 0.15.6 ## 0.15.6

View File

@ -140,13 +140,9 @@ def test_partition_docx_processes_table():
assert isinstance(elements[0], Table) assert isinstance(elements[0], Table)
assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example") assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example")
assert elements[0].metadata.text_as_html == ( assert elements[0].metadata.text_as_html == (
"<table>\n" "<table>"
"<thead>\n" "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n" "<tr><td>Lorem ipsum</td><td>A Link example</td></tr>"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"</tbody>\n"
"</table>" "</table>"
) )
assert elements[0].metadata.filename == "fake_table.docx" assert elements[0].metadata.filename == "fake_table.docx"
@ -1086,13 +1082,9 @@ class Describe_DocxPartitioner:
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
assert _DocxPartitioner(opts)._convert_table_to_html(table) == ( assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
"<table>\n" "<table>"
"<thead>\n" "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n" "<tr><td>Lorem ipsum</td><td>A link example</td></tr>"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>A link example</td></tr>\n"
"</tbody>\n"
"</table>" "</table>"
) )
@ -1118,25 +1110,13 @@ class Describe_DocxPartitioner:
# -- re.sub() strips out the extra padding inserted by tabulate -- # -- re.sub() strips out the extra padding inserted by tabulate --
html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table)) html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))
expected_lines = [ assert html == (
"<table>", "<table>"
"<thead>", "<tr><td>a</td><td>&gt;b&lt;</td><td>c</td></tr>"
"<tr><th>a</th><th>&gt;b&lt;</th><th>c</th></tr>", "<tr><td>d</td><td>e f g&amp;t h</td><td>i</td></tr>"
"</thead>", "<tr><td>j</td><td>k</td><td>l</td></tr>"
"<tbody>", "</table>"
"<tr><td>d</td><td><table>", )
"<tbody>",
"<tr><td>e</td><td>f</td></tr>",
"<tr><td>g&amp;t</td><td>h</td></tr>",
"</tbody>",
"</table></td><td>i</td></tr>",
"<tr><td>j</td><td>k</td><td>l</td></tr>",
"</tbody>",
"</table>",
]
actual_lines = html.splitlines()
for expected, actual in zip(expected_lines, actual_lines):
assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}"
def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]): def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
opts = DocxPartitionerOptions(**opts_args) opts = DocxPartitionerOptions(**opts_args)
@ -1216,10 +1196,7 @@ class Describe_DocxPartitioner:
assert type(e).__name__ == "Table" assert type(e).__name__ == "Table"
assert e.text == "a b c d" assert e.text == "a b c d"
assert e.metadata.text_as_html == ( assert e.metadata.text_as_html == (
"<table>\n" "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>"
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
) )
# -- # --
# ┌───┐ # ┌───┐
@ -1231,10 +1208,7 @@ class Describe_DocxPartitioner:
assert type(e).__name__ == "Table" assert type(e).__name__ == "Table"
assert e.text == "a b c", f"actual {e.text=}" assert e.text == "a b c", f"actual {e.text=}"
assert e.metadata.text_as_html == ( assert e.metadata.text_as_html == (
"<table>\n" "<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>"
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}" ), f"actual {e.metadata.text_as_html=}"
# -- # --
# ┌───────┐ # ┌───────┐
@ -1246,9 +1220,9 @@ class Describe_DocxPartitioner:
assert type(e).__name__ == "Table" assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}" assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == ( assert e.metadata.text_as_html == (
"<table>\n" "<table>"
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n" "<tr><td>a</td><td>a</td><td/></tr>"
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n" "<tr><td>b</td><td>c</td><td>d</td></tr>"
"</table>" "</table>"
), f"actual {e.metadata.text_as_html=}" ), f"actual {e.metadata.text_as_html=}"
# -- # --
@ -1261,9 +1235,9 @@ class Describe_DocxPartitioner:
assert type(e).__name__ == "Table" assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}" assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == ( assert e.metadata.text_as_html == (
"<table>\n" "<table>"
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n" "<tr><td>a</td><td>b</td><td/></tr>"
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n" "<tr><td>a</td><td>c</td><td>d</td></tr>"
"</table>" "</table>"
), f"actual {e.metadata.text_as_html=}" ), f"actual {e.metadata.text_as_html=}"
# -- late-start, early-end, and >2 rows vertical span -- # -- late-start, early-end, and >2 rows vertical span --
@ -1280,14 +1254,11 @@ class Describe_DocxPartitioner:
assert type(e).__name__ == "Table" assert type(e).__name__ == "Table"
assert e.text == "a b c d e f", f"actual {e.text=}" assert e.text == "a b c d e f", f"actual {e.text=}"
assert e.metadata.text_as_html == ( assert e.metadata.text_as_html == (
"<table>\n" "<table>"
"<thead>\n" "<tr><td>a</td><td>a</td><td>b</td><td>c</td></tr>"
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n" "<tr><td/><td>d</td><td>d</td><td/></tr>"
"</thead>\n<tbody>\n" "<tr><td>e</td><td>d</td><td>d</td><td>f</td></tr>"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n" "<tr><td/><td>d</td><td>d</td><td/></tr>"
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"</tbody>\n"
"</table>" "</table>"
), f"actual {e.metadata.text_as_html=}" ), f"actual {e.metadata.text_as_html=}"
# -- # --
@ -1296,19 +1267,15 @@ class Describe_DocxPartitioner:
assert type(e).__name__ == "Table" assert type(e).__name__ == "Table"
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}" assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
assert e.metadata.text_as_html == ( assert e.metadata.text_as_html == (
"<table>\n" "<table>"
"<thead>\n" "<tr><td>Data</td><td>Data</td><td/></tr>"
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n" "<tr><td>Data</td><td>Data</td><td/></tr>"
"</thead>\n" "<tr><td>Data</td><td>Data</td><td/></tr>"
"<tbody>\n" "<tr><td/><td>More</td><td/></tr>"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n" "<tr><td>Dato</td><td/></tr>"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n" "<tr><td>WTF?</td><td>WTF?</td><td/></tr>"
"<tr><td> </td><td>More </td><td> </td></tr>\n" "<tr><td>Strange</td><td>Strange</td><td/></tr>"
"<tr><td>Dato </td><td> </td><td> </td></tr>\n" "<tr><td/><td>Format</td><td>Format</td></tr>"
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
"</tbody>\n"
"</table>" "</table>"
), f"actual {e.metadata.text_as_html=}" ), f"actual {e.metadata.text_as_html=}"

View File

@ -1 +1 @@
__version__ = "0.15.8-dev1" # pragma: no cover __version__ = "0.15.8-dev2" # pragma: no cover

View File

@ -2,7 +2,6 @@
from __future__ import annotations from __future__ import annotations
import html
import io import io
import itertools import itertools
import os import os
@ -23,11 +22,11 @@ from docx.text.hyperlink import Hyperlink
from docx.text.pagebreak import RenderedPageBreak from docx.text.pagebreak import RenderedPageBreak
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.text.run import Run from docx.text.run import Run
from tabulate import tabulate
from typing_extensions import TypeAlias from typing_extensions import TypeAlias
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import clean_bullets from unstructured.cleaners.core import clean_bullets
from unstructured.common.html_table import htmlify_matrix_of_cell_texts
from unstructured.documents.elements import ( from unstructured.documents.elements import (
Address, Address,
Element, Element,
@ -498,7 +497,7 @@ class _DocxPartitioner:
# NOTE(scanny) - if all that fails we give it the default `Text` element-type # NOTE(scanny) - if all that fails we give it the default `Text` element-type
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str: def _convert_table_to_html(self, table: DocxTable) -> str:
"""HTML string version of `table`. """HTML string version of `table`.
Example: Example:
@ -520,44 +519,38 @@ class _DocxPartitioner:
def iter_cell_block_items(cell: _Cell) -> Iterator[str]: def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
"""Generate the text of each paragraph or table in `cell` as a separate string. """Generate the text of each paragraph or table in `cell` as a separate string.
A table nested in `cell` is converted to HTML and emitted as that string. A table nested in `cell` is converted to the normalized text it contains.
""" """
for block_item in cell.iter_inner_content(): for block_item in cell.iter_inner_content():
if isinstance(block_item, Paragraph): if isinstance(paragraph := block_item, Paragraph):
# -- all docx content is ultimately in a paragraph; a nested table contributes # -- all docx content is ultimately in a paragraph; a nested table contributes
# -- structure only # -- structure only
yield f"{html.escape(block_item.text)}" yield paragraph.text
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance] elif isinstance(table := block_item, DocxTable):
block_item, DocxTable for row in table.rows:
): yield from iter_row_cells_as_text(row)
yield self._convert_table_to_html(block_item, is_nested=True)
def iter_row_cells_as_text(row: _Row) -> Iterator[str]: def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
"""Generate the text of each cell in `row` as a separate string. """Generate the normalized text of each cell in `row` as a separate string.
The text of each paragraph within a cell is separated from the next by a newline The text of each paragraph within a cell is not separated. A table nested in a cell is
(`"\n"`). A table nested in a cell is first converted to HTML and then included as a converted to a normalized string of its contents and combined with the text of the
string, also separated by a newline. cell that contains the table.
""" """
# -- each omitted cell at the start of the row (pretty rare) gets the empty string -- # -- Each omitted cell at the start of the row (pretty rare) gets the empty string.
# -- This preserves column alignment when one or more initial cells are omitted.
for _ in range(row.grid_cols_before): for _ in range(row.grid_cols_before):
yield "" yield ""
for cell in row.cells: for cell in row.cells:
yield "\n".join(iter_cell_block_items(cell)) cell_text = " ".join(iter_cell_block_items(cell))
yield " ".join(cell_text.split())
# -- each omitted cell at the end of the row (also rare) gets the empty string -- # -- Each omitted cell at the end of the row (also rare) gets the empty string. --
for _ in range(row.grid_cols_after): for _ in range(row.grid_cols_after):
yield "" yield ""
return tabulate( return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows])
[list(iter_row_cells_as_text(row)) for row in table.rows],
headers=[] if is_nested else "firstrow",
# -- tabulate isn't really designed for recursive tables so we have to do any
# -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
# -- contents.
tablefmt="unsafehtml",
)
@lazyproperty @lazyproperty
def _document(self) -> Document: def _document(self) -> Document: