mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 10:33:09 +00:00
rfctr(docx): DOCX emits std minified .text_as_html (#3545)
**Summary** Eliminate historical "idiosyncracies" of `table.metadata.text_as_html` HTML introduced by `partition_docx()`. Produce minified `.text_as_html` consistent with that formed by chunking. **Additional Context** - nested tables appear as their extracted text in the parent cell (no nested `<table>` elements in `.text_as_html`). - DOCX `.text_as_html` is minified (no extra whitespace or thead, tbody, tfoot elements).
This commit is contained in:
parent
f135344738
commit
03e0ed3519
@ -1,4 +1,4 @@
|
||||
## 0.15.8-dev1
|
||||
## 0.15.8-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
### Fixes
|
||||
|
||||
* **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data.
|
||||
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
|
||||
|
||||
## 0.15.6
|
||||
|
||||
|
||||
@ -140,13 +140,9 @@ def test_partition_docx_processes_table():
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example")
|
||||
assert elements[0].metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"<table>"
|
||||
"<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
|
||||
"<tr><td>Lorem ipsum</td><td>A Link example</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
assert elements[0].metadata.filename == "fake_table.docx"
|
||||
@ -1086,13 +1082,9 @@ class Describe_DocxPartitioner:
|
||||
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
|
||||
|
||||
assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>A link example</td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"<table>"
|
||||
"<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
|
||||
"<tr><td>Lorem ipsum</td><td>A link example</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
@ -1118,25 +1110,13 @@ class Describe_DocxPartitioner:
|
||||
# -- re.sub() strips out the extra padding inserted by tabulate --
|
||||
html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))
|
||||
|
||||
expected_lines = [
|
||||
"<table>",
|
||||
"<thead>",
|
||||
"<tr><th>a</th><th>>b<</th><th>c</th></tr>",
|
||||
"</thead>",
|
||||
"<tbody>",
|
||||
"<tr><td>d</td><td><table>",
|
||||
"<tbody>",
|
||||
"<tr><td>e</td><td>f</td></tr>",
|
||||
"<tr><td>g&t</td><td>h</td></tr>",
|
||||
"</tbody>",
|
||||
"</table></td><td>i</td></tr>",
|
||||
"<tr><td>j</td><td>k</td><td>l</td></tr>",
|
||||
"</tbody>",
|
||||
"</table>",
|
||||
]
|
||||
actual_lines = html.splitlines()
|
||||
for expected, actual in zip(expected_lines, actual_lines):
|
||||
assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}"
|
||||
assert html == (
|
||||
"<table>"
|
||||
"<tr><td>a</td><td>>b<</td><td>c</td></tr>"
|
||||
"<tr><td>d</td><td>e f g&t h</td><td>i</td></tr>"
|
||||
"<tr><td>j</td><td>k</td><td>l</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
|
||||
opts = DocxPartitionerOptions(**opts_args)
|
||||
@ -1216,10 +1196,7 @@ class Describe_DocxPartitioner:
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
|
||||
"</table>"
|
||||
"<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>"
|
||||
)
|
||||
# --
|
||||
# ┌───┐
|
||||
@ -1231,10 +1208,7 @@ class Describe_DocxPartitioner:
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
|
||||
"</table>"
|
||||
"<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# --
|
||||
# ┌───────┐
|
||||
@ -1246,9 +1220,9 @@ class Describe_DocxPartitioner:
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n"
|
||||
"<table>"
|
||||
"<tr><td>a</td><td>a</td><td/></tr>"
|
||||
"<tr><td>b</td><td>c</td><td>d</td></tr>"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# --
|
||||
@ -1261,9 +1235,9 @@ class Describe_DocxPartitioner:
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n"
|
||||
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n"
|
||||
"<table>"
|
||||
"<tr><td>a</td><td>b</td><td/></tr>"
|
||||
"<tr><td>a</td><td>c</td><td>d</td></tr>"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# -- late-start, early-end, and >2 rows vertical span --
|
||||
@ -1280,14 +1254,11 @@ class Describe_DocxPartitioner:
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "a b c d e f", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n"
|
||||
"</thead>\n<tbody>\n"
|
||||
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
|
||||
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
|
||||
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"<table>"
|
||||
"<tr><td>a</td><td>a</td><td>b</td><td>c</td></tr>"
|
||||
"<tr><td/><td>d</td><td>d</td><td/></tr>"
|
||||
"<tr><td>e</td><td>d</td><td>d</td><td>f</td></tr>"
|
||||
"<tr><td/><td>d</td><td>d</td><td/></tr>"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
# --
|
||||
@ -1296,19 +1267,15 @@ class Describe_DocxPartitioner:
|
||||
assert type(e).__name__ == "Table"
|
||||
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
|
||||
assert e.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
|
||||
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
|
||||
"<tr><td> </td><td>More </td><td> </td></tr>\n"
|
||||
"<tr><td>Dato </td><td> </td><td> </td></tr>\n"
|
||||
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
|
||||
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
|
||||
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"<table>"
|
||||
"<tr><td>Data</td><td>Data</td><td/></tr>"
|
||||
"<tr><td>Data</td><td>Data</td><td/></tr>"
|
||||
"<tr><td>Data</td><td>Data</td><td/></tr>"
|
||||
"<tr><td/><td>More</td><td/></tr>"
|
||||
"<tr><td>Dato</td><td/></tr>"
|
||||
"<tr><td>WTF?</td><td>WTF?</td><td/></tr>"
|
||||
"<tr><td>Strange</td><td>Strange</td><td/></tr>"
|
||||
"<tr><td/><td>Format</td><td>Format</td></tr>"
|
||||
"</table>"
|
||||
), f"actual {e.metadata.text_as_html=}"
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.8-dev1" # pragma: no cover
|
||||
__version__ = "0.15.8-dev2" # pragma: no cover
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import io
|
||||
import itertools
|
||||
import os
|
||||
@ -23,11 +22,11 @@ from docx.text.hyperlink import Hyperlink
|
||||
from docx.text.pagebreak import RenderedPageBreak
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
from tabulate import tabulate
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.cleaners.core import clean_bullets
|
||||
from unstructured.common.html_table import htmlify_matrix_of_cell_texts
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
@ -498,7 +497,7 @@ class _DocxPartitioner:
|
||||
# NOTE(scanny) - if all that fails we give it the default `Text` element-type
|
||||
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
|
||||
def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
|
||||
def _convert_table_to_html(self, table: DocxTable) -> str:
|
||||
"""HTML string version of `table`.
|
||||
|
||||
Example:
|
||||
@ -520,44 +519,38 @@ class _DocxPartitioner:
|
||||
def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
|
||||
"""Generate the text of each paragraph or table in `cell` as a separate string.
|
||||
|
||||
A table nested in `cell` is converted to HTML and emitted as that string.
|
||||
A table nested in `cell` is converted to the normalized text it contains.
|
||||
"""
|
||||
for block_item in cell.iter_inner_content():
|
||||
if isinstance(block_item, Paragraph):
|
||||
if isinstance(paragraph := block_item, Paragraph):
|
||||
# -- all docx content is ultimately in a paragraph; a nested table contributes
|
||||
# -- structure only
|
||||
yield f"{html.escape(block_item.text)}"
|
||||
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
block_item, DocxTable
|
||||
):
|
||||
yield self._convert_table_to_html(block_item, is_nested=True)
|
||||
yield paragraph.text
|
||||
elif isinstance(table := block_item, DocxTable):
|
||||
for row in table.rows:
|
||||
yield from iter_row_cells_as_text(row)
|
||||
|
||||
def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
|
||||
"""Generate the text of each cell in `row` as a separate string.
|
||||
"""Generate the normalized text of each cell in `row` as a separate string.
|
||||
|
||||
The text of each paragraph within a cell is separated from the next by a newline
|
||||
(`"\n"`). A table nested in a cell is first converted to HTML and then included as a
|
||||
string, also separated by a newline.
|
||||
The text of each paragraph within a cell is not separated. A table nested in a cell is
|
||||
converted to a normalized string of its contents and combined with the text of the
|
||||
cell that contains the table.
|
||||
"""
|
||||
# -- each omitted cell at the start of the row (pretty rare) gets the empty string --
|
||||
# -- Each omitted cell at the start of the row (pretty rare) gets the empty string.
|
||||
# -- This preserves column alignment when one or more initial cells are omitted.
|
||||
for _ in range(row.grid_cols_before):
|
||||
yield ""
|
||||
|
||||
for cell in row.cells:
|
||||
yield "\n".join(iter_cell_block_items(cell))
|
||||
cell_text = " ".join(iter_cell_block_items(cell))
|
||||
yield " ".join(cell_text.split())
|
||||
|
||||
# -- each omitted cell at the end of the row (also rare) gets the empty string --
|
||||
# -- Each omitted cell at the end of the row (also rare) gets the empty string. --
|
||||
for _ in range(row.grid_cols_after):
|
||||
yield ""
|
||||
|
||||
return tabulate(
|
||||
[list(iter_row_cells_as_text(row)) for row in table.rows],
|
||||
headers=[] if is_nested else "firstrow",
|
||||
# -- tabulate isn't really designed for recursive tables so we have to do any
|
||||
# -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
|
||||
# -- contents.
|
||||
tablefmt="unsafehtml",
|
||||
)
|
||||
return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows])
|
||||
|
||||
@lazyproperty
|
||||
def _document(self) -> Document:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user