mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-27 08:03:42 +00:00
fix(docx): tabulate output is non-deterministic (#2090)
The test for nested tables added a few PRs ago indirectly relies on the padding added to table-HTML by `tabulate`. The length of that padding turns out to be non-deterministic, perhaps related to M1 vs. Intel hardware. Remove padding from tabulate output in the test so only actual content is compared.
This commit is contained in:
parent
5fa40850f4
commit
41fc55bc12
@ -1,6 +1,7 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
import pathlib
|
||||
import re
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import Dict, List, cast
|
||||
|
||||
@ -63,27 +64,28 @@ class Describe_DocxPartitioner:
|
||||
"""
|
||||
table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
|
||||
|
||||
html = _DocxPartitioner()._convert_table_to_html(table)
|
||||
# -- re.sub() strips out the extra padding inserted by tabulate --
|
||||
html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table))
|
||||
|
||||
expected_lines = [
|
||||
"<table>",
|
||||
"<thead>",
|
||||
f"<tr><th>a </th><th>>b<{' ' * 96}</th><th>c </th></tr>",
|
||||
"<tr><th>a</th><th>>b<</th><th>c</th></tr>",
|
||||
"</thead>",
|
||||
"<tbody>",
|
||||
"<tr><td>d </td><td><table>",
|
||||
"<tr><td>d</td><td><table>",
|
||||
"<tbody>",
|
||||
"<tr><td>e </td><td>f</td></tr>",
|
||||
"<tr><td>e</td><td>f</td></tr>",
|
||||
"<tr><td>g&t</td><td>h</td></tr>",
|
||||
"</tbody>",
|
||||
"</table></td><td>i </td></tr>",
|
||||
f"<tr><td>j </td><td>k{' ' * 104}</td><td>l </td></tr>",
|
||||
"</table></td><td>i</td></tr>",
|
||||
"<tr><td>j</td><td>k</td><td>l</td></tr>",
|
||||
"</tbody>",
|
||||
"</table>",
|
||||
]
|
||||
actual_lines = html.splitlines()
|
||||
for expected, actual in zip(expected_lines, actual_lines):
|
||||
assert actual == expected
|
||||
assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}"
|
||||
|
||||
def it_can_convert_a_table_to_plain_text(self):
|
||||
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user