From 41fc55bc121872ad7a58d46cd82c7f566ea0956e Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 15 Nov 2023 23:52:16 -0800 Subject: [PATCH] fix(docx): tabulate output is non-deterministic (#2090) The test for nested tables added a few PRs ago indirectly relies on the padding added to table-HTML by `tabulate`. The length of that padding turns out to be non-deterministic, perhaps related to M1 vs. Intel hardware. Remove padding from tabulate output in the test so only actual content is compared. --- test_unstructured/partition/docx/test_docx.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 6d6636f23..ca915d500 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -1,6 +1,7 @@ # pyright: reportPrivateUsage=false import pathlib +import re from tempfile import SpooledTemporaryFile from typing import Dict, List, cast @@ -63,27 +64,28 @@ class Describe_DocxPartitioner: """ table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] - html = _DocxPartitioner()._convert_table_to_html(table) + # -- re.sub() strips out the extra padding inserted by tabulate -- + html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table)) expected_lines = [ "", "", - f"", + "", "", "", - "", + "", "", "
a >b<{' ' * 96}c
a>b<c
d ", + "", - f"", + "
d", "", - "", + "", "", "", - "
e f
ef
g&th
i
j k{' ' * 104}l
i
jkl
", ] actual_lines = html.splitlines() for expected, actual in zip(expected_lines, actual_lines): - assert actual == expected + assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" def it_can_convert_a_table_to_plain_text(self): table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]