fix(docx): tabulate output is non-deterministic (#2090)

The test for nested tables added a few PRs ago indirectly relies on the padding added to table-HTML by `tabulate`. The length of that padding turns out to be non-deterministic, perhaps related to M1 vs. Intel hardware. Remove padding from tabulate output in the test so only actual content is compared.
2025-10-29 17:04:37 +00:00 · 2023-11-15 23:52:16 -08:00 · 2023-11-15 23:52:16 -08:00 · 41fc55bc12
commit 41fc55bc12
parent 5fa40850f4
1 changed files with 9 additions and 7 deletions
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@ -1,6 +1,7 @@
 # pyright: reportPrivateUsage=false
 import pathlib
 import re
 from tempfile import SpooledTemporaryFile
 from typing import Dict, List, cast
@ -63,27 +64,28 @@ class Describe_DocxPartitioner:
        """
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
-        html = _DocxPartitioner()._convert_table_to_html(table)
+        # -- re.sub() strips out the extra padding inserted by tabulate --
        html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table))
        expected_lines = [
            "<table>",
            "<thead>",
-            f"<tr><th>a  </th><th>&gt;b&lt;{' ' * 96}</th><th>c  </th></tr>",
+            "<tr><th>a</th><th>&gt;b&lt;</th><th>c</th></tr>",
            "</thead>",
            "<tbody>",
-            "<tr><td>d  </td><td><table>",
+            "<tr><td>d</td><td><table>",
            "<tbody>",
-            "<tr><td>e      </td><td>f</td></tr>",
+            "<tr><td>e</td><td>f</td></tr>",
            "<tr><td>g&amp;t</td><td>h</td></tr>",
            "</tbody>",
-            "</table></td><td>i  </td></tr>",
+            "</table></td><td>i</td></tr>",
-            f"<tr><td>j  </td><td>k{' ' * 104}</td><td>l  </td></tr>",
+            "<tr><td>j</td><td>k</td><td>l</td></tr>",
            "</tbody>",
            "</table>",
        ]
        actual_lines = html.splitlines()
        for expected, actual in zip(expected_lines, actual_lines):
-            assert actual == expected
+            assert actual == expected, f"\nexpected: {repr(expected)}\nactual:   {repr(actual)}"
    def it_can_convert_a_table_to_plain_text(self):
        table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]