From 41fc55bc121872ad7a58d46cd82c7f566ea0956e Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Wed, 15 Nov 2023 23:52:16 -0800
Subject: [PATCH] fix(docx): tabulate output is non-deterministic (#2090)

The test for nested tables added a few PRs ago indirectly relies on the
padding added to table-HTML by `tabulate`. The length of that padding
turns out to be non-deterministic, perhaps related to M1 vs. Intel
hardware.

Remove padding from tabulate output in the test so only actual content
is compared.
---
 test_unstructured/partition/docx/test_docx.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
index 6d6636f23..ca915d500 100644
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@@ -1,6 +1,7 @@
 # pyright: reportPrivateUsage=false
 
 import pathlib
+import re
 from tempfile import SpooledTemporaryFile
 from typing import Dict, List, cast
 
@@ -63,27 +64,28 @@ class Describe_DocxPartitioner:
         """
         table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
 
-        html = _DocxPartitioner()._convert_table_to_html(table)
+        # -- re.sub() strips out the extra padding inserted by tabulate --
+        html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table))
 
         expected_lines = [
             "<table>",
             "<thead>",
-            f"<tr><th>a  </th><th>&gt;b&lt;{' ' * 96}</th><th>c  </th></tr>",
+            "<tr><th>a</th><th>&gt;b&lt;</th><th>c</th></tr>",
             "</thead>",
             "<tbody>",
-            "<tr><td>d  </td><td><table>",
+            "<tr><td>d</td><td><table>",
             "<tbody>",
-            "<tr><td>e      </td><td>f</td></tr>",
+            "<tr><td>e</td><td>f</td></tr>",
             "<tr><td>g&amp;t</td><td>h</td></tr>",
             "</tbody>",
-            "</table></td><td>i  </td></tr>",
-            f"<tr><td>j  </td><td>k{' ' * 104}</td><td>l  </td></tr>",
+            "</table></td><td>i</td></tr>",
+            "<tr><td>j</td><td>k</td><td>l</td></tr>",
             "</tbody>",
             "</table>",
         ]
         actual_lines = html.splitlines()
         for expected, actual in zip(expected_lines, actual_lines):
-            assert actual == expected
+            assert actual == expected, f"\nexpected: {repr(expected)}\nactual:   {repr(actual)}"
 
     def it_can_convert_a_table_to_plain_text(self):
         table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]