diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 6d6636f23..ca915d500 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -1,6 +1,7 @@ # pyright: reportPrivateUsage=false import pathlib +import re from tempfile import SpooledTemporaryFile from typing import Dict, List, cast @@ -63,27 +64,28 @@ class Describe_DocxPartitioner: """ table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] - html = _DocxPartitioner()._convert_table_to_html(table) + # -- re.sub() strips out the extra padding inserted by tabulate -- + html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table)) expected_lines = [ "", "", - f"", + "", "", "", - "", + "", "", "
a >b<{' ' * 96}c
a>b<c
d ", + "", - f"", + "
d", "", - "", + "", "", "", - "
e f
ef
g&th
i
j k{' ' * 104}l
i
jkl
", ] actual_lines = html.splitlines() for expected, actual in zip(expected_lines, actual_lines): - assert actual == expected + assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" def it_can_convert_a_table_to_plain_text(self): table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]