mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-29 17:04:37 +00:00 
			
		
		
		
	fix(docx): tabulate output is non-deterministic (#2090)
The test for nested tables added a few PRs ago indirectly relies on the padding added to table-HTML by `tabulate`. The length of that padding turns out to be non-deterministic, perhaps related to M1 vs. Intel hardware. Remove padding from tabulate output in the test so only actual content is compared.
This commit is contained in:
		
							parent
							
								
									5fa40850f4
								
							
						
					
					
						commit
						41fc55bc12
					
				| @ -1,6 +1,7 @@ | |||||||
| # pyright: reportPrivateUsage=false | # pyright: reportPrivateUsage=false | ||||||
| 
 | 
 | ||||||
| import pathlib | import pathlib | ||||||
|  | import re | ||||||
| from tempfile import SpooledTemporaryFile | from tempfile import SpooledTemporaryFile | ||||||
| from typing import Dict, List, cast | from typing import Dict, List, cast | ||||||
| 
 | 
 | ||||||
| @ -63,27 +64,28 @@ class Describe_DocxPartitioner: | |||||||
|         """ |         """ | ||||||
|         table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] |         table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] | ||||||
| 
 | 
 | ||||||
|         html = _DocxPartitioner()._convert_table_to_html(table) |         # -- re.sub() strips out the extra padding inserted by tabulate -- | ||||||
|  |         html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table)) | ||||||
| 
 | 
 | ||||||
|         expected_lines = [ |         expected_lines = [ | ||||||
|             "<table>", |             "<table>", | ||||||
|             "<thead>", |             "<thead>", | ||||||
|             f"<tr><th>a  </th><th>>b<{' ' * 96}</th><th>c  </th></tr>", |             "<tr><th>a</th><th>>b<</th><th>c</th></tr>", | ||||||
|             "</thead>", |             "</thead>", | ||||||
|             "<tbody>", |             "<tbody>", | ||||||
|             "<tr><td>d  </td><td><table>", |             "<tr><td>d</td><td><table>", | ||||||
|             "<tbody>", |             "<tbody>", | ||||||
|             "<tr><td>e      </td><td>f</td></tr>", |             "<tr><td>e</td><td>f</td></tr>", | ||||||
|             "<tr><td>g&t</td><td>h</td></tr>", |             "<tr><td>g&t</td><td>h</td></tr>", | ||||||
|             "</tbody>", |             "</tbody>", | ||||||
|             "</table></td><td>i  </td></tr>", |             "</table></td><td>i</td></tr>", | ||||||
|             f"<tr><td>j  </td><td>k{' ' * 104}</td><td>l  </td></tr>", |             "<tr><td>j</td><td>k</td><td>l</td></tr>", | ||||||
|             "</tbody>", |             "</tbody>", | ||||||
|             "</table>", |             "</table>", | ||||||
|         ] |         ] | ||||||
|         actual_lines = html.splitlines() |         actual_lines = html.splitlines() | ||||||
|         for expected, actual in zip(expected_lines, actual_lines): |         for expected, actual in zip(expected_lines, actual_lines): | ||||||
|             assert actual == expected |             assert actual == expected, f"\nexpected: {repr(expected)}\nactual:   {repr(actual)}" | ||||||
| 
 | 
 | ||||||
|     def it_can_convert_a_table_to_plain_text(self): |     def it_can_convert_a_table_to_plain_text(self): | ||||||
|         table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] |         table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Steve Canny
						Steve Canny