olmocr/tests/test_table_parsing.py

181 lines
7.2 KiB
Python
Raw Normal View History

2025-10-23 21:19:27 +00:00
import unittest
from olmocr.bench.tests import (
parse_html_tables,
parse_markdown_tables,
)
class TestParseHtmlTables(unittest.TestCase):
def test_basic_table(self):
2025-10-24 21:37:25 +00:00
data = parse_html_tables(
"""
2025-10-23 21:19:27 +00:00
<table border="1">
<thead>
<tr>
<th></th>
<th>ArXiv</th>
<th>Old<br>scans<br>math</th>
<th>Tables</th>
<th>Old<br>scans</th>
<th>Headers<br>&<br>footers</th>
<th>Multi<br>column</th>
<th>Long<br>tiny<br>text</th>
<th>Base</th>
<th>Overall</th>
</tr>
</thead>
<tbody>
<tr>
<td>Mistral OCR API</td>
<td>77.2</td>
<td>67.5</td>
<td>60.6</td>
<td>29.3</td>
<td>93.6</td>
<td>71.3</td>
<td>77.1</td>
<td>99.4</td>
<td>72.0±1.1</td>
</tr>
2025-10-24 21:37:25 +00:00
</tbody></table>"""
)[0]
2025-10-23 21:19:27 +00:00
print(data)
2025-10-24 21:37:25 +00:00
self.assertEqual(data.cell_text[0, 0], "")
self.assertEqual(data.cell_text[0, 1], "ArXiv")
self.assertEqual(data.left_relations[0, 0], set())
self.assertEqual(data.up_relations[0, 0], set())
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertEqual(data.left_relations[0, 1], {(0, 0)})
self.assertEqual(data.up_relations[1, 0], {(0, 0)})
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertEqual(data.heading_cells, {(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9)})
self.assertEqual(data.top_heading_relations(1, 3), {(0, 3)})
2025-10-23 21:19:27 +00:00
# If there are no left headings defined, then the left most column is considered the left heading
2025-10-24 21:37:25 +00:00
print(data.left_heading_relations(1, 3))
self.assertEqual(data.left_heading_relations(1, 3), {(1, 0)})
2025-10-23 21:19:27 +00:00
def test_multiple_top_headings(self):
2025-10-24 21:37:25 +00:00
data = parse_html_tables(
"""
2025-10-23 21:19:27 +00:00
<table border="1">
<thead>
<tr>
<th colspan="2">Fruit Costs in Unittest land</th>
</tr>
<tr>
<th>Fruit Type</th>
<th>Cost</th>
</tr>
</thead>
<tbody>
<tr>
<td>Apples</td>
<td>$1.00</td>
</tr>
<tr>
<td>Oranges</td>
<td>$2.00</td>
</tr>
2025-10-24 21:37:25 +00:00
</tbody></table>"""
)[0]
2025-10-23 21:19:27 +00:00
print(data)
2025-10-24 21:37:25 +00:00
self.assertEqual(data.cell_text[0, 0], "Fruit Costs in Unittest land")
self.assertEqual(data.cell_text[1, 0], "Fruit Type")
self.assertEqual(data.cell_text[1, 1], "Cost")
self.assertEqual(data.cell_text[2, 0], "Apples")
self.assertEqual(data.cell_text[2, 1], "$1.00")
self.assertEqual(data.cell_text[3, 0], "Oranges")
self.assertEqual(data.cell_text[3, 1], "$2.00")
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertEqual(data.up_relations[1, 0], {(0, 0)})
self.assertEqual(data.up_relations[1, 1], {(0, 0)})
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertEqual(data.up_relations[2, 0], {(1, 0)})
self.assertEqual(data.up_relations[2, 1], {(1, 1)})
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertEqual(data.top_heading_relations(1, 0), {(0, 0)})
self.assertEqual(data.top_heading_relations(1, 1), {(0, 0)})
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertEqual(data.top_heading_relations(2, 0), {(0, 0), (1, 0)})
self.assertEqual(data.top_heading_relations(2, 1), {(0, 0), (1, 1)})
2025-10-23 21:19:27 +00:00
def test_4x4_table_with_spans(self):
"""Test a 4x4 table with various row spans and column spans"""
2025-10-24 21:37:25 +00:00
data = parse_html_tables(
"""
2025-10-23 21:19:27 +00:00
<table border="1">
<thead>
<tr>
<th>Header 1</th>
<th colspan="2">Header 2-3</th>
<th>Header 4</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="2">Cell A (spans 2 rows)</td>
<td>Cell B</td>
<td>Cell C</td>
<td rowspan="2">Cell D (spans 2 rows)</td>
</tr>
<tr>
<td colspan="2">Cell E-F (spans 2 cols)</td>
</tr>
<tr>
<td>Cell G</td>
<td colspan="3">Cell H-I-J (spans 3 cols)</td>
</tr>
</tbody>
2025-10-24 21:37:25 +00:00
</table>"""
)[0]
2025-10-23 21:19:27 +00:00
print(data)
# Test header row
2025-10-24 21:37:25 +00:00
self.assertEqual(data.cell_text[0, 0], "Header 1")
self.assertEqual(data.cell_text[0, 1], "Header 2-3")
2025-10-23 21:19:27 +00:00
2025-10-24 21:37:25 +00:00
self.assertNotIn((0, 2), data.cell_text) # colspan=2, so that next cell is empty
self.assertEqual(data.cell_text[0, 3], "Header 4")
2025-10-23 21:19:27 +00:00
# Test first body row
2025-10-24 21:37:25 +00:00
self.assertEqual(data.cell_text[1, 0], "Cell A (spans 2 rows)")
self.assertEqual(data.cell_text[1, 1], "Cell B")
self.assertEqual(data.cell_text[1, 2], "Cell C")
self.assertEqual(data.cell_text[1, 3], "Cell D (spans 2 rows)")
2025-10-23 21:19:27 +00:00
# Test second body row
2025-10-24 21:37:25 +00:00
self.assertNotIn((2, 0), data.cell_text)
self.assertEqual(data.cell_text[2, 1], "Cell E-F (spans 2 cols)")
2025-10-23 21:19:27 +00:00
# Test third body row
2025-10-24 21:37:25 +00:00
self.assertEqual(data.cell_text[3, 0], "Cell G")
self.assertEqual(data.cell_text[3, 1], "Cell H-I-J (spans 3 cols)")
2025-10-23 21:19:27 +00:00
# Test heading cells
2025-10-24 21:37:25 +00:00
self.assertEqual(data.heading_cells, {(0, 0), (0, 1), (0, 3)})
self.assertEqual(data.left_heading_relations(0, 0), set())
self.assertEqual(data.left_heading_relations(1, 0), set())
self.assertEqual(data.left_heading_relations(2, 0), set())
self.assertEqual(data.left_heading_relations(3, 0), set())
self.assertEqual(data.top_heading_relations(0, 0), set())
self.assertEqual(data.top_heading_relations(0, 1), set())
self.assertEqual(data.top_heading_relations(0, 2), set())
self.assertEqual(data.top_heading_relations(0, 3), set())
self.assertEqual(data.left_heading_relations(1, 1), {(1, 0)})
self.assertEqual(data.left_heading_relations(1, 2), {(1, 0)})
self.assertEqual(data.left_heading_relations(1, 3), {(1, 0)})
self.assertEqual(data.top_heading_relations(3, 1), {(0, 1), (0, 3)})