import unittest
from olmocr.bench.tests import (
parse_html_tables,
parse_markdown_tables,
)
class TestParseHtmlTables(unittest.TestCase):
def test_basic_table(self):
data = parse_html_tables("""
|
ArXiv |
Old scans math |
Tables |
Old scans |
Headers & footers |
Multi column |
Long tiny text |
Base |
Overall |
| Mistral OCR API |
77.2 |
67.5 |
60.6 |
29.3 |
93.6 |
71.3 |
77.1 |
99.4 |
72.0±1.1 |
""")[0]
print(data)
self.assertEqual(data.cell_text[0,0], "")
self.assertEqual(data.cell_text[0,1], "ArXiv")
self.assertEqual(data.left_relations[0,0], set())
self.assertEqual(data.up_relations[0,0], set())
self.assertEqual(data.left_relations[0,1], {(0,0)})
self.assertEqual(data.up_relations[1,0], {(0,0)})
self.assertEqual(data.heading_cells, {
(0,0), (0,1), (0,2), (0,3),(0,4), (0,5),(0,6), (0,7), (0,8), (0,9)
})
self.assertEqual(data.top_heading_relations[1,3], {(0,3)})
# If there are no left headings defined, then the left most column is considered the left heading
self.assertEqual(data.left_heading_relations[1,3], {(1,0)})
def test_multiple_top_headings(self):
data = parse_html_tables("""
| Fruit Costs in Unittest land |
| Fruit Type |
Cost |
| Apples |
$1.00 |
| Oranges |
$2.00 |
""")[0]
print(data)
self.assertEqual(data.cell_text[0,0], "Fruit Costs in Unittest land")
self.assertEqual(data.cell_text[1,0], "Fruit Type")
self.assertEqual(data.cell_text[1,1], "Cost")
self.assertEqual(data.cell_text[2,0], "Apples")
self.assertEqual(data.cell_text[2,1], "$1.00")
self.assertEqual(data.cell_text[3,0], "Oranges")
self.assertEqual(data.cell_text[3,1], "$2.00")
self.assertEqual(data.up_relations[1,0], {(0,0)})
self.assertEqual(data.up_relations[1,1], {(0,0)})
self.assertEqual(data.up_relations[2,0], {(1,0)})
self.assertEqual(data.up_relations[2,1], {(1,1)})
self.assertEqual(data.top_heading_relations[1,0], {(0,0)})
self.assertEqual(data.top_heading_relations[1,1], {(0,0)})
self.assertEqual(data.top_heading_relations[2,0], {(0,0), (1,0)})
self.assertEqual(data.top_heading_relations[2,1], {(0,0), (1,1)})
def test_4x4_table_with_spans(self):
"""Test a 4x4 table with various row spans and column spans"""
data = parse_html_tables("""
| Header 1 |
Header 2-3 |
Header 4 |
| Cell A (spans 2 rows) |
Cell B |
Cell C |
Cell D (spans 2 rows) |
| Cell E-F (spans 2 cols) |
| Cell G |
Cell H-I-J (spans 3 cols) |
""")[0]
print(data)
# Test header row
self.assertEqual(data.cell_text[0,0], "Header 1")
self.assertEqual(data.cell_text[0,1], "Header 2-3")
self.assertNotIn((0,2), data.cell_text) # colspan=2, so that next cell is empty
self.assertEqual(data.cell_text[0,3], "Header 4")
# Test first body row
self.assertEqual(data.cell_text[1,0], "Cell A (spans 2 rows)")
self.assertEqual(data.cell_text[1,1], "Cell B")
self.assertEqual(data.cell_text[1,2], "Cell C")
self.assertEqual(data.cell_text[1,3], "Cell D (spans 2 rows)")
# Test second body row
self.assertNotIn((2,0), data.cell_text)
self.assertEqual(data.cell_text[2,1], "Cell E-F (spans 2 cols)")
# Test third body row
self.assertEqual(data.cell_text[3,0], "Cell G")
self.assertEqual(data.cell_text[3,1], "Cell H-I-J (spans 3 cols)")
# Test heading cells
self.assertEqual(data.heading_cells, {
(0,0), (0,1), (0,3)
})