import unittest from olmocr.bench.tests import ( parse_html_tables, parse_markdown_tables, ) class TestParseHtmlTables(unittest.TestCase): def test_basic_table(self): data = parse_html_tables( """
ArXiv Old
scans
math
Tables Old
scans
Headers
&
footers
Multi
column
Long
tiny
text
Base Overall
Mistral OCR API 77.2 67.5 60.6 29.3 93.6 71.3 77.1 99.4 72.0±1.1
""" )[0] print(data) self.assertEqual(data.cell_text[0, 0], "") self.assertEqual(data.cell_text[0, 1], "ArXiv") self.assertEqual(data.left_relations[0, 0], set()) self.assertEqual(data.up_relations[0, 0], set()) self.assertEqual(data.left_relations[0, 1], {(0, 0)}) self.assertEqual(data.up_relations[1, 0], {(0, 0)}) self.assertEqual(data.heading_cells, {(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9)}) self.assertEqual(data.top_heading_relations(1, 3), {(0, 3)}) # If there are no left headings defined, then the left most column is considered the left heading print(data.left_heading_relations(1, 3)) self.assertEqual(data.left_heading_relations(1, 3), {(1, 0)}) def test_multiple_top_headings(self): data = parse_html_tables( """
Fruit Costs in Unittest land
Fruit Type Cost
Apples $1.00
Oranges $2.00
""" )[0] print(data) self.assertEqual(data.cell_text[0, 0], "Fruit Costs in Unittest land") self.assertEqual(data.cell_text[1, 0], "Fruit Type") self.assertEqual(data.cell_text[1, 1], "Cost") self.assertEqual(data.cell_text[2, 0], "Apples") self.assertEqual(data.cell_text[2, 1], "$1.00") self.assertEqual(data.cell_text[3, 0], "Oranges") self.assertEqual(data.cell_text[3, 1], "$2.00") self.assertEqual(data.up_relations[1, 0], {(0, 0)}) self.assertEqual(data.up_relations[1, 1], {(0, 0)}) self.assertEqual(data.up_relations[2, 0], {(1, 0)}) self.assertEqual(data.up_relations[2, 1], {(1, 1)}) self.assertEqual(data.top_heading_relations(1, 0), {(0, 0)}) self.assertEqual(data.top_heading_relations(1, 1), {(0, 0)}) self.assertEqual(data.top_heading_relations(2, 0), {(0, 0), (1, 0)}) self.assertEqual(data.top_heading_relations(2, 1), {(0, 0), (1, 1)}) def test_4x4_table_with_spans(self): """Test a 4x4 table with various row spans and column spans""" data = parse_html_tables( """
Header 1 Header 2-3 Header 4
Cell A (spans 2 rows) Cell B Cell C Cell D (spans 2 rows)
Cell E-F (spans 2 cols)
Cell G Cell H-I-J (spans 3 cols)
""" )[0] print(data) # Test header row self.assertEqual(data.cell_text[0, 0], "Header 1") self.assertEqual(data.cell_text[0, 1], "Header 2-3") self.assertNotIn((0, 2), data.cell_text) # colspan=2, so that next cell is empty self.assertEqual(data.cell_text[0, 3], "Header 4") # Test first body row self.assertEqual(data.cell_text[1, 0], "Cell A (spans 2 rows)") self.assertEqual(data.cell_text[1, 1], "Cell B") self.assertEqual(data.cell_text[1, 2], "Cell C") self.assertEqual(data.cell_text[1, 3], "Cell D (spans 2 rows)") # Test second body row self.assertNotIn((2, 0), data.cell_text) self.assertEqual(data.cell_text[2, 1], "Cell E-F (spans 2 cols)") # Test third body row self.assertEqual(data.cell_text[3, 0], "Cell G") self.assertEqual(data.cell_text[3, 1], "Cell H-I-J (spans 3 cols)") # Test heading cells self.assertEqual(data.heading_cells, {(0, 0), (0, 1), (0, 3)}) self.assertEqual(data.left_heading_relations(0, 0), set()) self.assertEqual(data.left_heading_relations(1, 0), set()) self.assertEqual(data.left_heading_relations(2, 0), set()) self.assertEqual(data.left_heading_relations(3, 0), set()) self.assertEqual(data.top_heading_relations(0, 0), set()) self.assertEqual(data.top_heading_relations(0, 1), set()) self.assertEqual(data.top_heading_relations(0, 2), set()) self.assertEqual(data.top_heading_relations(0, 3), set()) self.assertEqual(data.left_heading_relations(1, 1), {(1, 0)}) self.assertEqual(data.left_heading_relations(1, 2), {(1, 0)}) self.assertEqual(data.left_heading_relations(1, 3), {(1, 0)}) self.assertEqual(data.top_heading_relations(3, 1), {(0, 1), (0, 3)})