From cce7a6c4de73cd64dac6925ceae3002bcb0435e7 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Fri, 24 Oct 2025 21:50:32 +0000 Subject: [PATCH] Adding more row span col span tests --- tests/test_table_parsing.py | 367 ++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) diff --git a/tests/test_table_parsing.py b/tests/test_table_parsing.py index 0b0b2f9..96e0bd8 100644 --- a/tests/test_table_parsing.py +++ b/tests/test_table_parsing.py @@ -178,3 +178,370 @@ class TestParseHtmlTables(unittest.TestCase): self.assertEqual(data.left_heading_relations(1, 3), {(1, 0)}) self.assertEqual(data.top_heading_relations(3, 1), {(0, 1), (0, 3)}) + + def test_complex_multi_level_headers(self): + """Test a table with multiple levels of headers and complex spanning""" + data = parse_html_tables( + """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Main Category
Sub Category ASub Category B
A1A2A3B1B2B3
Row 1102030405060
Row 2152535455565
""" + )[0] + + print("\n=== Complex Multi-Level Headers Test ===") + print(data) + + # Test the three-level header structure + self.assertEqual(data.cell_text[0, 0], "") # Empty corner cell + self.assertEqual(data.cell_text[0, 1], "Main Category") + + self.assertEqual(data.cell_text[1, 1], "Sub Category A") + self.assertEqual(data.cell_text[1, 4], "Sub Category B") + + self.assertEqual(data.cell_text[2, 1], "A1") + self.assertEqual(data.cell_text[2, 2], "A2") + self.assertEqual(data.cell_text[2, 3], "A3") + self.assertEqual(data.cell_text[2, 4], "B1") + self.assertEqual(data.cell_text[2, 5], "B2") + self.assertEqual(data.cell_text[2, 6], "B3") + + # Test data rows + self.assertEqual(data.cell_text[3, 0], "Row 1") + self.assertEqual(data.cell_text[3, 1], "10") + self.assertEqual(data.cell_text[4, 0], "Row 2") + self.assertEqual(data.cell_text[4, 1], "15") + + # Test heading cells - all header rows should be marked as heading cells + expected_heading_cells = { + (0, 0), (0, 1), # First header row + (1, 1), (1, 4), # Second header row + (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6) # Third header row + } + self.assertEqual(data.heading_cells, expected_heading_cells) + + # Test top heading relations for data cells + # Cell (3, 1) should have relations to all three levels of headers above it + top_relations_3_1 = data.top_heading_relations(3, 1) + self.assertIn((0, 1), top_relations_3_1) # Main Category + self.assertIn((1, 1), top_relations_3_1) # Sub Category A + self.assertIn((2, 1), top_relations_3_1) # A1 + + # Cell (3, 4) should relate to headers in the B column + top_relations_3_4 = data.top_heading_relations(3, 4) + self.assertIn((0, 1), top_relations_3_4) # Main Category + self.assertIn((1, 4), top_relations_3_4) # Sub Category B + self.assertIn((2, 4), top_relations_3_4) # B1 + + # Test left heading relations + self.assertEqual(data.left_heading_relations(3, 1), {(3, 0)}) # Row 1 + self.assertEqual(data.left_heading_relations(4, 1), {(4, 0)}) # Row 2 + + def test_left_headers_with_row_spans(self): + """Test a table with left-side headers that have row spans""" + data = parse_html_tables( + """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Quarter 1Quarter 2
JanFebMarAprMayJun
NorthSales100110120130140150
Cost505560657075
Profit505560657075
SouthSales200210220230240250
Cost100105110115120125
Profit100105110115120125
""" + )[0] + + print("\n=== Left Headers with Row Spans Test ===") + print(data) + + # Test top headers + self.assertEqual(data.cell_text[0, 2], "Quarter 1") + self.assertEqual(data.cell_text[0, 5], "Quarter 2") + self.assertEqual(data.cell_text[1, 2], "Jan") + self.assertEqual(data.cell_text[1, 3], "Feb") + self.assertEqual(data.cell_text[1, 4], "Mar") + + # Test left headers with row spans + self.assertEqual(data.cell_text[2, 0], "North") + self.assertEqual(data.cell_text[2, 1], "Sales") + self.assertEqual(data.cell_text[3, 1], "Cost") + self.assertEqual(data.cell_text[4, 1], "Profit") + + self.assertEqual(data.cell_text[5, 0], "South") + self.assertEqual(data.cell_text[5, 1], "Sales") + + # Test data values + self.assertEqual(data.cell_text[2, 2], "100") + self.assertEqual(data.cell_text[3, 2], "50") + self.assertEqual(data.cell_text[5, 2], "200") + + # Test heading cells - should include both top headers and left headers + # Top headers: rows 0-1, Left headers: column 0-1 in data rows + self.assertIn((0, 2), data.heading_cells) # Quarter 1 + self.assertIn((1, 2), data.heading_cells) # Jan + self.assertIn((2, 0), data.heading_cells) # North (left header) + self.assertIn((2, 1), data.heading_cells) # Sales (left header) + self.assertIn((5, 0), data.heading_cells) # South (left header) + + # Test left heading relations with multiple levels + # Data cell (2, 2) should have both North and Sales as left headers + left_relations_2_2 = data.left_heading_relations(2, 2) + self.assertIn((2, 0), left_relations_2_2) # North + self.assertIn((2, 1), left_relations_2_2) # Sales + + # Data cell (3, 2) should have North (spans from row 2) and Cost as left headers + left_relations_3_2 = data.left_heading_relations(3, 2) + self.assertIn((2, 0), left_relations_3_2) # North (row span) + self.assertIn((3, 1), left_relations_3_2) # Cost + + # Data cell (5, 3) should have South and Sales as left headers + left_relations_5_3 = data.left_heading_relations(5, 3) + self.assertIn((5, 0), left_relations_5_3) # South + self.assertIn((5, 1), left_relations_5_3) # Sales + + # Test top heading relations + top_relations_2_2 = data.top_heading_relations(2, 2) + self.assertIn((0, 2), top_relations_2_2) # Quarter 1 + self.assertIn((1, 2), top_relations_2_2) # Jan + + def test_nested_header_groups_with_col_spans(self): + """Test a complex table with nested header groups and various column spans""" + data = parse_html_tables( + """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RegionStore2024 Sales Data
First HalfSecond Half
Q1Q2Q3Q4
JanFebMarAprMayJunJulAugSepOctNovDec
WestStore A101112131415161718192021
Store B222324252627282930313233
""" + )[0] + + print("\n=== Nested Header Groups with Col Spans Test ===") + print(data) + + # Test nested header structure + self.assertEqual(data.cell_text[0, 0], "Region") + self.assertEqual(data.cell_text[0, 1], "Store") + self.assertEqual(data.cell_text[0, 2], "2024 Sales Data") + + self.assertEqual(data.cell_text[1, 2], "First Half") + self.assertEqual(data.cell_text[1, 8], "Second Half") + + self.assertEqual(data.cell_text[2, 2], "Q1") + self.assertEqual(data.cell_text[2, 5], "Q2") + self.assertEqual(data.cell_text[2, 8], "Q3") + self.assertEqual(data.cell_text[2, 11], "Q4") + + self.assertEqual(data.cell_text[3, 2], "Jan") + self.assertEqual(data.cell_text[3, 7], "Jun") + self.assertEqual(data.cell_text[3, 13], "Dec") + + # Test data rows + self.assertEqual(data.cell_text[4, 0], "West") + self.assertEqual(data.cell_text[4, 1], "Store A") + self.assertEqual(data.cell_text[4, 2], "10") + self.assertEqual(data.cell_text[5, 1], "Store B") + self.assertEqual(data.cell_text[5, 2], "22") + + # Test all header cells are marked + self.assertIn((0, 0), data.heading_cells) # Region + self.assertIn((0, 1), data.heading_cells) # Store + self.assertIn((0, 2), data.heading_cells) # 2024 Sales Data + self.assertIn((1, 2), data.heading_cells) # First Half + self.assertIn((2, 2), data.heading_cells) # Q1 + self.assertIn((3, 2), data.heading_cells) # Jan + + # Test multiple top heading relations for a data cell + # Cell (4, 2) - January data for Store A should have all 4 levels of headers + top_relations_4_2 = data.top_heading_relations(4, 2) + self.assertIn((0, 2), top_relations_4_2) # 2024 Sales Data + self.assertIn((1, 2), top_relations_4_2) # First Half + self.assertIn((2, 2), top_relations_4_2) # Q1 + self.assertIn((3, 2), top_relations_4_2) # Jan + + # Cell (4, 7) - June data should relate to Q2 and First Half + top_relations_4_7 = data.top_heading_relations(4, 7) + self.assertIn((0, 2), top_relations_4_7) # 2024 Sales Data + self.assertIn((1, 2), top_relations_4_7) # First Half + self.assertIn((2, 5), top_relations_4_7) # Q2 + self.assertIn((3, 7), top_relations_4_7) # Jun + + # Cell (4, 13) - December data should relate to Q4 and Second Half + top_relations_4_13 = data.top_heading_relations(4, 13) + self.assertIn((0, 2), top_relations_4_13) # 2024 Sales Data + self.assertIn((1, 8), top_relations_4_13) # Second Half + self.assertIn((2, 11), top_relations_4_13) # Q4 + self.assertIn((3, 13), top_relations_4_13) # Dec + + # Test left heading relations + # Store B row that says "22" should relate to just West + left_relations_5_2 = data.left_heading_relations(5, 2) + self.assertEqual(data.cell_text[5,2], "22") + self.assertIn((4, 0), left_relations_5_2) + self.assertEqual(len(left_relations_5_2), 1) + + # But of the left headings themselves at the top, January, should have both Region and Store + self.assertEqual(data.left_heading_relations(3,2), {(0, 0), (0, 1)}) \ No newline at end of file