mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-03 19:45:41 +00:00 
			
		
		
		
	Adding more row span col span tests
This commit is contained in:
		
							parent
							
								
									c9f0b2c709
								
							
						
					
					
						commit
						cce7a6c4de
					
				@ -178,3 +178,370 @@ class TestParseHtmlTables(unittest.TestCase):
 | 
			
		||||
        self.assertEqual(data.left_heading_relations(1, 3), {(1, 0)})
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.top_heading_relations(3, 1), {(0, 1), (0, 3)})
 | 
			
		||||
 | 
			
		||||
    def test_complex_multi_level_headers(self):
 | 
			
		||||
        """Test a table with multiple levels of headers and complex spanning"""
 | 
			
		||||
        data = parse_html_tables(
 | 
			
		||||
            """
 | 
			
		||||
                    <table border="1">
 | 
			
		||||
                        <thead>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th rowspan="3"></th>
 | 
			
		||||
                                <th colspan="6">Main Category</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th colspan="3">Sub Category A</th>
 | 
			
		||||
                                <th colspan="3">Sub Category B</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>A1</th>
 | 
			
		||||
                                <th>A2</th>
 | 
			
		||||
                                <th>A3</th>
 | 
			
		||||
                                <th>B1</th>
 | 
			
		||||
                                <th>B2</th>
 | 
			
		||||
                                <th>B3</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                        </thead>
 | 
			
		||||
                        <tbody>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <td>Row 1</td>
 | 
			
		||||
                                <td>10</td>
 | 
			
		||||
                                <td>20</td>
 | 
			
		||||
                                <td>30</td>
 | 
			
		||||
                                <td>40</td>
 | 
			
		||||
                                <td>50</td>
 | 
			
		||||
                                <td>60</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <td>Row 2</td>
 | 
			
		||||
                                <td>15</td>
 | 
			
		||||
                                <td>25</td>
 | 
			
		||||
                                <td>35</td>
 | 
			
		||||
                                <td>45</td>
 | 
			
		||||
                                <td>55</td>
 | 
			
		||||
                                <td>65</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                        </tbody>
 | 
			
		||||
                    </table>"""
 | 
			
		||||
        )[0]
 | 
			
		||||
 | 
			
		||||
        print("\n=== Complex Multi-Level Headers Test ===")
 | 
			
		||||
        print(data)
 | 
			
		||||
 | 
			
		||||
        # Test the three-level header structure
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 0], "")  # Empty corner cell
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 1], "Main Category")
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 1], "Sub Category A")
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 4], "Sub Category B")
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 1], "A1")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 2], "A2")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 3], "A3")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 4], "B1")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 5], "B2")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 6], "B3")
 | 
			
		||||
 | 
			
		||||
        # Test data rows
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 0], "Row 1")
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 1], "10")
 | 
			
		||||
        self.assertEqual(data.cell_text[4, 0], "Row 2")
 | 
			
		||||
        self.assertEqual(data.cell_text[4, 1], "15")
 | 
			
		||||
 | 
			
		||||
        # Test heading cells - all header rows should be marked as heading cells
 | 
			
		||||
        expected_heading_cells = {
 | 
			
		||||
            (0, 0), (0, 1),  # First header row
 | 
			
		||||
            (1, 1), (1, 4),  # Second header row
 | 
			
		||||
            (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6)  # Third header row
 | 
			
		||||
        }
 | 
			
		||||
        self.assertEqual(data.heading_cells, expected_heading_cells)
 | 
			
		||||
 | 
			
		||||
        # Test top heading relations for data cells
 | 
			
		||||
        # Cell (3, 1) should have relations to all three levels of headers above it
 | 
			
		||||
        top_relations_3_1 = data.top_heading_relations(3, 1)
 | 
			
		||||
        self.assertIn((0, 1), top_relations_3_1)  # Main Category
 | 
			
		||||
        self.assertIn((1, 1), top_relations_3_1)  # Sub Category A
 | 
			
		||||
        self.assertIn((2, 1), top_relations_3_1)  # A1
 | 
			
		||||
 | 
			
		||||
        # Cell (3, 4) should relate to headers in the B column
 | 
			
		||||
        top_relations_3_4 = data.top_heading_relations(3, 4)
 | 
			
		||||
        self.assertIn((0, 1), top_relations_3_4)  # Main Category
 | 
			
		||||
        self.assertIn((1, 4), top_relations_3_4)  # Sub Category B
 | 
			
		||||
        self.assertIn((2, 4), top_relations_3_4)  # B1
 | 
			
		||||
 | 
			
		||||
        # Test left heading relations
 | 
			
		||||
        self.assertEqual(data.left_heading_relations(3, 1), {(3, 0)})  # Row 1
 | 
			
		||||
        self.assertEqual(data.left_heading_relations(4, 1), {(4, 0)})  # Row 2
 | 
			
		||||
 | 
			
		||||
    def test_left_headers_with_row_spans(self):
 | 
			
		||||
        """Test a table with left-side headers that have row spans"""
 | 
			
		||||
        data = parse_html_tables(
 | 
			
		||||
            """
 | 
			
		||||
                    <table border="1">
 | 
			
		||||
                        <thead>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th rowspan="2" colspan="2"></th>
 | 
			
		||||
                                <th colspan="3">Quarter 1</th>
 | 
			
		||||
                                <th colspan="3">Quarter 2</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>Jan</th>
 | 
			
		||||
                                <th>Feb</th>
 | 
			
		||||
                                <th>Mar</th>
 | 
			
		||||
                                <th>Apr</th>
 | 
			
		||||
                                <th>May</th>
 | 
			
		||||
                                <th>Jun</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                        </thead>
 | 
			
		||||
                        <tbody>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th rowspan="3">North</th>
 | 
			
		||||
                                <th>Sales</th>
 | 
			
		||||
                                <td>100</td>
 | 
			
		||||
                                <td>110</td>
 | 
			
		||||
                                <td>120</td>
 | 
			
		||||
                                <td>130</td>
 | 
			
		||||
                                <td>140</td>
 | 
			
		||||
                                <td>150</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>Cost</th>
 | 
			
		||||
                                <td>50</td>
 | 
			
		||||
                                <td>55</td>
 | 
			
		||||
                                <td>60</td>
 | 
			
		||||
                                <td>65</td>
 | 
			
		||||
                                <td>70</td>
 | 
			
		||||
                                <td>75</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>Profit</th>
 | 
			
		||||
                                <td>50</td>
 | 
			
		||||
                                <td>55</td>
 | 
			
		||||
                                <td>60</td>
 | 
			
		||||
                                <td>65</td>
 | 
			
		||||
                                <td>70</td>
 | 
			
		||||
                                <td>75</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th rowspan="3">South</th>
 | 
			
		||||
                                <th>Sales</th>
 | 
			
		||||
                                <td>200</td>
 | 
			
		||||
                                <td>210</td>
 | 
			
		||||
                                <td>220</td>
 | 
			
		||||
                                <td>230</td>
 | 
			
		||||
                                <td>240</td>
 | 
			
		||||
                                <td>250</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>Cost</th>
 | 
			
		||||
                                <td>100</td>
 | 
			
		||||
                                <td>105</td>
 | 
			
		||||
                                <td>110</td>
 | 
			
		||||
                                <td>115</td>
 | 
			
		||||
                                <td>120</td>
 | 
			
		||||
                                <td>125</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>Profit</th>
 | 
			
		||||
                                <td>100</td>
 | 
			
		||||
                                <td>105</td>
 | 
			
		||||
                                <td>110</td>
 | 
			
		||||
                                <td>115</td>
 | 
			
		||||
                                <td>120</td>
 | 
			
		||||
                                <td>125</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                        </tbody>
 | 
			
		||||
                    </table>"""
 | 
			
		||||
        )[0]
 | 
			
		||||
 | 
			
		||||
        print("\n=== Left Headers with Row Spans Test ===")
 | 
			
		||||
        print(data)
 | 
			
		||||
 | 
			
		||||
        # Test top headers
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 2], "Quarter 1")
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 5], "Quarter 2")
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 2], "Jan")
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 3], "Feb")
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 4], "Mar")
 | 
			
		||||
 | 
			
		||||
        # Test left headers with row spans
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 0], "North")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 1], "Sales")
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 1], "Cost")
 | 
			
		||||
        self.assertEqual(data.cell_text[4, 1], "Profit")
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.cell_text[5, 0], "South")
 | 
			
		||||
        self.assertEqual(data.cell_text[5, 1], "Sales")
 | 
			
		||||
 | 
			
		||||
        # Test data values
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 2], "100")
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 2], "50")
 | 
			
		||||
        self.assertEqual(data.cell_text[5, 2], "200")
 | 
			
		||||
 | 
			
		||||
        # Test heading cells - should include both top headers and left headers
 | 
			
		||||
        # Top headers: rows 0-1, Left headers: column 0-1 in data rows
 | 
			
		||||
        self.assertIn((0, 2), data.heading_cells)  # Quarter 1
 | 
			
		||||
        self.assertIn((1, 2), data.heading_cells)  # Jan
 | 
			
		||||
        self.assertIn((2, 0), data.heading_cells)  # North (left header)
 | 
			
		||||
        self.assertIn((2, 1), data.heading_cells)  # Sales (left header)
 | 
			
		||||
        self.assertIn((5, 0), data.heading_cells)  # South (left header)
 | 
			
		||||
 | 
			
		||||
        # Test left heading relations with multiple levels
 | 
			
		||||
        # Data cell (2, 2) should have both North and Sales as left headers
 | 
			
		||||
        left_relations_2_2 = data.left_heading_relations(2, 2)
 | 
			
		||||
        self.assertIn((2, 0), left_relations_2_2)  # North
 | 
			
		||||
        self.assertIn((2, 1), left_relations_2_2)  # Sales
 | 
			
		||||
 | 
			
		||||
        # Data cell (3, 2) should have North (spans from row 2) and Cost as left headers
 | 
			
		||||
        left_relations_3_2 = data.left_heading_relations(3, 2)
 | 
			
		||||
        self.assertIn((2, 0), left_relations_3_2)  # North (row span)
 | 
			
		||||
        self.assertIn((3, 1), left_relations_3_2)  # Cost
 | 
			
		||||
 | 
			
		||||
        # Data cell (5, 3) should have South and Sales as left headers
 | 
			
		||||
        left_relations_5_3 = data.left_heading_relations(5, 3)
 | 
			
		||||
        self.assertIn((5, 0), left_relations_5_3)  # South
 | 
			
		||||
        self.assertIn((5, 1), left_relations_5_3)  # Sales
 | 
			
		||||
 | 
			
		||||
        # Test top heading relations
 | 
			
		||||
        top_relations_2_2 = data.top_heading_relations(2, 2)
 | 
			
		||||
        self.assertIn((0, 2), top_relations_2_2)  # Quarter 1
 | 
			
		||||
        self.assertIn((1, 2), top_relations_2_2)  # Jan
 | 
			
		||||
 | 
			
		||||
    def test_nested_header_groups_with_col_spans(self):
 | 
			
		||||
        """Test a complex table with nested header groups and various column spans"""
 | 
			
		||||
        data = parse_html_tables(
 | 
			
		||||
            """
 | 
			
		||||
                    <table border="1">
 | 
			
		||||
                        <thead>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th rowspan="4">Region</th>
 | 
			
		||||
                                <th rowspan="4">Store</th>
 | 
			
		||||
                                <th colspan="12">2024 Sales Data</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th colspan="6">First Half</th>
 | 
			
		||||
                                <th colspan="6">Second Half</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th colspan="3">Q1</th>
 | 
			
		||||
                                <th colspan="3">Q2</th>
 | 
			
		||||
                                <th colspan="3">Q3</th>
 | 
			
		||||
                                <th colspan="3">Q4</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <th>Jan</th>
 | 
			
		||||
                                <th>Feb</th>
 | 
			
		||||
                                <th>Mar</th>
 | 
			
		||||
                                <th>Apr</th>
 | 
			
		||||
                                <th>May</th>
 | 
			
		||||
                                <th>Jun</th>
 | 
			
		||||
                                <th>Jul</th>
 | 
			
		||||
                                <th>Aug</th>
 | 
			
		||||
                                <th>Sep</th>
 | 
			
		||||
                                <th>Oct</th>
 | 
			
		||||
                                <th>Nov</th>
 | 
			
		||||
                                <th>Dec</th>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                        </thead>
 | 
			
		||||
                        <tbody>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <td rowspan="2">West</td>
 | 
			
		||||
                                <td>Store A</td>
 | 
			
		||||
                                <td>10</td>
 | 
			
		||||
                                <td>11</td>
 | 
			
		||||
                                <td>12</td>
 | 
			
		||||
                                <td>13</td>
 | 
			
		||||
                                <td>14</td>
 | 
			
		||||
                                <td>15</td>
 | 
			
		||||
                                <td>16</td>
 | 
			
		||||
                                <td>17</td>
 | 
			
		||||
                                <td>18</td>
 | 
			
		||||
                                <td>19</td>
 | 
			
		||||
                                <td>20</td>
 | 
			
		||||
                                <td>21</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                            <tr>
 | 
			
		||||
                                <td>Store B</td>
 | 
			
		||||
                                <td>22</td>
 | 
			
		||||
                                <td>23</td>
 | 
			
		||||
                                <td>24</td>
 | 
			
		||||
                                <td>25</td>
 | 
			
		||||
                                <td>26</td>
 | 
			
		||||
                                <td>27</td>
 | 
			
		||||
                                <td>28</td>
 | 
			
		||||
                                <td>29</td>
 | 
			
		||||
                                <td>30</td>
 | 
			
		||||
                                <td>31</td>
 | 
			
		||||
                                <td>32</td>
 | 
			
		||||
                                <td>33</td>
 | 
			
		||||
                            </tr>
 | 
			
		||||
                        </tbody>
 | 
			
		||||
                    </table>"""
 | 
			
		||||
        )[0]
 | 
			
		||||
 | 
			
		||||
        print("\n=== Nested Header Groups with Col Spans Test ===")
 | 
			
		||||
        print(data)
 | 
			
		||||
 | 
			
		||||
        # Test nested header structure
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 0], "Region")
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 1], "Store")
 | 
			
		||||
        self.assertEqual(data.cell_text[0, 2], "2024 Sales Data")
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 2], "First Half")
 | 
			
		||||
        self.assertEqual(data.cell_text[1, 8], "Second Half")
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 2], "Q1")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 5], "Q2")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 8], "Q3")
 | 
			
		||||
        self.assertEqual(data.cell_text[2, 11], "Q4")
 | 
			
		||||
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 2], "Jan")
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 7], "Jun")
 | 
			
		||||
        self.assertEqual(data.cell_text[3, 13], "Dec")
 | 
			
		||||
 | 
			
		||||
        # Test data rows
 | 
			
		||||
        self.assertEqual(data.cell_text[4, 0], "West")
 | 
			
		||||
        self.assertEqual(data.cell_text[4, 1], "Store A")
 | 
			
		||||
        self.assertEqual(data.cell_text[4, 2], "10")
 | 
			
		||||
        self.assertEqual(data.cell_text[5, 1], "Store B")
 | 
			
		||||
        self.assertEqual(data.cell_text[5, 2], "22")
 | 
			
		||||
 | 
			
		||||
        # Test all header cells are marked
 | 
			
		||||
        self.assertIn((0, 0), data.heading_cells)  # Region
 | 
			
		||||
        self.assertIn((0, 1), data.heading_cells)  # Store
 | 
			
		||||
        self.assertIn((0, 2), data.heading_cells)  # 2024 Sales Data
 | 
			
		||||
        self.assertIn((1, 2), data.heading_cells)  # First Half
 | 
			
		||||
        self.assertIn((2, 2), data.heading_cells)  # Q1
 | 
			
		||||
        self.assertIn((3, 2), data.heading_cells)  # Jan
 | 
			
		||||
 | 
			
		||||
        # Test multiple top heading relations for a data cell
 | 
			
		||||
        # Cell (4, 2) - January data for Store A should have all 4 levels of headers
 | 
			
		||||
        top_relations_4_2 = data.top_heading_relations(4, 2)
 | 
			
		||||
        self.assertIn((0, 2), top_relations_4_2)  # 2024 Sales Data
 | 
			
		||||
        self.assertIn((1, 2), top_relations_4_2)  # First Half
 | 
			
		||||
        self.assertIn((2, 2), top_relations_4_2)  # Q1
 | 
			
		||||
        self.assertIn((3, 2), top_relations_4_2)  # Jan
 | 
			
		||||
 | 
			
		||||
        # Cell (4, 7) - June data should relate to Q2 and First Half
 | 
			
		||||
        top_relations_4_7 = data.top_heading_relations(4, 7)
 | 
			
		||||
        self.assertIn((0, 2), top_relations_4_7)  # 2024 Sales Data
 | 
			
		||||
        self.assertIn((1, 2), top_relations_4_7)  # First Half
 | 
			
		||||
        self.assertIn((2, 5), top_relations_4_7)  # Q2
 | 
			
		||||
        self.assertIn((3, 7), top_relations_4_7)  # Jun
 | 
			
		||||
 | 
			
		||||
        # Cell (4, 13) - December data should relate to Q4 and Second Half
 | 
			
		||||
        top_relations_4_13 = data.top_heading_relations(4, 13)
 | 
			
		||||
        self.assertIn((0, 2), top_relations_4_13)  # 2024 Sales Data
 | 
			
		||||
        self.assertIn((1, 8), top_relations_4_13)  # Second Half
 | 
			
		||||
        self.assertIn((2, 11), top_relations_4_13)  # Q4
 | 
			
		||||
        self.assertIn((3, 13), top_relations_4_13)  # Dec
 | 
			
		||||
 | 
			
		||||
        # Test left heading relations
 | 
			
		||||
        # Store B row that says "22" should relate to just West
 | 
			
		||||
        left_relations_5_2 = data.left_heading_relations(5, 2)
 | 
			
		||||
        self.assertEqual(data.cell_text[5,2], "22")
 | 
			
		||||
        self.assertIn((4, 0), left_relations_5_2)
 | 
			
		||||
        self.assertEqual(len(left_relations_5_2), 1)
 | 
			
		||||
 | 
			
		||||
        # But of the left headings themselves at the top, January, should have both Region and Store
 | 
			
		||||
        self.assertEqual(data.left_heading_relations(3,2), {(0, 0), (0, 1)})
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user