mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	feat: table evaluations for fixed html table generation (#3196)
Update to the evaluation script to handle correct HTML syntax for tables. See https://github.com/Unstructured-IO/unstructured-inference/pull/355 for details. This change: - modifies transforming HTML tables to evaluation internal `cells` format - fixes the indexing of the output (internal format cells) when HTML cells use spans
This commit is contained in:
		
							parent
							
								
									dadc9c6d0b
								
							
						
					
					
						commit
						29e64eb281
					
				@ -1,4 +1,4 @@
 | 
			
		||||
## 0.14.6-dev5
 | 
			
		||||
## 0.14.6-dev6
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
@ -13,6 +13,7 @@
 | 
			
		||||
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
 | 
			
		||||
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
 | 
			
		||||
* **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'.
 | 
			
		||||
* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format.
 | 
			
		||||
 | 
			
		||||
## 0.14.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
 | 
			
		||||
        {
 | 
			
		||||
            "type": "Table",
 | 
			
		||||
            "metadata": {
 | 
			
		||||
                "text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
 | 
			
		||||
                "text_as_html": """<table><thead><tr><th>r1c1</th><th>r1c2</th></tr></thead>
 | 
			
		||||
                    <tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
 | 
			
		||||
            },
 | 
			
		||||
        }
 | 
			
		||||
@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type():
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "text_as_html",
 | 
			
		||||
    [
 | 
			
		||||
        """<table><thead><th>r1c1</th><th>r1c2</th></thead>
 | 
			
		||||
            <tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
 | 
			
		||||
            <td>r3c2</td></tr></tbody></table>""",
 | 
			
		||||
        """<table><tr><th>r1c1</th><th>r1c2</th></tr>
 | 
			
		||||
            <tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
 | 
			
		||||
            <td>r3c2</td></tr></tbody></table>""",
 | 
			
		||||
        """<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
 | 
			
		||||
            <td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
 | 
			
		||||
        """
 | 
			
		||||
<table>
 | 
			
		||||
    <thead>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th>r1c1</th>
 | 
			
		||||
            <th>r1c2</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </thead>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r2c1</td>
 | 
			
		||||
            <td>r2c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r3c1</td>
 | 
			
		||||
            <td>r3c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>
 | 
			
		||||
""",
 | 
			
		||||
        """
 | 
			
		||||
<table>
 | 
			
		||||
    <tr>
 | 
			
		||||
        <th>r1c1</th>
 | 
			
		||||
        <th>r1c2</th>
 | 
			
		||||
    </tr>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r2c1</td>
 | 
			
		||||
            <td>r2c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r3c1</td>
 | 
			
		||||
            <td>r3c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>
 | 
			
		||||
""",
 | 
			
		||||
        """
 | 
			
		||||
<table>
 | 
			
		||||
    </tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r1c1</td>
 | 
			
		||||
            <td>r1c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r2c1</td>
 | 
			
		||||
            <td>r2c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r3c1</td>
 | 
			
		||||
            <td>r3c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>
 | 
			
		||||
""",
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_table_eval_processor_various_table_html_structures(text_as_html):
 | 
			
		||||
@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table():
 | 
			
		||||
        {
 | 
			
		||||
            "type": "Table",
 | 
			
		||||
            "metadata": {
 | 
			
		||||
                "text_as_html": """<table><thead><th>11</th><th>12</th></thead>
 | 
			
		||||
                    <tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
 | 
			
		||||
                "text_as_html": """
 | 
			
		||||
<table>
 | 
			
		||||
    <thead>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th>11</th>
 | 
			
		||||
            <th>12</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </thead>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>21</td>
 | 
			
		||||
            <td>22</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>"""
 | 
			
		||||
            },
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table():
 | 
			
		||||
    assert result.element_col_level_content_acc == 1.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.xfail(
 | 
			
		||||
    reason="This is expected to fail as table eval metrics does not cover merged cells"
 | 
			
		||||
)
 | 
			
		||||
def test_table_eval_processor_merged_cells():
 | 
			
		||||
    prediction = [
 | 
			
		||||
        {
 | 
			
		||||
            "type": "Table",
 | 
			
		||||
            "metadata": {
 | 
			
		||||
                "text_as_html": """
 | 
			
		||||
                <table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
 | 
			
		||||
                <tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
 | 
			
		||||
                <tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
 | 
			
		||||
                <tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
 | 
			
		||||
<table>
 | 
			
		||||
    <thead>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th rowspan="2">r1c1</th>
 | 
			
		||||
            <th>r1c2</th>
 | 
			
		||||
            <th colspan="2">r1c3</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th>r2c2</th>
 | 
			
		||||
            <th>r2c3</th>
 | 
			
		||||
            <th>r2c4</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </thead>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r3c1</td>
 | 
			
		||||
            <td>r3c2</td>
 | 
			
		||||
            <td colspan="2" rowspan="2">r3c3</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r4c1</td>
 | 
			
		||||
            <td>r4c2</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>
 | 
			
		||||
"""
 | 
			
		||||
            },
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
@ -159,7 +159,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            """Sometimes sentences have a dash - like this one!
 | 
			
		||||
                A hyphen connects 2 words with no gap: easy-peasy.""",
 | 
			
		||||
                    A hyphen connects 2 words with no gap: easy-peasy.""",
 | 
			
		||||
            {
 | 
			
		||||
                "sometimes": 1,
 | 
			
		||||
                "sentences": 1,
 | 
			
		||||
@ -222,24 +222,334 @@ def test_calculate_percent_missing_text(output_text, source_text, expected_perce
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cells_extraction_from_prediction_when_simple_example():
 | 
			
		||||
    example_element = {
 | 
			
		||||
        "type": "Table",
 | 
			
		||||
        "metadata": {
 | 
			
		||||
            "text_as_html": "<table><thead><th>Month A.</th></thead><tr><td>22</td></tr></table>",
 | 
			
		||||
            "table_as_cells": [
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    ("table_as_cells", "expected_extraction"),
 | 
			
		||||
    [
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            [
 | 
			
		||||
                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
 | 
			
		||||
                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
 | 
			
		||||
            ],
 | 
			
		||||
            [
 | 
			
		||||
                {"row_index": 0, "col_index": 0, "content": "Month A."},
 | 
			
		||||
                {"row_index": 1, "col_index": 0, "content": "22"},
 | 
			
		||||
            ],
 | 
			
		||||
            id="Simple table, 1 head cell, 1 body cell, no spans",
 | 
			
		||||
        ),
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            [
 | 
			
		||||
                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
 | 
			
		||||
                {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
 | 
			
		||||
                {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
 | 
			
		||||
                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
 | 
			
		||||
                {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
 | 
			
		||||
                {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
 | 
			
		||||
                {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
 | 
			
		||||
                {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
 | 
			
		||||
                {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
 | 
			
		||||
            ],
 | 
			
		||||
            [
 | 
			
		||||
                {"row_index": 0, "col_index": 0, "content": "Month A."},
 | 
			
		||||
                {"row_index": 0, "col_index": 1, "content": "Month B."},
 | 
			
		||||
                {"row_index": 0, "col_index": 2, "content": "Month C."},
 | 
			
		||||
                {"row_index": 1, "col_index": 0, "content": "11"},
 | 
			
		||||
                {"row_index": 1, "col_index": 1, "content": "12"},
 | 
			
		||||
                {"row_index": 1, "col_index": 2, "content": "13"},
 | 
			
		||||
                {"row_index": 2, "col_index": 0, "content": "21"},
 | 
			
		||||
                {"row_index": 2, "col_index": 1, "content": "22"},
 | 
			
		||||
                {"row_index": 2, "col_index": 2, "content": "23"},
 | 
			
		||||
            ],
 | 
			
		||||
            id="Simple table, 3 head cell, 5 body cell, no spans",
 | 
			
		||||
        ),
 | 
			
		||||
        # +----------+---------------------+----------+
 | 
			
		||||
        # |          |       h1col23       |  h1col4  |
 | 
			
		||||
        # | h12col1  |----------+----------+----------|
 | 
			
		||||
        # |          |  h2col2  |       h2col34       |
 | 
			
		||||
        # |----------|----------+----------+----------+
 | 
			
		||||
        # |  r3col1  |  r3col2  |                     |
 | 
			
		||||
        # |----------+----------|      r34col34       |
 | 
			
		||||
        # |       r4col12       |                     |
 | 
			
		||||
        # +----------+----------+----------+----------+
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            [
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 0,
 | 
			
		||||
                    "x": 0,
 | 
			
		||||
                    "w": 2,
 | 
			
		||||
                    "h": 1,
 | 
			
		||||
                    "content": "h12col1",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 0,
 | 
			
		||||
                    "x": 1,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 2,
 | 
			
		||||
                    "content": "h1col23",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 0,
 | 
			
		||||
                    "x": 3,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 1,
 | 
			
		||||
                    "content": "h1col4",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 1,
 | 
			
		||||
                    "x": 1,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 1,
 | 
			
		||||
                    "content": "h2col2",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 1,
 | 
			
		||||
                    "x": 2,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 2,
 | 
			
		||||
                    "content": "h2col34",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 2,
 | 
			
		||||
                    "x": 0,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 1,
 | 
			
		||||
                    "content": "r3col1",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 2,
 | 
			
		||||
                    "x": 1,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 1,
 | 
			
		||||
                    "content": "r3col2",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 2,
 | 
			
		||||
                    "x": 2,
 | 
			
		||||
                    "w": 2,
 | 
			
		||||
                    "h": 2,
 | 
			
		||||
                    "content": "r34col34",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "y": 3,
 | 
			
		||||
                    "x": 0,
 | 
			
		||||
                    "w": 1,
 | 
			
		||||
                    "h": 2,
 | 
			
		||||
                    "content": "r4col12",
 | 
			
		||||
                },
 | 
			
		||||
            ],
 | 
			
		||||
            [
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 0,
 | 
			
		||||
                    "col_index": 0,
 | 
			
		||||
                    "content": "h12col1",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 0,
 | 
			
		||||
                    "col_index": 1,
 | 
			
		||||
                    "content": "h1col23",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 0,
 | 
			
		||||
                    "col_index": 3,
 | 
			
		||||
                    "content": "h1col4",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 1,
 | 
			
		||||
                    "col_index": 1,
 | 
			
		||||
                    "content": "h2col2",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 1,
 | 
			
		||||
                    "col_index": 2,
 | 
			
		||||
                    "content": "h2col34",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 2,
 | 
			
		||||
                    "col_index": 0,
 | 
			
		||||
                    "content": "r3col1",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 2,
 | 
			
		||||
                    "col_index": 1,
 | 
			
		||||
                    "content": "r3col2",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 2,
 | 
			
		||||
                    "col_index": 2,
 | 
			
		||||
                    "content": "r34col34",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 3,
 | 
			
		||||
                    "col_index": 0,
 | 
			
		||||
                    "content": "r4col12",
 | 
			
		||||
                },
 | 
			
		||||
            ],
 | 
			
		||||
            id="various spans, with 2 row header",
 | 
			
		||||
        ),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
 | 
			
		||||
    example_element = {
 | 
			
		||||
        "type": "Table",
 | 
			
		||||
        "metadata": {"table_as_cells": table_as_cells},
 | 
			
		||||
    }
 | 
			
		||||
    assert extract_cells_from_table_as_cells(example_element) == expected_extraction
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    ("text_as_html", "expected_extraction"),
 | 
			
		||||
    [
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            """
 | 
			
		||||
<table>
 | 
			
		||||
    <thead>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th>Month A.</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </thead>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>22</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>"
 | 
			
		||||
            """,
 | 
			
		||||
            [
 | 
			
		||||
                {"row_index": 0, "col_index": 0, "content": "Month A."},
 | 
			
		||||
                {"row_index": 1, "col_index": 0, "content": "22"},
 | 
			
		||||
            ],
 | 
			
		||||
            id="Simple table, 1 head cell, 1 body cell, no spans",
 | 
			
		||||
        ),
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            """
 | 
			
		||||
<table>
 | 
			
		||||
    <thead>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th>Month A.</th>
 | 
			
		||||
            <th>Month B.</th>
 | 
			
		||||
            <th>Month C.</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </thead>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>11</td>
 | 
			
		||||
            <td>12</td>
 | 
			
		||||
            <td>13</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>21</td>
 | 
			
		||||
            <td>22</td>
 | 
			
		||||
            <td>23</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>"
 | 
			
		||||
""",
 | 
			
		||||
            [
 | 
			
		||||
                {"row_index": 0, "col_index": 0, "content": "Month A."},
 | 
			
		||||
                {"row_index": 0, "col_index": 1, "content": "Month B."},
 | 
			
		||||
                {"row_index": 0, "col_index": 2, "content": "Month C."},
 | 
			
		||||
                {"row_index": 1, "col_index": 0, "content": "11"},
 | 
			
		||||
                {"row_index": 1, "col_index": 1, "content": "12"},
 | 
			
		||||
                {"row_index": 1, "col_index": 2, "content": "13"},
 | 
			
		||||
                {"row_index": 2, "col_index": 0, "content": "21"},
 | 
			
		||||
                {"row_index": 2, "col_index": 1, "content": "22"},
 | 
			
		||||
                {"row_index": 2, "col_index": 2, "content": "23"},
 | 
			
		||||
            ],
 | 
			
		||||
            id="Simple table, 3 head cell, 5 body cell, no spans",
 | 
			
		||||
        ),
 | 
			
		||||
        # +----------+---------------------+----------+
 | 
			
		||||
        # |          |       h1col23       |  h1col4  |
 | 
			
		||||
        # | h12col1  |----------+----------+----------|
 | 
			
		||||
        # |          |  h2col2  |       h2col34       |
 | 
			
		||||
        # |----------|----------+----------+----------+
 | 
			
		||||
        # |  r3col1  |  r3col2  |                     |
 | 
			
		||||
        # |----------+----------|      r34col34       |
 | 
			
		||||
        # |       r4col12       |                     |
 | 
			
		||||
        # +----------+----------+----------+----------+
 | 
			
		||||
        pytest.param(
 | 
			
		||||
            """
 | 
			
		||||
<table>
 | 
			
		||||
    <thead>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th rowspan="2">h12col1</th>
 | 
			
		||||
            <th colspan="2">h1col23</th>
 | 
			
		||||
            <th>h1col4</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <th>h2col2</th>
 | 
			
		||||
            <th colspan="2">h2col34</th>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </thead>
 | 
			
		||||
    <tbody>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>r3col1</td>
 | 
			
		||||
            <td>r3col2</td>
 | 
			
		||||
            <td colspan="2" rowspan="2">r34col34</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td colspan="2">r4col12</td>
 | 
			
		||||
        </tr>
 | 
			
		||||
    </tbody>
 | 
			
		||||
</table>
 | 
			
		||||
""",
 | 
			
		||||
            [
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 0,
 | 
			
		||||
                    "col_index": 0,
 | 
			
		||||
                    "content": "h12col1",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 0,
 | 
			
		||||
                    "col_index": 1,
 | 
			
		||||
                    "content": "h1col23",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 0,
 | 
			
		||||
                    "col_index": 3,
 | 
			
		||||
                    "content": "h1col4",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 1,
 | 
			
		||||
                    "col_index": 1,
 | 
			
		||||
                    "content": "h2col2",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 1,
 | 
			
		||||
                    "col_index": 2,
 | 
			
		||||
                    "content": "h2col34",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 2,
 | 
			
		||||
                    "col_index": 0,
 | 
			
		||||
                    "content": "r3col1",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 2,
 | 
			
		||||
                    "col_index": 1,
 | 
			
		||||
                    "content": "r3col2",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 2,
 | 
			
		||||
                    "col_index": 2,
 | 
			
		||||
                    "content": "r34col34",
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "row_index": 3,
 | 
			
		||||
                    "col_index": 0,
 | 
			
		||||
                    "content": "r4col12",
 | 
			
		||||
                },
 | 
			
		||||
            ],
 | 
			
		||||
            id="various spans, with 2 row header",
 | 
			
		||||
        ),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
 | 
			
		||||
    example_element = {
 | 
			
		||||
        "type": "Table",
 | 
			
		||||
        "metadata": {
 | 
			
		||||
            "text_as_html": text_as_html,
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    expected_extraction = [
 | 
			
		||||
        {"row_index": 0, "col_index": 0, "content": "Month A."},
 | 
			
		||||
        {"row_index": 1, "col_index": 0, "content": "22"},
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    assert extract_cells_from_text_as_html(example_element) == expected_extraction
 | 
			
		||||
    assert extract_cells_from_table_as_cells(example_element) == expected_extraction
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_cells_extraction_from_prediction_when_missing_prediction():
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.14.6-dev5"  # pragma: no cover
 | 
			
		||||
__version__ = "0.14.6-dev6"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -11,8 +11,34 @@ EMPTY_CELL = {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
 | 
			
		||||
    """Convert html format to table structure.
 | 
			
		||||
def _move_cells_for_spanned_cells(cells: List[Dict[str, Any]]):
 | 
			
		||||
    """Move cells to the right if spanned cells have an influence on the rendering.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        cells: List of cells in the table in Deckerd format.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        List of cells in the table in Deckerd format with cells moved to the right if spanned.
 | 
			
		||||
    """
 | 
			
		||||
    sorted_cells = sorted(cells, key=lambda x: (x["y"], x["x"]))
 | 
			
		||||
    cells_occupied_by_spanned = set()
 | 
			
		||||
    for cell in sorted_cells:
 | 
			
		||||
        if cell["w"] > 1 or cell["h"] > 1:
 | 
			
		||||
            for i in range(cell["y"], cell["y"] + cell["h"]):
 | 
			
		||||
                for j in range(cell["x"], cell["x"] + cell["w"]):
 | 
			
		||||
                    if (i, j) != (cell["y"], cell["x"]):
 | 
			
		||||
                        cells_occupied_by_spanned.add((i, j))
 | 
			
		||||
        while (cell["y"], cell["x"]) in cells_occupied_by_spanned:
 | 
			
		||||
            cell_y, cell_x = cell["y"], cell["x"]
 | 
			
		||||
            cells_to_the_right = [c for c in sorted_cells if c["y"] == cell_y and c["x"] >= cell_x]
 | 
			
		||||
            for cell_to_move in cells_to_the_right:
 | 
			
		||||
                cell_to_move["x"] += 1
 | 
			
		||||
            cells_occupied_by_spanned.remove((cell_y, cell_x))
 | 
			
		||||
    return sorted_cells
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
 | 
			
		||||
    """Convert html format to Deckerd table structure.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        content: The html content with a table to extract.
 | 
			
		||||
@ -20,33 +46,38 @@ def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
 | 
			
		||||
    Returns:
 | 
			
		||||
        A list of dictionaries where each dictionary represents a cell in the table.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
    table = soup.find("table")
 | 
			
		||||
    rows = table.findAll(["tr", "thead"])
 | 
			
		||||
    rows = table.findAll(["tr"])
 | 
			
		||||
    table_data = []
 | 
			
		||||
 | 
			
		||||
    for i, row in enumerate(rows):
 | 
			
		||||
        headers = row.findAll("th")
 | 
			
		||||
        data_row = row.findAll("td")
 | 
			
		||||
        cells = row.findAll(["th", "td"])
 | 
			
		||||
        for j, cell_data in enumerate(cells):
 | 
			
		||||
            cell = {
 | 
			
		||||
                "y": i,
 | 
			
		||||
                "x": j,
 | 
			
		||||
                "w": int(cell_data.attrs.get("colspan", 1)),
 | 
			
		||||
                "h": int(cell_data.attrs.get("rowspan", 1)),
 | 
			
		||||
                "content": cell_data.text,
 | 
			
		||||
            }
 | 
			
		||||
            table_data.append(cell)
 | 
			
		||||
    return _move_cells_for_spanned_cells(table_data)
 | 
			
		||||
 | 
			
		||||
        if headers:
 | 
			
		||||
            for j, header in enumerate(headers):
 | 
			
		||||
                cell = {
 | 
			
		||||
                    "row_index": i,
 | 
			
		||||
                    "col_index": j,
 | 
			
		||||
                    "content": header.text,
 | 
			
		||||
                }
 | 
			
		||||
                table_data.append(cell)
 | 
			
		||||
 | 
			
		||||
        if data_row:
 | 
			
		||||
            for k, data in enumerate(data_row):
 | 
			
		||||
                cell = {
 | 
			
		||||
                    "row_index": i,
 | 
			
		||||
                    "col_index": k,
 | 
			
		||||
                    "content": data.text,
 | 
			
		||||
                }
 | 
			
		||||
                table_data.append(cell)
 | 
			
		||||
    return table_data
 | 
			
		||||
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
 | 
			
		||||
    """Convert html format to table structure. As a middle step it converts
 | 
			
		||||
    html to the Deckerd format as it's more convenient to work with.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        content: The html content with a table to extract.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        A list of dictionaries where each dictionary represents a cell in the table.
 | 
			
		||||
    """
 | 
			
		||||
    deckerd_cells = _html_table_to_deckerd(content)
 | 
			
		||||
    return _convert_table_from_deckerd(deckerd_cells)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 | 
			
		||||
@ -151,11 +182,15 @@ def extract_cells_from_text_as_html(element: Dict[str, Any]) -> List[Dict[str, A
 | 
			
		||||
            "metadata": {
 | 
			
		||||
                "text_as_html": "<table>
 | 
			
		||||
                                    <thead>
 | 
			
		||||
                                        <th>Month A.</th>
 | 
			
		||||
                                        <tr>
 | 
			
		||||
                                            <th>Month A.</th>
 | 
			
		||||
                                        </tr>
 | 
			
		||||
                                    </thead>
 | 
			
		||||
                                    <tr>
 | 
			
		||||
                                        <td>22</td><
 | 
			
		||||
                                    /tr>
 | 
			
		||||
                                    </tbody>
 | 
			
		||||
                                        <tr>
 | 
			
		||||
                                            <td>22</td><
 | 
			
		||||
                                        </tr>
 | 
			
		||||
                                    </tbody>
 | 
			
		||||
                                </table>"
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user