mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00

This PR redefines the `table_level_acc` metric as follow: - for each predicted table use sequence matching ratio as its accuracy - as a prerequisite for the sequence matching we sort the table cells by row then column for both predicted and ground truth to ensure they are ordered the same - average all predicted table accuracy - any prediction without a matching ground truth (false positive) would decrease the score - prediction that splits ground truth into smaller tables would also have low score with perfectly equal splits having lowest score This new definition makes the new metric a value between 0 and 1 per file. This replaces the existing definition where the metric is defined as (the number of predicted table that has a match to ground truth) to (the number of ground truth table). This existing metric actually gives higher values for predictions that splits tables and can be higher than 1. The new definition prefers predictions that do not split ground truth tables.
356 lines
11 KiB
Python
356 lines
11 KiB
Python
import pytest
|
|
|
|
from unstructured.metrics.table.table_eval import TableEvalProcessor
|
|
from unstructured.metrics.table_structure import (
|
|
eval_table_transformer_for_file,
|
|
image_or_pdf_to_dataframe,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
[
|
|
"example-docs/table-multi-row-column-cells.png",
|
|
"example-docs/table-multi-row-column-cells.pdf",
|
|
],
|
|
)
|
|
def test_image_or_pdf_to_dataframe(filename):
|
|
df = image_or_pdf_to_dataframe(filename)
|
|
assert ["Blind", "5", "1", "4", "34.5%, n=1", "1199 sec, n=1"] in df.values
|
|
|
|
|
|
def test_eval_table_transformer_for_file():
|
|
score = eval_table_transformer_for_file(
|
|
"example-docs/table-multi-row-column-cells.png",
|
|
"example-docs/table-multi-row-column-cells-actual.csv",
|
|
)
|
|
# avoid severe degradation of performance
|
|
assert 0.8 < score < 1
|
|
|
|
|
|
def test_table_eval_processor_simple():
|
|
prediction = [
|
|
{
|
|
"type": "Table",
|
|
"metadata": {
|
|
"text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
|
|
<tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
|
|
},
|
|
}
|
|
]
|
|
|
|
ground_truth = [
|
|
{
|
|
"type": "Table",
|
|
"text": [
|
|
{
|
|
"id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
|
|
"x": 0,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r1c1",
|
|
},
|
|
{
|
|
"id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
|
|
"x": 0,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c1",
|
|
},
|
|
{
|
|
"id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
|
|
"x": 1,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r1c2",
|
|
},
|
|
{
|
|
"id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
|
|
"x": 1,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c2",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
te_processor = TableEvalProcessor(prediction, ground_truth)
|
|
result = te_processor.process_file()
|
|
assert result.total_tables == 1
|
|
assert result.table_level_acc == 1.0
|
|
assert result.element_row_level_index_acc == 1.0
|
|
assert result.element_col_level_index_acc == 1.0
|
|
assert result.element_row_level_content_acc == 1.0
|
|
assert result.element_col_level_content_acc == 1.0
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text_as_html",
|
|
[
|
|
"""<table><thead><th>r1c1</th><th>r1c2</th></thead>
|
|
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
|
|
<td>r3c2</td></tr></tbody></table>""",
|
|
"""<table><tr><th>r1c1</th><th>r1c2</th></tr>
|
|
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
|
|
<td>r3c2</td></tr></tbody></table>""",
|
|
"""<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
|
|
<td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
|
|
],
|
|
)
|
|
def test_table_eval_processor_various_table_html_structures(text_as_html):
|
|
prediction = [{"type": "Table", "metadata": {"text_as_html": text_as_html}}]
|
|
|
|
ground_truth = [
|
|
{
|
|
"type": "Table",
|
|
"text": [
|
|
{
|
|
"id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
|
|
"x": 0,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r1c1",
|
|
},
|
|
{
|
|
"id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
|
|
"x": 0,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c1",
|
|
},
|
|
{
|
|
"id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
|
|
"x": 1,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r1c2",
|
|
},
|
|
{
|
|
"id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
|
|
"x": 1,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c2",
|
|
},
|
|
{
|
|
"id": "364f4a17-2979-4506-ae77-e8adf8e3f554",
|
|
"x": 0,
|
|
"y": 2,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3c1",
|
|
},
|
|
{
|
|
"id": "30f87503-ac1f-4db1-b924-b316af585702",
|
|
"x": 1,
|
|
"y": 2,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3c2",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
te_processor = TableEvalProcessor(prediction, ground_truth)
|
|
result = te_processor.process_file()
|
|
assert result.total_tables == 1
|
|
assert result.table_level_acc == 1.0
|
|
assert result.element_row_level_index_acc == 1.0
|
|
assert result.element_col_level_index_acc == 1.0
|
|
assert result.element_row_level_content_acc == 1.0
|
|
assert result.element_col_level_content_acc == 1.0
|
|
|
|
|
|
def test_table_eval_processor_non_str_values_in_table():
|
|
prediction = [
|
|
{
|
|
"type": "Table",
|
|
"metadata": {
|
|
"text_as_html": """<table><thead><th>11</th><th>12</th></thead>
|
|
<tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
|
|
},
|
|
}
|
|
]
|
|
|
|
ground_truth = [
|
|
{
|
|
"type": "Table",
|
|
"text": [
|
|
{
|
|
"id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
|
|
"x": 0,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "11",
|
|
},
|
|
{
|
|
"id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
|
|
"x": 0,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "21",
|
|
},
|
|
{
|
|
"id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
|
|
"x": 1,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "12",
|
|
},
|
|
{
|
|
"id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
|
|
"x": 1,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "22",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
te_processor = TableEvalProcessor(prediction, ground_truth)
|
|
result = te_processor.process_file()
|
|
assert result.total_tables == 1
|
|
assert result.table_level_acc == 1.0
|
|
assert result.element_row_level_index_acc == 1.0
|
|
assert result.element_col_level_index_acc == 1.0
|
|
assert result.element_row_level_content_acc == 1.0
|
|
assert result.element_col_level_content_acc == 1.0
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
reason="This is expected to fail as table eval metrics does not cover merged cells"
|
|
)
|
|
def test_table_eval_processor_merged_cells():
|
|
prediction = [
|
|
{
|
|
"type": "Table",
|
|
"metadata": {
|
|
"text_as_html": """
|
|
<table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
|
|
<tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
|
|
<tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
|
|
<tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
|
|
},
|
|
}
|
|
]
|
|
|
|
ground_truth = [
|
|
{
|
|
"type": "Table",
|
|
"text": [
|
|
{
|
|
"id": "f399ef57-5b88-4509-8971-9cb63246866e",
|
|
"x": 0,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "r1c1",
|
|
},
|
|
{
|
|
"id": "2dfdec2f-e8f3-4be7-a6ac-8ff21c4e8556",
|
|
"x": 0,
|
|
"y": 2,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3c1",
|
|
},
|
|
{
|
|
"id": "9c771c58-88c7-49d8-9c12-85d0e44b920e",
|
|
"x": 0,
|
|
"y": 3,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r4c1",
|
|
},
|
|
{
|
|
"id": "5bd6f3f0-34c5-495b-8a28-c4ac96989ef8",
|
|
"x": 1,
|
|
"y": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r1c2",
|
|
},
|
|
{
|
|
"id": "7b8e6bc2-a310-4dd6-997c-313f951e7f96",
|
|
"x": 1,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c2",
|
|
},
|
|
{
|
|
"id": "1c152ad4-12fa-4a7b-90de-a992aa6410a4",
|
|
"x": 1,
|
|
"y": 2,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3c2",
|
|
},
|
|
{
|
|
"id": "55063f64-0003-4217-b6ca-aff5914793ff",
|
|
"x": 1,
|
|
"y": 3,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r4c2",
|
|
},
|
|
{
|
|
"id": "22852e86-0e22-4d32-b63a-9ba7dd4118a2",
|
|
"x": 2,
|
|
"y": 0,
|
|
"w": 2,
|
|
"h": 1,
|
|
"content": "r1c3",
|
|
},
|
|
{
|
|
"id": "eae013c5-5597-4a8b-9771-82e28c5c5cba",
|
|
"x": 2,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c3",
|
|
},
|
|
{
|
|
"id": "0dea3a42-8523-4d6e-9e70-d65cc2314678",
|
|
"x": 2,
|
|
"y": 2,
|
|
"w": 2,
|
|
"h": 2,
|
|
"content": "r3c3",
|
|
},
|
|
{
|
|
"id": "60093e2c-d3e2-4146-92b5-97a2fc16c061",
|
|
"x": 3,
|
|
"y": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r2c4",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
te_processor = TableEvalProcessor(prediction, ground_truth)
|
|
result = te_processor.process_file()
|
|
assert result.total_tables == 1
|
|
assert result.table_level_acc == 1.0
|
|
assert result.element_row_level_index_acc == 1.0
|
|
assert result.element_col_level_index_acc == 1.0
|
|
assert result.element_row_level_content_acc == 1.0
|
|
assert result.element_col_level_content_acc == 1.0
|