feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
This commit is contained in:
Pawel Kmiecik 2024-06-14 11:03:27 +02:00 committed by GitHub
parent dadc9c6d0b
commit 29e64eb281
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 485 additions and 59 deletions

View File

@ -1,4 +1,4 @@
## 0.14.6-dev5
## 0.14.6-dev6
### Enhancements
@ -13,6 +13,7 @@
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
* **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'.
* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format.
## 0.14.5

View File

@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
{
"type": "Table",
"metadata": {
"text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
"text_as_html": """<table><thead><tr><th>r1c1</th><th>r1c2</th></tr></thead>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
},
}
@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type():
@pytest.mark.parametrize(
"text_as_html",
[
"""<table><thead><th>r1c1</th><th>r1c2</th></thead>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
<td>r3c2</td></tr></tbody></table>""",
"""<table><tr><th>r1c1</th><th>r1c2</th></tr>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
<td>r3c2</td></tr></tbody></table>""",
"""<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
<td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
"""
<table>
<thead>
<tr>
<th>r1c1</th>
<th>r1c2</th>
</tr>
</thead>
<tbody>
<tr>
<td>r2c1</td>
<td>r2c2</td>
</tr>
<tr>
<td>r3c1</td>
<td>r3c2</td>
</tr>
</tbody>
</table>
""",
"""
<table>
<tr>
<th>r1c1</th>
<th>r1c2</th>
</tr>
<tbody>
<tr>
<td>r2c1</td>
<td>r2c2</td>
</tr>
<tr>
<td>r3c1</td>
<td>r3c2</td>
</tr>
</tbody>
</table>
""",
"""
<table>
</tbody>
<tr>
<td>r1c1</td>
<td>r1c2</td>
</tr>
<tr>
<td>r2c1</td>
<td>r2c2</td>
</tr>
<tr>
<td>r3c1</td>
<td>r3c2</td>
</tr>
</tbody>
</table>
""",
],
)
def test_table_eval_processor_various_table_html_structures(text_as_html):
@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table():
{
"type": "Table",
"metadata": {
"text_as_html": """<table><thead><th>11</th><th>12</th></thead>
<tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
"text_as_html": """
<table>
<thead>
<tr>
<th>11</th>
<th>12</th>
</tr>
</thead>
<tbody>
<tr>
<td>21</td>
<td>22</td>
</tr>
</tbody>
</table>"""
},
}
]
@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table():
assert result.element_col_level_content_acc == 1.0
@pytest.mark.xfail(
reason="This is expected to fail as table eval metrics does not cover merged cells"
)
def test_table_eval_processor_merged_cells():
prediction = [
{
"type": "Table",
"metadata": {
"text_as_html": """
<table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
<tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
<tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
<tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
<table>
<thead>
<tr>
<th rowspan="2">r1c1</th>
<th>r1c2</th>
<th colspan="2">r1c3</th>
</tr>
<tr>
<th>r2c2</th>
<th>r2c3</th>
<th>r2c4</th>
</tr>
</thead>
<tbody>
<tr>
<td>r3c1</td>
<td>r3c2</td>
<td colspan="2" rowspan="2">r3c3</td>
</tr>
<tr>
<td>r4c1</td>
<td>r4c2</td>
</tr>
</tbody>
</table>
"""
},
}
]

View File

@ -159,7 +159,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
),
(
"""Sometimes sentences have a dash - like this one!
A hyphen connects 2 words with no gap: easy-peasy.""",
A hyphen connects 2 words with no gap: easy-peasy.""",
{
"sometimes": 1,
"sentences": 1,
@ -222,24 +222,334 @@ def test_calculate_percent_missing_text(output_text, source_text, expected_perce
)
def test_cells_extraction_from_prediction_when_simple_example():
example_element = {
"type": "Table",
"metadata": {
"text_as_html": "<table><thead><th>Month A.</th></thead><tr><td>22</td></tr></table>",
"table_as_cells": [
@pytest.mark.parametrize(
("table_as_cells", "expected_extraction"),
[
pytest.param(
[
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
],
[
{"row_index": 0, "col_index": 0, "content": "Month A."},
{"row_index": 1, "col_index": 0, "content": "22"},
],
id="Simple table, 1 head cell, 1 body cell, no spans",
),
pytest.param(
[
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
{"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
{"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
{"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
{"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
{"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
{"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
{"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
],
[
{"row_index": 0, "col_index": 0, "content": "Month A."},
{"row_index": 0, "col_index": 1, "content": "Month B."},
{"row_index": 0, "col_index": 2, "content": "Month C."},
{"row_index": 1, "col_index": 0, "content": "11"},
{"row_index": 1, "col_index": 1, "content": "12"},
{"row_index": 1, "col_index": 2, "content": "13"},
{"row_index": 2, "col_index": 0, "content": "21"},
{"row_index": 2, "col_index": 1, "content": "22"},
{"row_index": 2, "col_index": 2, "content": "23"},
],
id="Simple table, 3 head cell, 5 body cell, no spans",
),
# +----------+---------------------+----------+
# | | h1col23 | h1col4 |
# | h12col1 |----------+----------+----------|
# | | h2col2 | h2col34 |
# |----------|----------+----------+----------+
# | r3col1 | r3col2 | |
# |----------+----------| r34col34 |
# | r4col12 | |
# +----------+----------+----------+----------+
pytest.param(
[
{
"y": 0,
"x": 0,
"w": 2,
"h": 1,
"content": "h12col1",
},
{
"y": 0,
"x": 1,
"w": 1,
"h": 2,
"content": "h1col23",
},
{
"y": 0,
"x": 3,
"w": 1,
"h": 1,
"content": "h1col4",
},
{
"y": 1,
"x": 1,
"w": 1,
"h": 1,
"content": "h2col2",
},
{
"y": 1,
"x": 2,
"w": 1,
"h": 2,
"content": "h2col34",
},
{
"y": 2,
"x": 0,
"w": 1,
"h": 1,
"content": "r3col1",
},
{
"y": 2,
"x": 1,
"w": 1,
"h": 1,
"content": "r3col2",
},
{
"y": 2,
"x": 2,
"w": 2,
"h": 2,
"content": "r34col34",
},
{
"y": 3,
"x": 0,
"w": 1,
"h": 2,
"content": "r4col12",
},
],
[
{
"row_index": 0,
"col_index": 0,
"content": "h12col1",
},
{
"row_index": 0,
"col_index": 1,
"content": "h1col23",
},
{
"row_index": 0,
"col_index": 3,
"content": "h1col4",
},
{
"row_index": 1,
"col_index": 1,
"content": "h2col2",
},
{
"row_index": 1,
"col_index": 2,
"content": "h2col34",
},
{
"row_index": 2,
"col_index": 0,
"content": "r3col1",
},
{
"row_index": 2,
"col_index": 1,
"content": "r3col2",
},
{
"row_index": 2,
"col_index": 2,
"content": "r34col34",
},
{
"row_index": 3,
"col_index": 0,
"content": "r4col12",
},
],
id="various spans, with 2 row header",
),
],
)
def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
example_element = {
"type": "Table",
"metadata": {"table_as_cells": table_as_cells},
}
assert extract_cells_from_table_as_cells(example_element) == expected_extraction
@pytest.mark.parametrize(
("text_as_html", "expected_extraction"),
[
pytest.param(
"""
<table>
<thead>
<tr>
<th>Month A.</th>
</tr>
</thead>
<tbody>
<tr>
<td>22</td>
</tr>
</tbody>
</table>"
""",
[
{"row_index": 0, "col_index": 0, "content": "Month A."},
{"row_index": 1, "col_index": 0, "content": "22"},
],
id="Simple table, 1 head cell, 1 body cell, no spans",
),
pytest.param(
"""
<table>
<thead>
<tr>
<th>Month A.</th>
<th>Month B.</th>
<th>Month C.</th>
</tr>
</thead>
<tbody>
<tr>
<td>11</td>
<td>12</td>
<td>13</td>
</tr>
<tr>
<td>21</td>
<td>22</td>
<td>23</td>
</tr>
</tbody>
</table>"
""",
[
{"row_index": 0, "col_index": 0, "content": "Month A."},
{"row_index": 0, "col_index": 1, "content": "Month B."},
{"row_index": 0, "col_index": 2, "content": "Month C."},
{"row_index": 1, "col_index": 0, "content": "11"},
{"row_index": 1, "col_index": 1, "content": "12"},
{"row_index": 1, "col_index": 2, "content": "13"},
{"row_index": 2, "col_index": 0, "content": "21"},
{"row_index": 2, "col_index": 1, "content": "22"},
{"row_index": 2, "col_index": 2, "content": "23"},
],
id="Simple table, 3 head cell, 5 body cell, no spans",
),
# +----------+---------------------+----------+
# | | h1col23 | h1col4 |
# | h12col1 |----------+----------+----------|
# | | h2col2 | h2col34 |
# |----------|----------+----------+----------+
# | r3col1 | r3col2 | |
# |----------+----------| r34col34 |
# | r4col12 | |
# +----------+----------+----------+----------+
pytest.param(
"""
<table>
<thead>
<tr>
<th rowspan="2">h12col1</th>
<th colspan="2">h1col23</th>
<th>h1col4</th>
</tr>
<tr>
<th>h2col2</th>
<th colspan="2">h2col34</th>
</tr>
</thead>
<tbody>
<tr>
<td>r3col1</td>
<td>r3col2</td>
<td colspan="2" rowspan="2">r34col34</td>
</tr>
<tr>
<td colspan="2">r4col12</td>
</tr>
</tbody>
</table>
""",
[
{
"row_index": 0,
"col_index": 0,
"content": "h12col1",
},
{
"row_index": 0,
"col_index": 1,
"content": "h1col23",
},
{
"row_index": 0,
"col_index": 3,
"content": "h1col4",
},
{
"row_index": 1,
"col_index": 1,
"content": "h2col2",
},
{
"row_index": 1,
"col_index": 2,
"content": "h2col34",
},
{
"row_index": 2,
"col_index": 0,
"content": "r3col1",
},
{
"row_index": 2,
"col_index": 1,
"content": "r3col2",
},
{
"row_index": 2,
"col_index": 2,
"content": "r34col34",
},
{
"row_index": 3,
"col_index": 0,
"content": "r4col12",
},
],
id="various spans, with 2 row header",
),
],
)
def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
example_element = {
"type": "Table",
"metadata": {
"text_as_html": text_as_html,
},
}
expected_extraction = [
{"row_index": 0, "col_index": 0, "content": "Month A."},
{"row_index": 1, "col_index": 0, "content": "22"},
]
assert extract_cells_from_text_as_html(example_element) == expected_extraction
assert extract_cells_from_table_as_cells(example_element) == expected_extraction
def test_cells_extraction_from_prediction_when_missing_prediction():

View File

@ -1 +1 @@
__version__ = "0.14.6-dev5" # pragma: no cover
__version__ = "0.14.6-dev6" # pragma: no cover

View File

@ -11,8 +11,34 @@ EMPTY_CELL = {
}
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
"""Convert html format to table structure.
def _move_cells_for_spanned_cells(cells: List[Dict[str, Any]]):
"""Move cells to the right if spanned cells have an influence on the rendering.
Args:
cells: List of cells in the table in Deckerd format.
Returns:
List of cells in the table in Deckerd format with cells moved to the right if spanned.
"""
sorted_cells = sorted(cells, key=lambda x: (x["y"], x["x"]))
cells_occupied_by_spanned = set()
for cell in sorted_cells:
if cell["w"] > 1 or cell["h"] > 1:
for i in range(cell["y"], cell["y"] + cell["h"]):
for j in range(cell["x"], cell["x"] + cell["w"]):
if (i, j) != (cell["y"], cell["x"]):
cells_occupied_by_spanned.add((i, j))
while (cell["y"], cell["x"]) in cells_occupied_by_spanned:
cell_y, cell_x = cell["y"], cell["x"]
cells_to_the_right = [c for c in sorted_cells if c["y"] == cell_y and c["x"] >= cell_x]
for cell_to_move in cells_to_the_right:
cell_to_move["x"] += 1
cells_occupied_by_spanned.remove((cell_y, cell_x))
return sorted_cells
def _html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
"""Convert html format to Deckerd table structure.
Args:
content: The html content with a table to extract.
@ -20,33 +46,38 @@ def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table")
rows = table.findAll(["tr", "thead"])
rows = table.findAll(["tr"])
table_data = []
for i, row in enumerate(rows):
headers = row.findAll("th")
data_row = row.findAll("td")
cells = row.findAll(["th", "td"])
for j, cell_data in enumerate(cells):
cell = {
"y": i,
"x": j,
"w": int(cell_data.attrs.get("colspan", 1)),
"h": int(cell_data.attrs.get("rowspan", 1)),
"content": cell_data.text,
}
table_data.append(cell)
return _move_cells_for_spanned_cells(table_data)
if headers:
for j, header in enumerate(headers):
cell = {
"row_index": i,
"col_index": j,
"content": header.text,
}
table_data.append(cell)
if data_row:
for k, data in enumerate(data_row):
cell = {
"row_index": i,
"col_index": k,
"content": data.text,
}
table_data.append(cell)
return table_data
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
"""Convert html format to table structure. As a middle step it converts
html to the Deckerd format as it's more convenient to work with.
Args:
content: The html content with a table to extract.
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
deckerd_cells = _html_table_to_deckerd(content)
return _convert_table_from_deckerd(deckerd_cells)
def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@ -151,11 +182,15 @@ def extract_cells_from_text_as_html(element: Dict[str, Any]) -> List[Dict[str, A
"metadata": {
"text_as_html": "<table>
<thead>
<th>Month A.</th>
<tr>
<th>Month A.</th>
</tr>
</thead>
<tr>
<td>22</td><
/tr>
</tbody>
<tr>
<td>22</td><
</tr>
</tbody>
</table>"
}
}