mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-08 01:22:43 +00:00

This PR exposes functions in evaluation module for easy conversion between tables in Deckerd and HTML formats, which are useful in evalution experiments.
639 lines
18 KiB
Python
639 lines
18 KiB
Python
import re
|
|
|
|
import pytest
|
|
|
|
from unstructured.metrics import text_extraction
|
|
from unstructured.metrics.table.table_extraction import (
|
|
deckerd_table_to_html,
|
|
extract_cells_from_table_as_cells,
|
|
extract_cells_from_text_as_html,
|
|
html_table_to_deckerd,
|
|
)
|
|
from unstructured.partition.auto import partition
|
|
|
|
|
|
def test_calculate_edit_distance():
|
|
source_cct = "I like pizza. I like bagels."
|
|
source_cct_word_space = "I like p i z z a . I like bagles."
|
|
source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
|
|
source_cct_no_space = source_cct.replace(" ", "")
|
|
source_cct_one_sentence = "I like pizza."
|
|
source_cct_missing_word = "I like pizza. I like ."
|
|
source_cct_addn_char = "I like pizza. I like beagles."
|
|
source_cct_dup_word = "I like pizza pizza. I like bagels."
|
|
|
|
assert (
|
|
round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
|
|
== 1.0
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_word_space,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.75
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_spaces,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.39
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_no_space,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.64
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_one_sentence,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.0
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_missing_word,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.57
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_addn_char,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.89
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_dup_word,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.79
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "expected_score", "expected_distance"),
|
|
[
|
|
("fake-text.txt", 0.78, 38),
|
|
],
|
|
)
|
|
def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
|
|
with open("example-docs/fake-text.txt") as f:
|
|
source_cct = f.read()
|
|
|
|
elements = partition(filename=f"example-docs/{filename}")
|
|
output_cct = "\n".join([str(el) for el in elements])
|
|
|
|
score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
|
|
distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
|
|
|
|
assert score >= 0
|
|
assert score <= 1.0
|
|
assert distance >= 0
|
|
assert round(score, 2) == expected_score
|
|
assert distance == expected_distance
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
(
|
|
"The dog loved the cat, but the cat loved the cow",
|
|
{"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
|
|
),
|
|
(
|
|
"Hello my name is H a r p e r, what's your name?",
|
|
{"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
|
|
),
|
|
(
|
|
"I have a dog and a cat, I love my dog.",
|
|
{"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
|
|
),
|
|
(
|
|
"My dog's hair is red, but the dogs' houses are blue.",
|
|
{
|
|
"my": 1,
|
|
"dog's": 1,
|
|
"hair": 1,
|
|
"is": 1,
|
|
"red": 1,
|
|
"but": 1,
|
|
"the": 1,
|
|
"dogs'": 1,
|
|
"houses": 1,
|
|
"are": 1,
|
|
"blue": 1,
|
|
},
|
|
),
|
|
(
|
|
"""Sometimes sentences have a dash - like this one!
|
|
A hyphen connects 2 words with no gap: easy-peasy.""",
|
|
{
|
|
"sometimes": 1,
|
|
"sentences": 1,
|
|
"have": 1,
|
|
"a": 2,
|
|
"dash": 1,
|
|
"like": 1,
|
|
"this": 1,
|
|
"one": 1,
|
|
"hyphen": 1,
|
|
"connects": 1,
|
|
"2": 1,
|
|
"words": 1,
|
|
"with": 1,
|
|
"no": 1,
|
|
"gap": 1,
|
|
"easy-peasy": 1,
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_bag_of_words(text, expected):
|
|
assert text_extraction.bag_of_words(text) == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("output_text", "source_text", "expected_percentage"),
|
|
[
|
|
(
|
|
"extra",
|
|
"",
|
|
0,
|
|
),
|
|
(
|
|
"",
|
|
"Source text has a sentence.",
|
|
1,
|
|
),
|
|
(
|
|
"The original s e n t e n c e is normal.",
|
|
"The original sentence is normal...",
|
|
0.2,
|
|
),
|
|
(
|
|
"We saw 23% improvement in this quarter.",
|
|
"We saw 23% improvement in sales this quarter.",
|
|
0.125,
|
|
),
|
|
(
|
|
"no",
|
|
"Is it possible to have more than everything missing?",
|
|
1,
|
|
),
|
|
],
|
|
)
|
|
def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
|
|
assert (
|
|
text_extraction.calculate_percent_missing_text(output_text, source_text)
|
|
== expected_percentage
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("table_as_cells", "expected_extraction"),
|
|
[
|
|
pytest.param(
|
|
[
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
|
|
],
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
],
|
|
id="Simple table, 1 head cell, 1 body cell, no spans",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
{"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
|
|
{"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
|
|
{"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
|
|
{"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
|
|
{"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
|
|
{"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
|
|
{"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
|
|
],
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 0, "col_index": 1, "content": "Month B."},
|
|
{"row_index": 0, "col_index": 2, "content": "Month C."},
|
|
{"row_index": 1, "col_index": 0, "content": "11"},
|
|
{"row_index": 1, "col_index": 1, "content": "12"},
|
|
{"row_index": 1, "col_index": 2, "content": "13"},
|
|
{"row_index": 2, "col_index": 0, "content": "21"},
|
|
{"row_index": 2, "col_index": 1, "content": "22"},
|
|
{"row_index": 2, "col_index": 2, "content": "23"},
|
|
],
|
|
id="Simple table, 3 head cell, 5 body cell, no spans",
|
|
),
|
|
# +----------+---------------------+----------+
|
|
# | | h1col23 | h1col4 |
|
|
# | h12col1 |----------+----------+----------|
|
|
# | | h2col2 | h2col34 |
|
|
# |----------|----------+----------+----------+
|
|
# | r3col1 | r3col2 | |
|
|
# |----------+----------| r34col34 |
|
|
# | r4col12 | |
|
|
# +----------+----------+----------+----------+
|
|
pytest.param(
|
|
[
|
|
{
|
|
"y": 0,
|
|
"x": 0,
|
|
"w": 2,
|
|
"h": 1,
|
|
"content": "h12col1",
|
|
},
|
|
{
|
|
"y": 0,
|
|
"x": 1,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "h1col23",
|
|
},
|
|
{
|
|
"y": 0,
|
|
"x": 3,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "h1col4",
|
|
},
|
|
{
|
|
"y": 1,
|
|
"x": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "h2col2",
|
|
},
|
|
{
|
|
"y": 1,
|
|
"x": 2,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "h2col34",
|
|
},
|
|
{
|
|
"y": 2,
|
|
"x": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3col1",
|
|
},
|
|
{
|
|
"y": 2,
|
|
"x": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3col2",
|
|
},
|
|
{
|
|
"y": 2,
|
|
"x": 2,
|
|
"w": 2,
|
|
"h": 2,
|
|
"content": "r34col34",
|
|
},
|
|
{
|
|
"y": 3,
|
|
"x": 0,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "r4col12",
|
|
},
|
|
],
|
|
[
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 0,
|
|
"content": "h12col1",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 1,
|
|
"content": "h1col23",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 3,
|
|
"content": "h1col4",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 1,
|
|
"content": "h2col2",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 2,
|
|
"content": "h2col34",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 0,
|
|
"content": "r3col1",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 1,
|
|
"content": "r3col2",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 2,
|
|
"content": "r34col34",
|
|
},
|
|
{
|
|
"row_index": 3,
|
|
"col_index": 0,
|
|
"content": "r4col12",
|
|
},
|
|
],
|
|
id="various spans, with 2 row header",
|
|
),
|
|
],
|
|
)
|
|
def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
|
|
example_element = {
|
|
"type": "Table",
|
|
"metadata": {"table_as_cells": table_as_cells},
|
|
}
|
|
assert extract_cells_from_table_as_cells(example_element) == expected_extraction
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text_as_html", "expected_extraction"),
|
|
[
|
|
pytest.param(
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>22</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"
|
|
""",
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
],
|
|
id="Simple table, 1 head cell, 1 body cell, no spans",
|
|
),
|
|
pytest.param(
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
<th>Month B.</th>
|
|
<th>Month C.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>11</td>
|
|
<td>12</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>21</td>
|
|
<td>22</td>
|
|
<td>23</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"
|
|
""",
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 0, "col_index": 1, "content": "Month B."},
|
|
{"row_index": 0, "col_index": 2, "content": "Month C."},
|
|
{"row_index": 1, "col_index": 0, "content": "11"},
|
|
{"row_index": 1, "col_index": 1, "content": "12"},
|
|
{"row_index": 1, "col_index": 2, "content": "13"},
|
|
{"row_index": 2, "col_index": 0, "content": "21"},
|
|
{"row_index": 2, "col_index": 1, "content": "22"},
|
|
{"row_index": 2, "col_index": 2, "content": "23"},
|
|
],
|
|
id="Simple table, 3 head cell, 5 body cell, no spans",
|
|
),
|
|
# +----------+---------------------+----------+
|
|
# | | h1col23 | h1col4 |
|
|
# | h12col1 |----------+----------+----------|
|
|
# | | h2col2 | h2col34 |
|
|
# |----------|----------+----------+----------+
|
|
# | r3col1 | r3col2 | |
|
|
# |----------+----------| r34col34 |
|
|
# | r4col12 | |
|
|
# +----------+----------+----------+----------+
|
|
pytest.param(
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th rowspan="2">h12col1</th>
|
|
<th colspan="2">h1col23</th>
|
|
<th>h1col4</th>
|
|
</tr>
|
|
<tr>
|
|
<th>h2col2</th>
|
|
<th colspan="2">h2col34</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>r3col1</td>
|
|
<td>r3col2</td>
|
|
<td colspan="2" rowspan="2">r34col34</td>
|
|
</tr>
|
|
<tr>
|
|
<td colspan="2">r4col12</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
[
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 0,
|
|
"content": "h12col1",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 1,
|
|
"content": "h1col23",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 3,
|
|
"content": "h1col4",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 1,
|
|
"content": "h2col2",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 2,
|
|
"content": "h2col34",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 0,
|
|
"content": "r3col1",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 1,
|
|
"content": "r3col2",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 2,
|
|
"content": "r34col34",
|
|
},
|
|
{
|
|
"row_index": 3,
|
|
"col_index": 0,
|
|
"content": "r4col12",
|
|
},
|
|
],
|
|
id="various spans, with 2 row header",
|
|
),
|
|
],
|
|
)
|
|
def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
|
|
example_element = {
|
|
"type": "Table",
|
|
"metadata": {
|
|
"text_as_html": text_as_html,
|
|
},
|
|
}
|
|
assert extract_cells_from_text_as_html(example_element) == expected_extraction
|
|
|
|
|
|
def test_cells_extraction_from_prediction_when_missing_prediction():
|
|
example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
|
|
assert extract_cells_from_text_as_html(example_element) is None
|
|
assert extract_cells_from_table_as_cells(example_element) is None
|
|
|
|
|
|
def _trim_html(html: str) -> str:
|
|
html_lines = [line.strip() for line in html.split("\n") if line]
|
|
return "".join(html_lines)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"html_to_test",
|
|
[
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>22</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
<th>Month B.</th>
|
|
<th>Month C.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>11</td>
|
|
<td>12</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>21</td>
|
|
<td>22</td>
|
|
<td>23</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th rowspan="2">h12col1</th>
|
|
<th colspan="2">h1col23</th>
|
|
<th>h1col4</th>
|
|
</tr>
|
|
<tr>
|
|
<th>h2col2</th>
|
|
<th colspan="2">h2col34</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>r3col1</td>
|
|
<td>r3col2</td>
|
|
<td colspan="2" rowspan="2">r34col34</td>
|
|
</tr>
|
|
<tr>
|
|
<td colspan="2">r4col12</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
],
|
|
)
|
|
def test_deckerd_html_converter(html_to_test):
|
|
deckerd_table = html_table_to_deckerd(html_to_test)
|
|
html_table = deckerd_table_to_html(deckerd_table)
|
|
assert _trim_html(html_to_test) == html_table
|