mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-19 07:02:38 +00:00

This pull request add metrics that are calculated based on table_as_cells instead of text_as_html. This change is required for comprehensive metrics calculation, as previously every colspan or rowspan predicted was considered to be an incorrect predicted (even if it was correct prediction) This change has to be merged after https://github.com/Unstructured-IO/unstructured/pull/2892 which introduces table_as_cells field
249 lines
6.9 KiB
Python
249 lines
6.9 KiB
Python
import re
|
|
|
|
import pytest
|
|
|
|
from unstructured.metrics import text_extraction
|
|
from unstructured.metrics.table.table_extraction import (
|
|
extract_cells_from_table_as_cells,
|
|
extract_cells_from_text_as_html,
|
|
)
|
|
from unstructured.partition.auto import partition
|
|
|
|
|
|
def test_calculate_edit_distance():
|
|
source_cct = "I like pizza. I like bagels."
|
|
source_cct_word_space = "I like p i z z a . I like bagles."
|
|
source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
|
|
source_cct_no_space = source_cct.replace(" ", "")
|
|
source_cct_one_sentence = "I like pizza."
|
|
source_cct_missing_word = "I like pizza. I like ."
|
|
source_cct_addn_char = "I like pizza. I like beagles."
|
|
source_cct_dup_word = "I like pizza pizza. I like bagels."
|
|
|
|
assert (
|
|
round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
|
|
== 1.0
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_word_space,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.75
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_spaces,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.39
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_no_space,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.64
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_one_sentence,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.0
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_missing_word,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.57
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_addn_char,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.89
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_dup_word,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.79
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "expected_score", "expected_distance"),
|
|
[
|
|
("fake-text.txt", 0.78, 38),
|
|
],
|
|
)
|
|
def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
|
|
with open("example-docs/fake-text.txt") as f:
|
|
source_cct = f.read()
|
|
|
|
elements = partition(filename=f"example-docs/{filename}")
|
|
output_cct = "\n".join([str(el) for el in elements])
|
|
|
|
score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
|
|
distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
|
|
|
|
assert score >= 0
|
|
assert score <= 1.0
|
|
assert distance >= 0
|
|
assert round(score, 2) == expected_score
|
|
assert distance == expected_distance
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
(
|
|
"The dog loved the cat, but the cat loved the cow",
|
|
{"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
|
|
),
|
|
(
|
|
"Hello my name is H a r p e r, what's your name?",
|
|
{"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
|
|
),
|
|
(
|
|
"I have a dog and a cat, I love my dog.",
|
|
{"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
|
|
),
|
|
(
|
|
"My dog's hair is red, but the dogs' houses are blue.",
|
|
{
|
|
"my": 1,
|
|
"dog's": 1,
|
|
"hair": 1,
|
|
"is": 1,
|
|
"red": 1,
|
|
"but": 1,
|
|
"the": 1,
|
|
"dogs'": 1,
|
|
"houses": 1,
|
|
"are": 1,
|
|
"blue": 1,
|
|
},
|
|
),
|
|
(
|
|
"""Sometimes sentences have a dash - like this one!
|
|
A hyphen connects 2 words with no gap: easy-peasy.""",
|
|
{
|
|
"sometimes": 1,
|
|
"sentences": 1,
|
|
"have": 1,
|
|
"a": 2,
|
|
"dash": 1,
|
|
"like": 1,
|
|
"this": 1,
|
|
"one": 1,
|
|
"hyphen": 1,
|
|
"connects": 1,
|
|
"2": 1,
|
|
"words": 1,
|
|
"with": 1,
|
|
"no": 1,
|
|
"gap": 1,
|
|
"easy-peasy": 1,
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_bag_of_words(text, expected):
|
|
assert text_extraction.bag_of_words(text) == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("output_text", "source_text", "expected_percentage"),
|
|
[
|
|
(
|
|
"extra",
|
|
"",
|
|
0,
|
|
),
|
|
(
|
|
"",
|
|
"Source text has a sentence.",
|
|
1,
|
|
),
|
|
(
|
|
"The original s e n t e n c e is normal.",
|
|
"The original sentence is normal...",
|
|
0.2,
|
|
),
|
|
(
|
|
"We saw 23% improvement in this quarter.",
|
|
"We saw 23% improvement in sales this quarter.",
|
|
0.125,
|
|
),
|
|
(
|
|
"no",
|
|
"Is it possible to have more than everything missing?",
|
|
1,
|
|
),
|
|
],
|
|
)
|
|
def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
|
|
assert (
|
|
text_extraction.calculate_percent_missing_text(output_text, source_text)
|
|
== expected_percentage
|
|
)
|
|
|
|
|
|
def test_cells_extraction_from_prediction_when_simple_example():
|
|
example_element = {
|
|
"type": "Table",
|
|
"metadata": {
|
|
"text_as_html": "<table><thead><th>Month A.</th></thead><tr><td>22</td></tr></table>",
|
|
"table_as_cells": [
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
|
|
],
|
|
},
|
|
}
|
|
expected_extraction = [
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
]
|
|
|
|
assert extract_cells_from_text_as_html(example_element) == expected_extraction
|
|
assert extract_cells_from_table_as_cells(example_element) == expected_extraction
|
|
|
|
|
|
def test_cells_extraction_from_prediction_when_missing_prediction():
|
|
example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
|
|
assert extract_cells_from_text_as_html(example_element) is None
|
|
assert extract_cells_from_table_as_cells(example_element) is None
|