unstructured/test_unstructured/metrics/test_text_extraction.py
shreyanid 9d228c7ecb
feat: calculate metric for percent of text missing (#1701)
### Summary
Missing text is a particularly important metric of quality for the
Unstructured library because information from the document is not being
captured and therefore not usable by downstream applications.

Add function to calculate the percent of text missing relative to the
source transcription. Function takes 2 text strings (output and source)
as input, and returns the percentage of text missing as a decimal.

### Technical Details
- The 2 input strings are both assumed to already contain clean and
concatenated text (CCT)
- Implementation compares the bags of words (frequency counts for each
word present in the text) of each input text
- Duplicated/extra text is not penalized
- Value is limited to the range [0, 1]

### Test
- Several edge cases are covered in the test function (missing text,
duplicated text, spaced out words, etc).
- Can test other cases or text inputs by calling the function with 2 CCT
strings as "output" and "source"
2023-10-10 20:54:49 +00:00

219 lines
5.7 KiB
Python

import re
import pytest
from unstructured.metrics import text_extraction
from unstructured.partition.auto import partition
def test_calculate_edit_distance():
source_cct = "I like pizza. I like bagels."
source_cct_word_space = "I like p i z z a . I like bagles."
source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
source_cct_no_space = source_cct.replace(" ", "")
source_cct_one_sentence = "I like pizza."
source_cct_missing_word = "I like pizza. I like ."
source_cct_addn_char = "I like pizza. I like beagles."
source_cct_dup_word = "I like pizza pizza. I like bagels."
assert (
round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
== 1.0
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_word_space,
source_cct,
return_as="score",
),
2,
)
== 0.75
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_spaces,
source_cct,
return_as="score",
),
2,
)
== 0.39
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_no_space,
source_cct,
return_as="score",
),
2,
)
== 0.64
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_one_sentence,
source_cct,
return_as="score",
),
2,
)
== 0.0
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_missing_word,
source_cct,
return_as="score",
),
2,
)
== 0.57
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_addn_char,
source_cct,
return_as="score",
),
2,
)
== 0.89
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_dup_word,
source_cct,
return_as="score",
),
2,
)
== 0.79
)
@pytest.mark.parametrize(
("filename", "expected_score", "expected_distance"),
[
("fake-text.txt", 0.78, 38),
],
)
def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
with open("example-docs/fake-text.txt") as f:
source_cct = f.read()
elements = partition(filename=f"example-docs/{filename}")
output_cct = "\n".join([str(el) for el in elements])
score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
assert score >= 0
assert score <= 1.0
assert distance >= 0
assert round(score, 2) == expected_score
assert distance == expected_distance
@pytest.mark.parametrize(
("text", "expected"),
[
(
"The dog loved the cat, but the cat loved the cow",
{"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
),
(
"Hello my name is H a r p e r, what's your name?",
{"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
),
(
"I have a dog and a cat, I love my dog.",
{"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
),
(
"My dog's hair is red, but the dogs' houses are blue.",
{
"my": 1,
"dog's": 1,
"hair": 1,
"is": 1,
"red": 1,
"but": 1,
"the": 1,
"dogs'": 1,
"houses": 1,
"are": 1,
"blue": 1,
},
),
(
"""Sometimes sentences have a dash - like this one!
A hyphen connects 2 words with no gap: easy-peasy.""",
{
"sometimes": 1,
"sentences": 1,
"have": 1,
"a": 2,
"dash": 1,
"like": 1,
"this": 1,
"one": 1,
"hyphen": 1,
"connects": 1,
"2": 1,
"words": 1,
"with": 1,
"no": 1,
"gap": 1,
"easy-peasy": 1,
},
),
],
)
def test_bag_of_words(text, expected):
assert text_extraction.bag_of_words(text) == expected
@pytest.mark.parametrize(
("output_text", "source_text", "expected_percentage"),
[
(
"extra",
"",
0,
),
(
"",
"Source text has a sentence.",
1,
),
(
"The original s e n t e n c e is normal.",
"The original sentence is normal...",
0.2,
),
(
"We saw 23% improvement in this quarter.",
"We saw 23% improvement in sales this quarter.",
0.12,
),
(
"no",
"Is it possible to have more than everything missing?",
1,
),
],
)
def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
assert (
text_extraction.calculate_percent_missing_text(output_text, source_text)
== expected_percentage
)