Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
Mallori Harrell 2023-10-10 13:46:01 -05:00 committed by GitHub
parent b38a6b3022
commit a5d7ae4611
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 167 additions and 13 deletions

View File

@ -14,6 +14,7 @@
*
### Features
* **Adds `bag_of_words` function** In order to count the word frequency to evaluate extraction accuracy.
* **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included.
* **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature
setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed.

View File

@ -2,7 +2,7 @@ import re
import pytest
from unstructured.metrics.text_extraction import calculate_edit_distance
from unstructured.metrics import text_extraction
from unstructured.partition.auto import partition
@ -16,32 +16,85 @@ def test_calculate_edit_distance():
source_cct_addn_char = "I like pizza. I like beagles."
source_cct_dup_word = "I like pizza pizza. I like bagels."
assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0
assert (
round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2)
round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
== 1.0
)
assert (
round(
text_extraction.calculate_edit_distance(
source_cct_word_space,
source_cct,
return_as="score",
),
2,
)
== 0.75
)
assert (
round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39
round(
text_extraction.calculate_edit_distance(
source_cct_spaces,
source_cct,
return_as="score",
),
2,
)
== 0.39
)
assert (
round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2)
round(
text_extraction.calculate_edit_distance(
source_cct_no_space,
source_cct,
return_as="score",
),
2,
)
== 0.64
)
assert (
round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2)
round(
text_extraction.calculate_edit_distance(
source_cct_one_sentence,
source_cct,
return_as="score",
),
2,
)
== 0.0
)
assert (
round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2)
round(
text_extraction.calculate_edit_distance(
source_cct_missing_word,
source_cct,
return_as="score",
),
2,
)
== 0.57
)
assert (
round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2)
round(
text_extraction.calculate_edit_distance(
source_cct_addn_char,
source_cct,
return_as="score",
),
2,
)
== 0.89
)
assert (
round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2)
round(
text_extraction.calculate_edit_distance(
source_cct_dup_word,
source_cct,
return_as="score",
),
2,
)
== 0.79
)
@ -59,11 +112,70 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
elements = partition(filename=f"example-docs/{filename}")
output_cct = "\n".join([str(el) for el in elements])
score = calculate_edit_distance(output_cct, source_cct, return_as="score")
distance = calculate_edit_distance(output_cct, source_cct, return_as="distance")
score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
assert score >= 0
assert score <= 1.0
assert distance >= 0
assert round(score, 2) == expected_score
assert distance == expected_distance
@pytest.mark.parametrize(
("text", "expected"),
[
(
"The dog loved the cat, but the cat loved the cow",
{"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
),
(
"Hello my name is H a r p e r, what's your name?",
{"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
),
(
"I have a dog and a cat, I love my dog.",
{"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
),
(
"My dog's hair is red, but the dogs' houses are blue.",
{
"my": 1,
"dog's": 1,
"hair": 1,
"is": 1,
"red": 1,
"but": 1,
"the": 1,
"dogs'": 1,
"houses": 1,
"are": 1,
"blue": 1,
},
),
(
"""Sometimes sentences have a dash - like this one!
A hyphen connects 2 words with no gap: easy-peasy.""",
{
"sometimes": 1,
"sentences": 1,
"have": 1,
"a": 2,
"dash": 1,
"like": 1,
"this": 1,
"one": 1,
"hyphen": 1,
"connects": 1,
"2": 1,
"words": 1,
"with": 1,
"no": 1,
"gap": 1,
"easy-peasy": 1,
},
),
],
)
def test_bag_of_words(text, expected):
assert text_extraction.bag_of_words(text) == expected

View File

@ -2,7 +2,7 @@ import quopri
import re
import sys
import unicodedata
from typing import Tuple
from typing import Optional, Tuple
import numpy as np
@ -302,6 +302,15 @@ def remove_punctuation(s: str) -> str:
return s
def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
tbl_new = tbl.copy()
if exclude_punctuation:
for punct in exclude_punctuation:
del tbl_new[ord(punct)]
s = s.translate(tbl_new)
return s
def clean_extra_whitespace(text: str) -> str:
"""Cleans extra whitespace characters that appear between words.

View File

@ -1,7 +1,9 @@
from typing import Tuple
from typing import Dict, Tuple
from rapidfuzz.distance import Levenshtein
from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
def calculate_edit_distance(
output: str,
@ -50,3 +52,33 @@ def calculate_edit_distance(
elif return_as == "distance":
return distance
return 0.0
def bag_of_words(text: str) -> Dict[str, int]:
bow: Dict[str, int] = {}
incorrect_word: str = ""
words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split()
i = 0
while i < len(words):
if len(words[i]) > 1:
if words[i] in bow:
bow[words[i]] += 1
else:
bow[words[i]] = 1
i += 1
else:
j = i
incorrect_word = ""
while j < len(words) and len(words[j]) == 1:
incorrect_word += words[j]
j += 1
if len(incorrect_word) == 1 and words[i].isalnum():
if incorrect_word in bow:
bow[incorrect_word] += 1
else:
bow[incorrect_word] = 1
i = j
return bow