diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d2333df0..e1b3574d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * ### Features +* **Adds `bag_of_words` function** In order to count the word frequency to evaluate extraction accuracy. * **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included. * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 73bce5bd6..5be975342 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -2,7 +2,7 @@ import re import pytest -from unstructured.metrics.text_extraction import calculate_edit_distance +from unstructured.metrics import text_extraction from unstructured.partition.auto import partition @@ -16,32 +16,85 @@ def test_calculate_edit_distance(): source_cct_addn_char = "I like pizza. I like beagles." source_cct_dup_word = "I like pizza pizza. I like bagels." - assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0 assert ( - round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2) + round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) + == 1.0 + ) + assert ( + round( + text_extraction.calculate_edit_distance( + source_cct_word_space, + source_cct, + return_as="score", + ), + 2, + ) == 0.75 ) assert ( - round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39 + round( + text_extraction.calculate_edit_distance( + source_cct_spaces, + source_cct, + return_as="score", + ), + 2, + ) + == 0.39 ) assert ( - round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_no_space, + source_cct, + return_as="score", + ), + 2, + ) == 0.64 ) assert ( - round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_one_sentence, + source_cct, + return_as="score", + ), + 2, + ) == 0.0 ) assert ( - round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_missing_word, + source_cct, + return_as="score", + ), + 2, + ) == 0.57 ) assert ( - round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_addn_char, + source_cct, + return_as="score", + ), + 2, + ) == 0.89 ) assert ( - round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2) + round( + text_extraction.calculate_edit_distance( + source_cct_dup_word, + source_cct, + return_as="score", + ), + 2, + ) == 0.79 ) @@ -59,11 +112,70 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte elements = partition(filename=f"example-docs/{filename}") output_cct = "\n".join([str(el) for el in elements]) - score = calculate_edit_distance(output_cct, source_cct, return_as="score") - distance = calculate_edit_distance(output_cct, source_cct, return_as="distance") + score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score") + distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance") assert score >= 0 assert score <= 1.0 assert distance >= 0 assert round(score, 2) == expected_score assert distance == expected_distance + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ( + "The dog loved the cat, but the cat loved the cow", + {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, + ), + ( + "Hello my name is H a r p e r, what's your name?", + {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1}, + ), + ( + "I have a dog and a cat, I love my dog.", + {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, + ), + ( + "My dog's hair is red, but the dogs' houses are blue.", + { + "my": 1, + "dog's": 1, + "hair": 1, + "is": 1, + "red": 1, + "but": 1, + "the": 1, + "dogs'": 1, + "houses": 1, + "are": 1, + "blue": 1, + }, + ), + ( + """Sometimes sentences have a dash - like this one! + A hyphen connects 2 words with no gap: easy-peasy.""", + { + "sometimes": 1, + "sentences": 1, + "have": 1, + "a": 2, + "dash": 1, + "like": 1, + "this": 1, + "one": 1, + "hyphen": 1, + "connects": 1, + "2": 1, + "words": 1, + "with": 1, + "no": 1, + "gap": 1, + "easy-peasy": 1, + }, + ), + ], +) +def test_bag_of_words(text, expected): + assert text_extraction.bag_of_words(text) == expected diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index ba7ec592d..e5df8df0c 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,7 +2,7 @@ import quopri import re import sys import unicodedata -from typing import Tuple +from typing import Optional, Tuple import numpy as np @@ -302,6 +302,15 @@ def remove_punctuation(s: str) -> str: return s +def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: + tbl_new = tbl.copy() + if exclude_punctuation: + for punct in exclude_punctuation: + del tbl_new[ord(punct)] + s = s.translate(tbl_new) + return s + + def clean_extra_whitespace(text: str) -> str: """Cleans extra whitespace characters that appear between words. diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 001bf1bd1..39d1c395a 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,7 +1,9 @@ -from typing import Tuple +from typing import Dict, Tuple from rapidfuzz.distance import Levenshtein +from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation + def calculate_edit_distance( output: str, @@ -50,3 +52,33 @@ def calculate_edit_distance( elif return_as == "distance": return distance return 0.0 + + +def bag_of_words(text: str) -> Dict[str, int]: + bow: Dict[str, int] = {} + incorrect_word: str = "" + words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split() + + i = 0 + while i < len(words): + if len(words[i]) > 1: + if words[i] in bow: + bow[words[i]] += 1 + else: + bow[words[i]] = 1 + i += 1 + else: + j = i + incorrect_word = "" + + while j < len(words) and len(words[j]) == 1: + incorrect_word += words[j] + j += 1 + + if len(incorrect_word) == 1 and words[i].isalnum(): + if incorrect_word in bow: + bow[incorrect_word] += 1 + else: + bow[incorrect_word] = 1 + i = j + return bow