Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words for evaluation. **Testing** ```Python from unstructured.cleaners.core import bag_of_words string = "The dog loved the cat, but the cat loved the cow." print(bag_of_words) --------- Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local> Co-authored-by: Klaijan <klaijan@unstructured.io> Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com> Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2025-12-15 01:02:50 +00:00 · 2023-10-10 13:46:01 -05:00 · 2023-10-10 13:46:01 -05:00 · a5d7ae4611
commit a5d7ae4611
parent b38a6b3022
4 changed files with 167 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -14,6 +14,7 @@
 *
 ### Features

+* **Adds `bag_of_words` function** In order to count the word frequency to evaluate extraction accuracy.
 * **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included.
 * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature
 setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed.
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@ -2,7 +2,7 @@ import re

 import pytest

-from unstructured.metrics.text_extraction import calculate_edit_distance
+from unstructured.metrics import text_extraction
 from unstructured.partition.auto import partition


@ -16,32 +16,85 @@ def test_calculate_edit_distance():
    source_cct_addn_char = "I like pizza. I like beagles."
    source_cct_dup_word = "I like pizza pizza. I like bagels."

-    assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0
    assert (
-        round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2)
+        round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
+        == 1.0
+    )
+    assert (
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_word_space,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
        == 0.75
    )
    assert (
-        round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_spaces,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
+        == 0.39
    )
    assert (
-        round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_no_space,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
        == 0.64
    )
    assert (
-        round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_one_sentence,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
        == 0.0
    )
    assert (
-        round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_missing_word,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
        == 0.57
    )
    assert (
-        round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_addn_char,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
        == 0.89
    )
    assert (
-        round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2)
+        round(
+            text_extraction.calculate_edit_distance(
+                source_cct_dup_word,
+                source_cct,
+                return_as="score",
+            ),
+            2,
+        )
        == 0.79
    )

@ -59,11 +112,70 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
    elements = partition(filename=f"example-docs/{filename}")
    output_cct = "\n".join([str(el) for el in elements])

-    score = calculate_edit_distance(output_cct, source_cct, return_as="score")
-    distance = calculate_edit_distance(output_cct, source_cct, return_as="distance")
+    score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
+    distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")

    assert score >= 0
    assert score <= 1.0
    assert distance >= 0
    assert round(score, 2) == expected_score
    assert distance == expected_distance
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        (
+            "The dog loved the cat, but the cat loved the cow",
+            {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
+        ),
+        (
+            "Hello my name is H a r p e r, what's your name?",
+            {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
+        ),
+        (
+            "I have a dog and a cat, I love my dog.",
+            {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
+        ),
+        (
+            "My dog's hair is red, but the dogs' houses are blue.",
+            {
+                "my": 1,
+                "dog's": 1,
+                "hair": 1,
+                "is": 1,
+                "red": 1,
+                "but": 1,
+                "the": 1,
+                "dogs'": 1,
+                "houses": 1,
+                "are": 1,
+                "blue": 1,
+            },
+        ),
+        (
+            """Sometimes sentences have a dash - like this one!
+            A hyphen connects 2 words with no gap: easy-peasy.""",
+            {
+                "sometimes": 1,
+                "sentences": 1,
+                "have": 1,
+                "a": 2,
+                "dash": 1,
+                "like": 1,
+                "this": 1,
+                "one": 1,
+                "hyphen": 1,
+                "connects": 1,
+                "2": 1,
+                "words": 1,
+                "with": 1,
+                "no": 1,
+                "gap": 1,
+                "easy-peasy": 1,
+            },
+        ),
+    ],
+)
+def test_bag_of_words(text, expected):
+    assert text_extraction.bag_of_words(text) == expected
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@ -2,7 +2,7 @@ import quopri
 import re
 import sys
 import unicodedata
-from typing import Tuple
+from typing import Optional, Tuple

 import numpy as np

@ -302,6 +302,15 @@ def remove_punctuation(s: str) -> str:
    return s


+def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
+    tbl_new = tbl.copy()
+    if exclude_punctuation:
+        for punct in exclude_punctuation:
+            del tbl_new[ord(punct)]
+    s = s.translate(tbl_new)
+    return s
+
+
 def clean_extra_whitespace(text: str) -> str:
    """Cleans extra whitespace characters that appear between words.

--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@ -1,7 +1,9 @@
-from typing import Tuple
+from typing import Dict, Tuple

 from rapidfuzz.distance import Levenshtein

+from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
+

 def calculate_edit_distance(
    output: str,
@ -50,3 +52,33 @@ def calculate_edit_distance(
    elif return_as == "distance":
        return distance
    return 0.0
+
+
+def bag_of_words(text: str) -> Dict[str, int]:
+    bow: Dict[str, int] = {}
+    incorrect_word: str = ""
+    words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split()
+
+    i = 0
+    while i < len(words):
+        if len(words[i]) > 1:
+            if words[i] in bow:
+                bow[words[i]] += 1
+            else:
+                bow[words[i]] = 1
+            i += 1
+        else:
+            j = i
+            incorrect_word = ""
+
+            while j < len(words) and len(words[j]) == 1:
+                incorrect_word += words[j]
+                j += 1
+
+            if len(incorrect_word) == 1 and words[i].isalnum():
+                if incorrect_word in bow:
+                    bow[incorrect_word] += 1
+                else:
+                    bow[incorrect_word] = 1
+            i = j
+    return bow