feat: add accuracy as wrapper for edit distance score (#1828)

Add `calculate_accuracy` function that is a wrapper of `calculate_edit_distance` that returns the "score". --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2026-01-08 13:20:31 +00:00 · 2023-10-20 19:04:13 -04:00 · 2023-10-20 19:04:13 -04:00 · de685fbc18
commit de685fbc18
parent aa7b7c87d6
2 changed files with 36 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,7 @@

 ### Features

+* **Adds accuracy function** The accuracy scoring was originally an option under `calculate_edit_distance`. For easy function call, it is now a wrapper around the original function that calls edit_distance and return as "score".
 * **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API.
 * **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.

--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@ -1,15 +1,27 @@
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple

 from rapidfuzz.distance import Levenshtein

 from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation


-def calculate_edit_distance(
-    output: str,
-    source: str,
+def calculate_accuracy(
+    output: Optional[str],
+    source: Optional[str],
    weights: Tuple[int, int, int] = (2, 1, 1),
-    return_as: str = "score",
+) -> float:
+    """
+    Calculates accuracy by calling calculate_edit_distance function using `return_as=score`.
+    The function will return complement of the edit distance instead.
+    """
+    return calculate_edit_distance(output, source, weights, return_as="score")
+
+
+def calculate_edit_distance(
+    output: Optional[str],
+    source: Optional[str],
+    weights: Tuple[int, int, int] = (2, 1, 1),
+    return_as: str = "distance",
 ) -> float:
    """
    Calculates edit distance using Levenshtein distance between two strings.
@ -22,7 +34,7 @@ def calculate_edit_distance(
            distance calculation. Default is (2, 1, 1).
        return_as (str, optional): The type of result to return, one of
            ["score",, "distance"].
-            Default is "score".
+            Default is "distance".

    Returns:
        float: The calculated edit distance or similarity score between
@ -44,9 +56,13 @@ def calculate_edit_distance(
    return_types = ["score", "distance"]
    if return_as not in return_types:
        raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
-    distance = Levenshtein.distance(output, source, weights=weights)
-    char_len = len(source)
-    bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0)
+    output = _prepare_str(output)
+    source = _prepare_str(source)
+    distance = Levenshtein.distance(output, source, weights=weights)  # type: ignore
+    # lower bounded the char length for source string at 1.0 because to avoid division by zero
+    # in the case where source string is empty, the distance should be at 100%
+    source_char_len = max(len(source), 1.0)  # type: ignore
+    bounded_percentage_distance = min(max(distance / source_char_len, 0.0), 1.0)
    if return_as == "score":
        return 1 - bounded_percentage_distance
    elif return_as == "distance":
@ -92,8 +108,8 @@ def bag_of_words(text: str) -> Dict[str, int]:


 def calculate_percent_missing_text(
-    output: str,
-    source: str,
+    output: Optional[str],
+    source: Optional[str],
 ) -> float:
    """
    Creates the bag of words (BOW) found in each input text and their frequencies, then compares the
@ -111,6 +127,8 @@ def calculate_percent_missing_text(

    Returns the percentage of missing text represented as a decimal between 0 and 1.
    """
+    output = _prepare_str(output)
+    source = _prepare_str(source)
    output_bow = bag_of_words(output)
    source_bow = bag_of_words(source)

@ -133,3 +151,9 @@ def calculate_percent_missing_text(

    fraction_missing = round(total_missing_word_count / total_source_word_count, 2)
    return min(fraction_missing, 1)  # limit to 100%
+
+
+def _prepare_str(string: Optional[str]) -> str:
+    if not string:
+        return ""
+    return str(string)  # type: ignore