From de685fbc185c7988a99c28fa26cc6df63c2abefa Mon Sep 17 00:00:00 2001
From: Klaijan <klaijan@unstructured.io>
Date: Fri, 20 Oct 2023 19:04:13 -0400
Subject: [PATCH] feat: add accuracy as wrapper for edit distance score (#1828)

Add `calculate_accuracy` function that is a wrapper of
`calculate_edit_distance` that returns the "score".

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
---
 CHANGELOG.md                            |  1 +
 unstructured/metrics/text_extraction.py | 46 +++++++++++++++++++------
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b6a5e0df4..358498843 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
 
 ### Features
 
+* **Adds accuracy function** The accuracy scoring was originally an option under `calculate_edit_distance`. For easy function call, it is now a wrapper around the original function that calls edit_distance and return as "score".
 * **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API.
 * **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.
 
diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py
index 62f414107..f8e8df6db 100644
--- a/unstructured/metrics/text_extraction.py
+++ b/unstructured/metrics/text_extraction.py
@@ -1,15 +1,27 @@
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple
 
 from rapidfuzz.distance import Levenshtein
 
 from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
 
 
-def calculate_edit_distance(
-    output: str,
-    source: str,
+def calculate_accuracy(
+    output: Optional[str],
+    source: Optional[str],
     weights: Tuple[int, int, int] = (2, 1, 1),
-    return_as: str = "score",
+) -> float:
+    """
+    Calculates accuracy by calling calculate_edit_distance function using `return_as=score`.
+    The function will return complement of the edit distance instead.
+    """
+    return calculate_edit_distance(output, source, weights, return_as="score")
+
+
+def calculate_edit_distance(
+    output: Optional[str],
+    source: Optional[str],
+    weights: Tuple[int, int, int] = (2, 1, 1),
+    return_as: str = "distance",
 ) -> float:
     """
     Calculates edit distance using Levenshtein distance between two strings.
@@ -22,7 +34,7 @@ def calculate_edit_distance(
             distance calculation. Default is (2, 1, 1).
         return_as (str, optional): The type of result to return, one of
             ["score",, "distance"].
-            Default is "score".
+            Default is "distance".
 
     Returns:
         float: The calculated edit distance or similarity score between
@@ -44,9 +56,13 @@ def calculate_edit_distance(
     return_types = ["score", "distance"]
     if return_as not in return_types:
         raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
-    distance = Levenshtein.distance(output, source, weights=weights)
-    char_len = len(source)
-    bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0)
+    output = _prepare_str(output)
+    source = _prepare_str(source)
+    distance = Levenshtein.distance(output, source, weights=weights)  # type: ignore
+    # lower bounded the char length for source string at 1.0 because to avoid division by zero
+    # in the case where source string is empty, the distance should be at 100%
+    source_char_len = max(len(source), 1.0)  # type: ignore
+    bounded_percentage_distance = min(max(distance / source_char_len, 0.0), 1.0)
     if return_as == "score":
         return 1 - bounded_percentage_distance
     elif return_as == "distance":
@@ -92,8 +108,8 @@ def bag_of_words(text: str) -> Dict[str, int]:
 
 
 def calculate_percent_missing_text(
-    output: str,
-    source: str,
+    output: Optional[str],
+    source: Optional[str],
 ) -> float:
     """
     Creates the bag of words (BOW) found in each input text and their frequencies, then compares the
@@ -111,6 +127,8 @@ def calculate_percent_missing_text(
 
     Returns the percentage of missing text represented as a decimal between 0 and 1.
     """
+    output = _prepare_str(output)
+    source = _prepare_str(source)
     output_bow = bag_of_words(output)
     source_bow = bag_of_words(source)
 
@@ -133,3 +151,9 @@ def calculate_percent_missing_text(
 
     fraction_missing = round(total_missing_word_count / total_source_word_count, 2)
     return min(fraction_missing, 1)  # limit to 100%
+
+
+def _prepare_str(string: Optional[str]) -> str:
+    if not string:
+        return ""
+    return str(string)  # type: ignore