feat: add accuracy as wrapper for edit distance score (#1828)

Add `calculate_accuracy` function that is a wrapper of
`calculate_edit_distance` that returns the "score".

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
Klaijan 2023-10-20 19:04:13 -04:00 committed by GitHub
parent aa7b7c87d6
commit de685fbc18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 11 deletions

View File

@ -6,6 +6,7 @@
### Features
* **Adds accuracy function** The accuracy scoring was originally an option under `calculate_edit_distance`. For easy function call, it is now a wrapper around the original function that calls edit_distance and return as "score".
* **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API.
* **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.

View File

@ -1,15 +1,27 @@
from typing import Dict, Tuple
from typing import Dict, Optional, Tuple
from rapidfuzz.distance import Levenshtein
from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
def calculate_edit_distance(
output: str,
source: str,
def calculate_accuracy(
output: Optional[str],
source: Optional[str],
weights: Tuple[int, int, int] = (2, 1, 1),
return_as: str = "score",
) -> float:
"""
Calculates accuracy by calling calculate_edit_distance function using `return_as=score`.
The function will return complement of the edit distance instead.
"""
return calculate_edit_distance(output, source, weights, return_as="score")
def calculate_edit_distance(
output: Optional[str],
source: Optional[str],
weights: Tuple[int, int, int] = (2, 1, 1),
return_as: str = "distance",
) -> float:
"""
Calculates edit distance using Levenshtein distance between two strings.
@ -22,7 +34,7 @@ def calculate_edit_distance(
distance calculation. Default is (2, 1, 1).
return_as (str, optional): The type of result to return, one of
["score",, "distance"].
Default is "score".
Default is "distance".
Returns:
float: The calculated edit distance or similarity score between
@ -44,9 +56,13 @@ def calculate_edit_distance(
return_types = ["score", "distance"]
if return_as not in return_types:
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
distance = Levenshtein.distance(output, source, weights=weights)
char_len = len(source)
bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0)
output = _prepare_str(output)
source = _prepare_str(source)
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
# lower bounded the char length for source string at 1.0 because to avoid division by zero
# in the case where source string is empty, the distance should be at 100%
source_char_len = max(len(source), 1.0) # type: ignore
bounded_percentage_distance = min(max(distance / source_char_len, 0.0), 1.0)
if return_as == "score":
return 1 - bounded_percentage_distance
elif return_as == "distance":
@ -92,8 +108,8 @@ def bag_of_words(text: str) -> Dict[str, int]:
def calculate_percent_missing_text(
output: str,
source: str,
output: Optional[str],
source: Optional[str],
) -> float:
"""
Creates the bag of words (BOW) found in each input text and their frequencies, then compares the
@ -111,6 +127,8 @@ def calculate_percent_missing_text(
Returns the percentage of missing text represented as a decimal between 0 and 1.
"""
output = _prepare_str(output)
source = _prepare_str(source)
output_bow = bag_of_words(output)
source_bow = bag_of_words(source)
@ -133,3 +151,9 @@ def calculate_percent_missing_text(
fraction_missing = round(total_missing_word_count / total_source_word_count, 2)
return min(fraction_missing, 1) # limit to 100%
def _prepare_str(string: Optional[str]) -> str:
if not string:
return ""
return str(string) # type: ignore