mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-08 13:20:31 +00:00
feat: add accuracy as wrapper for edit distance score (#1828)
Add `calculate_accuracy` function that is a wrapper of `calculate_edit_distance` that returns the "score". --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
parent
aa7b7c87d6
commit
de685fbc18
@ -6,6 +6,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* **Adds accuracy function** The accuracy scoring was originally an option under `calculate_edit_distance`. For easy function call, it is now a wrapper around the original function that calls edit_distance and return as "score".
|
||||
* **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API.
|
||||
* **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.
|
||||
|
||||
|
||||
@ -1,15 +1,27 @@
|
||||
from typing import Dict, Tuple
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation
|
||||
|
||||
|
||||
def calculate_edit_distance(
|
||||
output: str,
|
||||
source: str,
|
||||
def calculate_accuracy(
|
||||
output: Optional[str],
|
||||
source: Optional[str],
|
||||
weights: Tuple[int, int, int] = (2, 1, 1),
|
||||
return_as: str = "score",
|
||||
) -> float:
|
||||
"""
|
||||
Calculates accuracy by calling calculate_edit_distance function using `return_as=score`.
|
||||
The function will return complement of the edit distance instead.
|
||||
"""
|
||||
return calculate_edit_distance(output, source, weights, return_as="score")
|
||||
|
||||
|
||||
def calculate_edit_distance(
|
||||
output: Optional[str],
|
||||
source: Optional[str],
|
||||
weights: Tuple[int, int, int] = (2, 1, 1),
|
||||
return_as: str = "distance",
|
||||
) -> float:
|
||||
"""
|
||||
Calculates edit distance using Levenshtein distance between two strings.
|
||||
@ -22,7 +34,7 @@ def calculate_edit_distance(
|
||||
distance calculation. Default is (2, 1, 1).
|
||||
return_as (str, optional): The type of result to return, one of
|
||||
["score",, "distance"].
|
||||
Default is "score".
|
||||
Default is "distance".
|
||||
|
||||
Returns:
|
||||
float: The calculated edit distance or similarity score between
|
||||
@ -44,9 +56,13 @@ def calculate_edit_distance(
|
||||
return_types = ["score", "distance"]
|
||||
if return_as not in return_types:
|
||||
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
|
||||
distance = Levenshtein.distance(output, source, weights=weights)
|
||||
char_len = len(source)
|
||||
bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0)
|
||||
output = _prepare_str(output)
|
||||
source = _prepare_str(source)
|
||||
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
|
||||
# lower bounded the char length for source string at 1.0 because to avoid division by zero
|
||||
# in the case where source string is empty, the distance should be at 100%
|
||||
source_char_len = max(len(source), 1.0) # type: ignore
|
||||
bounded_percentage_distance = min(max(distance / source_char_len, 0.0), 1.0)
|
||||
if return_as == "score":
|
||||
return 1 - bounded_percentage_distance
|
||||
elif return_as == "distance":
|
||||
@ -92,8 +108,8 @@ def bag_of_words(text: str) -> Dict[str, int]:
|
||||
|
||||
|
||||
def calculate_percent_missing_text(
|
||||
output: str,
|
||||
source: str,
|
||||
output: Optional[str],
|
||||
source: Optional[str],
|
||||
) -> float:
|
||||
"""
|
||||
Creates the bag of words (BOW) found in each input text and their frequencies, then compares the
|
||||
@ -111,6 +127,8 @@ def calculate_percent_missing_text(
|
||||
|
||||
Returns the percentage of missing text represented as a decimal between 0 and 1.
|
||||
"""
|
||||
output = _prepare_str(output)
|
||||
source = _prepare_str(source)
|
||||
output_bow = bag_of_words(output)
|
||||
source_bow = bag_of_words(source)
|
||||
|
||||
@ -133,3 +151,9 @@ def calculate_percent_missing_text(
|
||||
|
||||
fraction_missing = round(total_missing_word_count / total_source_word_count, 2)
|
||||
return min(fraction_missing, 1) # limit to 100%
|
||||
|
||||
|
||||
def _prepare_str(string: Optional[str]) -> str:
|
||||
if not string:
|
||||
return ""
|
||||
return str(string) # type: ignore
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user