From de685fbc185c7988a99c28fa26cc6df63c2abefa Mon Sep 17 00:00:00 2001 From: Klaijan Date: Fri, 20 Oct 2023 19:04:13 -0400 Subject: [PATCH] feat: add accuracy as wrapper for edit distance score (#1828) Add `calculate_accuracy` function that is a wrapper of `calculate_edit_distance` that returns the "score". --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com> --- CHANGELOG.md | 1 + unstructured/metrics/text_extraction.py | 46 +++++++++++++++++++------ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6a5e0df4..358498843 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Features +* **Adds accuracy function** The accuracy scoring was originally an option under `calculate_edit_distance`. For easy function call, it is now a wrapper around the original function that calls edit_distance and return as "score". * **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API. * **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run. diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 62f414107..f8e8df6db 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -1,15 +1,27 @@ -from typing import Dict, Tuple +from typing import Dict, Optional, Tuple from rapidfuzz.distance import Levenshtein from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation -def calculate_edit_distance( - output: str, - source: str, +def calculate_accuracy( + output: Optional[str], + source: Optional[str], weights: Tuple[int, int, int] = (2, 1, 1), - return_as: str = "score", +) -> float: + """ + Calculates accuracy by calling calculate_edit_distance function using `return_as=score`. + The function will return complement of the edit distance instead. + """ + return calculate_edit_distance(output, source, weights, return_as="score") + + +def calculate_edit_distance( + output: Optional[str], + source: Optional[str], + weights: Tuple[int, int, int] = (2, 1, 1), + return_as: str = "distance", ) -> float: """ Calculates edit distance using Levenshtein distance between two strings. @@ -22,7 +34,7 @@ def calculate_edit_distance( distance calculation. Default is (2, 1, 1). return_as (str, optional): The type of result to return, one of ["score",, "distance"]. - Default is "score". + Default is "distance". Returns: float: The calculated edit distance or similarity score between @@ -44,9 +56,13 @@ def calculate_edit_distance( return_types = ["score", "distance"] if return_as not in return_types: raise ValueError("Invalid return value type. Expected one of: %s" % return_types) - distance = Levenshtein.distance(output, source, weights=weights) - char_len = len(source) - bounded_percentage_distance = min(max(distance / char_len, 0.0), 1.0) + output = _prepare_str(output) + source = _prepare_str(source) + distance = Levenshtein.distance(output, source, weights=weights) # type: ignore + # lower bounded the char length for source string at 1.0 because to avoid division by zero + # in the case where source string is empty, the distance should be at 100% + source_char_len = max(len(source), 1.0) # type: ignore + bounded_percentage_distance = min(max(distance / source_char_len, 0.0), 1.0) if return_as == "score": return 1 - bounded_percentage_distance elif return_as == "distance": @@ -92,8 +108,8 @@ def bag_of_words(text: str) -> Dict[str, int]: def calculate_percent_missing_text( - output: str, - source: str, + output: Optional[str], + source: Optional[str], ) -> float: """ Creates the bag of words (BOW) found in each input text and their frequencies, then compares the @@ -111,6 +127,8 @@ def calculate_percent_missing_text( Returns the percentage of missing text represented as a decimal between 0 and 1. """ + output = _prepare_str(output) + source = _prepare_str(source) output_bow = bag_of_words(output) source_bow = bag_of_words(source) @@ -133,3 +151,9 @@ def calculate_percent_missing_text( fraction_missing = round(total_missing_word_count / total_source_word_count, 2) return min(fraction_missing, 1) # limit to 100% + + +def _prepare_str(string: Optional[str]) -> str: + if not string: + return "" + return str(string) # type: ignore