unstructured/test_unstructured/metrics/test_text_extraction.py
Klaijan 33edbf84f5
feat: add calculate edit distance feature (#1656)
**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
2023-10-07 01:21:14 +00:00

70 lines
2.3 KiB
Python

import re
import pytest
from unstructured.metrics.text_extraction import calculate_edit_distance
from unstructured.partition.auto import partition
def test_calculate_edit_distance():
source_cct = "I like pizza. I like bagels."
source_cct_word_space = "I like p i z z a . I like bagles."
source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
source_cct_no_space = source_cct.replace(" ", "")
source_cct_one_sentence = "I like pizza."
source_cct_missing_word = "I like pizza. I like ."
source_cct_addn_char = "I like pizza. I like beagles."
source_cct_dup_word = "I like pizza pizza. I like bagels."
assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0
assert (
round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2)
== 0.75
)
assert (
round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39
)
assert (
round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2)
== 0.64
)
assert (
round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2)
== 0.0
)
assert (
round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2)
== 0.57
)
assert (
round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2)
== 0.89
)
assert (
round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2)
== 0.79
)
@pytest.mark.parametrize(
("filename", "expected_score", "expected_distance"),
[
("fake-text.txt", 0.78, 38),
],
)
def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
with open("example-docs/fake-text.txt") as f:
source_cct = f.read()
elements = partition(filename=f"example-docs/{filename}")
output_cct = "\n".join([str(el) for el in elements])
score = calculate_edit_distance(output_cct, source_cct, return_as="score")
distance = calculate_edit_distance(output_cct, source_cct, return_as="distance")
assert score >= 0
assert score <= 1.0
assert distance >= 0
assert round(score, 2) == expected_score
assert distance == expected_distance