Feat: add quote standardization and update edit distance calculation

This commit is contained in:
Christine Straub 2024-12-03 21:21:39 -08:00
parent 9445a2dd01
commit 371cb7528d
2 changed files with 34 additions and 2 deletions

View File

@ -339,6 +339,21 @@ def test_prepare_string(text, expected):
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
assert text_extraction.prepare_str(text) == text
@pytest.mark.parametrize(
("input_text", "expected_output"),
[
('"Hello"', '"Hello"'), # Basic double quotes
("'Hello'", "'Hello'"), # Basic single quotes
('„Hello"', '"Hello"'), # German-style quotes to standard
('"Hello"', '"Hello"'), # Fancy double quotes to standard
('Hello', "'Hello'"), # German-style single quotes to standard
('Hello\'s', "Hello's"), # Apostrophe standardization
('Mixed "quotes" and \'test\'', 'Mixed "quotes" and \'test\''), # Mixed quote types
('No quotes here', 'No quotes here'), # No quotes to change
],
)
def test_standardize_quotes(input_text, expected_output):
assert text_extraction.standardize_quotes(input_text) == expected_output
@pytest.mark.parametrize(
("output_text", "source_text", "expected_percentage"),

View File

@ -57,8 +57,8 @@ def calculate_edit_distance(
return_types = ["score", "distance"]
if return_as not in return_types:
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
output = prepare_str(output, standardize_whitespaces)
source = prepare_str(source, standardize_whitespaces)
output = standardize_quotes(prepare_str(output, standardize_whitespaces))
source = standardize_quotes(prepare_str(source, standardize_whitespaces))
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
# lower bounded the char length for source string at 1.0 because to avoid division by zero
# in the case where source string is empty, the distance should be at 100%
@ -160,3 +160,20 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
if standardize_whitespaces:
return " ".join(string.split())
return str(string) # type: ignore
def standardize_quotes(text: str) -> str:
"""
Converts all starting and ending double quotes to a standard for Unicode conversion,
and does the same for single quotes but with a different standard.
Args:
text (str): The input text to be standardized.
Returns:
str: The text with standardized quotes.
"""
# Standardize double quotes
text = text.replace('', '"').replace('', '"').replace('', '"')
# Standardize single quotes
text = text.replace('', "'").replace('', "'").replace('', "'")
return text