mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 08:53:15 +00:00
Feat: add quote standardization and update edit distance calculation
This commit is contained in:
parent
9445a2dd01
commit
371cb7528d
@ -339,6 +339,21 @@ def test_prepare_string(text, expected):
|
||||
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
||||
assert text_extraction.prepare_str(text) == text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_text", "expected_output"),
|
||||
[
|
||||
('"Hello"', '"Hello"'), # Basic double quotes
|
||||
("'Hello'", "'Hello'"), # Basic single quotes
|
||||
('„Hello"', '"Hello"'), # German-style quotes to standard
|
||||
('"Hello"', '"Hello"'), # Fancy double quotes to standard
|
||||
('‚Hello‚', "'Hello'"), # German-style single quotes to standard
|
||||
('Hello\'s', "Hello's"), # Apostrophe standardization
|
||||
('Mixed "quotes" and \'test\'', 'Mixed "quotes" and \'test\''), # Mixed quote types
|
||||
('No quotes here', 'No quotes here'), # No quotes to change
|
||||
],
|
||||
)
|
||||
def test_standardize_quotes(input_text, expected_output):
|
||||
assert text_extraction.standardize_quotes(input_text) == expected_output
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("output_text", "source_text", "expected_percentage"),
|
||||
|
@ -57,8 +57,8 @@ def calculate_edit_distance(
|
||||
return_types = ["score", "distance"]
|
||||
if return_as not in return_types:
|
||||
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
|
||||
output = prepare_str(output, standardize_whitespaces)
|
||||
source = prepare_str(source, standardize_whitespaces)
|
||||
output = standardize_quotes(prepare_str(output, standardize_whitespaces))
|
||||
source = standardize_quotes(prepare_str(source, standardize_whitespaces))
|
||||
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
|
||||
# lower bounded the char length for source string at 1.0 because to avoid division by zero
|
||||
# in the case where source string is empty, the distance should be at 100%
|
||||
@ -160,3 +160,20 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
|
||||
if standardize_whitespaces:
|
||||
return " ".join(string.split())
|
||||
return str(string) # type: ignore
|
||||
|
||||
def standardize_quotes(text: str) -> str:
|
||||
"""
|
||||
Converts all starting and ending double quotes to a standard for Unicode conversion,
|
||||
and does the same for single quotes but with a different standard.
|
||||
|
||||
Args:
|
||||
text (str): The input text to be standardized.
|
||||
|
||||
Returns:
|
||||
str: The text with standardized quotes.
|
||||
"""
|
||||
# Standardize double quotes
|
||||
text = text.replace('“', '"').replace('”', '"').replace('„', '"')
|
||||
# Standardize single quotes
|
||||
text = text.replace('‘', "'").replace('’', "'").replace('‚', "'")
|
||||
return text
|
Loading…
x
Reference in New Issue
Block a user