mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 08:53:15 +00:00
Feat: add quote standardization and update edit distance calculation
This commit is contained in:
parent
9445a2dd01
commit
371cb7528d
@ -339,6 +339,21 @@ def test_prepare_string(text, expected):
|
|||||||
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
||||||
assert text_extraction.prepare_str(text) == text
|
assert text_extraction.prepare_str(text) == text
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("input_text", "expected_output"),
|
||||||
|
[
|
||||||
|
('"Hello"', '"Hello"'), # Basic double quotes
|
||||||
|
("'Hello'", "'Hello'"), # Basic single quotes
|
||||||
|
('„Hello"', '"Hello"'), # German-style quotes to standard
|
||||||
|
('"Hello"', '"Hello"'), # Fancy double quotes to standard
|
||||||
|
('‚Hello‚', "'Hello'"), # German-style single quotes to standard
|
||||||
|
('Hello\'s', "Hello's"), # Apostrophe standardization
|
||||||
|
('Mixed "quotes" and \'test\'', 'Mixed "quotes" and \'test\''), # Mixed quote types
|
||||||
|
('No quotes here', 'No quotes here'), # No quotes to change
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_standardize_quotes(input_text, expected_output):
|
||||||
|
assert text_extraction.standardize_quotes(input_text) == expected_output
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("output_text", "source_text", "expected_percentage"),
|
("output_text", "source_text", "expected_percentage"),
|
||||||
|
@ -57,8 +57,8 @@ def calculate_edit_distance(
|
|||||||
return_types = ["score", "distance"]
|
return_types = ["score", "distance"]
|
||||||
if return_as not in return_types:
|
if return_as not in return_types:
|
||||||
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
|
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
|
||||||
output = prepare_str(output, standardize_whitespaces)
|
output = standardize_quotes(prepare_str(output, standardize_whitespaces))
|
||||||
source = prepare_str(source, standardize_whitespaces)
|
source = standardize_quotes(prepare_str(source, standardize_whitespaces))
|
||||||
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
|
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
|
||||||
# lower bounded the char length for source string at 1.0 because to avoid division by zero
|
# lower bounded the char length for source string at 1.0 because to avoid division by zero
|
||||||
# in the case where source string is empty, the distance should be at 100%
|
# in the case where source string is empty, the distance should be at 100%
|
||||||
@ -160,3 +160,20 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
|
|||||||
if standardize_whitespaces:
|
if standardize_whitespaces:
|
||||||
return " ".join(string.split())
|
return " ".join(string.split())
|
||||||
return str(string) # type: ignore
|
return str(string) # type: ignore
|
||||||
|
|
||||||
|
def standardize_quotes(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Converts all starting and ending double quotes to a standard for Unicode conversion,
|
||||||
|
and does the same for single quotes but with a different standard.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to be standardized.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The text with standardized quotes.
|
||||||
|
"""
|
||||||
|
# Standardize double quotes
|
||||||
|
text = text.replace('“', '"').replace('”', '"').replace('„', '"')
|
||||||
|
# Standardize single quotes
|
||||||
|
text = text.replace('‘', "'").replace('’', "'").replace('‚', "'")
|
||||||
|
return text
|
Loading…
x
Reference in New Issue
Block a user