diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 50db8bee1..ada209820 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -339,6 +339,21 @@ def test_prepare_string(text, expected): assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected assert text_extraction.prepare_str(text) == text +@pytest.mark.parametrize( + ("input_text", "expected_output"), + [ + ('"Hello"', '"Hello"'), # Basic double quotes + ("'Hello'", "'Hello'"), # Basic single quotes + ('„Hello"', '"Hello"'), # German-style quotes to standard + ('"Hello"', '"Hello"'), # Fancy double quotes to standard + ('‚Hello‚', "'Hello'"), # German-style single quotes to standard + ('Hello\'s', "Hello's"), # Apostrophe standardization + ('Mixed "quotes" and \'test\'', 'Mixed "quotes" and \'test\''), # Mixed quote types + ('No quotes here', 'No quotes here'), # No quotes to change + ], +) +def test_standardize_quotes(input_text, expected_output): + assert text_extraction.standardize_quotes(input_text) == expected_output @pytest.mark.parametrize( ("output_text", "source_text", "expected_percentage"), diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index dd2fa721b..d8997aac7 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -57,8 +57,8 @@ def calculate_edit_distance( return_types = ["score", "distance"] if return_as not in return_types: raise ValueError("Invalid return value type. Expected one of: %s" % return_types) - output = prepare_str(output, standardize_whitespaces) - source = prepare_str(source, standardize_whitespaces) + output = standardize_quotes(prepare_str(output, standardize_whitespaces)) + source = standardize_quotes(prepare_str(source, standardize_whitespaces)) distance = Levenshtein.distance(output, source, weights=weights) # type: ignore # lower bounded the char length for source string at 1.0 because to avoid division by zero # in the case where source string is empty, the distance should be at 100% @@ -160,3 +160,20 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) -> if standardize_whitespaces: return " ".join(string.split()) return str(string) # type: ignore + +def standardize_quotes(text: str) -> str: + """ + Converts all starting and ending double quotes to a standard for Unicode conversion, + and does the same for single quotes but with a different standard. + + Args: + text (str): The input text to be standardized. + + Returns: + str: The text with standardized quotes. + """ + # Standardize double quotes + text = text.replace('“', '"').replace('”', '"').replace('„', '"') + # Standardize single quotes + text = text.replace('‘', "'").replace('’', "'").replace('‚', "'") + return text \ No newline at end of file