From c0c3fd673f47a0bc2840de614227f54c9f660f6b Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Wed, 4 Dec 2024 13:02:07 -0800 Subject: [PATCH] test: enhance quote standardization tests with additional Unicode scenarios --- .../metrics/test_text_extraction.py | 90 ++++++++++--------- unstructured/metrics/text_extraction.py | 89 +++++++++--------- 2 files changed, 92 insertions(+), 87 deletions(-) diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 0ca8ac345..18cf7f97c 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -339,69 +339,75 @@ def test_prepare_string(text, expected): assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected assert text_extraction.prepare_str(text) == text + @pytest.mark.parametrize( ("input_text", "expected_output"), [ - # Complex sentences with standard quotes - ("\'The journey wasn\'t easy,\' she remarked.", - "\'The journey wasn\'t easy,\' she remarked."), - # Mixed quotes in longer sentences - ('She said "Hello" and then whispered \'Goodbye\' before leaving.', - 'She said "Hello" and then whispered \'Goodbye\' before leaving.'), - + ( + "She said \"Hello\" and then whispered 'Goodbye' before leaving.", + "She said \"Hello\" and then whispered 'Goodbye' before leaving.", + ), # Double low-9 quotes with complex content - ('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.', - '"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'), - + ( + "„To be, or not to be, that is the question\" - Shakespeare's famous quote.", + '"To be, or not to be, that is the question" - Shakespeare\'s famous quote.', + ), # Angle quotes with nested quotes - ('«When he said "life is beautiful," I believed him» wrote Maria.', - '"When he said "life is beautiful," I believed him" wrote Maria.'), - + ( + '«When he said "life is beautiful," I believed him» wrote Maria.', + '"When he said "life is beautiful," I believed him" wrote Maria.', + ), # Heavy ornament quotes in dialogue - ('❝Do you remember when we first met?❞ she asked with a smile.', - '"Do you remember when we first met?" she asked with a smile.'), - + ( + "❝Do you remember when we first met?❞ she asked with a smile.", + '"Do you remember when we first met?" she asked with a smile.', + ), # Double prime quotes with punctuation - ('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.', - '"The meeting starts at 10:00, don\'t be late!" announced the manager.'), - + ( + "〝The meeting starts at 10:00, don't be late!〟 announced the manager.", + '"The meeting starts at 10:00, don\'t be late!" announced the manager.', + ), # Corner brackets with nested quotes - ('「He told me "This is important" yesterday」, she explained.', - '\'He told me "This is important" yesterday\', she explained.'), - + ( + '「He told me "This is important" yesterday」, she explained.', + "'He told me \"This is important\" yesterday', she explained.", + ), # White corner brackets with multiple sentences - ('『The sun was setting. The birds were singing. It was peaceful.』', - '\'The sun was setting. The birds were singing. It was peaceful.\''), - + ( + "『The sun was setting. The birds were singing. It was peaceful.』", + "'The sun was setting. The birds were singing. It was peaceful.'", + ), # Vertical corner brackets with numbers and special characters - ('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁', - '\'Meeting #123 @ 15:00 - Don\'t forget!\''), - + ("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"), # Complex mixed quote types - ('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»', - '\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'), - + ( + '「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»', + '\'Hello\', "World", "Test", \'Example\', "Quote", "Final"', + ), # Quotes with multiple apostrophes - ('It\'s John\'s book, isn\'t it?', - "It's John's book, isn't it?"), - + ("It's John's book, isn't it?", "It's John's book, isn't it?"), # Single angle quotes with nested content - ('‹Testing the system\'s capability for "quoted" text›', - '\'Testing the system\'s capability for "quoted" text\''), - + ( + '‹Testing the system\'s capability for "quoted" text›', + "'Testing the system's capability for \"quoted\" text'", + ), # Heavy single ornament quotes with multiple sentences - ('❛First sentence. Second sentence. Third sentence.❜', - '\'First sentence. Second sentence. Third sentence.\''), - + ( + "❛First sentence. Second sentence. Third sentence.❜", + "'First sentence. Second sentence. Third sentence.'", + ), # Mix of various quote types in complex text - ('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».', - '\'Chapter 1\': "The Beginning" - "A new story" begins "today".') + ( + '「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».', + '\'Chapter 1\': "The Beginning" - "A new story" begins "today".', + ), ], ) def test_standardize_quotes(input_text, expected_output): assert text_extraction.standardize_quotes(input_text) == expected_output + @pytest.mark.parametrize( ("output_text", "source_text", "expected_percentage"), [ diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index a31edcabb..b8902d86a 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -161,6 +161,7 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) -> return " ".join(string.split()) return str(string) # type: ignore + def standardize_quotes(text: str) -> str: """ Converts all unicode quotes to standard ASCII quotes with comprehensive coverage. @@ -173,66 +174,64 @@ def standardize_quotes(text: str) -> str: """ # Double Quotes Dictionary double_quotes = { - '"': 'U+0022', # Standard typewriter/programmer's quote - '"': 'U+201C', # Left double quotation mark - '"': 'U+201D', # Right double quotation mark - '„': 'U+201E', # Double low-9 quotation mark - '‟': 'U+201F', # Double high-reversed-9 quotation mark - '«': 'U+00AB', # Left-pointing double angle quotation mark - '»': 'U+00BB', # Right-pointing double angle quotation mark - '❝': 'U+275D', # Heavy double turned comma quotation mark ornament - '❞': 'U+275E', # Heavy double comma quotation mark ornament - '⹂': 'U+2E42', # Double low-reversed-9 quotation mark - '🙶': 'U+1F676', # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT - '🙷': 'U+1F677', # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT - '🙸': 'U+1F678', # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT - '⠦': 'U+2826', # Braille double closing quotation mark - '⠴': 'U+2834', # Braille double opening quotation mark - '〝': 'U+301D', # REVERSED DOUBLE PRIME QUOTATION MARK - '〞': 'U+301E', # DOUBLE PRIME QUOTATION MARK - '〟': 'U+301F', # LOW DOUBLE PRIME QUOTATION MARK + '"': "U+0022", # Standard typewriter/programmer's quote + '"': "U+201C", # Left double quotation mark + '"': "U+201D", # Right double quotation mark + "„": "U+201E", # Double low-9 quotation mark + "‟": "U+201F", # Double high-reversed-9 quotation mark + "«": "U+00AB", # Left-pointing double angle quotation mark + "»": "U+00BB", # Right-pointing double angle quotation mark + "❝": "U+275D", # Heavy double turned comma quotation mark ornament + "❞": "U+275E", # Heavy double comma quotation mark ornament + "⹂": "U+2E42", # Double low-reversed-9 quotation mark + "🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT + "🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT + "🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT + "⠦": "U+2826", # Braille double closing quotation mark + "⠴": "U+2834", # Braille double opening quotation mark + "〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK + "〞": "U+301E", # DOUBLE PRIME QUOTATION MARK + "〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK } - - # Single Quotes Dictionary - single_quotes = { - "'": 'U+0027', # Standard typewriter/programmer's quote - '\'': 'U+2018', # Left single quotation mark - '\'': 'U+2019', # Right single quotation mark - '‚': 'U+201A', # Single low-9 quotation mark - '‛': 'U+201B', # Single high-reversed-9 quotation mark - '‹': 'U+2039', # Single left-pointing angle quotation mark - '›': 'U+203A', # Single right-pointing angle quotation mark - '❛': 'U+275B', # Heavy single turned comma quotation mark ornament - '❜': 'U+275C', # Heavy single comma quotation mark ornament - '「': 'U+300C', # Left corner bracket - '」': 'U+300D', # Right corner bracket - '『': 'U+300E', # Left white corner bracket - '』': 'U+300F', # Right white corner bracket - '﹁': 'U+FE41', # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET - '﹂': 'U+FE42', # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET - '﹃': 'U+FE43', # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET - '﹄': 'U+FE44', # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET - '"': 'U+FF02', # FULLWIDTH QUOTATION MARK - ''': 'U+FF07', # FULLWIDTH APOSTROPHE - '「': 'U+FF62', # HALFWIDTH LEFT CORNER BRACKET - '」': 'U+FF63' # HALFWIDTH RIGHT CORNER BRACKET + single_quotes = { + "'": "U+0027", # Standard typewriter/programmer's quote + "'": "U+2018", # Left single quotation mark + "'": "U+2019", # Right single quotation mark + "‚": "U+201A", # Single low-9 quotation mark + "‛": "U+201B", # Single high-reversed-9 quotation mark + "‹": "U+2039", # Single left-pointing angle quotation mark + "›": "U+203A", # Single right-pointing angle quotation mark + "❛": "U+275B", # Heavy single turned comma quotation mark ornament + "❜": "U+275C", # Heavy single comma quotation mark ornament + "「": "U+300C", # Left corner bracket + "」": "U+300D", # Right corner bracket + "『": "U+300E", # Left white corner bracket + "』": "U+300F", # Right white corner bracket + "﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + "﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + "﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + "﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + """: "U+FF02", # FULLWIDTH QUOTATION MARK + "'": "U+FF07", # FULLWIDTH APOSTROPHE + "「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET + "」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET } double_quote_standard = '"' single_quote_standard = "'" - + # Apply double quote replacements # Apply double quote replacements for unicode_val in double_quotes.values(): - unicode_char = chr(int(unicode_val.replace('U+', ''), 16)) + unicode_char = chr(int(unicode_val.replace("U+", ""), 16)) if unicode_char in text: text = text.replace(unicode_char, double_quote_standard) # Apply single quote replacements for unicode_val in single_quotes.values(): - unicode_char = chr(int(unicode_val.replace('U+', ''), 16)) + unicode_char = chr(int(unicode_val.replace("U+", ""), 16)) if unicode_char in text: text = text.replace(unicode_char, single_quote_standard)