test: enhance quote standardization tests with additional Unicode scenarios

This commit is contained in:
Christine Straub 2024-12-04 13:02:07 -08:00
parent 9038b88b8e
commit c0c3fd673f
2 changed files with 92 additions and 87 deletions

View File

@ -339,69 +339,75 @@ def test_prepare_string(text, expected):
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
assert text_extraction.prepare_str(text) == text assert text_extraction.prepare_str(text) == text
@pytest.mark.parametrize( @pytest.mark.parametrize(
("input_text", "expected_output"), ("input_text", "expected_output"),
[ [
# Complex sentences with standard quotes
("\'The journey wasn\'t easy,\' she remarked.",
"\'The journey wasn\'t easy,\' she remarked."),
# Mixed quotes in longer sentences # Mixed quotes in longer sentences
('She said "Hello" and then whispered \'Goodbye\' before leaving.', (
'She said "Hello" and then whispered \'Goodbye\' before leaving.'), "She said \"Hello\" and then whispered 'Goodbye' before leaving.",
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
),
# Double low-9 quotes with complex content # Double low-9 quotes with complex content
('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.', (
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'), "„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
),
# Angle quotes with nested quotes # Angle quotes with nested quotes
('«When he said "life is beautiful," I believed him» wrote Maria.', (
'"When he said "life is beautiful," I believed him" wrote Maria.'), '«When he said "life is beautiful," I believed him» wrote Maria.',
'"When he said "life is beautiful," I believed him" wrote Maria.',
),
# Heavy ornament quotes in dialogue # Heavy ornament quotes in dialogue
('❝Do you remember when we first met?❞ she asked with a smile.', (
'"Do you remember when we first met?" she asked with a smile.'), "❝Do you remember when we first met?❞ she asked with a smile.",
'"Do you remember when we first met?" she asked with a smile.',
),
# Double prime quotes with punctuation # Double prime quotes with punctuation
('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.', (
'"The meeting starts at 10:00, don\'t be late!" announced the manager.'), "〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
'"The meeting starts at 10:00, don\'t be late!" announced the manager.',
),
# Corner brackets with nested quotes # Corner brackets with nested quotes
('「He told me "This is important" yesterday」, she explained.', (
'\'He told me "This is important" yesterday\', she explained.'), '「He told me "This is important" yesterday」, she explained.',
"'He told me \"This is important\" yesterday', she explained.",
),
# White corner brackets with multiple sentences # White corner brackets with multiple sentences
('『The sun was setting. The birds were singing. It was peaceful.』', (
'\'The sun was setting. The birds were singing. It was peaceful.\''), "『The sun was setting. The birds were singing. It was peaceful.』",
"'The sun was setting. The birds were singing. It was peaceful.'",
),
# Vertical corner brackets with numbers and special characters # Vertical corner brackets with numbers and special characters
('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁', ("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
'\'Meeting #123 @ 15:00 - Don\'t forget!\''),
# Complex mixed quote types # Complex mixed quote types
('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»', (
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'), '「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
),
# Quotes with multiple apostrophes # Quotes with multiple apostrophes
('It\'s John\'s book, isn\'t it?', ("It's John's book, isn't it?", "It's John's book, isn't it?"),
"It's John's book, isn't it?"),
# Single angle quotes with nested content # Single angle quotes with nested content
('Testing the system\'s capability for "quoted" text', (
'\'Testing the system\'s capability for "quoted" text\''), 'Testing the system\'s capability for "quoted" text',
"'Testing the system's capability for \"quoted\" text'",
),
# Heavy single ornament quotes with multiple sentences # Heavy single ornament quotes with multiple sentences
('❛First sentence. Second sentence. Third sentence.❜', (
'\'First sentence. Second sentence. Third sentence.\''), "❛First sentence. Second sentence. Third sentence.❜",
"'First sentence. Second sentence. Third sentence.'",
),
# Mix of various quote types in complex text # Mix of various quote types in complex text
('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».', (
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".') '「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
),
], ],
) )
def test_standardize_quotes(input_text, expected_output): def test_standardize_quotes(input_text, expected_output):
assert text_extraction.standardize_quotes(input_text) == expected_output assert text_extraction.standardize_quotes(input_text) == expected_output
@pytest.mark.parametrize( @pytest.mark.parametrize(
("output_text", "source_text", "expected_percentage"), ("output_text", "source_text", "expected_percentage"),
[ [

View File

@ -161,6 +161,7 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
return " ".join(string.split()) return " ".join(string.split())
return str(string) # type: ignore return str(string) # type: ignore
def standardize_quotes(text: str) -> str: def standardize_quotes(text: str) -> str:
""" """
Converts all unicode quotes to standard ASCII quotes with comprehensive coverage. Converts all unicode quotes to standard ASCII quotes with comprehensive coverage.
@ -173,66 +174,64 @@ def standardize_quotes(text: str) -> str:
""" """
# Double Quotes Dictionary # Double Quotes Dictionary
double_quotes = { double_quotes = {
'"': 'U+0022', # Standard typewriter/programmer's quote '"': "U+0022", # Standard typewriter/programmer's quote
'"': 'U+201C', # Left double quotation mark '"': "U+201C", # Left double quotation mark
'"': 'U+201D', # Right double quotation mark '"': "U+201D", # Right double quotation mark
'': 'U+201E', # Double low-9 quotation mark "": "U+201E", # Double low-9 quotation mark
'': 'U+201F', # Double high-reversed-9 quotation mark "": "U+201F", # Double high-reversed-9 quotation mark
'«': 'U+00AB', # Left-pointing double angle quotation mark "«": "U+00AB", # Left-pointing double angle quotation mark
'»': 'U+00BB', # Right-pointing double angle quotation mark "»": "U+00BB", # Right-pointing double angle quotation mark
'': 'U+275D', # Heavy double turned comma quotation mark ornament "": "U+275D", # Heavy double turned comma quotation mark ornament
'': 'U+275E', # Heavy double comma quotation mark ornament "": "U+275E", # Heavy double comma quotation mark ornament
'': 'U+2E42', # Double low-reversed-9 quotation mark "": "U+2E42", # Double low-reversed-9 quotation mark
'🙶': 'U+1F676', # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT "🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
'🙷': 'U+1F677', # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT "🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
'🙸': 'U+1F678', # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT "🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
'': 'U+2826', # Braille double closing quotation mark "": "U+2826", # Braille double closing quotation mark
'': 'U+2834', # Braille double opening quotation mark "": "U+2834", # Braille double opening quotation mark
'': 'U+301D', # REVERSED DOUBLE PRIME QUOTATION MARK "": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
'': 'U+301E', # DOUBLE PRIME QUOTATION MARK "": "U+301E", # DOUBLE PRIME QUOTATION MARK
'': 'U+301F', # LOW DOUBLE PRIME QUOTATION MARK "": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
} }
# Single Quotes Dictionary # Single Quotes Dictionary
single_quotes = { single_quotes = {
"'": 'U+0027', # Standard typewriter/programmer's quote "'": "U+0027", # Standard typewriter/programmer's quote
'\'': 'U+2018', # Left single quotation mark "'": "U+2018", # Left single quotation mark
'\'': 'U+2019', # Right single quotation mark "'": "U+2019", # Right single quotation mark
'': 'U+201A', # Single low-9 quotation mark "": "U+201A", # Single low-9 quotation mark
'': 'U+201B', # Single high-reversed-9 quotation mark "": "U+201B", # Single high-reversed-9 quotation mark
'': 'U+2039', # Single left-pointing angle quotation mark "": "U+2039", # Single left-pointing angle quotation mark
'': 'U+203A', # Single right-pointing angle quotation mark "": "U+203A", # Single right-pointing angle quotation mark
'': 'U+275B', # Heavy single turned comma quotation mark ornament "": "U+275B", # Heavy single turned comma quotation mark ornament
'': 'U+275C', # Heavy single comma quotation mark ornament "": "U+275C", # Heavy single comma quotation mark ornament
'': 'U+300C', # Left corner bracket "": "U+300C", # Left corner bracket
'': 'U+300D', # Right corner bracket "": "U+300D", # Right corner bracket
'': 'U+300E', # Left white corner bracket "": "U+300E", # Left white corner bracket
'': 'U+300F', # Right white corner bracket "": "U+300F", # Right white corner bracket
'': 'U+FE41', # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET "": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
'': 'U+FE42', # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET "": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
'': 'U+FE43', # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET "": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
'': 'U+FE44', # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET "": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
'': 'U+FF02', # FULLWIDTH QUOTATION MARK "": "U+FF02", # FULLWIDTH QUOTATION MARK
'': 'U+FF07', # FULLWIDTH APOSTROPHE "": "U+FF07", # FULLWIDTH APOSTROPHE
'': 'U+FF62', # HALFWIDTH LEFT CORNER BRACKET "": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
'': 'U+FF63' # HALFWIDTH RIGHT CORNER BRACKET "": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
} }
double_quote_standard = '"' double_quote_standard = '"'
single_quote_standard = "'" single_quote_standard = "'"
# Apply double quote replacements # Apply double quote replacements
# Apply double quote replacements # Apply double quote replacements
for unicode_val in double_quotes.values(): for unicode_val in double_quotes.values():
unicode_char = chr(int(unicode_val.replace('U+', ''), 16)) unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
if unicode_char in text: if unicode_char in text:
text = text.replace(unicode_char, double_quote_standard) text = text.replace(unicode_char, double_quote_standard)
# Apply single quote replacements # Apply single quote replacements
for unicode_val in single_quotes.values(): for unicode_val in single_quotes.values():
unicode_char = chr(int(unicode_val.replace('U+', ''), 16)) unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
if unicode_char in text: if unicode_char in text:
text = text.replace(unicode_char, single_quote_standard) text = text.replace(unicode_char, single_quote_standard)