test: enhance quote standardization tests with additional Unicode scenarios

This commit is contained in:
Christine Straub 2024-12-04 13:02:07 -08:00
parent 9038b88b8e
commit c0c3fd673f
2 changed files with 92 additions and 87 deletions

View File

@ -339,69 +339,75 @@ def test_prepare_string(text, expected):
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
assert text_extraction.prepare_str(text) == text
@pytest.mark.parametrize(
("input_text", "expected_output"),
[
# Complex sentences with standard quotes
("\'The journey wasn\'t easy,\' she remarked.",
"\'The journey wasn\'t easy,\' she remarked."),
# Mixed quotes in longer sentences
('She said "Hello" and then whispered \'Goodbye\' before leaving.',
'She said "Hello" and then whispered \'Goodbye\' before leaving.'),
(
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
),
# Double low-9 quotes with complex content
('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'),
(
"„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
),
# Angle quotes with nested quotes
('«When he said "life is beautiful," I believed him» wrote Maria.',
'"When he said "life is beautiful," I believed him" wrote Maria.'),
(
'«When he said "life is beautiful," I believed him» wrote Maria.',
'"When he said "life is beautiful," I believed him" wrote Maria.',
),
# Heavy ornament quotes in dialogue
('❝Do you remember when we first met?❞ she asked with a smile.',
'"Do you remember when we first met?" she asked with a smile.'),
(
"❝Do you remember when we first met?❞ she asked with a smile.",
'"Do you remember when we first met?" she asked with a smile.',
),
# Double prime quotes with punctuation
('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.',
'"The meeting starts at 10:00, don\'t be late!" announced the manager.'),
(
"〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
'"The meeting starts at 10:00, don\'t be late!" announced the manager.',
),
# Corner brackets with nested quotes
('「He told me "This is important" yesterday」, she explained.',
'\'He told me "This is important" yesterday\', she explained.'),
(
'「He told me "This is important" yesterday」, she explained.',
"'He told me \"This is important\" yesterday', she explained.",
),
# White corner brackets with multiple sentences
('『The sun was setting. The birds were singing. It was peaceful.』',
'\'The sun was setting. The birds were singing. It was peaceful.\''),
(
"『The sun was setting. The birds were singing. It was peaceful.』",
"'The sun was setting. The birds were singing. It was peaceful.'",
),
# Vertical corner brackets with numbers and special characters
('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁',
'\'Meeting #123 @ 15:00 - Don\'t forget!\''),
("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
# Complex mixed quote types
('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'),
(
'「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
),
# Quotes with multiple apostrophes
('It\'s John\'s book, isn\'t it?',
"It's John's book, isn't it?"),
("It's John's book, isn't it?", "It's John's book, isn't it?"),
# Single angle quotes with nested content
('Testing the system\'s capability for "quoted" text',
'\'Testing the system\'s capability for "quoted" text\''),
(
'Testing the system\'s capability for "quoted" text',
"'Testing the system's capability for \"quoted\" text'",
),
# Heavy single ornament quotes with multiple sentences
('❛First sentence. Second sentence. Third sentence.❜',
'\'First sentence. Second sentence. Third sentence.\''),
(
"❛First sentence. Second sentence. Third sentence.❜",
"'First sentence. Second sentence. Third sentence.'",
),
# Mix of various quote types in complex text
('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".')
(
'「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
),
],
)
def test_standardize_quotes(input_text, expected_output):
assert text_extraction.standardize_quotes(input_text) == expected_output
@pytest.mark.parametrize(
("output_text", "source_text", "expected_percentage"),
[

View File

@ -161,6 +161,7 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
return " ".join(string.split())
return str(string) # type: ignore
def standardize_quotes(text: str) -> str:
"""
Converts all unicode quotes to standard ASCII quotes with comprehensive coverage.
@ -173,66 +174,64 @@ def standardize_quotes(text: str) -> str:
"""
# Double Quotes Dictionary
double_quotes = {
'"': 'U+0022', # Standard typewriter/programmer's quote
'"': 'U+201C', # Left double quotation mark
'"': 'U+201D', # Right double quotation mark
'': 'U+201E', # Double low-9 quotation mark
'': 'U+201F', # Double high-reversed-9 quotation mark
'«': 'U+00AB', # Left-pointing double angle quotation mark
'»': 'U+00BB', # Right-pointing double angle quotation mark
'': 'U+275D', # Heavy double turned comma quotation mark ornament
'': 'U+275E', # Heavy double comma quotation mark ornament
'': 'U+2E42', # Double low-reversed-9 quotation mark
'🙶': 'U+1F676', # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
'🙷': 'U+1F677', # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
'🙸': 'U+1F678', # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
'': 'U+2826', # Braille double closing quotation mark
'': 'U+2834', # Braille double opening quotation mark
'': 'U+301D', # REVERSED DOUBLE PRIME QUOTATION MARK
'': 'U+301E', # DOUBLE PRIME QUOTATION MARK
'': 'U+301F', # LOW DOUBLE PRIME QUOTATION MARK
'"': "U+0022", # Standard typewriter/programmer's quote
'"': "U+201C", # Left double quotation mark
'"': "U+201D", # Right double quotation mark
"": "U+201E", # Double low-9 quotation mark
"": "U+201F", # Double high-reversed-9 quotation mark
"«": "U+00AB", # Left-pointing double angle quotation mark
"»": "U+00BB", # Right-pointing double angle quotation mark
"": "U+275D", # Heavy double turned comma quotation mark ornament
"": "U+275E", # Heavy double comma quotation mark ornament
"": "U+2E42", # Double low-reversed-9 quotation mark
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
"": "U+2826", # Braille double closing quotation mark
"": "U+2834", # Braille double opening quotation mark
"": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
"": "U+301E", # DOUBLE PRIME QUOTATION MARK
"": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
}
# Single Quotes Dictionary
single_quotes = {
"'": 'U+0027', # Standard typewriter/programmer's quote
'\'': 'U+2018', # Left single quotation mark
'\'': 'U+2019', # Right single quotation mark
'': 'U+201A', # Single low-9 quotation mark
'': 'U+201B', # Single high-reversed-9 quotation mark
'': 'U+2039', # Single left-pointing angle quotation mark
'': 'U+203A', # Single right-pointing angle quotation mark
'': 'U+275B', # Heavy single turned comma quotation mark ornament
'': 'U+275C', # Heavy single comma quotation mark ornament
'': 'U+300C', # Left corner bracket
'': 'U+300D', # Right corner bracket
'': 'U+300E', # Left white corner bracket
'': 'U+300F', # Right white corner bracket
'': 'U+FE41', # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
'': 'U+FE42', # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
'': 'U+FE43', # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
'': 'U+FE44', # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
'': 'U+FF02', # FULLWIDTH QUOTATION MARK
'': 'U+FF07', # FULLWIDTH APOSTROPHE
'': 'U+FF62', # HALFWIDTH LEFT CORNER BRACKET
'': 'U+FF63' # HALFWIDTH RIGHT CORNER BRACKET
single_quotes = {
"'": "U+0027", # Standard typewriter/programmer's quote
"'": "U+2018", # Left single quotation mark
"'": "U+2019", # Right single quotation mark
"": "U+201A", # Single low-9 quotation mark
"": "U+201B", # Single high-reversed-9 quotation mark
"": "U+2039", # Single left-pointing angle quotation mark
"": "U+203A", # Single right-pointing angle quotation mark
"": "U+275B", # Heavy single turned comma quotation mark ornament
"": "U+275C", # Heavy single comma quotation mark ornament
"": "U+300C", # Left corner bracket
"": "U+300D", # Right corner bracket
"": "U+300E", # Left white corner bracket
"": "U+300F", # Right white corner bracket
"": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
"": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
"": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
"": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
"": "U+FF02", # FULLWIDTH QUOTATION MARK
"": "U+FF07", # FULLWIDTH APOSTROPHE
"": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
"": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
}
double_quote_standard = '"'
single_quote_standard = "'"
# Apply double quote replacements
# Apply double quote replacements
for unicode_val in double_quotes.values():
unicode_char = chr(int(unicode_val.replace('U+', ''), 16))
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
if unicode_char in text:
text = text.replace(unicode_char, double_quote_standard)
# Apply single quote replacements
for unicode_val in single_quotes.values():
unicode_char = chr(int(unicode_val.replace('U+', ''), 16))
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
if unicode_char in text:
text = text.replace(unicode_char, single_quote_standard)