mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-25 16:29:53 +00:00
test: enhance quote standardization tests with additional Unicode scenarios
This commit is contained in:
parent
9038b88b8e
commit
c0c3fd673f
@ -339,69 +339,75 @@ def test_prepare_string(text, expected):
|
||||
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
||||
assert text_extraction.prepare_str(text) == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("input_text", "expected_output"),
|
||||
[
|
||||
# Complex sentences with standard quotes
|
||||
("\'The journey wasn\'t easy,\' she remarked.",
|
||||
"\'The journey wasn\'t easy,\' she remarked."),
|
||||
|
||||
# Mixed quotes in longer sentences
|
||||
('She said "Hello" and then whispered \'Goodbye\' before leaving.',
|
||||
'She said "Hello" and then whispered \'Goodbye\' before leaving.'),
|
||||
|
||||
(
|
||||
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
|
||||
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
|
||||
),
|
||||
# Double low-9 quotes with complex content
|
||||
('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
|
||||
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'),
|
||||
|
||||
(
|
||||
"„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
|
||||
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
|
||||
),
|
||||
# Angle quotes with nested quotes
|
||||
('«When he said "life is beautiful," I believed him» wrote Maria.',
|
||||
'"When he said "life is beautiful," I believed him" wrote Maria.'),
|
||||
|
||||
(
|
||||
'«When he said "life is beautiful," I believed him» wrote Maria.',
|
||||
'"When he said "life is beautiful," I believed him" wrote Maria.',
|
||||
),
|
||||
# Heavy ornament quotes in dialogue
|
||||
('❝Do you remember when we first met?❞ she asked with a smile.',
|
||||
'"Do you remember when we first met?" she asked with a smile.'),
|
||||
|
||||
(
|
||||
"❝Do you remember when we first met?❞ she asked with a smile.",
|
||||
'"Do you remember when we first met?" she asked with a smile.',
|
||||
),
|
||||
# Double prime quotes with punctuation
|
||||
('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.',
|
||||
'"The meeting starts at 10:00, don\'t be late!" announced the manager.'),
|
||||
|
||||
(
|
||||
"〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
|
||||
'"The meeting starts at 10:00, don\'t be late!" announced the manager.',
|
||||
),
|
||||
# Corner brackets with nested quotes
|
||||
('「He told me "This is important" yesterday」, she explained.',
|
||||
'\'He told me "This is important" yesterday\', she explained.'),
|
||||
|
||||
(
|
||||
'「He told me "This is important" yesterday」, she explained.',
|
||||
"'He told me \"This is important\" yesterday', she explained.",
|
||||
),
|
||||
# White corner brackets with multiple sentences
|
||||
('『The sun was setting. The birds were singing. It was peaceful.』',
|
||||
'\'The sun was setting. The birds were singing. It was peaceful.\''),
|
||||
|
||||
(
|
||||
"『The sun was setting. The birds were singing. It was peaceful.』",
|
||||
"'The sun was setting. The birds were singing. It was peaceful.'",
|
||||
),
|
||||
# Vertical corner brackets with numbers and special characters
|
||||
('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁',
|
||||
'\'Meeting #123 @ 15:00 - Don\'t forget!\''),
|
||||
|
||||
("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
|
||||
# Complex mixed quote types
|
||||
('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
|
||||
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'),
|
||||
|
||||
(
|
||||
'「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
|
||||
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
|
||||
),
|
||||
# Quotes with multiple apostrophes
|
||||
('It\'s John\'s book, isn\'t it?',
|
||||
"It's John's book, isn't it?"),
|
||||
|
||||
("It's John's book, isn't it?", "It's John's book, isn't it?"),
|
||||
# Single angle quotes with nested content
|
||||
('‹Testing the system\'s capability for "quoted" text›',
|
||||
'\'Testing the system\'s capability for "quoted" text\''),
|
||||
|
||||
(
|
||||
'‹Testing the system\'s capability for "quoted" text›',
|
||||
"'Testing the system's capability for \"quoted\" text'",
|
||||
),
|
||||
# Heavy single ornament quotes with multiple sentences
|
||||
('❛First sentence. Second sentence. Third sentence.❜',
|
||||
'\'First sentence. Second sentence. Third sentence.\''),
|
||||
|
||||
(
|
||||
"❛First sentence. Second sentence. Third sentence.❜",
|
||||
"'First sentence. Second sentence. Third sentence.'",
|
||||
),
|
||||
# Mix of various quote types in complex text
|
||||
('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
|
||||
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".')
|
||||
(
|
||||
'「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
|
||||
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_standardize_quotes(input_text, expected_output):
|
||||
assert text_extraction.standardize_quotes(input_text) == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("output_text", "source_text", "expected_percentage"),
|
||||
[
|
||||
|
@ -161,6 +161,7 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
|
||||
return " ".join(string.split())
|
||||
return str(string) # type: ignore
|
||||
|
||||
|
||||
def standardize_quotes(text: str) -> str:
|
||||
"""
|
||||
Converts all unicode quotes to standard ASCII quotes with comprehensive coverage.
|
||||
@ -173,66 +174,64 @@ def standardize_quotes(text: str) -> str:
|
||||
"""
|
||||
# Double Quotes Dictionary
|
||||
double_quotes = {
|
||||
'"': 'U+0022', # Standard typewriter/programmer's quote
|
||||
'"': 'U+201C', # Left double quotation mark
|
||||
'"': 'U+201D', # Right double quotation mark
|
||||
'„': 'U+201E', # Double low-9 quotation mark
|
||||
'‟': 'U+201F', # Double high-reversed-9 quotation mark
|
||||
'«': 'U+00AB', # Left-pointing double angle quotation mark
|
||||
'»': 'U+00BB', # Right-pointing double angle quotation mark
|
||||
'❝': 'U+275D', # Heavy double turned comma quotation mark ornament
|
||||
'❞': 'U+275E', # Heavy double comma quotation mark ornament
|
||||
'⹂': 'U+2E42', # Double low-reversed-9 quotation mark
|
||||
'🙶': 'U+1F676', # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
|
||||
'🙷': 'U+1F677', # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
|
||||
'🙸': 'U+1F678', # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
|
||||
'⠦': 'U+2826', # Braille double closing quotation mark
|
||||
'⠴': 'U+2834', # Braille double opening quotation mark
|
||||
'〝': 'U+301D', # REVERSED DOUBLE PRIME QUOTATION MARK
|
||||
'〞': 'U+301E', # DOUBLE PRIME QUOTATION MARK
|
||||
'〟': 'U+301F', # LOW DOUBLE PRIME QUOTATION MARK
|
||||
'"': "U+0022", # Standard typewriter/programmer's quote
|
||||
'"': "U+201C", # Left double quotation mark
|
||||
'"': "U+201D", # Right double quotation mark
|
||||
"„": "U+201E", # Double low-9 quotation mark
|
||||
"‟": "U+201F", # Double high-reversed-9 quotation mark
|
||||
"«": "U+00AB", # Left-pointing double angle quotation mark
|
||||
"»": "U+00BB", # Right-pointing double angle quotation mark
|
||||
"❝": "U+275D", # Heavy double turned comma quotation mark ornament
|
||||
"❞": "U+275E", # Heavy double comma quotation mark ornament
|
||||
"⹂": "U+2E42", # Double low-reversed-9 quotation mark
|
||||
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
|
||||
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
|
||||
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
|
||||
"⠦": "U+2826", # Braille double closing quotation mark
|
||||
"⠴": "U+2834", # Braille double opening quotation mark
|
||||
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
|
||||
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
|
||||
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Single Quotes Dictionary
|
||||
single_quotes = {
|
||||
"'": 'U+0027', # Standard typewriter/programmer's quote
|
||||
'\'': 'U+2018', # Left single quotation mark
|
||||
'\'': 'U+2019', # Right single quotation mark
|
||||
'‚': 'U+201A', # Single low-9 quotation mark
|
||||
'‛': 'U+201B', # Single high-reversed-9 quotation mark
|
||||
'‹': 'U+2039', # Single left-pointing angle quotation mark
|
||||
'›': 'U+203A', # Single right-pointing angle quotation mark
|
||||
'❛': 'U+275B', # Heavy single turned comma quotation mark ornament
|
||||
'❜': 'U+275C', # Heavy single comma quotation mark ornament
|
||||
'「': 'U+300C', # Left corner bracket
|
||||
'」': 'U+300D', # Right corner bracket
|
||||
'『': 'U+300E', # Left white corner bracket
|
||||
'』': 'U+300F', # Right white corner bracket
|
||||
'﹁': 'U+FE41', # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
||||
'﹂': 'U+FE42', # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
||||
'﹃': 'U+FE43', # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
||||
'﹄': 'U+FE44', # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
||||
'"': 'U+FF02', # FULLWIDTH QUOTATION MARK
|
||||
''': 'U+FF07', # FULLWIDTH APOSTROPHE
|
||||
'「': 'U+FF62', # HALFWIDTH LEFT CORNER BRACKET
|
||||
'」': 'U+FF63' # HALFWIDTH RIGHT CORNER BRACKET
|
||||
single_quotes = {
|
||||
"'": "U+0027", # Standard typewriter/programmer's quote
|
||||
"'": "U+2018", # Left single quotation mark
|
||||
"'": "U+2019", # Right single quotation mark
|
||||
"‚": "U+201A", # Single low-9 quotation mark
|
||||
"‛": "U+201B", # Single high-reversed-9 quotation mark
|
||||
"‹": "U+2039", # Single left-pointing angle quotation mark
|
||||
"›": "U+203A", # Single right-pointing angle quotation mark
|
||||
"❛": "U+275B", # Heavy single turned comma quotation mark ornament
|
||||
"❜": "U+275C", # Heavy single comma quotation mark ornament
|
||||
"「": "U+300C", # Left corner bracket
|
||||
"」": "U+300D", # Right corner bracket
|
||||
"『": "U+300E", # Left white corner bracket
|
||||
"』": "U+300F", # Right white corner bracket
|
||||
"﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
||||
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
||||
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
||||
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
||||
""": "U+FF02", # FULLWIDTH QUOTATION MARK
|
||||
"'": "U+FF07", # FULLWIDTH APOSTROPHE
|
||||
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
|
||||
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
|
||||
}
|
||||
|
||||
double_quote_standard = '"'
|
||||
single_quote_standard = "'"
|
||||
|
||||
|
||||
# Apply double quote replacements
|
||||
# Apply double quote replacements
|
||||
for unicode_val in double_quotes.values():
|
||||
unicode_char = chr(int(unicode_val.replace('U+', ''), 16))
|
||||
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
|
||||
if unicode_char in text:
|
||||
text = text.replace(unicode_char, double_quote_standard)
|
||||
|
||||
# Apply single quote replacements
|
||||
for unicode_val in single_quotes.values():
|
||||
unicode_char = chr(int(unicode_val.replace('U+', ''), 16))
|
||||
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
|
||||
if unicode_char in text:
|
||||
text = text.replace(unicode_char, single_quote_standard)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user