mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 17:02:31 +00:00
test: enhance quote standardization tests with additional Unicode scenarios
This commit is contained in:
parent
9038b88b8e
commit
c0c3fd673f
@ -339,69 +339,75 @@ def test_prepare_string(text, expected):
|
|||||||
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
||||||
assert text_extraction.prepare_str(text) == text
|
assert text_extraction.prepare_str(text) == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("input_text", "expected_output"),
|
("input_text", "expected_output"),
|
||||||
[
|
[
|
||||||
# Complex sentences with standard quotes
|
|
||||||
("\'The journey wasn\'t easy,\' she remarked.",
|
|
||||||
"\'The journey wasn\'t easy,\' she remarked."),
|
|
||||||
|
|
||||||
# Mixed quotes in longer sentences
|
# Mixed quotes in longer sentences
|
||||||
('She said "Hello" and then whispered \'Goodbye\' before leaving.',
|
(
|
||||||
'She said "Hello" and then whispered \'Goodbye\' before leaving.'),
|
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
|
||||||
|
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
|
||||||
|
),
|
||||||
# Double low-9 quotes with complex content
|
# Double low-9 quotes with complex content
|
||||||
('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
|
(
|
||||||
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'),
|
"„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
|
||||||
|
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
|
||||||
|
),
|
||||||
# Angle quotes with nested quotes
|
# Angle quotes with nested quotes
|
||||||
('«When he said "life is beautiful," I believed him» wrote Maria.',
|
(
|
||||||
'"When he said "life is beautiful," I believed him" wrote Maria.'),
|
'«When he said "life is beautiful," I believed him» wrote Maria.',
|
||||||
|
'"When he said "life is beautiful," I believed him" wrote Maria.',
|
||||||
|
),
|
||||||
# Heavy ornament quotes in dialogue
|
# Heavy ornament quotes in dialogue
|
||||||
('❝Do you remember when we first met?❞ she asked with a smile.',
|
(
|
||||||
'"Do you remember when we first met?" she asked with a smile.'),
|
"❝Do you remember when we first met?❞ she asked with a smile.",
|
||||||
|
'"Do you remember when we first met?" she asked with a smile.',
|
||||||
|
),
|
||||||
# Double prime quotes with punctuation
|
# Double prime quotes with punctuation
|
||||||
('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.',
|
(
|
||||||
'"The meeting starts at 10:00, don\'t be late!" announced the manager.'),
|
"〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
|
||||||
|
'"The meeting starts at 10:00, don\'t be late!" announced the manager.',
|
||||||
|
),
|
||||||
# Corner brackets with nested quotes
|
# Corner brackets with nested quotes
|
||||||
('「He told me "This is important" yesterday」, she explained.',
|
(
|
||||||
'\'He told me "This is important" yesterday\', she explained.'),
|
'「He told me "This is important" yesterday」, she explained.',
|
||||||
|
"'He told me \"This is important\" yesterday', she explained.",
|
||||||
|
),
|
||||||
# White corner brackets with multiple sentences
|
# White corner brackets with multiple sentences
|
||||||
('『The sun was setting. The birds were singing. It was peaceful.』',
|
(
|
||||||
'\'The sun was setting. The birds were singing. It was peaceful.\''),
|
"『The sun was setting. The birds were singing. It was peaceful.』",
|
||||||
|
"'The sun was setting. The birds were singing. It was peaceful.'",
|
||||||
|
),
|
||||||
# Vertical corner brackets with numbers and special characters
|
# Vertical corner brackets with numbers and special characters
|
||||||
('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁',
|
("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
|
||||||
'\'Meeting #123 @ 15:00 - Don\'t forget!\''),
|
|
||||||
|
|
||||||
# Complex mixed quote types
|
# Complex mixed quote types
|
||||||
('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
|
(
|
||||||
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'),
|
'「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
|
||||||
|
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
|
||||||
|
),
|
||||||
# Quotes with multiple apostrophes
|
# Quotes with multiple apostrophes
|
||||||
('It\'s John\'s book, isn\'t it?',
|
("It's John's book, isn't it?", "It's John's book, isn't it?"),
|
||||||
"It's John's book, isn't it?"),
|
|
||||||
|
|
||||||
# Single angle quotes with nested content
|
# Single angle quotes with nested content
|
||||||
('‹Testing the system\'s capability for "quoted" text›',
|
(
|
||||||
'\'Testing the system\'s capability for "quoted" text\''),
|
'‹Testing the system\'s capability for "quoted" text›',
|
||||||
|
"'Testing the system's capability for \"quoted\" text'",
|
||||||
|
),
|
||||||
# Heavy single ornament quotes with multiple sentences
|
# Heavy single ornament quotes with multiple sentences
|
||||||
('❛First sentence. Second sentence. Third sentence.❜',
|
(
|
||||||
'\'First sentence. Second sentence. Third sentence.\''),
|
"❛First sentence. Second sentence. Third sentence.❜",
|
||||||
|
"'First sentence. Second sentence. Third sentence.'",
|
||||||
|
),
|
||||||
# Mix of various quote types in complex text
|
# Mix of various quote types in complex text
|
||||||
('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
|
(
|
||||||
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".')
|
'「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
|
||||||
|
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_standardize_quotes(input_text, expected_output):
|
def test_standardize_quotes(input_text, expected_output):
|
||||||
assert text_extraction.standardize_quotes(input_text) == expected_output
|
assert text_extraction.standardize_quotes(input_text) == expected_output
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("output_text", "source_text", "expected_percentage"),
|
("output_text", "source_text", "expected_percentage"),
|
||||||
[
|
[
|
||||||
|
@ -161,6 +161,7 @@ def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) ->
|
|||||||
return " ".join(string.split())
|
return " ".join(string.split())
|
||||||
return str(string) # type: ignore
|
return str(string) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def standardize_quotes(text: str) -> str:
|
def standardize_quotes(text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Converts all unicode quotes to standard ASCII quotes with comprehensive coverage.
|
Converts all unicode quotes to standard ASCII quotes with comprehensive coverage.
|
||||||
@ -173,66 +174,64 @@ def standardize_quotes(text: str) -> str:
|
|||||||
"""
|
"""
|
||||||
# Double Quotes Dictionary
|
# Double Quotes Dictionary
|
||||||
double_quotes = {
|
double_quotes = {
|
||||||
'"': 'U+0022', # Standard typewriter/programmer's quote
|
'"': "U+0022", # Standard typewriter/programmer's quote
|
||||||
'"': 'U+201C', # Left double quotation mark
|
'"': "U+201C", # Left double quotation mark
|
||||||
'"': 'U+201D', # Right double quotation mark
|
'"': "U+201D", # Right double quotation mark
|
||||||
'„': 'U+201E', # Double low-9 quotation mark
|
"„": "U+201E", # Double low-9 quotation mark
|
||||||
'‟': 'U+201F', # Double high-reversed-9 quotation mark
|
"‟": "U+201F", # Double high-reversed-9 quotation mark
|
||||||
'«': 'U+00AB', # Left-pointing double angle quotation mark
|
"«": "U+00AB", # Left-pointing double angle quotation mark
|
||||||
'»': 'U+00BB', # Right-pointing double angle quotation mark
|
"»": "U+00BB", # Right-pointing double angle quotation mark
|
||||||
'❝': 'U+275D', # Heavy double turned comma quotation mark ornament
|
"❝": "U+275D", # Heavy double turned comma quotation mark ornament
|
||||||
'❞': 'U+275E', # Heavy double comma quotation mark ornament
|
"❞": "U+275E", # Heavy double comma quotation mark ornament
|
||||||
'⹂': 'U+2E42', # Double low-reversed-9 quotation mark
|
"⹂": "U+2E42", # Double low-reversed-9 quotation mark
|
||||||
'🙶': 'U+1F676', # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
|
"🙶": "U+1F676", # SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
|
||||||
'🙷': 'U+1F677', # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
|
"🙷": "U+1F677", # SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
|
||||||
'🙸': 'U+1F678', # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
|
"🙸": "U+1F678", # SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
|
||||||
'⠦': 'U+2826', # Braille double closing quotation mark
|
"⠦": "U+2826", # Braille double closing quotation mark
|
||||||
'⠴': 'U+2834', # Braille double opening quotation mark
|
"⠴": "U+2834", # Braille double opening quotation mark
|
||||||
'〝': 'U+301D', # REVERSED DOUBLE PRIME QUOTATION MARK
|
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
|
||||||
'〞': 'U+301E', # DOUBLE PRIME QUOTATION MARK
|
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
|
||||||
'〟': 'U+301F', # LOW DOUBLE PRIME QUOTATION MARK
|
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Single Quotes Dictionary
|
# Single Quotes Dictionary
|
||||||
single_quotes = {
|
single_quotes = {
|
||||||
"'": 'U+0027', # Standard typewriter/programmer's quote
|
"'": "U+0027", # Standard typewriter/programmer's quote
|
||||||
'\'': 'U+2018', # Left single quotation mark
|
"'": "U+2018", # Left single quotation mark
|
||||||
'\'': 'U+2019', # Right single quotation mark
|
"'": "U+2019", # Right single quotation mark
|
||||||
'‚': 'U+201A', # Single low-9 quotation mark
|
"‚": "U+201A", # Single low-9 quotation mark
|
||||||
'‛': 'U+201B', # Single high-reversed-9 quotation mark
|
"‛": "U+201B", # Single high-reversed-9 quotation mark
|
||||||
'‹': 'U+2039', # Single left-pointing angle quotation mark
|
"‹": "U+2039", # Single left-pointing angle quotation mark
|
||||||
'›': 'U+203A', # Single right-pointing angle quotation mark
|
"›": "U+203A", # Single right-pointing angle quotation mark
|
||||||
'❛': 'U+275B', # Heavy single turned comma quotation mark ornament
|
"❛": "U+275B", # Heavy single turned comma quotation mark ornament
|
||||||
'❜': 'U+275C', # Heavy single comma quotation mark ornament
|
"❜": "U+275C", # Heavy single comma quotation mark ornament
|
||||||
'「': 'U+300C', # Left corner bracket
|
"「": "U+300C", # Left corner bracket
|
||||||
'」': 'U+300D', # Right corner bracket
|
"」": "U+300D", # Right corner bracket
|
||||||
'『': 'U+300E', # Left white corner bracket
|
"『": "U+300E", # Left white corner bracket
|
||||||
'』': 'U+300F', # Right white corner bracket
|
"』": "U+300F", # Right white corner bracket
|
||||||
'﹁': 'U+FE41', # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
"﹁": "U+FE41", # PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
||||||
'﹂': 'U+FE42', # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
||||||
'﹃': 'U+FE43', # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
||||||
'﹄': 'U+FE44', # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
||||||
'"': 'U+FF02', # FULLWIDTH QUOTATION MARK
|
""": "U+FF02", # FULLWIDTH QUOTATION MARK
|
||||||
''': 'U+FF07', # FULLWIDTH APOSTROPHE
|
"'": "U+FF07", # FULLWIDTH APOSTROPHE
|
||||||
'「': 'U+FF62', # HALFWIDTH LEFT CORNER BRACKET
|
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
|
||||||
'」': 'U+FF63' # HALFWIDTH RIGHT CORNER BRACKET
|
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
|
||||||
}
|
}
|
||||||
|
|
||||||
double_quote_standard = '"'
|
double_quote_standard = '"'
|
||||||
single_quote_standard = "'"
|
single_quote_standard = "'"
|
||||||
|
|
||||||
# Apply double quote replacements
|
# Apply double quote replacements
|
||||||
# Apply double quote replacements
|
# Apply double quote replacements
|
||||||
for unicode_val in double_quotes.values():
|
for unicode_val in double_quotes.values():
|
||||||
unicode_char = chr(int(unicode_val.replace('U+', ''), 16))
|
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
|
||||||
if unicode_char in text:
|
if unicode_char in text:
|
||||||
text = text.replace(unicode_char, double_quote_standard)
|
text = text.replace(unicode_char, double_quote_standard)
|
||||||
|
|
||||||
# Apply single quote replacements
|
# Apply single quote replacements
|
||||||
for unicode_val in single_quotes.values():
|
for unicode_val in single_quotes.values():
|
||||||
unicode_char = chr(int(unicode_val.replace('U+', ''), 16))
|
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
|
||||||
if unicode_char in text:
|
if unicode_char in text:
|
||||||
text = text.replace(unicode_char, single_quote_standard)
|
text = text.replace(unicode_char, single_quote_standard)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user