From 3bca724624f4d733619de6104abccb9762ccebae Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 5 Dec 2024 13:24:57 -0800 Subject: [PATCH] feat: update standardize_quote() --- unstructured/metrics/text_extraction.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index b070ec79f..91c9dfdf2 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -192,6 +192,8 @@ def standardize_quotes(text: str) -> str: "〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK "〞": "U+301E", # DOUBLE PRIME QUOTATION MARK "〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK + """: "U+FF02", # FULLWIDTH QUOTATION MARK + ",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT } # Single Quotes Dictionary @@ -213,7 +215,6 @@ def standardize_quotes(text: str) -> str: "﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET "﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET "﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET - """: "U+FF02", # FULLWIDTH QUOTATION MARK "'": "U+FF07", # FULLWIDTH APOSTROPHE "「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET "」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET @@ -225,14 +226,27 @@ def standardize_quotes(text: str) -> str: # Apply double quote replacements # Apply double quote replacements for unicode_val in double_quotes.values(): - unicode_char = chr(int(unicode_val.replace("U+", ""), 16)) + unicode_char = unicode_to_char(unicode_val) if unicode_char in text: text = text.replace(unicode_char, double_quote_standard) # Apply single quote replacements for unicode_val in single_quotes.values(): - unicode_char = chr(int(unicode_val.replace("U+", ""), 16)) + unicode_char = unicode_to_char(unicode_val) if unicode_char in text: text = text.replace(unicode_char, single_quote_standard) return text + + +def unicode_to_char(unicode_val: str) -> str: + """ + Converts a Unicode value to a character. + + Args: + unicode_val (str): The Unicode value to convert. + + Returns: + str: The character corresponding to the Unicode value. + """ + return chr(int(unicode_val.replace("U+", ""), 16))