mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-19 22:58:05 +00:00
fix(ingest/bigquery): changes helper function to decode unicode scape sequences (#10845)
This commit is contained in:
parent
1565fb0102
commit
4b83adfa9f
@ -10,14 +10,27 @@ def unquote_and_decode_unicode_escape_seq(
|
|||||||
"""
|
"""
|
||||||
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
|
If string starts and ends with a quote, unquote it and decode Unicode escape sequences
|
||||||
"""
|
"""
|
||||||
|
unicode_seq_pattern = re.compile(r"\\(u|U)[0-9a-fA-F]{4}")
|
||||||
trailing_quote = trailing_quote if trailing_quote else leading_quote
|
trailing_quote = trailing_quote if trailing_quote else leading_quote
|
||||||
|
|
||||||
if string.startswith(leading_quote) and string.endswith(trailing_quote):
|
if string.startswith(leading_quote) and string.endswith(trailing_quote):
|
||||||
string = string[1:-1]
|
string = string[1:-1]
|
||||||
|
|
||||||
cleaned_string = string.encode().decode("unicode-escape")
|
# Decode Unicode escape sequences. This avoid issues with encoding
|
||||||
|
# This process does not handle unicode from "\U00010000" to "\U0010FFFF"
|
||||||
return cleaned_string
|
while unicode_seq_pattern.search(string):
|
||||||
|
# Get the first Unicode escape sequence.
|
||||||
|
# mypy: unicode_seq_pattern.search(string) is not None because of the while loop
|
||||||
|
unicode_seq = unicode_seq_pattern.search(string).group(0) # type: ignore
|
||||||
|
# Replace the Unicode escape sequence with the decoded character
|
||||||
|
try:
|
||||||
|
string = string.replace(
|
||||||
|
unicode_seq, unicode_seq.encode("utf-8").decode("unicode-escape")
|
||||||
|
)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Skip decoding if is not possible to decode the Unicode escape sequence
|
||||||
|
break # avoid infinite loop
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
def parse_labels(labels_str: str) -> Dict[str, str]:
|
def parse_labels(labels_str: str) -> Dict[str, str]:
|
||||||
|
@ -212,3 +212,29 @@ def test_unquote_and_decode_unicode_escape_seq():
|
|||||||
expected_output = "No escape sequences here"
|
expected_output = "No escape sequences here"
|
||||||
result = unquote_and_decode_unicode_escape_seq(input_string)
|
result = unquote_and_decode_unicode_escape_seq(input_string)
|
||||||
assert result == expected_output
|
assert result == expected_output
|
||||||
|
|
||||||
|
# Test with invalid Unicode escape sequences
|
||||||
|
input_string = '"No escape \\u123 sequences here"'
|
||||||
|
expected_output = "No escape \\u123 sequences here"
|
||||||
|
result = unquote_and_decode_unicode_escape_seq(input_string)
|
||||||
|
assert result == expected_output
|
||||||
|
|
||||||
|
# Test with a string that has multiple Unicode escape sequences
|
||||||
|
input_string = '"Hello \\u003cWorld\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e"'
|
||||||
|
expected_output = "Hello <World> <Again> <Again> <Again>"
|
||||||
|
result = unquote_and_decode_unicode_escape_seq(input_string)
|
||||||
|
assert result == expected_output
|
||||||
|
|
||||||
|
# Test with a string that has a Unicode escape sequence at the beginning
|
||||||
|
input_string = '"Hello \\utest"'
|
||||||
|
expected_output = "Hello \\utest"
|
||||||
|
result = unquote_and_decode_unicode_escape_seq(input_string)
|
||||||
|
assert result == expected_output
|
||||||
|
|
||||||
|
# Test with special characters
|
||||||
|
input_string = (
|
||||||
|
'"Hello \\u003cWorld\\u003e \\u003cçãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?\\u003e"'
|
||||||
|
)
|
||||||
|
expected_output = "Hello <World> <çãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?>"
|
||||||
|
result = unquote_and_decode_unicode_escape_seq(input_string)
|
||||||
|
assert result == expected_output
|
||||||
|
Loading…
x
Reference in New Issue
Block a user