Fixed clean_query method for \n (#11389)

* Fixed clean query method

* fixed regex and tests

* updated regex
This commit is contained in:
Onkar Ravgan 2023-05-03 18:08:54 +05:30 committed by GitHub
parent 4ba4bd76c8
commit 7e9c02fe6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 9 deletions

View File

@ -345,15 +345,13 @@ class LineageParser:
replace_by=" ", # remove it as it does not add any value to lineage
)
query_no_linebreaks = insensitive_replace(
raw_str=clean_query.strip(),
to_replace="\n", # remove line breaks
replace_by=" ",
)
clean_query = clean_query.replace("\\n", "\n")
if insensitive_match(query_no_linebreaks, ".*merge into .*when matched.*"):
if insensitive_match(
clean_query, r"\s*/\*.*?\*/\s*merge.*into.*?when matched.*?"
):
clean_query = insensitive_replace(
raw_str=query_no_linebreaks,
raw_str=clean_query,
to_replace="when matched.*", # merge into queries specific
replace_by="", # remove it as LineageRunner is not able to perform the lineage
)

View File

@ -292,7 +292,7 @@ def insensitive_replace(raw_str: str, to_replace: str, replace_by: str) -> str:
A string where the given to_replace is replaced by replace_by in raw_str, ignoring case
"""
return re.sub(to_replace, replace_by, raw_str, flags=re.IGNORECASE)
return re.sub(to_replace, replace_by, raw_str, flags=re.IGNORECASE | re.DOTALL)
def insensitive_match(raw_str: str, to_match: str) -> bool:
@ -306,7 +306,7 @@ def insensitive_match(raw_str: str, to_match: str) -> bool:
True if `to_match` matches in `raw_str`, ignoring case. Otherwise, false.
"""
return re.match(to_match, raw_str, flags=re.IGNORECASE) is not None
return re.match(to_match, raw_str, flags=re.IGNORECASE | re.DOTALL) is not None
def get_entity_tier_from_tags(tags: list[TagLabel]) -> Optional[str]: