mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-12 18:47:45 +00:00
feat(ingest): improve query fingerprinting (#12104)
This commit is contained in:
parent
4683bc73a3
commit
e730afdb68
@ -1383,8 +1383,7 @@ class SqlParsingAggregator(Closeable):
|
||||
return QueryUrn(query_id).urn()
|
||||
|
||||
@classmethod
|
||||
def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str:
|
||||
composed_of_queries = list(composed_of_queries)
|
||||
def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
|
||||
combined = json.dumps(composed_of_queries)
|
||||
return f"composite_{generate_hash(combined)}"
|
||||
|
||||
|
||||
@ -121,7 +121,7 @@ _BASIC_NORMALIZATION_RULES = {
|
||||
# Remove /* */ comments.
|
||||
re.compile(r"/\*.*?\*/", re.DOTALL): "",
|
||||
# Remove -- comments.
|
||||
re.compile(r"--.*$"): "",
|
||||
re.compile(r"--.*$", re.MULTILINE): "",
|
||||
# Replace all runs of whitespace with a single space.
|
||||
re.compile(r"\s+"): " ",
|
||||
# Remove leading and trailing whitespace and trailing semicolons.
|
||||
@ -131,10 +131,16 @@ _BASIC_NORMALIZATION_RULES = {
|
||||
# Replace anything that looks like a string with a placeholder.
|
||||
re.compile(r"'[^']*'"): "?",
|
||||
# Replace sequences of IN/VALUES with a single placeholder.
|
||||
re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)",
|
||||
# The r" ?" makes it more robust to uneven spacing.
|
||||
re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
|
||||
# Normalize parenthesis spacing.
|
||||
re.compile(r"\( "): "(",
|
||||
re.compile(r" \)"): ")",
|
||||
# Fix up spaces before commas in column lists.
|
||||
# e.g. "col1 , col2" -> "col1, col2"
|
||||
# e.g. "col1,col2" -> "col1, col2"
|
||||
re.compile(r"\b ,"): ",",
|
||||
re.compile(r"\b,\b"): ", ",
|
||||
}
|
||||
_TABLE_NAME_NORMALIZATION_RULES = {
|
||||
# Replace UUID-like strings with a placeholder (both - and _ variants).
|
||||
|
||||
@ -133,7 +133,7 @@
|
||||
},
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)",
|
||||
"type": "TRANSFORMED",
|
||||
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
|
||||
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
|
||||
}
|
||||
],
|
||||
"fineGrainedLineages": [
|
||||
@ -147,7 +147,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
|
||||
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
|
||||
},
|
||||
{
|
||||
"upstreamType": "FIELD_SET",
|
||||
@ -159,7 +159,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
|
||||
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
|
||||
},
|
||||
{
|
||||
"upstreamType": "FIELD_SET",
|
||||
@ -171,7 +171,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c"
|
||||
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -179,7 +179,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c",
|
||||
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "queryProperties",
|
||||
"aspect": {
|
||||
@ -202,7 +202,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c",
|
||||
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "querySubjects",
|
||||
"aspect": {
|
||||
@ -229,7 +229,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c",
|
||||
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "dataPlatformInstance",
|
||||
"aspect": {
|
||||
|
||||
@ -133,7 +133,7 @@
|
||||
},
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)",
|
||||
"type": "TRANSFORMED",
|
||||
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
|
||||
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
|
||||
}
|
||||
],
|
||||
"fineGrainedLineages": [
|
||||
@ -147,7 +147,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
|
||||
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
|
||||
},
|
||||
{
|
||||
"upstreamType": "FIELD_SET",
|
||||
@ -159,7 +159,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
|
||||
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
|
||||
},
|
||||
{
|
||||
"upstreamType": "FIELD_SET",
|
||||
@ -171,7 +171,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500"
|
||||
"query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -179,7 +179,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500",
|
||||
"entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "queryProperties",
|
||||
"aspect": {
|
||||
@ -202,7 +202,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500",
|
||||
"entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "querySubjects",
|
||||
"aspect": {
|
||||
@ -229,7 +229,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500",
|
||||
"entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "dataPlatformInstance",
|
||||
"aspect": {
|
||||
@ -411,7 +411,7 @@
|
||||
},
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
|
||||
"type": "TRANSFORMED",
|
||||
"query": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68"
|
||||
"query": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904"
|
||||
},
|
||||
{
|
||||
"auditStamp": {
|
||||
@ -432,7 +432,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68",
|
||||
"entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "queryProperties",
|
||||
"aspect": {
|
||||
@ -455,7 +455,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68",
|
||||
"entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "querySubjects",
|
||||
"aspect": {
|
||||
@ -473,7 +473,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68",
|
||||
"entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "dataPlatformInstance",
|
||||
"aspect": {
|
||||
|
||||
@ -133,7 +133,7 @@
|
||||
},
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
|
||||
"type": "TRANSFORMED",
|
||||
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce"
|
||||
"query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
|
||||
},
|
||||
{
|
||||
"auditStamp": {
|
||||
@ -146,7 +146,7 @@
|
||||
},
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)",
|
||||
"type": "TRANSFORMED",
|
||||
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce"
|
||||
"query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
|
||||
}
|
||||
],
|
||||
"fineGrainedLineages": [
|
||||
@ -161,7 +161,7 @@
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
|
||||
],
|
||||
"confidenceScore": 1.0,
|
||||
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce"
|
||||
"query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -169,7 +169,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce",
|
||||
"entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "queryProperties",
|
||||
"aspect": {
|
||||
@ -192,7 +192,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce",
|
||||
"entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "querySubjects",
|
||||
"aspect": {
|
||||
@ -219,7 +219,7 @@
|
||||
},
|
||||
{
|
||||
"entityType": "query",
|
||||
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce",
|
||||
"entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "dataPlatformInstance",
|
||||
"aspect": {
|
||||
|
||||
@ -73,6 +73,12 @@ class QueryGeneralizationTestMode(Enum):
|
||||
"SELECT * FROM foo",
|
||||
QueryGeneralizationTestMode.BOTH,
|
||||
),
|
||||
(
|
||||
"SELECT a\n -- comment--\n,b --another comment\n FROM books",
|
||||
"redshift",
|
||||
"SELECT a, b FROM books",
|
||||
QueryGeneralizationTestMode.BOTH,
|
||||
),
|
||||
# Parameter normalization.
|
||||
(
|
||||
"UPDATE \"books\" SET page_count = page_count + 1, author_count = author_count + 1 WHERE book_title = 'My New Book'",
|
||||
@ -105,6 +111,21 @@ class QueryGeneralizationTestMode(Enum):
|
||||
"INSERT INTO MyTable (Column1, Column2, Column3) VALUES (?)",
|
||||
QueryGeneralizationTestMode.BOTH,
|
||||
),
|
||||
(
|
||||
# Uneven spacing within the IN clause.
|
||||
"SELECT * FROM books WHERE zip_code IN (123,345, 423 )",
|
||||
"redshift",
|
||||
"SELECT * FROM books WHERE zip_code IN (?)",
|
||||
QueryGeneralizationTestMode.BOTH,
|
||||
),
|
||||
# Uneven spacing in the column list.
|
||||
# This isn't perfect e.g. we still have issues with function calls inside selects.
|
||||
(
|
||||
"SELECT a\n ,b FROM books",
|
||||
"redshift",
|
||||
"SELECT a, b FROM books",
|
||||
QueryGeneralizationTestMode.BOTH,
|
||||
),
|
||||
(
|
||||
textwrap.dedent(
|
||||
"""\
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user