feat(ingest): improve query fingerprinting (#12104)

This commit is contained in:
Harshal Sheth 2024-12-12 16:51:18 -05:00 committed by GitHub
parent 4683bc73a3
commit e730afdb68
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 54 additions and 28 deletions

View File

@ -1383,8 +1383,7 @@ class SqlParsingAggregator(Closeable):
return QueryUrn(query_id).urn() return QueryUrn(query_id).urn()
@classmethod @classmethod
def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str: def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
composed_of_queries = list(composed_of_queries)
combined = json.dumps(composed_of_queries) combined = json.dumps(composed_of_queries)
return f"composite_{generate_hash(combined)}" return f"composite_{generate_hash(combined)}"

View File

@ -121,7 +121,7 @@ _BASIC_NORMALIZATION_RULES = {
# Remove /* */ comments. # Remove /* */ comments.
re.compile(r"/\*.*?\*/", re.DOTALL): "", re.compile(r"/\*.*?\*/", re.DOTALL): "",
# Remove -- comments. # Remove -- comments.
re.compile(r"--.*$"): "", re.compile(r"--.*$", re.MULTILINE): "",
# Replace all runs of whitespace with a single space. # Replace all runs of whitespace with a single space.
re.compile(r"\s+"): " ", re.compile(r"\s+"): " ",
# Remove leading and trailing whitespace and trailing semicolons. # Remove leading and trailing whitespace and trailing semicolons.
@ -131,10 +131,16 @@ _BASIC_NORMALIZATION_RULES = {
# Replace anything that looks like a string with a placeholder. # Replace anything that looks like a string with a placeholder.
re.compile(r"'[^']*'"): "?", re.compile(r"'[^']*'"): "?",
# Replace sequences of IN/VALUES with a single placeholder. # Replace sequences of IN/VALUES with a single placeholder.
re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)", # The r" ?" makes it more robust to uneven spacing.
re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
# Normalize parenthesis spacing. # Normalize parenthesis spacing.
re.compile(r"\( "): "(", re.compile(r"\( "): "(",
re.compile(r" \)"): ")", re.compile(r" \)"): ")",
# Fix up spaces before commas in column lists.
# e.g. "col1 , col2" -> "col1, col2"
# e.g. "col1,col2" -> "col1, col2"
re.compile(r"\b ,"): ",",
re.compile(r"\b,\b"): ", ",
} }
_TABLE_NAME_NORMALIZATION_RULES = { _TABLE_NAME_NORMALIZATION_RULES = {
# Replace UUID-like strings with a placeholder (both - and _ variants). # Replace UUID-like strings with a placeholder (both - and _ variants).

View File

@ -133,7 +133,7 @@
}, },
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)", "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)",
"type": "TRANSFORMED", "type": "TRANSFORMED",
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
} }
], ],
"fineGrainedLineages": [ "fineGrainedLineages": [
@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
}, },
{ {
"upstreamType": "FIELD_SET", "upstreamType": "FIELD_SET",
@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
}, },
{ {
"upstreamType": "FIELD_SET", "upstreamType": "FIELD_SET",
@ -171,7 +171,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c" "query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
} }
] ]
} }
@ -179,7 +179,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "queryProperties", "aspectName": "queryProperties",
"aspect": { "aspect": {
@ -202,7 +202,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "querySubjects", "aspectName": "querySubjects",
"aspect": { "aspect": {
@ -229,7 +229,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:a30d42497a737321ece461fa17344c3ba3588fdee736016acb59a00cec955a0c", "entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "dataPlatformInstance", "aspectName": "dataPlatformInstance",
"aspect": { "aspect": {

View File

@ -133,7 +133,7 @@
}, },
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)", "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_swap,PROD)",
"type": "TRANSFORMED", "type": "TRANSFORMED",
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
} }
], ],
"fineGrainedLineages": [ "fineGrainedLineages": [
@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
}, },
{ {
"upstreamType": "FIELD_SET", "upstreamType": "FIELD_SET",
@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),b)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
}, },
{ {
"upstreamType": "FIELD_SET", "upstreamType": "FIELD_SET",
@ -171,7 +171,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),c)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500" "query": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405"
} }
] ]
} }
@ -179,7 +179,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "queryProperties", "aspectName": "queryProperties",
"aspect": { "aspect": {
@ -202,7 +202,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "querySubjects", "aspectName": "querySubjects",
"aspect": { "aspect": {
@ -229,7 +229,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:3865108263e5f0670e6506f5747392f8315a72039cbfde1c4be4dd9a71bdd500", "entityUrn": "urn:li:query:b256c8cc8f386b209ef8da55485d46c3fbd471b942f804d370e24350b3087405",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "dataPlatformInstance", "aspectName": "dataPlatformInstance",
"aspect": { "aspect": {
@ -411,7 +411,7 @@
}, },
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED", "type": "TRANSFORMED",
"query": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68" "query": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904"
}, },
{ {
"auditStamp": { "auditStamp": {
@ -432,7 +432,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "queryProperties", "aspectName": "queryProperties",
"aspect": { "aspect": {
@ -455,7 +455,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "querySubjects", "aspectName": "querySubjects",
"aspect": { "aspect": {
@ -473,7 +473,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:d29a1c8ed6d4d77efb290260234e5eee56f98311a5631d0a12213798077d1a68", "entityUrn": "urn:li:query:3886d427c84692923797048da6d3991693e89ce44e10d1917c12e8b6fd493904",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "dataPlatformInstance", "aspectName": "dataPlatformInstance",
"aspect": { "aspect": {

View File

@ -133,7 +133,7 @@
}, },
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)", "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD)",
"type": "TRANSFORMED", "type": "TRANSFORMED",
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce" "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
}, },
{ {
"auditStamp": { "auditStamp": {
@ -146,7 +146,7 @@
}, },
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)", "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info_dep,PROD)",
"type": "TRANSFORMED", "type": "TRANSFORMED",
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce" "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
} }
], ],
"fineGrainedLineages": [ "fineGrainedLineages": [
@ -161,7 +161,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)" "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,dev.public.person_info,PROD),a)"
], ],
"confidenceScore": 1.0, "confidenceScore": 1.0,
"query": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce" "query": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3"
} }
] ]
} }
@ -169,7 +169,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce", "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "queryProperties", "aspectName": "queryProperties",
"aspect": { "aspect": {
@ -192,7 +192,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce", "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "querySubjects", "aspectName": "querySubjects",
"aspect": { "aspect": {
@ -219,7 +219,7 @@
}, },
{ {
"entityType": "query", "entityType": "query",
"entityUrn": "urn:li:query:composite_5f9c1232994672c5fb7621f8384f6600b6d4ed5acfccc4eb396fb446b3fb1bce", "entityUrn": "urn:li:query:composite_9e36ef19163461d35b618fd1eea2a3f6a5d10a23a979a6d5ef688b31f277abb3",
"changeType": "UPSERT", "changeType": "UPSERT",
"aspectName": "dataPlatformInstance", "aspectName": "dataPlatformInstance",
"aspect": { "aspect": {

View File

@ -73,6 +73,12 @@ class QueryGeneralizationTestMode(Enum):
"SELECT * FROM foo", "SELECT * FROM foo",
QueryGeneralizationTestMode.BOTH, QueryGeneralizationTestMode.BOTH,
), ),
(
"SELECT a\n -- comment--\n,b --another comment\n FROM books",
"redshift",
"SELECT a, b FROM books",
QueryGeneralizationTestMode.BOTH,
),
# Parameter normalization. # Parameter normalization.
( (
"UPDATE \"books\" SET page_count = page_count + 1, author_count = author_count + 1 WHERE book_title = 'My New Book'", "UPDATE \"books\" SET page_count = page_count + 1, author_count = author_count + 1 WHERE book_title = 'My New Book'",
@ -105,6 +111,21 @@ class QueryGeneralizationTestMode(Enum):
"INSERT INTO MyTable (Column1, Column2, Column3) VALUES (?)", "INSERT INTO MyTable (Column1, Column2, Column3) VALUES (?)",
QueryGeneralizationTestMode.BOTH, QueryGeneralizationTestMode.BOTH,
), ),
(
# Uneven spacing within the IN clause.
"SELECT * FROM books WHERE zip_code IN (123,345, 423 )",
"redshift",
"SELECT * FROM books WHERE zip_code IN (?)",
QueryGeneralizationTestMode.BOTH,
),
# Uneven spacing in the column list.
# This isn't perfect e.g. we still have issues with function calls inside selects.
(
"SELECT a\n ,b FROM books",
"redshift",
"SELECT a, b FROM books",
QueryGeneralizationTestMode.BOTH,
),
( (
textwrap.dedent( textwrap.dedent(
"""\ """\