mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-25 09:00:50 +00:00
fix(tool_meta_extractor): relax hex query detection to search entire query text (#14582)
This commit is contained in:
parent
fe8f108746
commit
67a441f312
@ -208,9 +208,7 @@ class ToolMetaExtractor:
|
||||
Returns:
|
||||
bool: whether QueryLog entry is that of hex.
|
||||
"""
|
||||
last_line = _get_last_line(entry.query_text)
|
||||
|
||||
if not last_line.startswith("-- Hex query metadata:"):
|
||||
if "-- Hex query metadata:" not in entry.query_text:
|
||||
return False
|
||||
|
||||
entry.origin = HEX_PLATFORM_URN
|
||||
|
@ -100,6 +100,114 @@ limit 100
|
||||
assert extractor.report.num_queries_meta_extracted["hex"] == 1
|
||||
|
||||
|
||||
def test_extract_hex_metadata_single_line_at_end() -> None:
|
||||
"""Test that single line queries with hex metadata at the end are detected."""
|
||||
extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
|
||||
hex_query = 'SELECT * FROM "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" LIMIT 100 -- Hex query metadata: {"user": "alice@mail.com"}'
|
||||
|
||||
entry = PreparsedQuery(
|
||||
query_id=None,
|
||||
query_text=hex_query,
|
||||
upstreams=[],
|
||||
downstream=None,
|
||||
column_lineage=None,
|
||||
column_usage=None,
|
||||
inferred_schema=None,
|
||||
user=CorpUserUrn("hexuser"),
|
||||
timestamp=parse_absolute_time("2021-08-01T01:02:03Z"),
|
||||
)
|
||||
|
||||
assert extractor.extract_bi_metadata(entry)
|
||||
assert isinstance(
|
||||
entry.origin, DataPlatformUrn
|
||||
) and entry.origin == Urn.from_string("urn:li:dataPlatform:hex")
|
||||
assert extractor.report.num_queries_meta_extracted["hex"] == 1
|
||||
|
||||
|
||||
def test_extract_hex_metadata_in_middle() -> None:
|
||||
"""Test that hex queries with metadata in the middle of query are detected."""
|
||||
extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
|
||||
hex_query = """\
|
||||
select *
|
||||
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||
-- Hex query metadata: {"user": "alice@mail.com"}
|
||||
limit 100"""
|
||||
|
||||
entry = PreparsedQuery(
|
||||
query_id=None,
|
||||
query_text=hex_query,
|
||||
upstreams=[],
|
||||
downstream=None,
|
||||
column_lineage=None,
|
||||
column_usage=None,
|
||||
inferred_schema=None,
|
||||
user=CorpUserUrn("hexuser"),
|
||||
timestamp=parse_absolute_time("2021-08-01T01:02:03Z"),
|
||||
)
|
||||
|
||||
assert extractor.extract_bi_metadata(entry)
|
||||
assert isinstance(
|
||||
entry.origin, DataPlatformUrn
|
||||
) and entry.origin == Urn.from_string("urn:li:dataPlatform:hex")
|
||||
assert extractor.report.num_queries_meta_extracted["hex"] == 1
|
||||
|
||||
|
||||
def test_extract_hex_metadata_multiline() -> None:
|
||||
"""Test that hex queries work with complex multiline queries."""
|
||||
extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
|
||||
hex_query = """\
|
||||
CREATE TABLE test AS
|
||||
WITH cte AS (
|
||||
SELECT col1, col2
|
||||
FROM source_table
|
||||
-- Hex query metadata: {"user": "alice@mail.com"}
|
||||
WHERE date_col > '2023-01-01'
|
||||
)
|
||||
SELECT * FROM cte"""
|
||||
|
||||
entry = PreparsedQuery(
|
||||
query_id=None,
|
||||
query_text=hex_query,
|
||||
upstreams=[],
|
||||
downstream=None,
|
||||
column_lineage=None,
|
||||
column_usage=None,
|
||||
inferred_schema=None,
|
||||
user=CorpUserUrn("hexuser"),
|
||||
timestamp=parse_absolute_time("2021-08-01T01:02:03Z"),
|
||||
)
|
||||
|
||||
assert extractor.extract_bi_metadata(entry)
|
||||
assert isinstance(
|
||||
entry.origin, DataPlatformUrn
|
||||
) and entry.origin == Urn.from_string("urn:li:dataPlatform:hex")
|
||||
assert extractor.report.num_queries_meta_extracted["hex"] == 1
|
||||
|
||||
|
||||
def test_extract_hex_metadata_without_dashes_not_detected() -> None:
|
||||
"""Test that queries with 'Hex query metadata:' but without dashes are not detected."""
|
||||
extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
|
||||
query = """\
|
||||
select * from table
|
||||
Hex query metadata: {"user": "alice@mail.com"}"""
|
||||
|
||||
entry = PreparsedQuery(
|
||||
query_id=None,
|
||||
query_text=query,
|
||||
upstreams=[],
|
||||
downstream=None,
|
||||
column_lineage=None,
|
||||
column_usage=None,
|
||||
inferred_schema=None,
|
||||
user=CorpUserUrn("hexuser"),
|
||||
timestamp=parse_absolute_time("2021-08-01T01:02:03Z"),
|
||||
)
|
||||
|
||||
assert not extractor.extract_bi_metadata(entry)
|
||||
assert not entry.origin
|
||||
assert extractor.report.num_queries_meta_extracted["hex"] == 0
|
||||
|
||||
|
||||
def test_extract_no_metadata() -> None:
|
||||
extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
|
||||
query = """\
|
||||
|
Loading…
x
Reference in New Issue
Block a user