fix(hex): filter out queries if non scheduled runs (#13126)

This commit is contained in:
Sergio Gómez Villamor 2025-04-08 20:55:28 +02:00 committed by GitHub
parent 967db2a136
commit 5c7b8e10ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 33 additions and 8 deletions

View File

@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
logger = logging.getLogger(__name__)
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
@dataclass
@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
fetched_query_objects: int = 0
filtered_out_queries_missing_metadata: int = 0
filtered_out_queries_different_workspace: int = 0
filtered_out_queries_no_match: int = 0
filtered_out_queries_no_subjects: int = 0
total_queries: int = 0
total_dataset_subjects: int = 0
@ -210,6 +212,7 @@ class HexQueryFetcher:
match = re.search(HEX_METADATA_PATTERN, sql_statement)
if not match:
self.report.filtered_out_queries_no_match += 1
return None
try:

View File

@ -87,6 +87,28 @@ class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
result = self.fetcher._extract_hex_metadata(sql)
assert result is None
def test_extract_hex_metadata_with_non_scheduled_run(self):
sql = """
select *
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
limit 100
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "LOGICAL_VIEW", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
"""
result = self.fetcher._extract_hex_metadata(sql)
assert result is None
def test_extract_hex_metadata_with_missing_context(self):
sql = """
select *
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
limit 100
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
"""
result = self.fetcher._extract_hex_metadata(sql)
assert result is None
def test_extract_hex_metadata_with_invalid_json(self):
# invalid JSON in Hex metadata
sql = """
@ -157,13 +179,13 @@ class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
# complex workspace names and paths
urls_to_test = [
# URL with hyphens in workspace name
"""{"project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
# URL with underscores
"""{"project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
# URL with special chars in domain
"""{"project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
# URL with long path after /hex/
"""{"project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
]
expected_workspaces = [
@ -226,7 +248,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
created=AuditStampClass._construct_with_defaults(),
lastModified=AuditStampClass._construct_with_defaults(),
statement=QueryStatementClass(
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
value="""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
),
source=HEX_PLATFORM_URN.urn(),
),
@ -242,7 +264,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
created=AuditStampClass._construct_with_defaults(),
lastModified=AuditStampClass._construct_with_defaults(),
statement=QueryStatementClass(
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
value="""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
),
source=HEX_PLATFORM_URN.urn(),
),
@ -310,7 +332,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
):
# force not match in query_urn_2
self.entities_data[self.query_urn_2][0].statement.value = ( # type: ignore
"""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
"""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
)
mock_fetch_query_urns.return_value = [self.query_urn_1]
mock_fetch_query_entities.return_value = self.entities_data