mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-02 19:58:59 +00:00
fix(hex): filter out queries if non scheduled runs (#13126)
This commit is contained in:
parent
967db2a136
commit
5c7b8e10ce
@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
||||
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
||||
# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
|
||||
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
|
||||
fetched_query_objects: int = 0
|
||||
filtered_out_queries_missing_metadata: int = 0
|
||||
filtered_out_queries_different_workspace: int = 0
|
||||
filtered_out_queries_no_match: int = 0
|
||||
filtered_out_queries_no_subjects: int = 0
|
||||
total_queries: int = 0
|
||||
total_dataset_subjects: int = 0
|
||||
@ -210,6 +212,7 @@ class HexQueryFetcher:
|
||||
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
||||
|
||||
if not match:
|
||||
self.report.filtered_out_queries_no_match += 1
|
||||
return None
|
||||
|
||||
try:
|
||||
|
||||
@ -87,6 +87,28 @@ class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
|
||||
result = self.fetcher._extract_hex_metadata(sql)
|
||||
assert result is None
|
||||
|
||||
def test_extract_hex_metadata_with_non_scheduled_run(self):
|
||||
sql = """
|
||||
select *
|
||||
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||
limit 100
|
||||
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "LOGICAL_VIEW", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||
"""
|
||||
|
||||
result = self.fetcher._extract_hex_metadata(sql)
|
||||
assert result is None
|
||||
|
||||
def test_extract_hex_metadata_with_missing_context(self):
|
||||
sql = """
|
||||
select *
|
||||
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||
limit 100
|
||||
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||
"""
|
||||
|
||||
result = self.fetcher._extract_hex_metadata(sql)
|
||||
assert result is None
|
||||
|
||||
def test_extract_hex_metadata_with_invalid_json(self):
|
||||
# invalid JSON in Hex metadata
|
||||
sql = """
|
||||
@ -157,13 +179,13 @@ class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
|
||||
# complex workspace names and paths
|
||||
urls_to_test = [
|
||||
# URL with hyphens in workspace name
|
||||
"""{"project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
|
||||
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
|
||||
# URL with underscores
|
||||
"""{"project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
|
||||
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
|
||||
# URL with special chars in domain
|
||||
"""{"project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
|
||||
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
|
||||
# URL with long path after /hex/
|
||||
"""{"project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
|
||||
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
|
||||
]
|
||||
|
||||
expected_workspaces = [
|
||||
@ -226,7 +248,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
|
||||
created=AuditStampClass._construct_with_defaults(),
|
||||
lastModified=AuditStampClass._construct_with_defaults(),
|
||||
statement=QueryStatementClass(
|
||||
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
|
||||
value="""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
|
||||
),
|
||||
source=HEX_PLATFORM_URN.urn(),
|
||||
),
|
||||
@ -242,7 +264,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
|
||||
created=AuditStampClass._construct_with_defaults(),
|
||||
lastModified=AuditStampClass._construct_with_defaults(),
|
||||
statement=QueryStatementClass(
|
||||
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
|
||||
value="""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
|
||||
),
|
||||
source=HEX_PLATFORM_URN.urn(),
|
||||
),
|
||||
@ -310,7 +332,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
|
||||
):
|
||||
# force not match in query_urn_2
|
||||
self.entities_data[self.query_urn_2][0].statement.value = ( # type: ignore
|
||||
"""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
|
||||
"""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
|
||||
)
|
||||
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||
mock_fetch_query_entities.return_value = self.entities_data
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user