mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-08 15:30:55 +00:00
fix(hex): filter out queries if non scheduled runs (#13126)
This commit is contained in:
parent
967db2a136
commit
5c7b8e10ce
@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
||||||
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
|
||||||
|
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
|
|||||||
fetched_query_objects: int = 0
|
fetched_query_objects: int = 0
|
||||||
filtered_out_queries_missing_metadata: int = 0
|
filtered_out_queries_missing_metadata: int = 0
|
||||||
filtered_out_queries_different_workspace: int = 0
|
filtered_out_queries_different_workspace: int = 0
|
||||||
|
filtered_out_queries_no_match: int = 0
|
||||||
filtered_out_queries_no_subjects: int = 0
|
filtered_out_queries_no_subjects: int = 0
|
||||||
total_queries: int = 0
|
total_queries: int = 0
|
||||||
total_dataset_subjects: int = 0
|
total_dataset_subjects: int = 0
|
||||||
@ -210,6 +212,7 @@ class HexQueryFetcher:
|
|||||||
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
||||||
|
|
||||||
if not match:
|
if not match:
|
||||||
|
self.report.filtered_out_queries_no_match += 1
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -87,6 +87,28 @@ class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
|
|||||||
result = self.fetcher._extract_hex_metadata(sql)
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_non_scheduled_run(self):
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "LOGICAL_VIEW", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_missing_context(self):
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
def test_extract_hex_metadata_with_invalid_json(self):
|
def test_extract_hex_metadata_with_invalid_json(self):
|
||||||
# invalid JSON in Hex metadata
|
# invalid JSON in Hex metadata
|
||||||
sql = """
|
sql = """
|
||||||
@ -157,13 +179,13 @@ class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
|
|||||||
# complex workspace names and paths
|
# complex workspace names and paths
|
||||||
urls_to_test = [
|
urls_to_test = [
|
||||||
# URL with hyphens in workspace name
|
# URL with hyphens in workspace name
|
||||||
"""{"project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
|
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
|
||||||
# URL with underscores
|
# URL with underscores
|
||||||
"""{"project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
|
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
|
||||||
# URL with special chars in domain
|
# URL with special chars in domain
|
||||||
"""{"project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
|
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
|
||||||
# URL with long path after /hex/
|
# URL with long path after /hex/
|
||||||
"""{"project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
|
"""{"context": "SCHEDULED_RUN", "project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
|
||||||
]
|
]
|
||||||
|
|
||||||
expected_workspaces = [
|
expected_workspaces = [
|
||||||
@ -226,7 +248,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
|
|||||||
created=AuditStampClass._construct_with_defaults(),
|
created=AuditStampClass._construct_with_defaults(),
|
||||||
lastModified=AuditStampClass._construct_with_defaults(),
|
lastModified=AuditStampClass._construct_with_defaults(),
|
||||||
statement=QueryStatementClass(
|
statement=QueryStatementClass(
|
||||||
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
|
value="""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
|
||||||
),
|
),
|
||||||
source=HEX_PLATFORM_URN.urn(),
|
source=HEX_PLATFORM_URN.urn(),
|
||||||
),
|
),
|
||||||
@ -242,7 +264,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
|
|||||||
created=AuditStampClass._construct_with_defaults(),
|
created=AuditStampClass._construct_with_defaults(),
|
||||||
lastModified=AuditStampClass._construct_with_defaults(),
|
lastModified=AuditStampClass._construct_with_defaults(),
|
||||||
statement=QueryStatementClass(
|
statement=QueryStatementClass(
|
||||||
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
|
value="""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
|
||||||
),
|
),
|
||||||
source=HEX_PLATFORM_URN.urn(),
|
source=HEX_PLATFORM_URN.urn(),
|
||||||
),
|
),
|
||||||
@ -310,7 +332,7 @@ class TestHexQueryFetcherFetch(unittest.TestCase):
|
|||||||
):
|
):
|
||||||
# force not match in query_urn_2
|
# force not match in query_urn_2
|
||||||
self.entities_data[self.query_urn_2][0].statement.value = ( # type: ignore
|
self.entities_data[self.query_urn_2][0].statement.value = ( # type: ignore
|
||||||
"""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
|
"""SELECT * FROM table -- Hex query metadata: {"context": "SCHEDULED_RUN", "project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
|
||||||
)
|
)
|
||||||
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||||
mock_fetch_query_entities.return_value = self.entities_data
|
mock_fetch_query_entities.return_value = self.entities_data
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user