mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-11 08:54:00 +00:00
feat(ingest): new hex connector - part 2 (#12985)
This commit is contained in:
parent
7618af549c
commit
d2bb33f7c5
@ -20,4 +20,12 @@ Currently, the [Hex API](https://learn.hex.tech/docs/api/api-reference) has some
|
|||||||
|
|
||||||
2. **Metadata Access**: There is no direct method to retrieve metadata for Collections, Status, or Categories. This information is only available indirectly through references within Projects and Components.
|
2. **Metadata Access**: There is no direct method to retrieve metadata for Collections, Status, or Categories. This information is only available indirectly through references within Projects and Components.
|
||||||
|
|
||||||
Please keep these limitations in mind when working with the Hex connector.
|
Please keep these limitations in mind when working with the Hex connector.
|
||||||
|
|
||||||
|
For the Dataset - Hex Project lineage, the connector relies on the
|
||||||
|
[_Hex query metadata_](https://learn.hex.tech/docs/explore-data/cells/sql-cells/sql-cells-introduction#query-metadata) feature.
|
||||||
|
Therefore, in order to extract lineage information, the required setup must include:
|
||||||
|
|
||||||
|
- A separated warehouse ingestor (_eg_ BigQuery, Snowflake, Redshift, ...) with `use_queries_v2` enabled in order to fetch Queries.
|
||||||
|
This will ingest the queries into DataHub as `Query` entities and the ones triggered by Hex will include the corresponding _Hex query metadata_.
|
||||||
|
- A DataHub server with version >= SaaS `0.3.10` or > OSS `1.0.0` so the `Query` entities are properly indexed by source (Hex in this case) and so fetched and processed by the Hex ingestor in order to emit the Dataset - Project lineage.
|
@ -1,3 +1,8 @@
|
|||||||
|
from datahub.metadata.urns import DataPlatformUrn
|
||||||
|
|
||||||
HEX_PLATFORM_NAME = "hex"
|
HEX_PLATFORM_NAME = "hex"
|
||||||
|
HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
|
||||||
HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
|
HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
|
||||||
HEX_API_PAGE_SIZE_DEFAULT = 100
|
HEX_API_PAGE_SIZE_DEFAULT = 100
|
||||||
|
|
||||||
|
DATAHUB_API_PAGE_SIZE_DEFAULT = 100
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
from pydantic import Field, SecretStr
|
from pydantic import Field, SecretStr, root_validator
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from datahub.configuration.common import AllowDenyPattern
|
from datahub.configuration.common import AllowDenyPattern
|
||||||
|
from datahub.configuration.datetimes import parse_user_datetime
|
||||||
from datahub.configuration.source_common import (
|
from datahub.configuration.source_common import (
|
||||||
EnvConfigMixin,
|
EnvConfigMixin,
|
||||||
PlatformInstanceConfigMixin,
|
PlatformInstanceConfigMixin,
|
||||||
@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.ingestion.source.hex.api import HexApi, HexApiReport
|
from datahub.ingestion.source.hex.api import HexApi, HexApiReport
|
||||||
from datahub.ingestion.source.hex.constants import (
|
from datahub.ingestion.source.hex.constants import (
|
||||||
|
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
||||||
HEX_API_BASE_URL_DEFAULT,
|
HEX_API_BASE_URL_DEFAULT,
|
||||||
HEX_API_PAGE_SIZE_DEFAULT,
|
HEX_API_PAGE_SIZE_DEFAULT,
|
||||||
HEX_PLATFORM_NAME,
|
HEX_PLATFORM_NAME,
|
||||||
)
|
)
|
||||||
from datahub.ingestion.source.hex.mapper import Mapper
|
from datahub.ingestion.source.hex.mapper import Mapper
|
||||||
from datahub.ingestion.source.hex.model import Component, Project
|
from datahub.ingestion.source.hex.model import Component, Project
|
||||||
|
from datahub.ingestion.source.hex.query_fetcher import (
|
||||||
|
HexQueryFetcher,
|
||||||
|
HexQueryFetcherReport,
|
||||||
|
)
|
||||||
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
||||||
StaleEntityRemovalHandler,
|
StaleEntityRemovalHandler,
|
||||||
StaleEntityRemovalSourceReport,
|
StaleEntityRemovalSourceReport,
|
||||||
@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|||||||
)
|
)
|
||||||
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
||||||
StatefulIngestionConfigBase,
|
StatefulIngestionConfigBase,
|
||||||
StatefulIngestionReport,
|
|
||||||
StatefulIngestionSourceBase,
|
StatefulIngestionSourceBase,
|
||||||
)
|
)
|
||||||
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
||||||
|
from datahub.sdk.main_client import DataHubClient
|
||||||
|
|
||||||
|
|
||||||
class HexSourceConfig(
|
class HexSourceConfig(
|
||||||
@ -93,9 +102,73 @@ class HexSourceConfig(
|
|||||||
default=True,
|
default=True,
|
||||||
description="Set ownership identity from owner/creator email",
|
description="Set ownership identity from owner/creator email",
|
||||||
)
|
)
|
||||||
|
include_lineage: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
|
||||||
|
)
|
||||||
|
lineage_start_time: Optional[datetime] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
|
||||||
|
)
|
||||||
|
lineage_end_time: Optional[datetime] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
|
||||||
|
)
|
||||||
|
datahub_page_size: int = Field(
|
||||||
|
default=DATAHUB_API_PAGE_SIZE_DEFAULT,
|
||||||
|
description="Number of items to fetch per DataHub API call.",
|
||||||
|
)
|
||||||
|
|
||||||
|
@root_validator(pre=True)
|
||||||
|
def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
# lineage_end_time default = now
|
||||||
|
if "lineage_end_time" not in data or data["lineage_end_time"] is None:
|
||||||
|
data["lineage_end_time"] = datetime.now(tz=timezone.utc)
|
||||||
|
# if string is given, parse it
|
||||||
|
if isinstance(data["lineage_end_time"], str):
|
||||||
|
data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
|
||||||
|
# if no timezone is given, assume UTC
|
||||||
|
if data["lineage_end_time"].tzinfo is None:
|
||||||
|
data["lineage_end_time"] = data["lineage_end_time"].replace(
|
||||||
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
|
# at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
|
||||||
|
assert (
|
||||||
|
data["lineage_end_time"]
|
||||||
|
and isinstance(data["lineage_end_time"], datetime)
|
||||||
|
and data["lineage_end_time"].tzinfo is not None
|
||||||
|
and data["lineage_end_time"].tzinfo == timezone.utc
|
||||||
|
)
|
||||||
|
|
||||||
|
# lineage_start_time default = lineage_end_time - 1 day
|
||||||
|
if "lineage_start_time" not in data or data["lineage_start_time"] is None:
|
||||||
|
data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
|
||||||
|
# if string is given, parse it
|
||||||
|
if isinstance(data["lineage_start_time"], str):
|
||||||
|
data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
|
||||||
|
# if no timezone is given, assume UTC
|
||||||
|
if data["lineage_start_time"].tzinfo is None:
|
||||||
|
data["lineage_start_time"] = data["lineage_start_time"].replace(
|
||||||
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
|
# at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
|
||||||
|
assert (
|
||||||
|
data["lineage_start_time"]
|
||||||
|
and isinstance(data["lineage_start_time"], datetime)
|
||||||
|
and data["lineage_start_time"].tzinfo is not None
|
||||||
|
and data["lineage_start_time"].tzinfo == timezone.utc
|
||||||
|
)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
|
@dataclass
|
||||||
|
class HexReport(
|
||||||
|
StaleEntityRemovalSourceReport,
|
||||||
|
HexApiReport,
|
||||||
|
IngestionStageReport,
|
||||||
|
HexQueryFetcherReport,
|
||||||
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
|
|||||||
def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
|
def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
|
||||||
super().__init__(config, ctx)
|
super().__init__(config, ctx)
|
||||||
self.source_config = config
|
self.source_config = config
|
||||||
self.report = HexReport()
|
self.report: HexReport = HexReport()
|
||||||
self.platform = HEX_PLATFORM_NAME
|
self.platform = HEX_PLATFORM_NAME
|
||||||
self.hex_api = HexApi(
|
self.hex_api = HexApi(
|
||||||
report=self.report,
|
report=self.report,
|
||||||
@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
|
|||||||
categories_as_tags=self.source_config.categories_as_tags,
|
categories_as_tags=self.source_config.categories_as_tags,
|
||||||
set_ownership_from_email=self.source_config.set_ownership_from_email,
|
set_ownership_from_email=self.source_config.set_ownership_from_email,
|
||||||
)
|
)
|
||||||
|
self.project_registry: Dict[str, Project] = {}
|
||||||
|
self.component_registry: Dict[str, Component] = {}
|
||||||
|
|
||||||
|
self.datahub_client: Optional[DataHubClient] = None
|
||||||
|
self.query_fetcher: Optional[HexQueryFetcher] = None
|
||||||
|
if self.source_config.include_lineage:
|
||||||
|
graph = ctx.require_graph("Lineage")
|
||||||
|
assert self.source_config.lineage_start_time and isinstance(
|
||||||
|
self.source_config.lineage_start_time, datetime
|
||||||
|
)
|
||||||
|
assert self.source_config.lineage_end_time and isinstance(
|
||||||
|
self.source_config.lineage_end_time, datetime
|
||||||
|
)
|
||||||
|
self.datahub_client = DataHubClient(graph=graph)
|
||||||
|
self.query_fetcher = HexQueryFetcher(
|
||||||
|
datahub_client=self.datahub_client,
|
||||||
|
workspace_name=self.source_config.workspace_name,
|
||||||
|
start_datetime=self.source_config.lineage_start_time,
|
||||||
|
end_datetime=self.source_config.lineage_end_time,
|
||||||
|
report=self.report,
|
||||||
|
page_size=self.source_config.datahub_page_size,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
|
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
|
||||||
@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
|
|||||||
).workunit_processor,
|
).workunit_processor,
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_report(self) -> StatefulIngestionReport:
|
def get_report(self) -> HexReport:
|
||||||
return self.report
|
return self.report
|
||||||
|
|
||||||
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
||||||
yield from self.mapper.map_workspace()
|
with self.report.new_stage("Fetch Hex assets from Hex API"):
|
||||||
|
for project_or_component in self.hex_api.fetch_projects():
|
||||||
for project_or_component in self.hex_api.fetch_projects():
|
if isinstance(project_or_component, Project):
|
||||||
if isinstance(project_or_component, Project):
|
if self.source_config.project_title_pattern.allowed(
|
||||||
if self.source_config.project_title_pattern.allowed(
|
|
||||||
project_or_component.title
|
|
||||||
):
|
|
||||||
yield from self.mapper.map_project(project=project_or_component)
|
|
||||||
elif isinstance(project_or_component, Component):
|
|
||||||
if (
|
|
||||||
self.source_config.include_components
|
|
||||||
and self.source_config.component_title_pattern.allowed(
|
|
||||||
project_or_component.title
|
project_or_component.title
|
||||||
)
|
):
|
||||||
):
|
self.project_registry[project_or_component.id] = (
|
||||||
yield from self.mapper.map_component(component=project_or_component)
|
project_or_component
|
||||||
else:
|
)
|
||||||
assert_never(project_or_component)
|
elif isinstance(project_or_component, Component):
|
||||||
|
if (
|
||||||
|
self.source_config.include_components
|
||||||
|
and self.source_config.component_title_pattern.allowed(
|
||||||
|
project_or_component.title
|
||||||
|
)
|
||||||
|
):
|
||||||
|
self.component_registry[project_or_component.id] = (
|
||||||
|
project_or_component
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert_never(project_or_component)
|
||||||
|
|
||||||
|
if self.source_config.include_lineage:
|
||||||
|
assert self.datahub_client and self.query_fetcher
|
||||||
|
|
||||||
|
with self.report.new_stage(
|
||||||
|
"Fetch Hex lineage from existing Queries in DataHub"
|
||||||
|
):
|
||||||
|
for query_metadata in self.query_fetcher.fetch():
|
||||||
|
project = self.project_registry.get(query_metadata.hex_project_id)
|
||||||
|
if project:
|
||||||
|
project.upstream_datasets.extend(
|
||||||
|
query_metadata.dataset_subjects
|
||||||
|
)
|
||||||
|
project.upstream_schema_fields.extend(
|
||||||
|
query_metadata.schema_field_subjects
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.report.report_warning(
|
||||||
|
title="Missing project for lineage",
|
||||||
|
message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
|
||||||
|
context=str(query_metadata),
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.report.new_stage("Emit"):
|
||||||
|
yield from self.mapper.map_workspace()
|
||||||
|
|
||||||
|
for project in self.project_registry.values():
|
||||||
|
yield from self.mapper.map_project(project=project)
|
||||||
|
for component in self.component_registry.values():
|
||||||
|
yield from self.mapper.map_component(component=component)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterable, List, Optional, Tuple
|
from typing import Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from datahub._codegen.aspect import (
|
from datahub._codegen.aspect import (
|
||||||
_Aspect, # TODO: is there a better import than this one?
|
_Aspect, # TODO: is there a better import than this one?
|
||||||
@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
|
|||||||
DashboardInfoClass,
|
DashboardInfoClass,
|
||||||
DashboardUsageStatisticsClass,
|
DashboardUsageStatisticsClass,
|
||||||
DataPlatformInstanceClass,
|
DataPlatformInstanceClass,
|
||||||
|
EdgeClass,
|
||||||
GlobalTagsClass,
|
GlobalTagsClass,
|
||||||
OwnerClass,
|
OwnerClass,
|
||||||
OwnershipClass,
|
OwnershipClass,
|
||||||
@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
|
|||||||
TagAssociationClass,
|
TagAssociationClass,
|
||||||
TimeWindowSizeClass,
|
TimeWindowSizeClass,
|
||||||
)
|
)
|
||||||
from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
|
from datahub.metadata.urns import (
|
||||||
|
ContainerUrn,
|
||||||
|
CorpUserUrn,
|
||||||
|
DashboardUrn,
|
||||||
|
DatasetUrn,
|
||||||
|
SchemaFieldUrn,
|
||||||
|
Urn,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -116,6 +124,8 @@ class Mapper:
|
|||||||
),
|
),
|
||||||
externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
|
externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
|
||||||
customProperties=dict(id=project.id),
|
customProperties=dict(id=project.id),
|
||||||
|
datasetEdges=self._dataset_edges(project.upstream_datasets),
|
||||||
|
# TODO: support schema field upstream, maybe InputFields?
|
||||||
)
|
)
|
||||||
|
|
||||||
subtypes = SubTypesClass(
|
subtypes = SubTypesClass(
|
||||||
@ -343,6 +353,22 @@ class Mapper:
|
|||||||
else None,
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _dataset_edges(
|
||||||
|
self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
|
||||||
|
) -> Optional[List[EdgeClass]]:
|
||||||
|
# TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
|
||||||
|
return (
|
||||||
|
[
|
||||||
|
EdgeClass(
|
||||||
|
destinationUrn=upstream_urn.urn(),
|
||||||
|
)
|
||||||
|
for upstream_urn in upstream
|
||||||
|
if isinstance(upstream_urn, DatasetUrn)
|
||||||
|
]
|
||||||
|
if upstream
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
def _yield_mcps(
|
def _yield_mcps(
|
||||||
self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
|
self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -51,6 +53,12 @@ class Project:
|
|||||||
creator: Optional[Owner] = None
|
creator: Optional[Owner] = None
|
||||||
owner: Optional[Owner] = None
|
owner: Optional[Owner] = None
|
||||||
analytics: Optional[Analytics] = None
|
analytics: Optional[Analytics] = None
|
||||||
|
upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
|
||||||
|
default_factory=list
|
||||||
|
)
|
||||||
|
upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
|
||||||
|
default_factory=list
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -0,0 +1,297 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
from datahub.ingestion.api.source import SourceReport
|
||||||
|
from datahub.ingestion.source.hex.constants import (
|
||||||
|
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
||||||
|
HEX_PLATFORM_URN,
|
||||||
|
)
|
||||||
|
from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
|
||||||
|
from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
|
||||||
|
from datahub.sdk.main_client import DataHubClient
|
||||||
|
from datahub.sdk.search_filters import FilterDsl as F
|
||||||
|
from datahub.utilities.time import datetime_to_ts_millis
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
||||||
|
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QueryResponse:
|
||||||
|
"""This is the public response model for the HexQueryFetcher."""
|
||||||
|
|
||||||
|
urn: QueryUrn
|
||||||
|
hex_project_id: str
|
||||||
|
dataset_subjects: List[DatasetUrn] = field(default_factory=list)
|
||||||
|
schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HexQueryFetcherReport(SourceReport):
|
||||||
|
start_datetime: Optional[datetime] = None
|
||||||
|
end_datetime: Optional[datetime] = None
|
||||||
|
fetched_query_urns: int = 0
|
||||||
|
fetched_query_objects: int = 0
|
||||||
|
filtered_out_queries_missing_metadata: int = 0
|
||||||
|
filtered_out_queries_different_workspace: int = 0
|
||||||
|
filtered_out_queries_no_subjects: int = 0
|
||||||
|
total_queries: int = 0
|
||||||
|
total_dataset_subjects: int = 0
|
||||||
|
total_schema_field_subjects: int = 0
|
||||||
|
num_calls_fetch_query_entities: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class HexQueryFetcher:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
datahub_client: DataHubClient,
|
||||||
|
workspace_name: str,
|
||||||
|
start_datetime: datetime,
|
||||||
|
end_datetime: datetime,
|
||||||
|
report: HexQueryFetcherReport,
|
||||||
|
page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
|
||||||
|
):
|
||||||
|
self.datahub_client = datahub_client
|
||||||
|
self.workspace_name = workspace_name
|
||||||
|
self.start_datetime = start_datetime
|
||||||
|
self.end_datetime = end_datetime
|
||||||
|
self.report = report
|
||||||
|
self.page_size = page_size
|
||||||
|
|
||||||
|
self.report.start_datetime = start_datetime
|
||||||
|
self.report.end_datetime = end_datetime
|
||||||
|
|
||||||
|
def fetch(self) -> Iterable[QueryResponse]:
|
||||||
|
try:
|
||||||
|
query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
|
||||||
|
assert all(isinstance(urn, QueryUrn) for urn in query_urns)
|
||||||
|
self.report.fetched_query_urns = len(query_urns)
|
||||||
|
|
||||||
|
entities_by_urn = self._fetch_query_entities(query_urns)
|
||||||
|
self.report.fetched_query_objects = len(entities_by_urn)
|
||||||
|
except Exception as e:
|
||||||
|
self.report.failure(
|
||||||
|
title="Error fetching Queries for lineage",
|
||||||
|
message="Error fetching Queries will result on missing lineage",
|
||||||
|
context=str(
|
||||||
|
dict(
|
||||||
|
workspace_name=self.workspace_name,
|
||||||
|
start_datetime=self.start_datetime,
|
||||||
|
end_datetime=self.end_datetime,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
exc=e,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if not query_urns or not entities_by_urn:
|
||||||
|
self.report.warning(
|
||||||
|
title="No Queries found with Hex as origin",
|
||||||
|
message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
|
||||||
|
context=str(
|
||||||
|
dict(
|
||||||
|
workspace_name=self.workspace_name,
|
||||||
|
start_datetime=self.start_datetime,
|
||||||
|
end_datetime=self.end_datetime,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
for query_urn, (
|
||||||
|
query_properties,
|
||||||
|
query_subjects,
|
||||||
|
) in entities_by_urn.items():
|
||||||
|
maybe_query_response = self._build_query_response(
|
||||||
|
query_urn=query_urn,
|
||||||
|
query_properties=query_properties,
|
||||||
|
query_subjects=query_subjects,
|
||||||
|
)
|
||||||
|
if maybe_query_response:
|
||||||
|
yield maybe_query_response
|
||||||
|
|
||||||
|
def _fetch_query_entities(
|
||||||
|
self, query_urns: List[QueryUrn]
|
||||||
|
) -> Dict[
|
||||||
|
QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
|
||||||
|
]:
|
||||||
|
entities_by_urn: Dict[
|
||||||
|
QueryUrn,
|
||||||
|
Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
|
||||||
|
] = {}
|
||||||
|
for i in range(0, len(query_urns), self.page_size):
|
||||||
|
batch = query_urns[i : i + self.page_size]
|
||||||
|
|
||||||
|
logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
|
||||||
|
entities = self.datahub_client._graph.get_entities(
|
||||||
|
entity_name=QueryUrn.ENTITY_TYPE,
|
||||||
|
urns=[urn.urn() for urn in batch],
|
||||||
|
aspects=[
|
||||||
|
QueryPropertiesClass.ASPECT_NAME,
|
||||||
|
QuerySubjectsClass.ASPECT_NAME,
|
||||||
|
],
|
||||||
|
with_system_metadata=False,
|
||||||
|
)
|
||||||
|
self.report.num_calls_fetch_query_entities += 1
|
||||||
|
logger.debug(f"Get entities response: {entities}")
|
||||||
|
|
||||||
|
for urn, entity in entities.items():
|
||||||
|
query_urn = QueryUrn.from_string(urn)
|
||||||
|
|
||||||
|
properties_tuple = entity.get(
|
||||||
|
QueryPropertiesClass.ASPECT_NAME, (None, None)
|
||||||
|
)
|
||||||
|
query_properties: Optional[QueryPropertiesClass] = None
|
||||||
|
if properties_tuple and properties_tuple[0]:
|
||||||
|
assert isinstance(properties_tuple[0], QueryPropertiesClass)
|
||||||
|
query_properties = properties_tuple[0]
|
||||||
|
|
||||||
|
subjects_tuple = entity.get(
|
||||||
|
QuerySubjectsClass.ASPECT_NAME, (None, None)
|
||||||
|
)
|
||||||
|
query_subjects: Optional[QuerySubjectsClass] = None
|
||||||
|
if subjects_tuple and subjects_tuple[0]:
|
||||||
|
assert isinstance(subjects_tuple[0], QuerySubjectsClass)
|
||||||
|
query_subjects = subjects_tuple[0]
|
||||||
|
|
||||||
|
entities_by_urn[query_urn] = (query_properties, query_subjects)
|
||||||
|
|
||||||
|
return entities_by_urn
|
||||||
|
|
||||||
|
def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
|
||||||
|
last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
|
||||||
|
last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
|
||||||
|
|
||||||
|
urns = self.datahub_client.search.get_urns(
|
||||||
|
filter=F.and_(
|
||||||
|
F.entity_type(QueryUrn.ENTITY_TYPE),
|
||||||
|
F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
|
||||||
|
F.custom_filter(
|
||||||
|
"lastModifiedAt",
|
||||||
|
"GREATER_THAN_OR_EQUAL_TO",
|
||||||
|
[str(last_modified_start_at_millis)],
|
||||||
|
),
|
||||||
|
F.custom_filter(
|
||||||
|
"lastModifiedAt",
|
||||||
|
"LESS_THAN_OR_EQUAL_TO",
|
||||||
|
[str(last_modified_end_at_millis)],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
logger.debug(f"Get URNS by filter: {urns}")
|
||||||
|
return [QueryUrn.from_string(urn.urn()) for urn in urns]
|
||||||
|
|
||||||
|
def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
Extract project ID and workspace name from SQL statement.
|
||||||
|
|
||||||
|
Looks for Hex metadata in SQL comments in the format:
|
||||||
|
-- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
|
||||||
|
|
||||||
|
Example:
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
|
||||||
|
|
||||||
|
# TODO: Consider supporting multiline metadata format in the future:
|
||||||
|
# -- Hex query metadata: {
|
||||||
|
# -- "categories": ["Scratchpad"],
|
||||||
|
# -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
|
||||||
|
# -- ...
|
||||||
|
# -- }
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple of (project_id, workspace_name) if both are successfully extracted
|
||||||
|
None if extraction fails for any reason
|
||||||
|
"""
|
||||||
|
# Extract both project_id and workspace name in a single regex operation
|
||||||
|
match = re.search(HEX_METADATA_PATTERN, sql_statement)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
project_id = match.group(1)
|
||||||
|
workspace_name = match.group(2)
|
||||||
|
return project_id, workspace_name
|
||||||
|
except (IndexError, AttributeError) as e:
|
||||||
|
self.report.warning(
|
||||||
|
title="Failed to extract information from Hex query metadata",
|
||||||
|
message="Failed to extract information from Hex query metadata will result on missing lineage",
|
||||||
|
context=sql_statement,
|
||||||
|
exc=e,
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _build_query_response(
|
||||||
|
self,
|
||||||
|
query_urn: QueryUrn,
|
||||||
|
query_properties: Optional[QueryPropertiesClass],
|
||||||
|
query_subjects: Optional[QuerySubjectsClass],
|
||||||
|
) -> Optional[QueryResponse]:
|
||||||
|
# Skip if missing required aspects
|
||||||
|
if (
|
||||||
|
not query_properties
|
||||||
|
or not query_properties.statement
|
||||||
|
or not query_properties.statement.value
|
||||||
|
or not query_subjects
|
||||||
|
or query_subjects.subjects is None # empty list is allowed
|
||||||
|
):
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
|
||||||
|
)
|
||||||
|
self.report.filtered_out_queries_missing_metadata += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Extract hex metadata (project_id and workspace_name)
|
||||||
|
metadata_result = self._extract_hex_metadata(query_properties.statement.value)
|
||||||
|
if not metadata_result:
|
||||||
|
logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
|
||||||
|
self.report.filtered_out_queries_missing_metadata += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
hex_project_id, workspace_from_url = metadata_result
|
||||||
|
|
||||||
|
# Validate workspace
|
||||||
|
if workspace_from_url != self.workspace_name:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
|
||||||
|
)
|
||||||
|
self.report.filtered_out_queries_different_workspace += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Extract subjects
|
||||||
|
dataset_subjects: List[DatasetUrn] = []
|
||||||
|
schema_field_subjects: List[SchemaFieldUrn] = []
|
||||||
|
for subject in query_subjects.subjects:
|
||||||
|
if subject.entity and subject.entity.startswith("urn:li:dataset:"):
|
||||||
|
dataset_subjects.append(DatasetUrn.from_string(subject.entity))
|
||||||
|
elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
|
||||||
|
schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
|
||||||
|
|
||||||
|
if not dataset_subjects and not schema_field_subjects:
|
||||||
|
self.report.filtered_out_queries_no_subjects += 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Create response
|
||||||
|
response = QueryResponse(
|
||||||
|
urn=query_urn,
|
||||||
|
hex_project_id=hex_project_id,
|
||||||
|
dataset_subjects=dataset_subjects,
|
||||||
|
schema_field_subjects=schema_field_subjects,
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
|
||||||
|
)
|
||||||
|
self.report.total_queries += 1
|
||||||
|
self.report.total_dataset_subjects += len(dataset_subjects)
|
||||||
|
self.report.total_schema_field_subjects += len(schema_field_subjects)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Processed query {query_urn} with Hex project ID {hex_project_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
@ -0,0 +1,47 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"urn": "urn:li:query:307d80903ebbc165944c52ae79efaeb1736f9bb37d9f8fa48fc0af69d725413f",
|
||||||
|
"queryKey": {
|
||||||
|
"value": {
|
||||||
|
"id": "307d80903ebbc165944c52ae79efaeb1736f9bb37d9f8fa48fc0af69d725413f"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"querySubjects": {
|
||||||
|
"value": {
|
||||||
|
"subjects": [
|
||||||
|
{
|
||||||
|
"entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_orders,PROD)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD),age_m)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD),age_y)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"queryProperties": {
|
||||||
|
"value": {
|
||||||
|
"statement": {
|
||||||
|
"value": "select * from \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"PET_DETAILS\" left outer join \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"PET_ORDERS\" limit 100\n-- Hex query metadata: {\"categories\": [\"Scratchpad\"], \"cell_type\": \"SQL\", \"connection\": \"Long Tail Companions\", \"context\": \"SCHEDULED_RUN\", \"project_id\": \"d73da67d-c87b-4dd8-9e7f-b79cb7f822cf\", \"project_name\": \"PlayNotebook\", \"project_url\": \"https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=2520f866-8bb6-43dc-8d52-8f522a938b9c\", \"status\": \"In development\", \"trace_id\": \"0195cd3713e6700088f1535fa4c874a5\", \"user_email\": \"alice@email.com\"}",
|
||||||
|
"language": "SQL"
|
||||||
|
},
|
||||||
|
"customProperties": {},
|
||||||
|
"source": "SYSTEM",
|
||||||
|
"lastModified": {
|
||||||
|
"actor": "urn:li:corpuser:sf_hex_user",
|
||||||
|
"time": 1742904697868
|
||||||
|
},
|
||||||
|
"created": {
|
||||||
|
"actor": "urn:li:corpuser:sf_hex_user",
|
||||||
|
"time": 1742904697868
|
||||||
|
},
|
||||||
|
"origin": "urn:li:dataPlatform:hex"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,44 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"urn": "urn:li:query:87fe9c2def1a5b7932ec9f12c4b55a56dcae6a29fa1d5b78902e34e73abcd123",
|
||||||
|
"queryKey": {
|
||||||
|
"value": {
|
||||||
|
"id": "87fe9c2def1a5b7932ec9f12c4b55a56dcae6a29fa1d5b78902e34e73abcd123"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"querySubjects": {
|
||||||
|
"value": {
|
||||||
|
"subjects": [
|
||||||
|
{
|
||||||
|
"entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.visit_core,PROD)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.visit_details,PROD)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.visit_more_details,PROD)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"queryProperties": {
|
||||||
|
"value": {
|
||||||
|
"statement": {
|
||||||
|
"value": "select * from \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"VISIT_core\" join \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"VISIT_DETAILS\" join \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"VISIT_MORE_DETAILS\" limit 100\n-- Hex query metadata: {\"categories\": [\"Scratchpad\"], \"cell_type\": \"SQL\", \"connection\": \"Long Tail Companions\", \"context\": \"SCHEDULED_RUN\", \"project_id\": \"2ef730de-25ec-4131-94af-3517e743a738\", \"project_name\": \"Welcome to Hex!\", \"project_url\": \"https://app.hex.tech/some-hex-workspace/hex/2ef730de-25ec-4131-94af-3517e743a738/draft/logic?selectedCellId=3630g977-9cc7-54ed-9e63-9g633b049c1e\", \"status\": \"In development\", \"trace_id\": \"0195cd3713e6700088f1535fa4c874a6\", \"user_email\": \"bob@email.com\"}",
|
||||||
|
"language": "SQL"
|
||||||
|
},
|
||||||
|
"customProperties": {},
|
||||||
|
"source": "SYSTEM",
|
||||||
|
"lastModified": {
|
||||||
|
"actor": "urn:li:corpuser:sf_hex_user",
|
||||||
|
"time": 1742904697969
|
||||||
|
},
|
||||||
|
"created": {
|
||||||
|
"actor": "urn:li:corpuser:sf_hex_user",
|
||||||
|
"time": 1742904697969
|
||||||
|
},
|
||||||
|
"origin": "urn:li:dataPlatform:hex"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"scrollAcrossEntities": {
|
||||||
|
"nextScrollId": "page_2_scroll_id",
|
||||||
|
"searchResults": [
|
||||||
|
{
|
||||||
|
"entity": {
|
||||||
|
"urn": "urn:li:query:307d80903ebbc165944c52ae79efaeb1736f9bb37d9f8fa48fc0af69d725413f"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"scrollAcrossEntities": {
|
||||||
|
"nextScrollId": null,
|
||||||
|
"searchResults": [
|
||||||
|
{
|
||||||
|
"entity": {
|
||||||
|
"urn": "urn:li:query:87fe9c2def1a5b7932ec9f12c4b55a56dcae6a29fa1d5b78902e34e73abcd123"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -7,10 +7,27 @@ services:
|
|||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
volumes:
|
volumes:
|
||||||
- ./hex_projects_response.json:/app/hex_projects_response.json
|
- ./hex_projects_response.json:/app/hex_projects_response.json
|
||||||
- ./mock_server.py:/app/mock_server.py
|
- ./mock_hex_server.py:/app/mock_hex_server.py
|
||||||
command: ["python", "/app/mock_server.py"]
|
command: ["python", "/app/mock_hex_server.py"]
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
datahub-mock-api:
|
||||||
|
image: python:3.9-alpine
|
||||||
|
container_name: datahub-mock-api
|
||||||
|
ports:
|
||||||
|
- "8010:8010"
|
||||||
|
volumes:
|
||||||
|
- ./datahub_entities_v3_page1.json:/app/datahub_entities_v3_page1.json
|
||||||
|
- ./datahub_entities_v3_page2.json:/app/datahub_entities_v3_page2.json
|
||||||
|
- ./datahub_get_urns_by_filter_page1.json:/app/datahub_get_urns_by_filter_page1.json
|
||||||
|
- ./datahub_get_urns_by_filter_page2.json:/app/datahub_get_urns_by_filter_page2.json
|
||||||
|
- ./mock_datahub_server.py:/app/mock_datahub_server.py
|
||||||
|
command: ["python", "/app/mock_datahub_server.py"]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8010/health"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
retries: 3
|
retries: 3
|
118
metadata-ingestion/tests/integration/hex/docker/mock_datahub_server.py
Executable file
118
metadata-ingestion/tests/integration/hex/docker/mock_datahub_server.py
Executable file
@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Mock DataHub server that handles both GET and POST requests and supports pagination
|
||||||
|
"""
|
||||||
|
|
||||||
|
import http.server
|
||||||
|
import json
|
||||||
|
import socketserver
|
||||||
|
from http import HTTPStatus
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
PORT = 8010
|
||||||
|
|
||||||
|
# Load the mock response data
|
||||||
|
with open("/app/datahub_entities_v3_page1.json", "r") as f:
|
||||||
|
ENTITIES_V3_PAGE1_RESPONSE = f.read()
|
||||||
|
|
||||||
|
with open("/app/datahub_entities_v3_page2.json", "r") as f:
|
||||||
|
ENTITIES_V3_PAGE2_RESPONSE = f.read()
|
||||||
|
|
||||||
|
with open("/app/datahub_get_urns_by_filter_page1.json", "r") as f:
|
||||||
|
URNS_BY_FILTER_PAGE1_RESPONSE = f.read()
|
||||||
|
|
||||||
|
with open("/app/datahub_get_urns_by_filter_page2.json", "r") as f:
|
||||||
|
URNS_BY_FILTER_PAGE2_RESPONSE = f.read()
|
||||||
|
|
||||||
|
# Global state flag to track if first page has been requested
|
||||||
|
FIRST_ENTITIES_PAGE_REQUESTED = False
|
||||||
|
|
||||||
|
|
||||||
|
class MockDataHubAPIHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
# Global state flag to track if first page has been requested accross all instances; one instance per request
|
||||||
|
first_entities_page_requested = False
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
parsed_url = urlparse(self.path)
|
||||||
|
path = parsed_url.path
|
||||||
|
|
||||||
|
# Health check endpoint
|
||||||
|
if path == "/health":
|
||||||
|
self.send_response(HTTPStatus.OK)
|
||||||
|
self.send_header("Content-type", "text/plain")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(b"OK")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Mock DataHub API endpoints
|
||||||
|
if path.startswith("/config"):
|
||||||
|
self.send_response(HTTPStatus.OK)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(json.dumps(dict(noCode="true")).encode())
|
||||||
|
return
|
||||||
|
|
||||||
|
# Default 404 response
|
||||||
|
self.send_response(HTTPStatus.NOT_FOUND)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(json.dumps({"error": "Not found", "path": self.path}).encode())
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
parsed_url = urlparse(self.path)
|
||||||
|
path = parsed_url.path
|
||||||
|
|
||||||
|
# Get request body
|
||||||
|
content_length = int(self.headers["Content-Length"])
|
||||||
|
post_data = self.rfile.read(content_length)
|
||||||
|
request_body = json.loads(post_data)
|
||||||
|
|
||||||
|
if path == "/openapi/v3/entity/query/batchGet":
|
||||||
|
self.send_response(HTTPStatus.OK)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
# Return the appropriate page of entity data in V3 format
|
||||||
|
if not MockDataHubAPIHandler.first_entities_page_requested:
|
||||||
|
self.wfile.write(ENTITIES_V3_PAGE1_RESPONSE.encode())
|
||||||
|
MockDataHubAPIHandler.first_entities_page_requested = True
|
||||||
|
else:
|
||||||
|
self.wfile.write(ENTITIES_V3_PAGE2_RESPONSE.encode())
|
||||||
|
return
|
||||||
|
|
||||||
|
if path == "/api/graphql":
|
||||||
|
self.send_response(HTTPStatus.OK)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.send_header("Access-Control-Allow-Origin", "*")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
# Check if this is a scroll query with nextScrollId
|
||||||
|
scroll_id = None
|
||||||
|
if "variables" in request_body:
|
||||||
|
scroll_id = request_body.get("variables", {}).get("scrollId")
|
||||||
|
|
||||||
|
if scroll_id == "page_2_scroll_id":
|
||||||
|
self.wfile.write(URNS_BY_FILTER_PAGE2_RESPONSE.encode())
|
||||||
|
else:
|
||||||
|
self.wfile.write(URNS_BY_FILTER_PAGE1_RESPONSE.encode())
|
||||||
|
return
|
||||||
|
|
||||||
|
# Default 404 response
|
||||||
|
self.send_response(HTTPStatus.NOT_FOUND)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(
|
||||||
|
json.dumps(
|
||||||
|
{"error": "Not found", "path": self.path, "method": "POST"}
|
||||||
|
).encode()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Set up the server
|
||||||
|
handler = MockDataHubAPIHandler
|
||||||
|
httpd = socketserver.TCPServer(("", PORT), handler)
|
||||||
|
|
||||||
|
print(f"Serving mock DataHub API at port {PORT}")
|
||||||
|
httpd.serve_forever()
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,6 @@ from datahub.ingestion.run.pipeline import Pipeline
|
|||||||
from tests.test_helpers import mce_helpers
|
from tests.test_helpers import mce_helpers
|
||||||
from tests.test_helpers.docker_helpers import wait_for_port
|
from tests.test_helpers.docker_helpers import wait_for_port
|
||||||
|
|
||||||
# Test resources and constants
|
|
||||||
FROZEN_TIME = "2025-03-25 12:00:00"
|
FROZEN_TIME = "2025-03-25 12:00:00"
|
||||||
|
|
||||||
pytestmark = pytest.mark.integration_batch_2
|
pytestmark = pytest.mark.integration_batch_2
|
||||||
@ -17,10 +16,10 @@ def test_resources_dir(pytestconfig):
|
|||||||
return pytestconfig.rootpath / "tests/integration/hex"
|
return pytestconfig.rootpath / "tests/integration/hex"
|
||||||
|
|
||||||
|
|
||||||
def is_hex_mock_api_up(container_name: str) -> bool:
|
def is_mock_api_up(port: int) -> bool:
|
||||||
"""Check if the mock API server is up and running"""
|
"""Check if the mock API server is up and running"""
|
||||||
try:
|
try:
|
||||||
response = requests.get("http://localhost:8000/health")
|
response = requests.get(f"http://localhost:{port}/health")
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return True
|
return True
|
||||||
except (requests.RequestException, ConnectionError):
|
except (requests.RequestException, ConnectionError):
|
||||||
@ -40,7 +39,14 @@ def hex_mock_api_runner(docker_compose_runner, test_resources_dir):
|
|||||||
"hex-mock-api",
|
"hex-mock-api",
|
||||||
8000,
|
8000,
|
||||||
timeout=30,
|
timeout=30,
|
||||||
checker=lambda: is_hex_mock_api_up("hex-mock-api"),
|
checker=lambda: is_mock_api_up(8000),
|
||||||
|
)
|
||||||
|
wait_for_port(
|
||||||
|
docker_services,
|
||||||
|
"datahub-mock-api",
|
||||||
|
8010,
|
||||||
|
timeout=30,
|
||||||
|
checker=lambda: is_mock_api_up(8010),
|
||||||
)
|
)
|
||||||
yield docker_services
|
yield docker_services
|
||||||
|
|
||||||
@ -48,7 +54,6 @@ def hex_mock_api_runner(docker_compose_runner, test_resources_dir):
|
|||||||
@freeze_time(FROZEN_TIME)
|
@freeze_time(FROZEN_TIME)
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_hex_ingestion(pytestconfig, hex_mock_api_runner, test_resources_dir, tmp_path):
|
def test_hex_ingestion(pytestconfig, hex_mock_api_runner, test_resources_dir, tmp_path):
|
||||||
"""Test Hex metadata ingestion using a mock API server."""
|
|
||||||
# Path for the golden file
|
# Path for the golden file
|
||||||
golden_dir = test_resources_dir / "golden"
|
golden_dir = test_resources_dir / "golden"
|
||||||
golden_path = golden_dir / "hex_mce_golden.json"
|
golden_path = golden_dir / "hex_mce_golden.json"
|
||||||
@ -56,14 +61,67 @@ def test_hex_ingestion(pytestconfig, hex_mock_api_runner, test_resources_dir, tm
|
|||||||
# Create the pipeline
|
# Create the pipeline
|
||||||
pipeline = Pipeline.create(
|
pipeline = Pipeline.create(
|
||||||
{
|
{
|
||||||
"run_id": "hex-test",
|
"pipeline_name": "test-hex",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "hex",
|
"type": "hex",
|
||||||
"config": {
|
"config": {
|
||||||
"workspace_name": "test-workspace",
|
"workspace_name": "test-workspace",
|
||||||
"token": "test-token",
|
"token": "test-token",
|
||||||
"base_url": "http://localhost:8000/api/v1", # Mock API URL
|
"base_url": "http://localhost:8000/api/v1", # Mock Hex API URL
|
||||||
"platform_instance": "hex_test",
|
"platform_instance": "hex_test",
|
||||||
|
"include_lineage": False,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"sink": {
|
||||||
|
"type": "file",
|
||||||
|
"config": {
|
||||||
|
"filename": f"{tmp_path}/hex_mces.json",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run the pipeline
|
||||||
|
pipeline.run()
|
||||||
|
pipeline.raise_from_status()
|
||||||
|
|
||||||
|
# Check against golden file
|
||||||
|
mce_helpers.check_golden_file(
|
||||||
|
pytestconfig,
|
||||||
|
output_path=f"{tmp_path}/hex_mces.json",
|
||||||
|
golden_path=golden_path,
|
||||||
|
ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@freeze_time(FROZEN_TIME)
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_hex_ingestion_with_lineage(
|
||||||
|
pytestconfig, hex_mock_api_runner, test_resources_dir, tmp_path
|
||||||
|
):
|
||||||
|
# Path for the golden file
|
||||||
|
golden_dir = test_resources_dir / "golden"
|
||||||
|
golden_path = golden_dir / "hex_mce_golden_with_lineage.json"
|
||||||
|
|
||||||
|
# Create the pipeline
|
||||||
|
pipeline = Pipeline.create(
|
||||||
|
{
|
||||||
|
"pipeline_name": "test-hex-with-lineage",
|
||||||
|
"datahub_api": {
|
||||||
|
"server": "http://localhost:8010", # Mock DataHub API URL
|
||||||
|
},
|
||||||
|
"source": {
|
||||||
|
"type": "hex",
|
||||||
|
"config": {
|
||||||
|
"workspace_name": "some-hex-workspace",
|
||||||
|
"token": "test-token",
|
||||||
|
"base_url": "http://localhost:8000/api/v1", # Mock Hex API URL
|
||||||
|
"platform_instance": "hex_test",
|
||||||
|
"include_lineage": True,
|
||||||
|
"datahub_page_size": 1, # Force pagination
|
||||||
|
"stateful_ingestion": {
|
||||||
|
"enabled": False,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"sink": {
|
"sink": {
|
||||||
|
157
metadata-ingestion/tests/unit/hex/test_hex.py
Normal file
157
metadata-ingestion/tests/unit/hex/test_hex.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
import unittest
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
|
||||||
|
from datahub.ingestion.source.hex.hex import HexSourceConfig
|
||||||
|
|
||||||
|
|
||||||
|
def datetime_approx_equal(
|
||||||
|
dt1: datetime, dt2: datetime, tolerance_seconds: int = 5
|
||||||
|
) -> bool:
|
||||||
|
if dt1.tzinfo is None:
|
||||||
|
dt1 = dt1.replace(tzinfo=timezone.utc)
|
||||||
|
if dt2.tzinfo is None:
|
||||||
|
dt2 = dt2.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
diff = abs((dt1 - dt2).total_seconds())
|
||||||
|
return diff <= tolerance_seconds
|
||||||
|
|
||||||
|
|
||||||
|
class TestHexSourceConfig(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.minimum_input_config = {
|
||||||
|
"workspace_name": "test-workspace",
|
||||||
|
"token": "test-token",
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_required_fields(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
input_config = {**self.minimum_input_config}
|
||||||
|
del input_config["workspace_name"]
|
||||||
|
HexSourceConfig.parse_obj(input_config)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
input_config = {**self.minimum_input_config}
|
||||||
|
del input_config["token"]
|
||||||
|
HexSourceConfig.parse_obj(input_config)
|
||||||
|
|
||||||
|
def test_minimum_config(self):
|
||||||
|
config = HexSourceConfig.parse_obj(self.minimum_input_config)
|
||||||
|
|
||||||
|
assert config
|
||||||
|
assert config.workspace_name == "test-workspace"
|
||||||
|
assert config.token.get_secret_value() == "test-token"
|
||||||
|
|
||||||
|
def test_lineage_config(self):
|
||||||
|
config = HexSourceConfig.parse_obj(self.minimum_input_config)
|
||||||
|
assert config and config.include_lineage
|
||||||
|
|
||||||
|
input_config = {**self.minimum_input_config, "include_lineage": False}
|
||||||
|
config = HexSourceConfig.parse_obj(input_config)
|
||||||
|
assert config and not config.include_lineage
|
||||||
|
|
||||||
|
# default values for lineage_start_time and lineage_end_time
|
||||||
|
config = HexSourceConfig.parse_obj(self.minimum_input_config)
|
||||||
|
assert (
|
||||||
|
config.lineage_start_time
|
||||||
|
and isinstance(config.lineage_start_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_start_time,
|
||||||
|
datetime.now(tz=timezone.utc) - timedelta(days=1),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
config.lineage_end_time
|
||||||
|
and isinstance(config.lineage_end_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_end_time, datetime.now(tz=timezone.utc)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# set values for lineage_start_time and lineage_end_time
|
||||||
|
input_config = {
|
||||||
|
**self.minimum_input_config,
|
||||||
|
"lineage_start_time": "2025-03-24 12:00:00",
|
||||||
|
"lineage_end_time": "2025-03-25 12:00:00",
|
||||||
|
}
|
||||||
|
config = HexSourceConfig.parse_obj(input_config)
|
||||||
|
assert (
|
||||||
|
config.lineage_start_time
|
||||||
|
and isinstance(config.lineage_start_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_start_time,
|
||||||
|
datetime(2025, 3, 24, 12, 0, 0, tzinfo=timezone.utc),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
config.lineage_end_time
|
||||||
|
and isinstance(config.lineage_end_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_end_time,
|
||||||
|
datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# set lineage_end_time only
|
||||||
|
input_config = {
|
||||||
|
**self.minimum_input_config,
|
||||||
|
"lineage_end_time": "2025-03-25 12:00:00",
|
||||||
|
}
|
||||||
|
config = HexSourceConfig.parse_obj(input_config)
|
||||||
|
assert (
|
||||||
|
config.lineage_start_time
|
||||||
|
and isinstance(config.lineage_start_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_start_time,
|
||||||
|
datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc)
|
||||||
|
- timedelta(days=1),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
config.lineage_end_time
|
||||||
|
and isinstance(config.lineage_end_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_end_time,
|
||||||
|
datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# set lineage_start_time only
|
||||||
|
input_config = {
|
||||||
|
**self.minimum_input_config,
|
||||||
|
"lineage_start_time": "2025-03-25 12:00:00",
|
||||||
|
}
|
||||||
|
config = HexSourceConfig.parse_obj(input_config)
|
||||||
|
assert (
|
||||||
|
config.lineage_start_time
|
||||||
|
and isinstance(config.lineage_start_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_start_time,
|
||||||
|
datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
config.lineage_end_time
|
||||||
|
and isinstance(config.lineage_end_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_end_time, datetime.now(tz=timezone.utc)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# set relative times for lineage_start_time and lineage_end_time
|
||||||
|
input_config = {
|
||||||
|
**self.minimum_input_config,
|
||||||
|
"lineage_start_time": "-3day",
|
||||||
|
"lineage_end_time": "now",
|
||||||
|
}
|
||||||
|
config = HexSourceConfig.parse_obj(input_config)
|
||||||
|
assert (
|
||||||
|
config.lineage_start_time
|
||||||
|
and isinstance(config.lineage_start_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_start_time,
|
||||||
|
datetime.now(tz=timezone.utc) - timedelta(days=3),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
config.lineage_end_time
|
||||||
|
and isinstance(config.lineage_end_time, datetime)
|
||||||
|
and datetime_approx_equal(
|
||||||
|
config.lineage_end_time, datetime.now(tz=timezone.utc)
|
||||||
|
)
|
||||||
|
)
|
@ -792,3 +792,95 @@ class TestMapper(unittest.TestCase):
|
|||||||
assert (
|
assert (
|
||||||
dashboard_urn.urn() == "urn:li:dashboard:(hex,test-platform.dashboard_name)"
|
dashboard_urn.urn() == "urn:li:dashboard:(hex,test-platform.dashboard_name)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_dataset_edges(self):
|
||||||
|
from datahub.metadata.schema_classes import EdgeClass
|
||||||
|
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
|
||||||
|
|
||||||
|
mapper = Mapper(
|
||||||
|
workspace_name=self.workspace_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with empty list
|
||||||
|
edges = mapper._dataset_edges([])
|
||||||
|
assert not edges
|
||||||
|
|
||||||
|
# Test with only DatasetUrns
|
||||||
|
dataset_urn1 = DatasetUrn(
|
||||||
|
platform="snowflake",
|
||||||
|
name="test-dataset-1",
|
||||||
|
)
|
||||||
|
dataset_urn2 = DatasetUrn(
|
||||||
|
platform="bigquery",
|
||||||
|
name="test-dataset-2",
|
||||||
|
)
|
||||||
|
|
||||||
|
edges = mapper._dataset_edges([dataset_urn1, dataset_urn2])
|
||||||
|
assert edges and len(edges) == 2
|
||||||
|
assert all(isinstance(edge, EdgeClass) for edge in edges)
|
||||||
|
assert edges[0].destinationUrn == dataset_urn1.urn()
|
||||||
|
assert edges[1].destinationUrn == dataset_urn2.urn()
|
||||||
|
|
||||||
|
# Test with mixed DatasetUrns and SchemaFieldUrns - should filter out SchemaFieldUrns
|
||||||
|
schema_field_urn = SchemaFieldUrn(
|
||||||
|
parent=dataset_urn1,
|
||||||
|
field_path="test.field.path",
|
||||||
|
)
|
||||||
|
|
||||||
|
edges = mapper._dataset_edges([dataset_urn1, schema_field_urn, dataset_urn2])
|
||||||
|
assert edges and len(edges) == 2 # SchemaFieldUrn should be filtered out
|
||||||
|
assert edges[0].destinationUrn == dataset_urn1.urn()
|
||||||
|
assert edges[1].destinationUrn == dataset_urn2.urn()
|
||||||
|
|
||||||
|
def test_map_project_with_upstream_datasets(self):
|
||||||
|
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
|
||||||
|
|
||||||
|
# Create a project with upstream datasets
|
||||||
|
dataset_urn1 = DatasetUrn(
|
||||||
|
platform="snowflake",
|
||||||
|
name="test-dataset-1",
|
||||||
|
)
|
||||||
|
dataset_urn2 = DatasetUrn(
|
||||||
|
platform="bigquery",
|
||||||
|
name="test-dataset-2",
|
||||||
|
)
|
||||||
|
schema_field_urn = SchemaFieldUrn(
|
||||||
|
parent=dataset_urn1,
|
||||||
|
field_path="test.field.path",
|
||||||
|
)
|
||||||
|
|
||||||
|
project = Project(
|
||||||
|
id="uuid1",
|
||||||
|
title="Test Project With Lineage",
|
||||||
|
description="A test project with upstream datasets",
|
||||||
|
created_at=self.created_at,
|
||||||
|
last_edited_at=self.last_edited_at,
|
||||||
|
status=Status(name="Published"),
|
||||||
|
creator=Owner(email="creator@example.com"),
|
||||||
|
owner=Owner(email="owner@example.com"),
|
||||||
|
upstream_datasets=[dataset_urn1, schema_field_urn, dataset_urn2],
|
||||||
|
)
|
||||||
|
|
||||||
|
mapper = Mapper(
|
||||||
|
workspace_name=self.workspace_name,
|
||||||
|
patch_metadata=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
work_units = list(mapper.map_project(project))
|
||||||
|
|
||||||
|
dashboard_info_wus = [
|
||||||
|
wu for wu in work_units if wu.get_aspect_of_type(DashboardInfoClass)
|
||||||
|
]
|
||||||
|
assert len(dashboard_info_wus) == 1
|
||||||
|
dashboard_info = dashboard_info_wus[0].get_aspect_of_type(DashboardInfoClass)
|
||||||
|
|
||||||
|
# Verify dataset edges
|
||||||
|
assert (
|
||||||
|
dashboard_info
|
||||||
|
and dashboard_info.datasetEdges
|
||||||
|
and len(dashboard_info.datasetEdges) == 2
|
||||||
|
)
|
||||||
|
edge_urns = [edge.destinationUrn for edge in dashboard_info.datasetEdges]
|
||||||
|
assert dataset_urn1.urn() in edge_urns
|
||||||
|
assert dataset_urn2.urn() in edge_urns
|
||||||
|
assert schema_field_urn.urn() not in edge_urns # Should be filtered out
|
||||||
|
386
metadata-ingestion/tests/unit/hex/test_query_fetcher.py
Normal file
386
metadata-ingestion/tests/unit/hex/test_query_fetcher.py
Normal file
@ -0,0 +1,386 @@
|
|||||||
|
import unittest
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Dict, Optional, Tuple
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from datahub.ingestion.source.hex.constants import HEX_PLATFORM_URN
|
||||||
|
from datahub.ingestion.source.hex.query_fetcher import (
|
||||||
|
HexQueryFetcher,
|
||||||
|
HexQueryFetcherReport,
|
||||||
|
QueryResponse,
|
||||||
|
)
|
||||||
|
from datahub.metadata.schema_classes import (
|
||||||
|
AuditStampClass,
|
||||||
|
QueryPropertiesClass,
|
||||||
|
QueryStatementClass,
|
||||||
|
QuerySubjectClass,
|
||||||
|
QuerySubjectsClass,
|
||||||
|
)
|
||||||
|
from datahub.metadata.urns import DatasetUrn, QueryUrn
|
||||||
|
|
||||||
|
|
||||||
|
class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
|
||||||
|
"""Test cases for HexQueryFetcher._extract_hex_metadata method"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.mock_client = MagicMock()
|
||||||
|
self.workspace_name = "some-hex-workspace"
|
||||||
|
self.start_datetime = datetime(2023, 1, 1)
|
||||||
|
self.report = HexQueryFetcherReport()
|
||||||
|
self.fetcher = HexQueryFetcher(
|
||||||
|
datahub_client=self.mock_client,
|
||||||
|
workspace_name=self.workspace_name,
|
||||||
|
start_datetime=self.start_datetime,
|
||||||
|
end_datetime=self.start_datetime - timedelta(days=1),
|
||||||
|
report=self.report,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_matching_workspace(self):
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is not None
|
||||||
|
project_id, workspace_name = result
|
||||||
|
assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
|
||||||
|
assert workspace_name == "some-hex-workspace"
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_non_matching_workspace(self):
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/different-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is not None
|
||||||
|
project_id, workspace_name = result
|
||||||
|
assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
|
||||||
|
assert workspace_name == "different-workspace"
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_without_url_returns_none(self):
|
||||||
|
# missing project_url
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_no_metadata(self):
|
||||||
|
# no Hex metadata
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- This is a regular comment
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_invalid_json(self):
|
||||||
|
# invalid JSON in Hex metadata
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", INVALID_JSON}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_missing_project_id(self):
|
||||||
|
# missing project_id
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_invalid_url_format_returns_none(self):
|
||||||
|
# invalid URL format in project_url
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://invalid-url-format/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_custom_domain(self):
|
||||||
|
# custom domain in project_url (single-tenant deployment)
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://my-hex-instance.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is not None
|
||||||
|
project_id, workspace_name = result
|
||||||
|
assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
|
||||||
|
assert workspace_name == "some-hex-workspace"
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_http_protocol(self):
|
||||||
|
# HTTP protocol (not HTTPS)
|
||||||
|
sql = """
|
||||||
|
select *
|
||||||
|
from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS"
|
||||||
|
limit 100
|
||||||
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "http://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is not None
|
||||||
|
project_id, workspace_name = result
|
||||||
|
assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
|
||||||
|
assert workspace_name == "some-hex-workspace"
|
||||||
|
|
||||||
|
def test_extract_hex_metadata_with_complex_urls(self):
|
||||||
|
# complex workspace names and paths
|
||||||
|
urls_to_test = [
|
||||||
|
# URL with hyphens in workspace name
|
||||||
|
"""{"project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
|
||||||
|
# URL with underscores
|
||||||
|
"""{"project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
|
||||||
|
# URL with special chars in domain
|
||||||
|
"""{"project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
|
||||||
|
# URL with long path after /hex/
|
||||||
|
"""{"project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
|
||||||
|
]
|
||||||
|
|
||||||
|
expected_workspaces = [
|
||||||
|
"my-complex-workspace-name",
|
||||||
|
"workspace_with_underscores",
|
||||||
|
"some-hex-workspace",
|
||||||
|
"some-hex-workspace",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, url_json in enumerate(urls_to_test):
|
||||||
|
sql = f"""
|
||||||
|
select * from table
|
||||||
|
-- Hex query metadata: {url_json}
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = self.fetcher._extract_hex_metadata(sql)
|
||||||
|
assert result is not None, (
|
||||||
|
f"Failed to extract metadata from URL: {url_json}"
|
||||||
|
)
|
||||||
|
project_id, workspace_name = result
|
||||||
|
assert project_id == "123"
|
||||||
|
assert workspace_name == expected_workspaces[i], (
|
||||||
|
f"Expected workspace {expected_workspaces[i]} but got {workspace_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHexQueryFetcherFetch(unittest.TestCase):
|
||||||
|
"""Test cases for the HexQueryFetcher.fetch method"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.mock_client = MagicMock()
|
||||||
|
self.workspace_name = "workspace1"
|
||||||
|
self.start_datetime = datetime(2023, 1, 1)
|
||||||
|
self.report = HexQueryFetcherReport()
|
||||||
|
|
||||||
|
self.fetcher = HexQueryFetcher(
|
||||||
|
datahub_client=self.mock_client,
|
||||||
|
workspace_name=self.workspace_name,
|
||||||
|
start_datetime=self.start_datetime,
|
||||||
|
end_datetime=self.start_datetime - timedelta(days=1),
|
||||||
|
report=self.report,
|
||||||
|
)
|
||||||
|
|
||||||
|
# valid test data
|
||||||
|
self.query_urn_1 = QueryUrn.from_string("urn:li:query:query1")
|
||||||
|
self.query_urn_2 = QueryUrn.from_string("urn:li:query:query2")
|
||||||
|
self.dataset_urn_1 = DatasetUrn.from_string(
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:snowflake,table1,PROD)"
|
||||||
|
)
|
||||||
|
self.dataset_urn_2 = DatasetUrn.from_string(
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:snowflake,table1,PROD)"
|
||||||
|
)
|
||||||
|
# self.entities_data matches the return type of HexQueryFetcher._fetch_query_entities
|
||||||
|
self.entities_data: Dict[
|
||||||
|
QueryUrn,
|
||||||
|
Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
|
||||||
|
] = {
|
||||||
|
self.query_urn_1: (
|
||||||
|
QueryPropertiesClass(
|
||||||
|
created=AuditStampClass._construct_with_defaults(),
|
||||||
|
lastModified=AuditStampClass._construct_with_defaults(),
|
||||||
|
statement=QueryStatementClass(
|
||||||
|
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
|
||||||
|
),
|
||||||
|
source=HEX_PLATFORM_URN.urn(),
|
||||||
|
),
|
||||||
|
QuerySubjectsClass(
|
||||||
|
subjects=[
|
||||||
|
QuerySubjectClass(entity=self.dataset_urn_1.urn()),
|
||||||
|
QuerySubjectClass(entity=self.dataset_urn_2.urn()),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
),
|
||||||
|
self.query_urn_2: (
|
||||||
|
QueryPropertiesClass(
|
||||||
|
created=AuditStampClass._construct_with_defaults(),
|
||||||
|
lastModified=AuditStampClass._construct_with_defaults(),
|
||||||
|
statement=QueryStatementClass(
|
||||||
|
value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
|
||||||
|
),
|
||||||
|
source=HEX_PLATFORM_URN.urn(),
|
||||||
|
),
|
||||||
|
QuerySubjectsClass(
|
||||||
|
subjects=[QuerySubjectClass(entity=self.dataset_urn_1.urn())]
|
||||||
|
),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
|
||||||
|
)
|
||||||
|
def test_fetch_with_valid_data(
|
||||||
|
self, mock_fetch_query_entities, mock_fetch_query_urns
|
||||||
|
):
|
||||||
|
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||||
|
mock_fetch_query_entities.return_value = self.entities_data
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 2
|
||||||
|
assert all(isinstance(qr, QueryResponse) for qr in results)
|
||||||
|
assert results[0].urn == self.query_urn_1
|
||||||
|
assert results[0].hex_project_id == "project1"
|
||||||
|
assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
|
||||||
|
assert results[1].urn == self.query_urn_2
|
||||||
|
assert results[1].hex_project_id == "project2"
|
||||||
|
assert results[1].dataset_subjects == [self.dataset_urn_1]
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
|
||||||
|
)
|
||||||
|
def test_fetch_with_missing_hex_query_metadata(
|
||||||
|
self, mock_fetch_query_entities, mock_fetch_query_urns
|
||||||
|
):
|
||||||
|
# force fail in query_urn_2
|
||||||
|
self.entities_data[self.query_urn_2][0].statement.value = ( # type: ignore
|
||||||
|
"SELECT * FROM table -- IT'S MISSING HERE"
|
||||||
|
)
|
||||||
|
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||||
|
mock_fetch_query_entities.return_value = self.entities_data
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
assert all(isinstance(qr, QueryResponse) for qr in results)
|
||||||
|
assert results[0].urn == self.query_urn_1
|
||||||
|
assert results[0].hex_project_id == "project1"
|
||||||
|
assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
|
||||||
|
)
|
||||||
|
def test_fetch_with_missing_not_matching_workspace(
|
||||||
|
self, mock_fetch_query_entities, mock_fetch_query_urns
|
||||||
|
):
|
||||||
|
# force not match in query_urn_2
|
||||||
|
self.entities_data[self.query_urn_2][0].statement.value = ( # type: ignore
|
||||||
|
"""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
|
||||||
|
)
|
||||||
|
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||||
|
mock_fetch_query_entities.return_value = self.entities_data
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
assert all(isinstance(qr, QueryResponse) for qr in results)
|
||||||
|
assert results[0].urn == self.query_urn_1
|
||||||
|
assert results[0].hex_project_id == "project1"
|
||||||
|
assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
|
||||||
|
)
|
||||||
|
def test_fetch_with_no_subjects(
|
||||||
|
self, mock_fetch_query_entities, mock_fetch_query_urns
|
||||||
|
):
|
||||||
|
# force no subjects query_urn_2
|
||||||
|
self.entities_data[self.query_urn_2][1].subjects = [] # type: ignore
|
||||||
|
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||||
|
mock_fetch_query_entities.return_value = self.entities_data
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
assert all(isinstance(qr, QueryResponse) for qr in results)
|
||||||
|
assert results[0].urn == self.query_urn_1
|
||||||
|
assert results[0].hex_project_id == "project1"
|
||||||
|
assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
def test_fetch_with_no_query_urns_found(self, mock_fetch_query_urns):
|
||||||
|
mock_fetch_query_urns.return_value = []
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 0
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
|
||||||
|
)
|
||||||
|
def test_fetch_query_entities_fail(
|
||||||
|
self, mock_fetch_query_entities, mock_fetch_query_urns
|
||||||
|
):
|
||||||
|
mock_fetch_query_urns.return_value = [self.query_urn_1]
|
||||||
|
mock_fetch_query_entities.side_effect = Exception(
|
||||||
|
"Failed to fetch query entities"
|
||||||
|
)
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 0
|
||||||
|
assert self.report.errors == 1
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
|
||||||
|
)
|
||||||
|
def test_fetch_query_urns_fail(self, mock_fetch_query_urns):
|
||||||
|
mock_fetch_query_urns.side_effect = Exception("Failed to fetch query urns")
|
||||||
|
|
||||||
|
results = list(self.fetcher.fetch())
|
||||||
|
|
||||||
|
assert len(results) == 0
|
||||||
|
assert self.report.errors == 1
|
Loading…
x
Reference in New Issue
Block a user