feat(ingest): new hex connector - part 2 (#12985)

2025-10-11 08:54:00 +00:00 · 2025-04-03 12:44:37 +02:00 · 2025-04-03 12:44:37 +02:00 · d2bb33f7c5
commit d2bb33f7c5
parent 7618af549c
18 changed files with 3295 additions and 36 deletions
--- a/metadata-ingestion/docs/sources/hex/README.md
+++ b/metadata-ingestion/docs/sources/hex/README.md
@ -20,4 +20,12 @@ Currently, the [Hex API](https://learn.hex.tech/docs/api/api-reference) has some
 2. **Metadata Access**: There is no direct method to retrieve metadata for Collections, Status, or Categories. This information is only available indirectly through references within Projects and Components.
-Please keep these limitations in mind when working with the Hex connector.
+Please keep these limitations in mind when working with the Hex connector.
 For the Dataset - Hex Project lineage, the connector relies on the 
 [_Hex query metadata_](https://learn.hex.tech/docs/explore-data/cells/sql-cells/sql-cells-introduction#query-metadata) feature.
 Therefore, in order to extract lineage information, the required setup must include:
 - A separated warehouse ingestor (_eg_ BigQuery, Snowflake, Redshift, ...) with `use_queries_v2` enabled in order to fetch Queries.
  This will ingest the queries into DataHub as `Query` entities and the ones triggered by Hex will include the corresponding _Hex query metadata_.
 - A DataHub server with version >= SaaS `0.3.10` or > OSS `1.0.0` so the `Query` entities are properly indexed by source (Hex in this case) and so fetched and processed by the Hex ingestor in order to emit the Dataset - Project lineage.
--- a/metadata-ingestion/src/datahub/ingestion/source/hex/constants.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/hex/constants.py
@ -1,3 +1,8 @@
 from datahub.metadata.urns import DataPlatformUrn
 HEX_PLATFORM_NAME = "hex"
 HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
 HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
 HEX_API_PAGE_SIZE_DEFAULT = 100
 DATAHUB_API_PAGE_SIZE_DEFAULT = 100
--- a/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py
@ -1,9 +1,12 @@
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, Iterable, List, Optional
-from pydantic import Field, SecretStr
+from pydantic import Field, SecretStr, root_validator
 from typing_extensions import assert_never
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.datetimes import parse_user_datetime
 from datahub.configuration.source_common import (
    EnvConfigMixin,
    PlatformInstanceConfigMixin,
@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.hex.api import HexApi, HexApiReport
 from datahub.ingestion.source.hex.constants import (
    DATAHUB_API_PAGE_SIZE_DEFAULT,
    HEX_API_BASE_URL_DEFAULT,
    HEX_API_PAGE_SIZE_DEFAULT,
    HEX_PLATFORM_NAME,
 )
 from datahub.ingestion.source.hex.mapper import Mapper
 from datahub.ingestion.source.hex.model import Component, Project
 from datahub.ingestion.source.hex.query_fetcher import (
    HexQueryFetcher,
    HexQueryFetcherReport,
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
    StaleEntityRemovalHandler,
    StaleEntityRemovalSourceReport,
@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 )
 from datahub.ingestion.source.state.stateful_ingestion_base import (
    StatefulIngestionConfigBase,
    StatefulIngestionReport,
    StatefulIngestionSourceBase,
 )
 from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
 from datahub.sdk.main_client import DataHubClient
 class HexSourceConfig(
@ -93,9 +102,73 @@ class HexSourceConfig(
        default=True,
        description="Set ownership identity from owner/creator email",
    )
    include_lineage: bool = Field(
        default=True,
        description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
    )
    lineage_start_time: Optional[datetime] = Field(
        default=None,
        description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
    )
    lineage_end_time: Optional[datetime] = Field(
        default=None,
        description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
    )
    datahub_page_size: int = Field(
        default=DATAHUB_API_PAGE_SIZE_DEFAULT,
        description="Number of items to fetch per DataHub API call.",
    )
    @root_validator(pre=True)
    def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
        # lineage_end_time default = now
        if "lineage_end_time" not in data or data["lineage_end_time"] is None:
            data["lineage_end_time"] = datetime.now(tz=timezone.utc)
        # if string is given, parse it
        if isinstance(data["lineage_end_time"], str):
            data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
        # if no timezone is given, assume UTC
        if data["lineage_end_time"].tzinfo is None:
            data["lineage_end_time"] = data["lineage_end_time"].replace(
                tzinfo=timezone.utc
            )
        # at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
        assert (
            data["lineage_end_time"]
            and isinstance(data["lineage_end_time"], datetime)
            and data["lineage_end_time"].tzinfo is not None
            and data["lineage_end_time"].tzinfo == timezone.utc
        )
        # lineage_start_time default = lineage_end_time - 1 day
        if "lineage_start_time" not in data or data["lineage_start_time"] is None:
            data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
        # if string is given, parse it
        if isinstance(data["lineage_start_time"], str):
            data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
        # if no timezone is given, assume UTC
        if data["lineage_start_time"].tzinfo is None:
            data["lineage_start_time"] = data["lineage_start_time"].replace(
                tzinfo=timezone.utc
            )
        # at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
        assert (
            data["lineage_start_time"]
            and isinstance(data["lineage_start_time"], datetime)
            and data["lineage_start_time"].tzinfo is not None
            and data["lineage_start_time"].tzinfo == timezone.utc
        )
        return data
-class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
+@dataclass
 class HexReport(
    StaleEntityRemovalSourceReport,
    HexApiReport,
    IngestionStageReport,
    HexQueryFetcherReport,
 ):
    pass
@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
    def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
        super().__init__(config, ctx)
        self.source_config = config
-        self.report = HexReport()
+        self.report: HexReport = HexReport()
        self.platform = HEX_PLATFORM_NAME
        self.hex_api = HexApi(
            report=self.report,
@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
            categories_as_tags=self.source_config.categories_as_tags,
            set_ownership_from_email=self.source_config.set_ownership_from_email,
        )
        self.project_registry: Dict[str, Project] = {}
        self.component_registry: Dict[str, Component] = {}
        self.datahub_client: Optional[DataHubClient] = None
        self.query_fetcher: Optional[HexQueryFetcher] = None
        if self.source_config.include_lineage:
            graph = ctx.require_graph("Lineage")
            assert self.source_config.lineage_start_time and isinstance(
                self.source_config.lineage_start_time, datetime
            )
            assert self.source_config.lineage_end_time and isinstance(
                self.source_config.lineage_end_time, datetime
            )
            self.datahub_client = DataHubClient(graph=graph)
            self.query_fetcher = HexQueryFetcher(
                datahub_client=self.datahub_client,
                workspace_name=self.source_config.workspace_name,
                start_datetime=self.source_config.lineage_start_time,
                end_datetime=self.source_config.lineage_end_time,
                report=self.report,
                page_size=self.source_config.datahub_page_size,
            )
    @classmethod
    def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
            ).workunit_processor,
        ]
-    def get_report(self) -> StatefulIngestionReport:
+    def get_report(self) -> HexReport:
        return self.report
    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        yield from self.mapper.map_workspace()
+        with self.report.new_stage("Fetch Hex assets from Hex API"):
-
+            for project_or_component in self.hex_api.fetch_projects():
-        for project_or_component in self.hex_api.fetch_projects():
+                if isinstance(project_or_component, Project):
-            if isinstance(project_or_component, Project):
+                    if self.source_config.project_title_pattern.allowed(
                if self.source_config.project_title_pattern.allowed(
                    project_or_component.title
                ):
                    yield from self.mapper.map_project(project=project_or_component)
            elif isinstance(project_or_component, Component):
                if (
                    self.source_config.include_components
                    and self.source_config.component_title_pattern.allowed(
                        project_or_component.title
-                    )
+                    ):
-                ):
+                        self.project_registry[project_or_component.id] = (
-                    yield from self.mapper.map_component(component=project_or_component)
+                            project_or_component
-            else:
+                        )
-                assert_never(project_or_component)
+                elif isinstance(project_or_component, Component):
                    if (
                        self.source_config.include_components
                        and self.source_config.component_title_pattern.allowed(
                            project_or_component.title
                        )
                    ):
                        self.component_registry[project_or_component.id] = (
                            project_or_component
                        )
                else:
                    assert_never(project_or_component)
        if self.source_config.include_lineage:
            assert self.datahub_client and self.query_fetcher
            with self.report.new_stage(
                "Fetch Hex lineage from existing Queries in DataHub"
            ):
                for query_metadata in self.query_fetcher.fetch():
                    project = self.project_registry.get(query_metadata.hex_project_id)
                    if project:
                        project.upstream_datasets.extend(
                            query_metadata.dataset_subjects
                        )
                        project.upstream_schema_fields.extend(
                            query_metadata.schema_field_subjects
                        )
                    else:
                        self.report.report_warning(
                            title="Missing project for lineage",
                            message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
                            context=str(query_metadata),
                        )
        with self.report.new_stage("Emit"):
            yield from self.mapper.map_workspace()
            for project in self.project_registry.values():
                yield from self.mapper.map_project(project=project)
            for component in self.component_registry.values():
                yield from self.mapper.map_component(component=component)
--- a/metadata-ingestion/src/datahub/ingestion/source/hex/mapper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/hex/mapper.py
@ -1,6 +1,6 @@
 import logging
 from datetime import datetime
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple, Union
 from datahub._codegen.aspect import (
    _Aspect,  # TODO: is there a better import than this one?
@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
    DashboardInfoClass,
    DashboardUsageStatisticsClass,
    DataPlatformInstanceClass,
    EdgeClass,
    GlobalTagsClass,
    OwnerClass,
    OwnershipClass,
@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
    TagAssociationClass,
    TimeWindowSizeClass,
 )
-from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
+from datahub.metadata.urns import (
    ContainerUrn,
    CorpUserUrn,
    DashboardUrn,
    DatasetUrn,
    SchemaFieldUrn,
    Urn,
 )
 logger = logging.getLogger(__name__)
@ -116,6 +124,8 @@ class Mapper:
            ),
            externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
            customProperties=dict(id=project.id),
            datasetEdges=self._dataset_edges(project.upstream_datasets),
            # TODO: support schema field upstream, maybe InputFields?
        )
        subtypes = SubTypesClass(
@ -343,6 +353,22 @@ class Mapper:
            else None,
        )
    def _dataset_edges(
        self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
    ) -> Optional[List[EdgeClass]]:
        # TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
        return (
            [
                EdgeClass(
                    destinationUrn=upstream_urn.urn(),
                )
                for upstream_urn in upstream
                if isinstance(upstream_urn, DatasetUrn)
            ]
            if upstream
            else None
        )
    def _yield_mcps(
        self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
    ) -> Iterable[MetadataWorkUnit]:
--- a/metadata-ingestion/src/datahub/ingestion/source/hex/model.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/hex/model.py
@ -1,6 +1,8 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
@dataclass
@ -51,6 +53,12 @@ class Project:
    creator: Optional[Owner] = None
    owner: Optional[Owner] = None
    analytics: Optional[Analytics] = None
    upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
        default_factory=list
    )
    upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
        default_factory=list
    )
@dataclass
--- a/metadata-ingestion/src/datahub/ingestion/source/hex/query_fetcher.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/hex/query_fetcher.py
@ -0,0 +1,297 @@
 import logging
 import re
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Dict, Iterable, List, Optional, Tuple
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.source.hex.constants import (
    DATAHUB_API_PAGE_SIZE_DEFAULT,
    HEX_PLATFORM_URN,
 )
 from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
 from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
 from datahub.sdk.main_client import DataHubClient
 from datahub.sdk.search_filters import FilterDsl as F
 from datahub.utilities.time import datetime_to_ts_millis
 logger = logging.getLogger(__name__)
 # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
 HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
@dataclass
 class QueryResponse:
    """This is the public response model for the HexQueryFetcher."""
    urn: QueryUrn
    hex_project_id: str
    dataset_subjects: List[DatasetUrn] = field(default_factory=list)
    schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
@dataclass
 class HexQueryFetcherReport(SourceReport):
    start_datetime: Optional[datetime] = None
    end_datetime: Optional[datetime] = None
    fetched_query_urns: int = 0
    fetched_query_objects: int = 0
    filtered_out_queries_missing_metadata: int = 0
    filtered_out_queries_different_workspace: int = 0
    filtered_out_queries_no_subjects: int = 0
    total_queries: int = 0
    total_dataset_subjects: int = 0
    total_schema_field_subjects: int = 0
    num_calls_fetch_query_entities: int = 0
 class HexQueryFetcher:
    def __init__(
        self,
        datahub_client: DataHubClient,
        workspace_name: str,
        start_datetime: datetime,
        end_datetime: datetime,
        report: HexQueryFetcherReport,
        page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
    ):
        self.datahub_client = datahub_client
        self.workspace_name = workspace_name
        self.start_datetime = start_datetime
        self.end_datetime = end_datetime
        self.report = report
        self.page_size = page_size
        self.report.start_datetime = start_datetime
        self.report.end_datetime = end_datetime
    def fetch(self) -> Iterable[QueryResponse]:
        try:
            query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
            assert all(isinstance(urn, QueryUrn) for urn in query_urns)
            self.report.fetched_query_urns = len(query_urns)
            entities_by_urn = self._fetch_query_entities(query_urns)
            self.report.fetched_query_objects = len(entities_by_urn)
        except Exception as e:
            self.report.failure(
                title="Error fetching Queries for lineage",
                message="Error fetching Queries will result on missing lineage",
                context=str(
                    dict(
                        workspace_name=self.workspace_name,
                        start_datetime=self.start_datetime,
                        end_datetime=self.end_datetime,
                    )
                ),
                exc=e,
            )
        else:
            if not query_urns or not entities_by_urn:
                self.report.warning(
                    title="No Queries found with Hex as origin",
                    message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
                    context=str(
                        dict(
                            workspace_name=self.workspace_name,
                            start_datetime=self.start_datetime,
                            end_datetime=self.end_datetime,
                        )
                    ),
                )
                return
            for query_urn, (
                query_properties,
                query_subjects,
            ) in entities_by_urn.items():
                maybe_query_response = self._build_query_response(
                    query_urn=query_urn,
                    query_properties=query_properties,
                    query_subjects=query_subjects,
                )
                if maybe_query_response:
                    yield maybe_query_response
    def _fetch_query_entities(
        self, query_urns: List[QueryUrn]
    ) -> Dict[
        QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
    ]:
        entities_by_urn: Dict[
            QueryUrn,
            Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
        ] = {}
        for i in range(0, len(query_urns), self.page_size):
            batch = query_urns[i : i + self.page_size]
            logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
            entities = self.datahub_client._graph.get_entities(
                entity_name=QueryUrn.ENTITY_TYPE,
                urns=[urn.urn() for urn in batch],
                aspects=[
                    QueryPropertiesClass.ASPECT_NAME,
                    QuerySubjectsClass.ASPECT_NAME,
                ],
                with_system_metadata=False,
            )
            self.report.num_calls_fetch_query_entities += 1
            logger.debug(f"Get entities response: {entities}")
            for urn, entity in entities.items():
                query_urn = QueryUrn.from_string(urn)
                properties_tuple = entity.get(
                    QueryPropertiesClass.ASPECT_NAME, (None, None)
                )
                query_properties: Optional[QueryPropertiesClass] = None
                if properties_tuple and properties_tuple[0]:
                    assert isinstance(properties_tuple[0], QueryPropertiesClass)
                    query_properties = properties_tuple[0]
                subjects_tuple = entity.get(
                    QuerySubjectsClass.ASPECT_NAME, (None, None)
                )
                query_subjects: Optional[QuerySubjectsClass] = None
                if subjects_tuple and subjects_tuple[0]:
                    assert isinstance(subjects_tuple[0], QuerySubjectsClass)
                    query_subjects = subjects_tuple[0]
                entities_by_urn[query_urn] = (query_properties, query_subjects)
        return entities_by_urn
    def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
        last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
        last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
        urns = self.datahub_client.search.get_urns(
            filter=F.and_(
                F.entity_type(QueryUrn.ENTITY_TYPE),
                F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
                F.custom_filter(
                    "lastModifiedAt",
                    "GREATER_THAN_OR_EQUAL_TO",
                    [str(last_modified_start_at_millis)],
                ),
                F.custom_filter(
                    "lastModifiedAt",
                    "LESS_THAN_OR_EQUAL_TO",
                    [str(last_modified_end_at_millis)],
                ),
            ),
        )
        logger.debug(f"Get URNS by filter: {urns}")
        return [QueryUrn.from_string(urn.urn()) for urn in urns]
    def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
        """
        Extract project ID and workspace name from SQL statement.
        Looks for Hex metadata in SQL comments in the format:
        -- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
        Example:
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
        # TODO: Consider supporting multiline metadata format in the future:
        # -- Hex query metadata: {
        # --   "categories": ["Scratchpad"],
        # --   "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
        # --   ...
        # -- }
        Returns:
            A tuple of (project_id, workspace_name) if both are successfully extracted
            None if extraction fails for any reason
        """
        # Extract both project_id and workspace name in a single regex operation
        match = re.search(HEX_METADATA_PATTERN, sql_statement)
        if not match:
            return None
        try:
            project_id = match.group(1)
            workspace_name = match.group(2)
            return project_id, workspace_name
        except (IndexError, AttributeError) as e:
            self.report.warning(
                title="Failed to extract information from Hex query metadata",
                message="Failed to extract information from Hex query metadata will result on missing lineage",
                context=sql_statement,
                exc=e,
            )
        return None
    def _build_query_response(
        self,
        query_urn: QueryUrn,
        query_properties: Optional[QueryPropertiesClass],
        query_subjects: Optional[QuerySubjectsClass],
    ) -> Optional[QueryResponse]:
        # Skip if missing required aspects
        if (
            not query_properties
            or not query_properties.statement
            or not query_properties.statement.value
            or not query_subjects
            or query_subjects.subjects is None  # empty list is allowed
        ):
            logger.debug(
                f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
            )
            self.report.filtered_out_queries_missing_metadata += 1
            return None
        # Extract hex metadata (project_id and workspace_name)
        metadata_result = self._extract_hex_metadata(query_properties.statement.value)
        if not metadata_result:
            logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
            self.report.filtered_out_queries_missing_metadata += 1
            return None
        hex_project_id, workspace_from_url = metadata_result
        # Validate workspace
        if workspace_from_url != self.workspace_name:
            logger.debug(
                f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
            )
            self.report.filtered_out_queries_different_workspace += 1
            return None
        # Extract subjects
        dataset_subjects: List[DatasetUrn] = []
        schema_field_subjects: List[SchemaFieldUrn] = []
        for subject in query_subjects.subjects:
            if subject.entity and subject.entity.startswith("urn:li:dataset:"):
                dataset_subjects.append(DatasetUrn.from_string(subject.entity))
            elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
                schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
        if not dataset_subjects and not schema_field_subjects:
            self.report.filtered_out_queries_no_subjects += 1
            return None
        # Create response
        response = QueryResponse(
            urn=query_urn,
            hex_project_id=hex_project_id,
            dataset_subjects=dataset_subjects,
            schema_field_subjects=schema_field_subjects,
        )
        logger.debug(
            f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
        )
        self.report.total_queries += 1
        self.report.total_dataset_subjects += len(dataset_subjects)
        self.report.total_schema_field_subjects += len(schema_field_subjects)
        logger.debug(
            f"Processed query {query_urn} with Hex project ID {hex_project_id}"
        )
        return response
--- a/metadata-ingestion/tests/integration/hex/docker/datahub_entities_v3_page1.json
+++ b/metadata-ingestion/tests/integration/hex/docker/datahub_entities_v3_page1.json
@ -0,0 +1,47 @@
 [
  {
    "urn": "urn:li:query:307d80903ebbc165944c52ae79efaeb1736f9bb37d9f8fa48fc0af69d725413f",
    "queryKey": {
      "value": {
        "id": "307d80903ebbc165944c52ae79efaeb1736f9bb37d9f8fa48fc0af69d725413f"
      }
    },
    "querySubjects": {
      "value": {
        "subjects": [
          {
            "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD)"
          },
          {
            "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_orders,PROD)"
          },
          {
            "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD),age_m)"
          },
          {
            "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD),age_y)"
          }
        ]
      }
    },
    "queryProperties": {
      "value": {
        "statement": {
          "value": "select * from \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"PET_DETAILS\" left outer join \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"PET_ORDERS\" limit 100\n-- Hex query metadata: {\"categories\": [\"Scratchpad\"], \"cell_type\": \"SQL\", \"connection\": \"Long Tail Companions\", \"context\": \"SCHEDULED_RUN\", \"project_id\": \"d73da67d-c87b-4dd8-9e7f-b79cb7f822cf\", \"project_name\": \"PlayNotebook\", \"project_url\": \"https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=2520f866-8bb6-43dc-8d52-8f522a938b9c\", \"status\": \"In development\", \"trace_id\": \"0195cd3713e6700088f1535fa4c874a5\", \"user_email\": \"alice@email.com\"}",
          "language": "SQL"
        },
        "customProperties": {},
        "source": "SYSTEM",
        "lastModified": {
          "actor": "urn:li:corpuser:sf_hex_user",
          "time": 1742904697868
        },
        "created": {
          "actor": "urn:li:corpuser:sf_hex_user",
          "time": 1742904697868
        },
        "origin": "urn:li:dataPlatform:hex"
      }
    }
  }
 ]
--- a/metadata-ingestion/tests/integration/hex/docker/datahub_entities_v3_page2.json
+++ b/metadata-ingestion/tests/integration/hex/docker/datahub_entities_v3_page2.json
@ -0,0 +1,44 @@
 [
  {
    "urn": "urn:li:query:87fe9c2def1a5b7932ec9f12c4b55a56dcae6a29fa1d5b78902e34e73abcd123",
    "queryKey": {
      "value": {
        "id": "87fe9c2def1a5b7932ec9f12c4b55a56dcae6a29fa1d5b78902e34e73abcd123"
      }
    },
    "querySubjects": {
      "value": {
        "subjects": [
          {
            "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.visit_core,PROD)"
          },
          {
            "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.visit_details,PROD)"
          },
          {
            "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.visit_more_details,PROD)"
          }
        ]
      }
    },
    "queryProperties": {
      "value": {
        "statement": {
          "value": "select * from \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"VISIT_core\" join \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"VISIT_DETAILS\" join \"LONG_TAIL_COMPANIONS\".\"ANALYTICS\".\"VISIT_MORE_DETAILS\" limit 100\n-- Hex query metadata: {\"categories\": [\"Scratchpad\"], \"cell_type\": \"SQL\", \"connection\": \"Long Tail Companions\", \"context\": \"SCHEDULED_RUN\", \"project_id\": \"2ef730de-25ec-4131-94af-3517e743a738\", \"project_name\": \"Welcome to Hex!\", \"project_url\": \"https://app.hex.tech/some-hex-workspace/hex/2ef730de-25ec-4131-94af-3517e743a738/draft/logic?selectedCellId=3630g977-9cc7-54ed-9e63-9g633b049c1e\", \"status\": \"In development\", \"trace_id\": \"0195cd3713e6700088f1535fa4c874a6\", \"user_email\": \"bob@email.com\"}",
          "language": "SQL"
        },
        "customProperties": {},
        "source": "SYSTEM",
        "lastModified": {
          "actor": "urn:li:corpuser:sf_hex_user",
          "time": 1742904697969
        },
        "created": {
          "actor": "urn:li:corpuser:sf_hex_user",
          "time": 1742904697969
        },
        "origin": "urn:li:dataPlatform:hex"
      }
    }
  }
 ]
--- a/metadata-ingestion/tests/integration/hex/docker/datahub_get_urns_by_filter_page1.json
+++ b/metadata-ingestion/tests/integration/hex/docker/datahub_get_urns_by_filter_page1.json
@ -0,0 +1,14 @@
 {
  "data": {
    "scrollAcrossEntities": {
      "nextScrollId": "page_2_scroll_id",
      "searchResults": [
        {
          "entity": {
            "urn": "urn:li:query:307d80903ebbc165944c52ae79efaeb1736f9bb37d9f8fa48fc0af69d725413f"
          }
        }
      ]
    }
  }
 }
--- a/metadata-ingestion/tests/integration/hex/docker/datahub_get_urns_by_filter_page2.json
+++ b/metadata-ingestion/tests/integration/hex/docker/datahub_get_urns_by_filter_page2.json
@ -0,0 +1,14 @@
 {
  "data": {
    "scrollAcrossEntities": {
      "nextScrollId": null,
      "searchResults": [
        {
          "entity": {
            "urn": "urn:li:query:87fe9c2def1a5b7932ec9f12c4b55a56dcae6a29fa1d5b78902e34e73abcd123"
          }
        }
      ]
    }
  }
 }
--- a/metadata-ingestion/tests/integration/hex/docker/docker-compose.yml
+++ b/metadata-ingestion/tests/integration/hex/docker/docker-compose.yml
@ -7,10 +7,27 @@ services:
      - "8000:8000"
    volumes:
      - ./hex_projects_response.json:/app/hex_projects_response.json
-      - ./mock_server.py:/app/mock_server.py
+      - ./mock_hex_server.py:/app/mock_hex_server.py
-    command: ["python", "/app/mock_server.py"]
+    command: ["python", "/app/mock_hex_server.py"]
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
      interval: 5s
      timeout: 5s
      retries: 3
  datahub-mock-api:
    image: python:3.9-alpine
    container_name: datahub-mock-api
    ports:
      - "8010:8010"
    volumes:
      - ./datahub_entities_v3_page1.json:/app/datahub_entities_v3_page1.json
      - ./datahub_entities_v3_page2.json:/app/datahub_entities_v3_page2.json
      - ./datahub_get_urns_by_filter_page1.json:/app/datahub_get_urns_by_filter_page1.json
      - ./datahub_get_urns_by_filter_page2.json:/app/datahub_get_urns_by_filter_page2.json
      - ./mock_datahub_server.py:/app/mock_datahub_server.py
    command: ["python", "/app/mock_datahub_server.py"]
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8010/health"]
      interval: 5s
      timeout: 5s
      retries: 3
--- a/metadata-ingestion/tests/integration/hex/docker/mock_datahub_server.py
+++ b/metadata-ingestion/tests/integration/hex/docker/mock_datahub_server.py
@ -0,0 +1,118 @@
 #!/usr/bin/env python3
 """
 Mock DataHub server that handles both GET and POST requests and supports pagination
 """
 import http.server
 import json
 import socketserver
 from http import HTTPStatus
 from urllib.parse import urlparse
 PORT = 8010
 # Load the mock response data
 with open("/app/datahub_entities_v3_page1.json", "r") as f:
    ENTITIES_V3_PAGE1_RESPONSE = f.read()
 with open("/app/datahub_entities_v3_page2.json", "r") as f:
    ENTITIES_V3_PAGE2_RESPONSE = f.read()
 with open("/app/datahub_get_urns_by_filter_page1.json", "r") as f:
    URNS_BY_FILTER_PAGE1_RESPONSE = f.read()
 with open("/app/datahub_get_urns_by_filter_page2.json", "r") as f:
    URNS_BY_FILTER_PAGE2_RESPONSE = f.read()
 # Global state flag to track if first page has been requested
 FIRST_ENTITIES_PAGE_REQUESTED = False
 class MockDataHubAPIHandler(http.server.SimpleHTTPRequestHandler):
    # Global state flag to track if first page has been requested accross all instances; one instance per request
    first_entities_page_requested = False
    def do_GET(self):
        parsed_url = urlparse(self.path)
        path = parsed_url.path
        # Health check endpoint
        if path == "/health":
            self.send_response(HTTPStatus.OK)
            self.send_header("Content-type", "text/plain")
            self.end_headers()
            self.wfile.write(b"OK")
            return
        # Mock DataHub API endpoints
        if path.startswith("/config"):
            self.send_response(HTTPStatus.OK)
            self.send_header("Content-type", "application/json")
            self.send_header("Access-Control-Allow-Origin", "*")
            self.end_headers()
            self.wfile.write(json.dumps(dict(noCode="true")).encode())
            return
        # Default 404 response
        self.send_response(HTTPStatus.NOT_FOUND)
        self.send_header("Content-type", "application/json")
        self.end_headers()
        self.wfile.write(json.dumps({"error": "Not found", "path": self.path}).encode())
    def do_POST(self):
        parsed_url = urlparse(self.path)
        path = parsed_url.path
        # Get request body
        content_length = int(self.headers["Content-Length"])
        post_data = self.rfile.read(content_length)
        request_body = json.loads(post_data)
        if path == "/openapi/v3/entity/query/batchGet":
            self.send_response(HTTPStatus.OK)
            self.send_header("Content-type", "application/json")
            self.send_header("Access-Control-Allow-Origin", "*")
            self.end_headers()
            # Return the appropriate page of entity data in V3 format
            if not MockDataHubAPIHandler.first_entities_page_requested:
                self.wfile.write(ENTITIES_V3_PAGE1_RESPONSE.encode())
                MockDataHubAPIHandler.first_entities_page_requested = True
            else:
                self.wfile.write(ENTITIES_V3_PAGE2_RESPONSE.encode())
            return
        if path == "/api/graphql":
            self.send_response(HTTPStatus.OK)
            self.send_header("Content-type", "application/json")
            self.send_header("Access-Control-Allow-Origin", "*")
            self.end_headers()
            # Check if this is a scroll query with nextScrollId
            scroll_id = None
            if "variables" in request_body:
                scroll_id = request_body.get("variables", {}).get("scrollId")
            if scroll_id == "page_2_scroll_id":
                self.wfile.write(URNS_BY_FILTER_PAGE2_RESPONSE.encode())
            else:
                self.wfile.write(URNS_BY_FILTER_PAGE1_RESPONSE.encode())
            return
        # Default 404 response
        self.send_response(HTTPStatus.NOT_FOUND)
        self.send_header("Content-type", "application/json")
        self.end_headers()
        self.wfile.write(
            json.dumps(
                {"error": "Not found", "path": self.path, "method": "POST"}
            ).encode()
        )
 # Set up the server
 handler = MockDataHubAPIHandler
 httpd = socketserver.TCPServer(("", PORT), handler)
 print(f"Serving mock DataHub API at port {PORT}")
 httpd.serve_forever()
--- a/metadata-ingestion/tests/integration/hex/docker/mock_hex_server.py
+++ b/metadata-ingestion/tests/integration/hex/docker/mock_hex_server.py
--- a/metadata-ingestion/tests/integration/hex/golden/hex_mce_golden_with_lineage.json
+++ b/metadata-ingestion/tests/integration/hex/golden/hex_mce_golden_with_lineage.json
--- a/metadata-ingestion/tests/integration/hex/test_hex.py
+++ b/metadata-ingestion/tests/integration/hex/test_hex.py
@ -6,7 +6,6 @@ from datahub.ingestion.run.pipeline import Pipeline
 from tests.test_helpers import mce_helpers
 from tests.test_helpers.docker_helpers import wait_for_port
 # Test resources and constants
 FROZEN_TIME = "2025-03-25 12:00:00"
 pytestmark = pytest.mark.integration_batch_2
@ -17,10 +16,10 @@ def test_resources_dir(pytestconfig):
    return pytestconfig.rootpath / "tests/integration/hex"
-def is_hex_mock_api_up(container_name: str) -> bool:
+def is_mock_api_up(port: int) -> bool:
    """Check if the mock API server is up and running"""
    try:
-        response = requests.get("http://localhost:8000/health")
+        response = requests.get(f"http://localhost:{port}/health")
        response.raise_for_status()
        return True
    except (requests.RequestException, ConnectionError):
@ -40,7 +39,14 @@ def hex_mock_api_runner(docker_compose_runner, test_resources_dir):
            "hex-mock-api",
            8000,
            timeout=30,
-            checker=lambda: is_hex_mock_api_up("hex-mock-api"),
+            checker=lambda: is_mock_api_up(8000),
        )
        wait_for_port(
            docker_services,
            "datahub-mock-api",
            8010,
            timeout=30,
            checker=lambda: is_mock_api_up(8010),
        )
        yield docker_services
@ -48,7 +54,6 @@ def hex_mock_api_runner(docker_compose_runner, test_resources_dir):
@freeze_time(FROZEN_TIME)
@pytest.mark.integration
 def test_hex_ingestion(pytestconfig, hex_mock_api_runner, test_resources_dir, tmp_path):
    """Test Hex metadata ingestion using a mock API server."""
    # Path for the golden file
    golden_dir = test_resources_dir / "golden"
    golden_path = golden_dir / "hex_mce_golden.json"
@ -56,14 +61,67 @@ def test_hex_ingestion(pytestconfig, hex_mock_api_runner, test_resources_dir, tm
    # Create the pipeline
    pipeline = Pipeline.create(
        {
-            "run_id": "hex-test",
+            "pipeline_name": "test-hex",
            "source": {
                "type": "hex",
                "config": {
                    "workspace_name": "test-workspace",
                    "token": "test-token",
-                    "base_url": "http://localhost:8000/api/v1",  # Mock API URL
+                    "base_url": "http://localhost:8000/api/v1",  # Mock Hex API URL
                    "platform_instance": "hex_test",
                    "include_lineage": False,
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/hex_mces.json",
                },
            },
        }
    )
    # Run the pipeline
    pipeline.run()
    pipeline.raise_from_status()
    # Check against golden file
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/hex_mces.json",
        golden_path=golden_path,
        ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
    )
@freeze_time(FROZEN_TIME)
@pytest.mark.integration
 def test_hex_ingestion_with_lineage(
    pytestconfig, hex_mock_api_runner, test_resources_dir, tmp_path
 ):
    # Path for the golden file
    golden_dir = test_resources_dir / "golden"
    golden_path = golden_dir / "hex_mce_golden_with_lineage.json"
    # Create the pipeline
    pipeline = Pipeline.create(
        {
            "pipeline_name": "test-hex-with-lineage",
            "datahub_api": {
                "server": "http://localhost:8010",  # Mock DataHub API URL
            },
            "source": {
                "type": "hex",
                "config": {
                    "workspace_name": "some-hex-workspace",
                    "token": "test-token",
                    "base_url": "http://localhost:8000/api/v1",  # Mock Hex API URL
                    "platform_instance": "hex_test",
                    "include_lineage": True,
                    "datahub_page_size": 1,  # Force pagination
                    "stateful_ingestion": {
                        "enabled": False,
                    },
                },
            },
            "sink": {
--- a/metadata-ingestion/tests/unit/hex/test_hex.py
+++ b/metadata-ingestion/tests/unit/hex/test_hex.py
@ -0,0 +1,157 @@
 import unittest
 from datetime import datetime, timedelta, timezone
 from datahub.ingestion.source.hex.hex import HexSourceConfig
 def datetime_approx_equal(
    dt1: datetime, dt2: datetime, tolerance_seconds: int = 5
 ) -> bool:
    if dt1.tzinfo is None:
        dt1 = dt1.replace(tzinfo=timezone.utc)
    if dt2.tzinfo is None:
        dt2 = dt2.replace(tzinfo=timezone.utc)
    diff = abs((dt1 - dt2).total_seconds())
    return diff <= tolerance_seconds
 class TestHexSourceConfig(unittest.TestCase):
    def setUp(self):
        self.minimum_input_config = {
            "workspace_name": "test-workspace",
            "token": "test-token",
        }
    def test_required_fields(self):
        with self.assertRaises(ValueError):
            input_config = {**self.minimum_input_config}
            del input_config["workspace_name"]
            HexSourceConfig.parse_obj(input_config)
        with self.assertRaises(ValueError):
            input_config = {**self.minimum_input_config}
            del input_config["token"]
            HexSourceConfig.parse_obj(input_config)
    def test_minimum_config(self):
        config = HexSourceConfig.parse_obj(self.minimum_input_config)
        assert config
        assert config.workspace_name == "test-workspace"
        assert config.token.get_secret_value() == "test-token"
    def test_lineage_config(self):
        config = HexSourceConfig.parse_obj(self.minimum_input_config)
        assert config and config.include_lineage
        input_config = {**self.minimum_input_config, "include_lineage": False}
        config = HexSourceConfig.parse_obj(input_config)
        assert config and not config.include_lineage
        # default values for lineage_start_time and lineage_end_time
        config = HexSourceConfig.parse_obj(self.minimum_input_config)
        assert (
            config.lineage_start_time
            and isinstance(config.lineage_start_time, datetime)
            and datetime_approx_equal(
                config.lineage_start_time,
                datetime.now(tz=timezone.utc) - timedelta(days=1),
            )
        )
        assert (
            config.lineage_end_time
            and isinstance(config.lineage_end_time, datetime)
            and datetime_approx_equal(
                config.lineage_end_time, datetime.now(tz=timezone.utc)
            )
        )
        # set values for lineage_start_time and lineage_end_time
        input_config = {
            **self.minimum_input_config,
            "lineage_start_time": "2025-03-24 12:00:00",
            "lineage_end_time": "2025-03-25 12:00:00",
        }
        config = HexSourceConfig.parse_obj(input_config)
        assert (
            config.lineage_start_time
            and isinstance(config.lineage_start_time, datetime)
            and datetime_approx_equal(
                config.lineage_start_time,
                datetime(2025, 3, 24, 12, 0, 0, tzinfo=timezone.utc),
            )
        )
        assert (
            config.lineage_end_time
            and isinstance(config.lineage_end_time, datetime)
            and datetime_approx_equal(
                config.lineage_end_time,
                datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc),
            )
        )
        # set lineage_end_time only
        input_config = {
            **self.minimum_input_config,
            "lineage_end_time": "2025-03-25 12:00:00",
        }
        config = HexSourceConfig.parse_obj(input_config)
        assert (
            config.lineage_start_time
            and isinstance(config.lineage_start_time, datetime)
            and datetime_approx_equal(
                config.lineage_start_time,
                datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc)
                - timedelta(days=1),
            )
        )
        assert (
            config.lineage_end_time
            and isinstance(config.lineage_end_time, datetime)
            and datetime_approx_equal(
                config.lineage_end_time,
                datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc),
            )
        )
        # set lineage_start_time only
        input_config = {
            **self.minimum_input_config,
            "lineage_start_time": "2025-03-25 12:00:00",
        }
        config = HexSourceConfig.parse_obj(input_config)
        assert (
            config.lineage_start_time
            and isinstance(config.lineage_start_time, datetime)
            and datetime_approx_equal(
                config.lineage_start_time,
                datetime(2025, 3, 25, 12, 0, 0, tzinfo=timezone.utc),
            )
        )
        assert (
            config.lineage_end_time
            and isinstance(config.lineage_end_time, datetime)
            and datetime_approx_equal(
                config.lineage_end_time, datetime.now(tz=timezone.utc)
            )
        )
        # set relative times for lineage_start_time and lineage_end_time
        input_config = {
            **self.minimum_input_config,
            "lineage_start_time": "-3day",
            "lineage_end_time": "now",
        }
        config = HexSourceConfig.parse_obj(input_config)
        assert (
            config.lineage_start_time
            and isinstance(config.lineage_start_time, datetime)
            and datetime_approx_equal(
                config.lineage_start_time,
                datetime.now(tz=timezone.utc) - timedelta(days=3),
            )
        )
        assert (
            config.lineage_end_time
            and isinstance(config.lineage_end_time, datetime)
            and datetime_approx_equal(
                config.lineage_end_time, datetime.now(tz=timezone.utc)
            )
        )
--- a/metadata-ingestion/tests/unit/hex/test_mapper.py
+++ b/metadata-ingestion/tests/unit/hex/test_mapper.py
@ -792,3 +792,95 @@ class TestMapper(unittest.TestCase):
        assert (
            dashboard_urn.urn() == "urn:li:dashboard:(hex,test-platform.dashboard_name)"
        )
    def test_dataset_edges(self):
        from datahub.metadata.schema_classes import EdgeClass
        from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
        mapper = Mapper(
            workspace_name=self.workspace_name,
        )
        # Test with empty list
        edges = mapper._dataset_edges([])
        assert not edges
        # Test with only DatasetUrns
        dataset_urn1 = DatasetUrn(
            platform="snowflake",
            name="test-dataset-1",
        )
        dataset_urn2 = DatasetUrn(
            platform="bigquery",
            name="test-dataset-2",
        )
        edges = mapper._dataset_edges([dataset_urn1, dataset_urn2])
        assert edges and len(edges) == 2
        assert all(isinstance(edge, EdgeClass) for edge in edges)
        assert edges[0].destinationUrn == dataset_urn1.urn()
        assert edges[1].destinationUrn == dataset_urn2.urn()
        # Test with mixed DatasetUrns and SchemaFieldUrns - should filter out SchemaFieldUrns
        schema_field_urn = SchemaFieldUrn(
            parent=dataset_urn1,
            field_path="test.field.path",
        )
        edges = mapper._dataset_edges([dataset_urn1, schema_field_urn, dataset_urn2])
        assert edges and len(edges) == 2  # SchemaFieldUrn should be filtered out
        assert edges[0].destinationUrn == dataset_urn1.urn()
        assert edges[1].destinationUrn == dataset_urn2.urn()
    def test_map_project_with_upstream_datasets(self):
        from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
        # Create a project with upstream datasets
        dataset_urn1 = DatasetUrn(
            platform="snowflake",
            name="test-dataset-1",
        )
        dataset_urn2 = DatasetUrn(
            platform="bigquery",
            name="test-dataset-2",
        )
        schema_field_urn = SchemaFieldUrn(
            parent=dataset_urn1,
            field_path="test.field.path",
        )
        project = Project(
            id="uuid1",
            title="Test Project With Lineage",
            description="A test project with upstream datasets",
            created_at=self.created_at,
            last_edited_at=self.last_edited_at,
            status=Status(name="Published"),
            creator=Owner(email="creator@example.com"),
            owner=Owner(email="owner@example.com"),
            upstream_datasets=[dataset_urn1, schema_field_urn, dataset_urn2],
        )
        mapper = Mapper(
            workspace_name=self.workspace_name,
            patch_metadata=False,
        )
        work_units = list(mapper.map_project(project))
        dashboard_info_wus = [
            wu for wu in work_units if wu.get_aspect_of_type(DashboardInfoClass)
        ]
        assert len(dashboard_info_wus) == 1
        dashboard_info = dashboard_info_wus[0].get_aspect_of_type(DashboardInfoClass)
        # Verify dataset edges
        assert (
            dashboard_info
            and dashboard_info.datasetEdges
            and len(dashboard_info.datasetEdges) == 2
        )
        edge_urns = [edge.destinationUrn for edge in dashboard_info.datasetEdges]
        assert dataset_urn1.urn() in edge_urns
        assert dataset_urn2.urn() in edge_urns
        assert schema_field_urn.urn() not in edge_urns  # Should be filtered out
--- a/metadata-ingestion/tests/unit/hex/test_query_fetcher.py
+++ b/metadata-ingestion/tests/unit/hex/test_query_fetcher.py
@ -0,0 +1,386 @@
 import unittest
 from datetime import datetime, timedelta
 from typing import Dict, Optional, Tuple
 from unittest.mock import MagicMock, patch
 from datahub.ingestion.source.hex.constants import HEX_PLATFORM_URN
 from datahub.ingestion.source.hex.query_fetcher import (
    HexQueryFetcher,
    HexQueryFetcherReport,
    QueryResponse,
 )
 from datahub.metadata.schema_classes import (
    AuditStampClass,
    QueryPropertiesClass,
    QueryStatementClass,
    QuerySubjectClass,
    QuerySubjectsClass,
 )
 from datahub.metadata.urns import DatasetUrn, QueryUrn
 class TestHexQueryFetcherExtractHexMetadata(unittest.TestCase):
    """Test cases for HexQueryFetcher._extract_hex_metadata method"""
    def setUp(self):
        self.mock_client = MagicMock()
        self.workspace_name = "some-hex-workspace"
        self.start_datetime = datetime(2023, 1, 1)
        self.report = HexQueryFetcherReport()
        self.fetcher = HexQueryFetcher(
            datahub_client=self.mock_client,
            workspace_name=self.workspace_name,
            start_datetime=self.start_datetime,
            end_datetime=self.start_datetime - timedelta(days=1),
            report=self.report,
        )
    def test_extract_hex_metadata_with_matching_workspace(self):
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is not None
        project_id, workspace_name = result
        assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
        assert workspace_name == "some-hex-workspace"
    def test_extract_hex_metadata_with_non_matching_workspace(self):
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://app.hex.tech/different-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is not None
        project_id, workspace_name = result
        assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
        assert workspace_name == "different-workspace"
    def test_extract_hex_metadata_without_url_returns_none(self):
        # missing project_url
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is None
    def test_extract_hex_metadata_with_no_metadata(self):
        # no Hex metadata
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- This is a regular comment
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is None
    def test_extract_hex_metadata_with_invalid_json(self):
        # invalid JSON in Hex metadata
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", INVALID_JSON}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is None
    def test_extract_hex_metadata_with_missing_project_id(self):
        # missing project_id
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "project_url": "https://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is None
    def test_extract_hex_metadata_with_invalid_url_format_returns_none(self):
        # invalid URL format in project_url
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://invalid-url-format/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is None
    def test_extract_hex_metadata_with_custom_domain(self):
        # custom domain in project_url (single-tenant deployment)
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "https://my-hex-instance.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is not None
        project_id, workspace_name = result
        assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
        assert workspace_name == "some-hex-workspace"
    def test_extract_hex_metadata_with_http_protocol(self):
        # HTTP protocol (not HTTPS)
        sql = """
        select * 
        from "LONG_TAIL_COMPANIONS"."ANALYTICS"."PET_DETAILS" 
        limit 100
        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_name": "PlayNotebook", "project_url": "http://app.hex.tech/some-hex-workspace/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic", "status": "In development", "trace_id": "f316f99947454a7e8aff2947f848f73d", "user_email": "alice@mail.com"}
        """
        result = self.fetcher._extract_hex_metadata(sql)
        assert result is not None
        project_id, workspace_name = result
        assert project_id == "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf"
        assert workspace_name == "some-hex-workspace"
    def test_extract_hex_metadata_with_complex_urls(self):
        # complex workspace names and paths
        urls_to_test = [
            # URL with hyphens in workspace name
            """{"project_id": "123", "project_url": "https://app.hex.tech/my-complex-workspace-name/hex/project-id"}""",
            # URL with underscores
            """{"project_id": "123", "project_url": "https://app.hex.tech/workspace_with_underscores/hex/project-id"}""",
            # URL with special chars in domain
            """{"project_id": "123", "project_url": "https://my-custom-subdomain.hex.tech/some-hex-workspace/hex/project-id"}""",
            # URL with long path after /hex/
            """{"project_id": "123", "project_url": "https://app.hex.tech/some-hex-workspace/hex/project-id/draft/logic?selectedCellId=67c38da0-e631"}""",
        ]
        expected_workspaces = [
            "my-complex-workspace-name",
            "workspace_with_underscores",
            "some-hex-workspace",
            "some-hex-workspace",
        ]
        for i, url_json in enumerate(urls_to_test):
            sql = f"""
            select * from table
            -- Hex query metadata: {url_json}
            """
            result = self.fetcher._extract_hex_metadata(sql)
            assert result is not None, (
                f"Failed to extract metadata from URL: {url_json}"
            )
            project_id, workspace_name = result
            assert project_id == "123"
            assert workspace_name == expected_workspaces[i], (
                f"Expected workspace {expected_workspaces[i]} but got {workspace_name}"
            )
 class TestHexQueryFetcherFetch(unittest.TestCase):
    """Test cases for the HexQueryFetcher.fetch method"""
    def setUp(self):
        self.mock_client = MagicMock()
        self.workspace_name = "workspace1"
        self.start_datetime = datetime(2023, 1, 1)
        self.report = HexQueryFetcherReport()
        self.fetcher = HexQueryFetcher(
            datahub_client=self.mock_client,
            workspace_name=self.workspace_name,
            start_datetime=self.start_datetime,
            end_datetime=self.start_datetime - timedelta(days=1),
            report=self.report,
        )
        # valid test data
        self.query_urn_1 = QueryUrn.from_string("urn:li:query:query1")
        self.query_urn_2 = QueryUrn.from_string("urn:li:query:query2")
        self.dataset_urn_1 = DatasetUrn.from_string(
            "urn:li:dataset:(urn:li:dataPlatform:snowflake,table1,PROD)"
        )
        self.dataset_urn_2 = DatasetUrn.from_string(
            "urn:li:dataset:(urn:li:dataPlatform:snowflake,table1,PROD)"
        )
        # self.entities_data matches the return type of HexQueryFetcher._fetch_query_entities
        self.entities_data: Dict[
            QueryUrn,
            Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
        ] = {
            self.query_urn_1: (
                QueryPropertiesClass(
                    created=AuditStampClass._construct_with_defaults(),
                    lastModified=AuditStampClass._construct_with_defaults(),
                    statement=QueryStatementClass(
                        value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/workspace1/hex/project1"}"""
                    ),
                    source=HEX_PLATFORM_URN.urn(),
                ),
                QuerySubjectsClass(
                    subjects=[
                        QuerySubjectClass(entity=self.dataset_urn_1.urn()),
                        QuerySubjectClass(entity=self.dataset_urn_2.urn()),
                    ]
                ),
            ),
            self.query_urn_2: (
                QueryPropertiesClass(
                    created=AuditStampClass._construct_with_defaults(),
                    lastModified=AuditStampClass._construct_with_defaults(),
                    statement=QueryStatementClass(
                        value="""SELECT * FROM table -- Hex query metadata: {"project_id": "project2", "project_url": "https://app.hex.tech/workspace1/hex/project2"}"""
                    ),
                    source=HEX_PLATFORM_URN.urn(),
                ),
                QuerySubjectsClass(
                    subjects=[QuerySubjectClass(entity=self.dataset_urn_1.urn())]
                ),
            ),
        }
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
    )
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
    )
    def test_fetch_with_valid_data(
        self, mock_fetch_query_entities, mock_fetch_query_urns
    ):
        mock_fetch_query_urns.return_value = [self.query_urn_1]
        mock_fetch_query_entities.return_value = self.entities_data
        results = list(self.fetcher.fetch())
        assert len(results) == 2
        assert all(isinstance(qr, QueryResponse) for qr in results)
        assert results[0].urn == self.query_urn_1
        assert results[0].hex_project_id == "project1"
        assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
        assert results[1].urn == self.query_urn_2
        assert results[1].hex_project_id == "project2"
        assert results[1].dataset_subjects == [self.dataset_urn_1]
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
    )
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
    )
    def test_fetch_with_missing_hex_query_metadata(
        self, mock_fetch_query_entities, mock_fetch_query_urns
    ):
        # force fail in query_urn_2
        self.entities_data[self.query_urn_2][0].statement.value = (  # type: ignore
            "SELECT * FROM table -- IT'S MISSING HERE"
        )
        mock_fetch_query_urns.return_value = [self.query_urn_1]
        mock_fetch_query_entities.return_value = self.entities_data
        results = list(self.fetcher.fetch())
        assert len(results) == 1
        assert all(isinstance(qr, QueryResponse) for qr in results)
        assert results[0].urn == self.query_urn_1
        assert results[0].hex_project_id == "project1"
        assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
    )
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
    )
    def test_fetch_with_missing_not_matching_workspace(
        self, mock_fetch_query_entities, mock_fetch_query_urns
    ):
        # force not match in query_urn_2
        self.entities_data[self.query_urn_2][0].statement.value = (  # type: ignore
            """SELECT * FROM table -- Hex query metadata: {"project_id": "project1", "project_url": "https://app.hex.tech/YET_ANOTHER_WORKSPACE/hex/project1"}"""
        )
        mock_fetch_query_urns.return_value = [self.query_urn_1]
        mock_fetch_query_entities.return_value = self.entities_data
        results = list(self.fetcher.fetch())
        assert len(results) == 1
        assert all(isinstance(qr, QueryResponse) for qr in results)
        assert results[0].urn == self.query_urn_1
        assert results[0].hex_project_id == "project1"
        assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
    )
    @patch(
        "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
    )
    def test_fetch_with_no_subjects(
        self, mock_fetch_query_entities, mock_fetch_query_urns
    ):
        # force no subjects query_urn_2
        self.entities_data[self.query_urn_2][1].subjects = []  # type: ignore
        mock_fetch_query_urns.return_value = [self.query_urn_1]
        mock_fetch_query_entities.return_value = self.entities_data
        results = list(self.fetcher.fetch())
        assert len(results) == 1
        assert all(isinstance(qr, QueryResponse) for qr in results)
        assert results[0].urn == self.query_urn_1
        assert results[0].hex_project_id == "project1"
        assert results[0].dataset_subjects == [self.dataset_urn_1, self.dataset_urn_2]
        @patch(
            "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
        )
        def test_fetch_with_no_query_urns_found(self, mock_fetch_query_urns):
            mock_fetch_query_urns.return_value = []
            results = list(self.fetcher.fetch())
            assert len(results) == 0
        @patch(
            "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
        )
        @patch(
            "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_entities"
        )
        def test_fetch_query_entities_fail(
            self, mock_fetch_query_entities, mock_fetch_query_urns
        ):
            mock_fetch_query_urns.return_value = [self.query_urn_1]
            mock_fetch_query_entities.side_effect = Exception(
                "Failed to fetch query entities"
            )
            results = list(self.fetcher.fetch())
            assert len(results) == 0
            assert self.report.errors == 1
        @patch(
            "datahub.ingestion.source.hex.query_fetcher.HexQueryFetcher._fetch_query_urns_filter_hex_and_last_modified"
        )
        def test_fetch_query_urns_fail(self, mock_fetch_query_urns):
            mock_fetch_query_urns.side_effect = Exception("Failed to fetch query urns")
            results = list(self.fetcher.fetch())
            assert len(results) == 0
            assert self.report.errors == 1