feat(ingest/fivetran): add safeguards on table/column lineage (#11674)

This commit is contained in:
Harshal Sheth 2024-10-20 23:59:45 -07:00 committed by GitHub
parent 2f85bc1ecb
commit dd1a06fb55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 156 additions and 123 deletions

View File

@ -1,6 +1,6 @@
import dataclasses
import logging import logging
from dataclasses import dataclass, field as dataclass_field from typing import Dict, Optional
from typing import Dict, List, Optional
import pydantic import pydantic
from pydantic import Field, root_validator from pydantic import Field, root_validator
@ -23,6 +23,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
from datahub.ingestion.source.state.stateful_ingestion_base import ( from datahub.ingestion.source.state.stateful_ingestion_base import (
StatefulIngestionConfigBase, StatefulIngestionConfigBase,
) )
from datahub.utilities.lossy_collections import LossyList
from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.perf_timer import PerfTimer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -114,24 +115,24 @@ class FivetranLogConfig(ConfigModel):
return values return values
@dataclass @dataclasses.dataclass
class MetadataExtractionPerfReport(Report): class MetadataExtractionPerfReport(Report):
connectors_metadata_extraction_sec: PerfTimer = dataclass_field( connectors_metadata_extraction_sec: PerfTimer = dataclasses.field(
default_factory=PerfTimer default_factory=PerfTimer
) )
connectors_lineage_extraction_sec: PerfTimer = dataclass_field( connectors_lineage_extraction_sec: PerfTimer = dataclasses.field(
default_factory=PerfTimer default_factory=PerfTimer
) )
connectors_jobs_extraction_sec: PerfTimer = dataclass_field( connectors_jobs_extraction_sec: PerfTimer = dataclasses.field(
default_factory=PerfTimer default_factory=PerfTimer
) )
@dataclass @dataclasses.dataclass
class FivetranSourceReport(StaleEntityRemovalSourceReport): class FivetranSourceReport(StaleEntityRemovalSourceReport):
connectors_scanned: int = 0 connectors_scanned: int = 0
filtered_connectors: List[str] = dataclass_field(default_factory=list) filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field( metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
default_factory=MetadataExtractionPerfReport default_factory=MetadataExtractionPerfReport
) )

View File

@ -24,7 +24,7 @@ class Connector:
sync_frequency: int sync_frequency: int
destination_id: str destination_id: str
user_id: str user_id: str
table_lineage: List[TableLineage] lineage: List[TableLineage]
jobs: List["Job"] jobs: List["Job"]

View File

@ -27,9 +27,10 @@ from datahub.ingestion.source.fivetran.config import (
PlatformDetail, PlatformDetail,
) )
from datahub.ingestion.source.fivetran.data_classes import Connector, Job from datahub.ingestion.source.fivetran.data_classes import Connector, Job
from datahub.ingestion.source.fivetran.fivetran_log_api import ( from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI
from datahub.ingestion.source.fivetran.fivetran_query import (
MAX_JOBS_PER_CONNECTOR, MAX_JOBS_PER_CONNECTOR,
FivetranLogAPI, MAX_TABLE_LINEAGE_PER_CONNECTOR,
) )
from datahub.ingestion.source.state.stale_entity_removal_handler import ( from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalHandler, StaleEntityRemovalHandler,
@ -106,13 +107,21 @@ class FivetranSource(StatefulIngestionSourceBase):
f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity."
) )
for table_lineage in connector.table_lineage: if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR:
self.report.warning(
title="Table lineage truncated",
message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. "
f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.",
context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
)
for lineage in connector.lineage:
input_dataset_urn = DatasetUrn.create_from_ids( input_dataset_urn = DatasetUrn.create_from_ids(
platform_id=source_platform, platform_id=source_platform,
table_name=( table_name=(
f"{source_database.lower()}.{table_lineage.source_table}" f"{source_database.lower()}.{lineage.source_table}"
if source_database if source_database
else table_lineage.source_table else lineage.source_table
), ),
env=source_platform_detail.env, env=source_platform_detail.env,
platform_instance=source_platform_detail.platform_instance, platform_instance=source_platform_detail.platform_instance,
@ -121,14 +130,14 @@ class FivetranSource(StatefulIngestionSourceBase):
output_dataset_urn = DatasetUrn.create_from_ids( output_dataset_urn = DatasetUrn.create_from_ids(
platform_id=self.config.fivetran_log_config.destination_platform, platform_id=self.config.fivetran_log_config.destination_platform,
table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}", table_name=f"{self.audit_log.fivetran_log_database.lower()}.{lineage.destination_table}",
env=destination_platform_detail.env, env=destination_platform_detail.env,
platform_instance=destination_platform_detail.platform_instance, platform_instance=destination_platform_detail.platform_instance,
) )
output_dataset_urn_list.append(output_dataset_urn) output_dataset_urn_list.append(output_dataset_urn)
if self.config.include_column_lineage: if self.config.include_column_lineage:
for column_lineage in table_lineage.column_lineage: for column_lineage in lineage.column_lineage:
fine_grained_lineage.append( fine_grained_lineage.append(
FineGrainedLineage( FineGrainedLineage(
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,

View File

@ -1,6 +1,7 @@
import functools import functools
import json import json
import logging import logging
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sqlglot import sqlglot
@ -22,10 +23,6 @@ from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery
logger: logging.Logger = logging.getLogger(__name__) logger: logging.Logger = logging.getLogger(__name__)
# We don't want to generate a massive number of dataProcesses for a single connector.
# This is primarily used as a safeguard to prevent performance issues.
MAX_JOBS_PER_CONNECTOR = 1000
class FivetranLogAPI: class FivetranLogAPI:
def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None:
@ -91,55 +88,51 @@ class FivetranLogAPI:
resp = self.engine.execute(query) resp = self.engine.execute(query)
return [row for row in resp] return [row for row in resp]
def _get_column_lineage_metadata(self) -> Dict[str, List]: def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]:
""" """
Return's dict of column lineage metadata with key as '<SOURCE_TABLE_ID>-<DESTINATION_TABLE_ID>' Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
""" """
all_column_lineage: Dict[str, List] = {} all_column_lineage = defaultdict(list)
column_lineage_result = self._query( column_lineage_result = self._query(
self.fivetran_log_query.get_column_lineage_query() self.fivetran_log_query.get_column_lineage_query()
) )
for column_lineage in column_lineage_result: for column_lineage in column_lineage_result:
key = f"{column_lineage[Constant.SOURCE_TABLE_ID]}-{column_lineage[Constant.DESTINATION_TABLE_ID]}" key = (
if key not in all_column_lineage: column_lineage[Constant.SOURCE_TABLE_ID],
all_column_lineage[key] = [column_lineage] column_lineage[Constant.DESTINATION_TABLE_ID],
else: )
all_column_lineage[key].append(column_lineage) all_column_lineage[key].append(column_lineage)
return all_column_lineage return dict(all_column_lineage)
def _get_connectors_table_lineage_metadata(self) -> Dict[str, List]: def _get_table_lineage_metadata(self) -> Dict[str, List]:
""" """
Return's dict of table lineage metadata with key as 'CONNECTOR_ID' Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
""" """
connectors_table_lineage_metadata: Dict[str, List] = {} connectors_table_lineage_metadata = defaultdict(list)
table_lineage_result = self._query( table_lineage_result = self._query(
self.fivetran_log_query.get_table_lineage_query() self.fivetran_log_query.get_table_lineage_query()
) )
for table_lineage in table_lineage_result: for table_lineage in table_lineage_result:
if (
table_lineage[Constant.CONNECTOR_ID]
not in connectors_table_lineage_metadata
):
connectors_table_lineage_metadata[
table_lineage[Constant.CONNECTOR_ID]
] = [table_lineage]
else:
connectors_table_lineage_metadata[ connectors_table_lineage_metadata[
table_lineage[Constant.CONNECTOR_ID] table_lineage[Constant.CONNECTOR_ID]
].append(table_lineage) ].append(table_lineage)
return connectors_table_lineage_metadata return dict(connectors_table_lineage_metadata)
def _get_table_lineage( def _extract_connector_lineage(
self, self,
column_lineage_metadata: Dict[str, List],
table_lineage_result: Optional[List], table_lineage_result: Optional[List],
column_lineage_metadata: Dict[Tuple[str, str], List],
) -> List[TableLineage]: ) -> List[TableLineage]:
table_lineage_list: List[TableLineage] = [] table_lineage_list: List[TableLineage] = []
if table_lineage_result is None: if table_lineage_result is None:
return table_lineage_list return table_lineage_list
for table_lineage in table_lineage_result: for table_lineage in table_lineage_result:
# Join the column lineage into the table lineage.
column_lineage_result = column_lineage_metadata.get( column_lineage_result = column_lineage_metadata.get(
f"{table_lineage[Constant.SOURCE_TABLE_ID]}-{table_lineage[Constant.DESTINATION_TABLE_ID]}" (
table_lineage[Constant.SOURCE_TABLE_ID],
table_lineage[Constant.DESTINATION_TABLE_ID],
)
) )
column_lineage_list: List[ColumnLineage] = [] column_lineage_list: List[ColumnLineage] = []
if column_lineage_result: if column_lineage_result:
@ -152,6 +145,7 @@ class FivetranLogAPI:
) )
for column_lineage in column_lineage_result for column_lineage in column_lineage_result
] ]
table_lineage_list.append( table_lineage_list.append(
TableLineage( TableLineage(
source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}", source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}",
@ -167,14 +161,9 @@ class FivetranLogAPI:
) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]:
sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {}
# Format connector_ids as a comma-separated string of quoted IDs query = self.fivetran_log_query.get_sync_logs_query(
formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
query = self.fivetran_log_query.get_sync_logs_query().format(
db_clause=self.fivetran_log_query.db_clause,
syncs_interval=syncs_interval, syncs_interval=syncs_interval,
max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, connector_ids=connector_ids,
connector_ids=formatted_connector_ids,
) )
for row in self._query(query): for row in self._query(query):
@ -234,13 +223,13 @@ class FivetranLogAPI:
return None return None
return self._get_users().get(user_id) return self._get_users().get(user_id)
def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
table_lineage_metadata = self._get_connectors_table_lineage_metadata() table_lineage_metadata = self._get_table_lineage_metadata()
column_lineage_metadata = self._get_column_lineage_metadata() column_lineage_metadata = self._get_column_lineage_metadata()
for connector in connectors: for connector in connectors:
connector.table_lineage = self._get_table_lineage( connector.lineage = self._extract_connector_lineage(
column_lineage_metadata=column_lineage_metadata,
table_lineage_result=table_lineage_metadata.get(connector.connector_id), table_lineage_result=table_lineage_metadata.get(connector.connector_id),
column_lineage_metadata=column_lineage_metadata,
) )
def _fill_connectors_jobs( def _fill_connectors_jobs(
@ -262,6 +251,7 @@ class FivetranLogAPI:
) -> List[Connector]: ) -> List[Connector]:
connectors: List[Connector] = [] connectors: List[Connector] = []
with report.metadata_extraction_perf.connectors_metadata_extraction_sec: with report.metadata_extraction_perf.connectors_metadata_extraction_sec:
logger.info("Fetching connector list")
connector_list = self._query(self.fivetran_log_query.get_connectors_query()) connector_list = self._query(self.fivetran_log_query.get_connectors_query())
for connector in connector_list: for connector in connector_list:
if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]): if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]):
@ -279,12 +269,20 @@ class FivetranLogAPI:
sync_frequency=connector[Constant.SYNC_FREQUENCY], sync_frequency=connector[Constant.SYNC_FREQUENCY],
destination_id=connector[Constant.DESTINATION_ID], destination_id=connector[Constant.DESTINATION_ID],
user_id=connector[Constant.CONNECTING_USER_ID], user_id=connector[Constant.CONNECTING_USER_ID],
table_lineage=[], lineage=[], # filled later
jobs=[], jobs=[], # filled later
) )
) )
if not connectors:
# Some of our queries don't work well when there's no connectors, since
# we push down connector id filters.
return []
with report.metadata_extraction_perf.connectors_lineage_extraction_sec: with report.metadata_extraction_perf.connectors_lineage_extraction_sec:
self._fill_connectors_table_lineage(connectors) logger.info("Fetching connector lineage")
self._fill_connectors_lineage(connectors)
with report.metadata_extraction_perf.connectors_jobs_extraction_sec: with report.metadata_extraction_perf.connectors_jobs_extraction_sec:
logger.info("Fetching connector job run history")
self._fill_connectors_jobs(connectors, syncs_interval) self._fill_connectors_jobs(connectors, syncs_interval)
return connectors return connectors

View File

@ -1,3 +1,11 @@
from typing import List
# Safeguards to prevent fetching massive amounts of data.
MAX_TABLE_LINEAGE_PER_CONNECTOR = 100
MAX_COLUMN_LINEAGE_PER_CONNECTOR = 3000
MAX_JOBS_PER_CONNECTOR = 1000
class FivetranLogQuery: class FivetranLogQuery:
# Note: All queries are written in Snowflake SQL. # Note: All queries are written in Snowflake SQL.
# They will be transpiled to the target database's SQL dialect at runtime. # They will be transpiled to the target database's SQL dialect at runtime.
@ -24,20 +32,28 @@ SELECT
destination_id destination_id
FROM {self.db_clause}connector FROM {self.db_clause}connector
WHERE WHERE
_fivetran_deleted = FALSE\ _fivetran_deleted = FALSE
""" """
def get_users_query(self) -> str: def get_users_query(self) -> str:
return f""" return f"""\
SELECT id as user_id, SELECT id as user_id,
given_name, given_name,
family_name, family_name,
email email
FROM {self.db_clause}user""" FROM {self.db_clause}user
"""
def get_sync_logs_query(self) -> str: def get_sync_logs_query(
return """ self,
WITH ranked_syncs AS ( syncs_interval: int,
connector_ids: List[str],
) -> str:
# Format connector_ids as a comma-separated string of quoted IDs
formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
return f"""\
WITH ranked_syncs AS (
SELECT SELECT
connector_id, connector_id,
sync_id, sync_id,
@ -45,48 +61,59 @@ WHERE
MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
FROM {db_clause}log FROM {self.db_clause}log
WHERE message_event in ('sync_start', 'sync_end') WHERE message_event in ('sync_start', 'sync_end')
AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
AND connector_id IN ({connector_ids}) AND connector_id IN ({formatted_connector_ids})
GROUP BY connector_id, sync_id GROUP BY connector_id, sync_id
) )
SELECT SELECT
connector_id, connector_id,
sync_id, sync_id,
start_time, start_time,
end_time, end_time,
end_message_data end_message_data
FROM ranked_syncs FROM ranked_syncs
WHERE rn <= {max_jobs_per_connector} WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
AND start_time IS NOT NULL AND start_time IS NOT NULL
AND end_time IS NOT NULL AND end_time IS NOT NULL
ORDER BY connector_id, end_time DESC ORDER BY connector_id, end_time DESC
""" """
def get_table_lineage_query(self) -> str: def get_table_lineage_query(self) -> str:
return f""" return f"""\
SELECT stm.connector_id as connector_id, SELECT
stm.connector_id as connector_id,
stm.id as source_table_id, stm.id as source_table_id,
stm.name as source_table_name, stm.name as source_table_name,
ssm.name as source_schema_name, ssm.name as source_schema_name,
dtm.id as destination_table_id, dtm.id as destination_table_id,
dtm.name as destination_table_name, dtm.name as destination_table_name,
dsm.name as destination_schema_name dsm.name as destination_schema_name
FROM {self.db_clause}table_lineage as tl FROM {self.db_clause}table_lineage as tl
JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id""" JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
ORDER BY stm.connector_id, tl.created_at DESC
"""
def get_column_lineage_query(self) -> str: def get_column_lineage_query(self) -> str:
return f""" return f"""\
SELECT scm.table_id as source_table_id, SELECT
scm.table_id as source_table_id,
dcm.table_id as destination_table_id, dcm.table_id as destination_table_id,
scm.name as source_column_name, scm.name as source_column_name,
dcm.name as destination_column_name dcm.name as destination_column_name
FROM {self.db_clause}column_lineage as cl FROM {self.db_clause}column_lineage as cl
JOIN {self.db_clause}source_column_metadata as scm JOIN {self.db_clause}source_column_metadata as scm
on cl.source_column_id = scm.id ON cl.source_column_id = scm.id
JOIN {self.db_clause}destination_column_metadata as dcm JOIN {self.db_clause}destination_column_metadata as dcm
on cl.destination_column_id = dcm.id""" ON cl.destination_column_id = dcm.id
-- Only joining source_table_metadata to get the connector_id.
JOIN {self.db_clause}source_table_metadata as stm
ON scm.table_id = stm.id
QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
ORDER BY stm.connector_id, cl.created_at DESC
"""

View File

@ -100,11 +100,9 @@ def default_query_results(
"email": "abc.xyz@email.com", "email": "abc.xyz@email.com",
} }
] ]
elif query == fivetran_log_query.get_sync_logs_query().format( elif query == fivetran_log_query.get_sync_logs_query(
db_clause=fivetran_log_query.db_clause,
syncs_interval=7, syncs_interval=7,
max_jobs_per_connector=1000, connector_ids=["calendar_elected"],
connector_ids="'calendar_elected'",
): ):
return [ return [
{ {