mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-28 18:38:17 +00:00
feat(ingest/fivetran): show connector filter reason (#11695)
This commit is contained in:
parent
326afc6308
commit
e96323a2a2
@ -25,7 +25,7 @@ source:
|
||||
client_email: "client_email"
|
||||
client_id: "client_id"
|
||||
private_key: "private_key"
|
||||
dataset: "fivetran_log_dataset"
|
||||
dataset: "fivetran_log_dataset"
|
||||
|
||||
# Optional - filter for certain connector names instead of ingesting everything.
|
||||
# connector_patterns:
|
||||
@ -35,7 +35,7 @@ source:
|
||||
# Optional -- A mapping of the connector's all sources to its database.
|
||||
# sources_to_database:
|
||||
# connector_id: source_db
|
||||
|
||||
|
||||
# Optional -- This mapping is optional and only required to configure platform-instance for source
|
||||
# A mapping of Fivetran connector id to data platform instance
|
||||
# sources_to_platform_instance:
|
||||
|
||||
@ -160,8 +160,7 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
||||
)
|
||||
connector_patterns: AllowDenyPattern = Field(
|
||||
default=AllowDenyPattern.allow_all(),
|
||||
description="Filtering regex patterns for connector ids. "
|
||||
"They're visible in the Fivetran UI under Connectors -> Setup -> Fivetran Connector ID.",
|
||||
description="Filtering regex patterns for connector names.",
|
||||
)
|
||||
destination_patterns: AllowDenyPattern = Field(
|
||||
default=AllowDenyPattern.allow_all(),
|
||||
|
||||
@ -84,17 +84,21 @@ class FivetranLogAPI:
|
||||
query = sqlglot.parse_one(query, dialect="snowflake").sql(
|
||||
dialect=self.fivetran_log_config.destination_platform, pretty=True
|
||||
)
|
||||
logger.debug(f"Query : {query}")
|
||||
logger.info(f"Executing query: {query}")
|
||||
resp = self.engine.execute(query)
|
||||
return [row for row in resp]
|
||||
|
||||
def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]:
|
||||
def _get_column_lineage_metadata(
|
||||
self, connector_ids: List[str]
|
||||
) -> Dict[Tuple[str, str], List]:
|
||||
"""
|
||||
Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
|
||||
"""
|
||||
all_column_lineage = defaultdict(list)
|
||||
column_lineage_result = self._query(
|
||||
self.fivetran_log_query.get_column_lineage_query()
|
||||
self.fivetran_log_query.get_column_lineage_query(
|
||||
connector_ids=connector_ids
|
||||
)
|
||||
)
|
||||
for column_lineage in column_lineage_result:
|
||||
key = (
|
||||
@ -104,13 +108,13 @@ class FivetranLogAPI:
|
||||
all_column_lineage[key].append(column_lineage)
|
||||
return dict(all_column_lineage)
|
||||
|
||||
def _get_table_lineage_metadata(self) -> Dict[str, List]:
|
||||
def _get_table_lineage_metadata(self, connector_ids: List[str]) -> Dict[str, List]:
|
||||
"""
|
||||
Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
|
||||
"""
|
||||
connectors_table_lineage_metadata = defaultdict(list)
|
||||
table_lineage_result = self._query(
|
||||
self.fivetran_log_query.get_table_lineage_query()
|
||||
self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
|
||||
)
|
||||
for table_lineage in table_lineage_result:
|
||||
connectors_table_lineage_metadata[
|
||||
@ -224,8 +228,9 @@ class FivetranLogAPI:
|
||||
return self._get_users().get(user_id)
|
||||
|
||||
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
||||
table_lineage_metadata = self._get_table_lineage_metadata()
|
||||
column_lineage_metadata = self._get_column_lineage_metadata()
|
||||
connector_ids = [connector.connector_id for connector in connectors]
|
||||
table_lineage_metadata = self._get_table_lineage_metadata(connector_ids)
|
||||
column_lineage_metadata = self._get_column_lineage_metadata(connector_ids)
|
||||
for connector in connectors:
|
||||
connector.lineage = self._extract_connector_lineage(
|
||||
table_lineage_result=table_lineage_metadata.get(connector.connector_id),
|
||||
@ -254,20 +259,25 @@ class FivetranLogAPI:
|
||||
logger.info("Fetching connector list")
|
||||
connector_list = self._query(self.fivetran_log_query.get_connectors_query())
|
||||
for connector in connector_list:
|
||||
if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]):
|
||||
report.report_connectors_dropped(connector[Constant.CONNECTOR_NAME])
|
||||
connector_name = connector[Constant.CONNECTOR_NAME]
|
||||
if not connector_patterns.allowed(connector_name):
|
||||
report.report_connectors_dropped(connector_name)
|
||||
continue
|
||||
if not destination_patterns.allowed(connector[Constant.DESTINATION_ID]):
|
||||
report.report_connectors_dropped(connector[Constant.CONNECTOR_NAME])
|
||||
if not destination_patterns.allowed(
|
||||
destination_id := connector[Constant.DESTINATION_ID]
|
||||
):
|
||||
report.report_connectors_dropped(
|
||||
f"{connector_name} (destination_id: {destination_id})"
|
||||
)
|
||||
continue
|
||||
connectors.append(
|
||||
Connector(
|
||||
connector_id=connector[Constant.CONNECTOR_ID],
|
||||
connector_name=connector[Constant.CONNECTOR_NAME],
|
||||
connector_name=connector_name,
|
||||
connector_type=connector[Constant.CONNECTOR_TYPE_ID],
|
||||
paused=connector[Constant.PAUSED],
|
||||
sync_frequency=connector[Constant.SYNC_FREQUENCY],
|
||||
destination_id=connector[Constant.DESTINATION_ID],
|
||||
destination_id=destination_id,
|
||||
user_id=connector[Constant.CONNECTING_USER_ID],
|
||||
lineage=[], # filled later
|
||||
jobs=[], # filled later
|
||||
@ -279,6 +289,7 @@ class FivetranLogAPI:
|
||||
# we push down connector id filters.
|
||||
logger.info("No allowed connectors found")
|
||||
return []
|
||||
logger.info(f"Found {len(connectors)} allowed connectors")
|
||||
|
||||
with report.metadata_extraction_perf.connectors_lineage_extraction_sec:
|
||||
logger.info("Fetching connector lineage")
|
||||
|
||||
@ -80,7 +80,10 @@ WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
|
||||
ORDER BY connector_id, end_time DESC
|
||||
"""
|
||||
|
||||
def get_table_lineage_query(self) -> str:
|
||||
def get_table_lineage_query(self, connector_ids: List[str]) -> str:
|
||||
# Format connector_ids as a comma-separated string of quoted IDs
|
||||
formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
|
||||
|
||||
return f"""\
|
||||
SELECT
|
||||
stm.connector_id as connector_id,
|
||||
@ -95,11 +98,15 @@ JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
|
||||
JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
|
||||
JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
|
||||
JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
|
||||
WHERE stm.connector_id IN ({formatted_connector_ids})
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
|
||||
ORDER BY stm.connector_id, tl.created_at DESC
|
||||
"""
|
||||
|
||||
def get_column_lineage_query(self) -> str:
|
||||
def get_column_lineage_query(self, connector_ids: List[str]) -> str:
|
||||
# Format connector_ids as a comma-separated string of quoted IDs
|
||||
formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
|
||||
|
||||
return f"""\
|
||||
SELECT
|
||||
scm.table_id as source_table_id,
|
||||
@ -114,6 +121,7 @@ JOIN {self.db_clause}destination_column_metadata as dcm
|
||||
-- Only joining source_table_metadata to get the connector_id.
|
||||
JOIN {self.db_clause}source_table_metadata as stm
|
||||
ON scm.table_id = stm.id
|
||||
WHERE stm.connector_id IN ({formatted_connector_ids})
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
|
||||
ORDER BY stm.connector_id, cl.created_at DESC
|
||||
"""
|
||||
|
||||
@ -43,7 +43,9 @@ def default_query_results(
|
||||
return []
|
||||
elif query == fivetran_log_query.get_connectors_query():
|
||||
return connector_query_results
|
||||
elif query == fivetran_log_query.get_table_lineage_query():
|
||||
elif query == fivetran_log_query.get_table_lineage_query(
|
||||
connector_ids=["calendar_elected"]
|
||||
):
|
||||
return [
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
@ -64,7 +66,9 @@ def default_query_results(
|
||||
"destination_schema_name": "postgres_public",
|
||||
},
|
||||
]
|
||||
elif query == fivetran_log_query.get_column_lineage_query():
|
||||
elif query == fivetran_log_query.get_column_lineage_query(
|
||||
connector_ids=["calendar_elected"]
|
||||
):
|
||||
return [
|
||||
{
|
||||
"source_table_id": "10040",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user