mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-04 20:58:36 +00:00
Revert "cll improvements"
This reverts commit 281b6df87568062c8d1d8e6b327d9e1d4236cbfd.
This commit is contained in:
parent
281b6df875
commit
fe1a042da5
@ -1,6 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
from typing import Dict, Iterable, List, Optional
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
|
||||||
|
|
||||||
import datahub.emitter.mce_builder as builder
|
import datahub.emitter.mce_builder as builder
|
||||||
from datahub.api.entities.datajob import DataFlow, DataJob
|
from datahub.api.entities.datajob import DataFlow, DataJob
|
||||||
@ -26,12 +25,7 @@ from datahub.ingestion.source.fivetran.config import (
|
|||||||
FivetranSourceReport,
|
FivetranSourceReport,
|
||||||
PlatformDetail,
|
PlatformDetail,
|
||||||
)
|
)
|
||||||
from datahub.ingestion.source.fivetran.data_classes import (
|
from datahub.ingestion.source.fivetran.data_classes import Connector, Job, TableLineage
|
||||||
ColumnLineage,
|
|
||||||
Connector,
|
|
||||||
Job,
|
|
||||||
TableLineage,
|
|
||||||
)
|
|
||||||
from datahub.ingestion.source.fivetran.fivetran_access import (
|
from datahub.ingestion.source.fivetran.fivetran_access import (
|
||||||
create_fivetran_access,
|
create_fivetran_access,
|
||||||
)
|
)
|
||||||
@ -1145,270 +1139,6 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|||||||
dpi = self._generate_dpi_from_job(job, datajob)
|
dpi = self._generate_dpi_from_job(job, datajob)
|
||||||
yield from self._get_dpi_workunits(job, dpi)
|
yield from self._get_dpi_workunits(job, dpi)
|
||||||
|
|
||||||
def _enhance_missing_column_lineage(self, connector: Connector) -> None:
|
|
||||||
"""
|
|
||||||
Last-resort attempt to add column lineage to a connector's tables.
|
|
||||||
This tries various methods to find column information when all else has failed.
|
|
||||||
"""
|
|
||||||
if not connector.lineage:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Keep track of tables we've enhanced
|
|
||||||
enhanced_tables = 0
|
|
||||||
|
|
||||||
# Process each table that doesn't have column lineage
|
|
||||||
for idx, table_lineage in enumerate(connector.lineage):
|
|
||||||
if table_lineage.column_lineage:
|
|
||||||
continue # Skip tables that already have column lineage
|
|
||||||
|
|
||||||
# Try to enhance this specific table's column lineage
|
|
||||||
if self._enhance_single_table_column_lineage(connector, idx, table_lineage):
|
|
||||||
enhanced_tables += 1
|
|
||||||
|
|
||||||
if enhanced_tables > 0:
|
|
||||||
logger.info(
|
|
||||||
f"Enhanced column lineage for {enhanced_tables} tables in connector {connector.connector_id}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"Could not enhance column lineage for any tables in connector {connector.connector_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _enhance_single_table_column_lineage(
|
|
||||||
self, connector: Connector, idx: int, table_lineage: TableLineage
|
|
||||||
) -> bool:
|
|
||||||
"""Helper method to enhance column lineage for a single table to reduce complexity."""
|
|
||||||
source_table = table_lineage.source_table
|
|
||||||
destination_table = table_lineage.destination_table
|
|
||||||
|
|
||||||
if not source_table or not destination_table or "." not in source_table:
|
|
||||||
return False
|
|
||||||
|
|
||||||
schema_name, table_name = source_table.split(".", 1)
|
|
||||||
|
|
||||||
# Get destination platform
|
|
||||||
destination_platform = connector.additional_properties.get(
|
|
||||||
"destination_platform", "unknown"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if we're using the standard API implementation
|
|
||||||
from datahub.ingestion.source.fivetran.fivetran_standard_api import (
|
|
||||||
FivetranStandardAPI,
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(self.fivetran_access, FivetranStandardAPI):
|
|
||||||
# Use the standard API's get_columns_from_api method
|
|
||||||
columns = self.fivetran_access._get_columns_from_api(
|
|
||||||
source_table, connector.connector_id
|
|
||||||
)
|
|
||||||
if columns:
|
|
||||||
column_lineage = self._create_columns_lineage_from_data(
|
|
||||||
columns, destination_platform
|
|
||||||
)
|
|
||||||
if column_lineage:
|
|
||||||
connector.lineage[idx].column_lineage = column_lineage
|
|
||||||
logger.info(
|
|
||||||
f"Added {len(column_lineage)} columns to lineage for {table_lineage.source_table} -> {table_lineage.destination_table}"
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Try a different approach - use the connector schemas method
|
|
||||||
# This should work with any implementation of FivetranAccessInterface
|
|
||||||
schemas = []
|
|
||||||
try:
|
|
||||||
# We need to get the schemas from the appropriate API
|
|
||||||
# Try to use list_connector_schemas indirectly through the fivetran_access
|
|
||||||
if hasattr(self.fivetran_access, "api_client") and hasattr(
|
|
||||||
self.fivetran_access.api_client, "list_connector_schemas"
|
|
||||||
):
|
|
||||||
schemas = self.fivetran_access.api_client.list_connector_schemas(
|
|
||||||
connector.connector_id
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Depending on your implementation, there might be other ways to get schemas
|
|
||||||
# For now, we'll just log and continue
|
|
||||||
logger.warning(
|
|
||||||
f"No suitable method found to get schemas for connector {connector.connector_id}"
|
|
||||||
)
|
|
||||||
except Exception as schema_e:
|
|
||||||
logger.warning(
|
|
||||||
f"Error getting schemas for connector {connector.connector_id}: {schema_e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if schemas:
|
|
||||||
return self._find_and_add_column_lineage(
|
|
||||||
connector,
|
|
||||||
idx,
|
|
||||||
schemas,
|
|
||||||
schema_name,
|
|
||||||
table_name,
|
|
||||||
destination_platform,
|
|
||||||
table_lineage,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Error enhancing column lineage for {source_table}: {e}")
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _find_and_add_column_lineage(
|
|
||||||
self,
|
|
||||||
connector: Connector,
|
|
||||||
idx: int,
|
|
||||||
schemas: List[Dict],
|
|
||||||
schema_name: str,
|
|
||||||
table_name: str,
|
|
||||||
destination_platform: str,
|
|
||||||
table_lineage: TableLineage,
|
|
||||||
) -> bool:
|
|
||||||
"""Find table in schemas and add column lineage if found."""
|
|
||||||
for schema in schemas:
|
|
||||||
if schema.get("name") != schema_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for table in schema.get("tables", []):
|
|
||||||
if not isinstance(table, dict) or table.get("name") != table_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Found the table, try to get columns
|
|
||||||
columns = self._get_columns_for_table(
|
|
||||||
connector, schema_name, table_name, table
|
|
||||||
)
|
|
||||||
if not columns:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create column lineage
|
|
||||||
column_lineage = self._create_columns_lineage_from_data(
|
|
||||||
columns, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
if column_lineage:
|
|
||||||
connector.lineage[idx].column_lineage = column_lineage
|
|
||||||
logger.info(
|
|
||||||
f"Added {len(column_lineage)} columns to lineage for {table_lineage.source_table} -> {table_lineage.destination_table}"
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _get_columns_for_table(
|
|
||||||
self, connector: Connector, schema_name: str, table_name: str, table: Dict
|
|
||||||
) -> List[Any]:
|
|
||||||
"""Get columns for a table, trying multiple methods."""
|
|
||||||
# First try columns in table data
|
|
||||||
columns = table.get("columns", [])
|
|
||||||
if columns:
|
|
||||||
return columns
|
|
||||||
|
|
||||||
# Try direct API call as fallback - need to handle the interface correctly
|
|
||||||
try:
|
|
||||||
# Check if we're using the standard API implementation
|
|
||||||
from datahub.ingestion.source.fivetran.fivetran_standard_api import (
|
|
||||||
FivetranStandardAPI,
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(self.fivetran_access, FivetranStandardAPI):
|
|
||||||
# Use the standard API's method to get columns
|
|
||||||
return self.fivetran_access._get_columns_from_api(
|
|
||||||
f"{schema_name}.{table_name}", connector.connector_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# If not using standard API, try alternative approach
|
|
||||||
if hasattr(self.fivetran_access, "api_client") and hasattr(
|
|
||||||
self.fivetran_access.api_client, "get_table_columns"
|
|
||||||
):
|
|
||||||
columns = self.fivetran_access.api_client.get_table_columns(
|
|
||||||
connector.connector_id, schema_name, table_name
|
|
||||||
)
|
|
||||||
if columns:
|
|
||||||
return columns
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Failed to get columns via API: {e}")
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _create_columns_lineage_from_data(
|
|
||||||
self, columns: List[Any], destination_platform: str
|
|
||||||
) -> List[ColumnLineage]:
|
|
||||||
"""Create column lineage objects from raw column data."""
|
|
||||||
column_lineage = []
|
|
||||||
is_bigquery = destination_platform.lower() == "bigquery"
|
|
||||||
|
|
||||||
# Process columns based on format
|
|
||||||
if isinstance(columns, list):
|
|
||||||
for column in columns:
|
|
||||||
col_name = self._extract_column_name(column)
|
|
||||||
if not col_name or col_name.startswith("_fivetran"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get destination column name
|
|
||||||
dest_col_name = self._get_destination_column_name(
|
|
||||||
column, col_name, is_bigquery
|
|
||||||
)
|
|
||||||
|
|
||||||
column_lineage.append(
|
|
||||||
ColumnLineage(
|
|
||||||
source_column=col_name, destination_column=dest_col_name
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# Handle dictionary format
|
|
||||||
elif isinstance(columns, dict):
|
|
||||||
for col_name, col_data in columns.items():
|
|
||||||
if col_name.startswith("_fivetran"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get destination column name
|
|
||||||
dest_col_name = self._get_destination_column_name_from_dict(
|
|
||||||
col_data, col_name, is_bigquery
|
|
||||||
)
|
|
||||||
|
|
||||||
column_lineage.append(
|
|
||||||
ColumnLineage(
|
|
||||||
source_column=col_name, destination_column=dest_col_name
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return column_lineage
|
|
||||||
|
|
||||||
def _extract_column_name(self, column: Any) -> Optional[str]:
|
|
||||||
"""Extract column name from column data."""
|
|
||||||
if isinstance(column, dict):
|
|
||||||
return column.get("name")
|
|
||||||
elif isinstance(column, str):
|
|
||||||
return column
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_destination_column_name(
|
|
||||||
self, column: Any, col_name: str, is_bigquery: bool
|
|
||||||
) -> str:
|
|
||||||
"""Get destination column name, preferring name_in_destination if available."""
|
|
||||||
if isinstance(column, dict) and "name_in_destination" in column:
|
|
||||||
return column.get("name_in_destination")
|
|
||||||
else:
|
|
||||||
return self._transform_column_name_for_platform(col_name, is_bigquery)
|
|
||||||
|
|
||||||
def _get_destination_column_name_from_dict(
|
|
||||||
self, col_data: Any, col_name: str, is_bigquery: bool
|
|
||||||
) -> str:
|
|
||||||
"""Get destination column name from dictionary format column data."""
|
|
||||||
if isinstance(col_data, dict) and "name_in_destination" in col_data:
|
|
||||||
return col_data.get("name_in_destination")
|
|
||||||
else:
|
|
||||||
return self._transform_column_name_for_platform(col_name, is_bigquery)
|
|
||||||
|
|
||||||
def _transform_column_name_for_platform(
|
|
||||||
self, col_name: str, is_bigquery: bool
|
|
||||||
) -> str:
|
|
||||||
"""Transform column name based on destination platform."""
|
|
||||||
if is_bigquery:
|
|
||||||
# For BigQuery, convert to snake_case
|
|
||||||
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", col_name)
|
|
||||||
s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
|
|
||||||
return s2.lower()
|
|
||||||
else:
|
|
||||||
# For other platforms like Snowflake, typically uppercase
|
|
||||||
return col_name.upper()
|
|
||||||
|
|
||||||
def _get_connector_workunits(
|
def _get_connector_workunits(
|
||||||
self, connector: Connector
|
self, connector: Connector
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
@ -1423,25 +1153,8 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|||||||
# Store field lineage workunits to emit after dataset workunits
|
# Store field lineage workunits to emit after dataset workunits
|
||||||
field_lineage_workunits = []
|
field_lineage_workunits = []
|
||||||
|
|
||||||
# We'll only consider a connector to have valid lineage if it has table lineage
|
|
||||||
# AND at least one of those tables has column lineage
|
|
||||||
has_column_lineage = any(
|
|
||||||
table_lineage.column_lineage for table_lineage in connector.lineage
|
|
||||||
)
|
|
||||||
|
|
||||||
# Special handling for connectors with lineage but no job history
|
# Special handling for connectors with lineage but no job history
|
||||||
if not connector.jobs and connector.lineage:
|
if not connector.jobs and connector.lineage:
|
||||||
# Check if there's any column lineage to include
|
|
||||||
if not has_column_lineage:
|
|
||||||
logger.warning(
|
|
||||||
f"Connector {connector.connector_name} (ID: {connector.connector_id}) "
|
|
||||||
f"has {len(connector.lineage)} lineage entries but no column lineage. "
|
|
||||||
"Column-level lineage information will be missing."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try one last attempt to add column lineage
|
|
||||||
self._enhance_missing_column_lineage(connector)
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Connector {connector.connector_name} (ID: {connector.connector_id}) "
|
f"Connector {connector.connector_name} (ID: {connector.connector_id}) "
|
||||||
f"has {len(connector.lineage)} lineage entries but no job history. "
|
f"has {len(connector.lineage)} lineage entries but no job history. "
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
import difflib
|
import difflib
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
from datahub.configuration.common import AllowDenyPattern
|
from datahub.configuration.common import AllowDenyPattern
|
||||||
from datahub.ingestion.source.fivetran.config import (
|
from datahub.ingestion.source.fivetran.config import (
|
||||||
@ -235,421 +235,83 @@ class FivetranStandardAPI(FivetranAccessInterface):
|
|||||||
|
|
||||||
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
||||||
"""
|
"""
|
||||||
Fill in lineage information for all connectors with special attention to column lineage.
|
Fill in lineage information for connectors by calling the API with enhanced diagnostics.
|
||||||
|
Ensures every connector with schema information gets lineage.
|
||||||
"""
|
"""
|
||||||
# Cache connectors for later use
|
|
||||||
self._connector_cache = connectors
|
self._connector_cache = connectors
|
||||||
|
|
||||||
# First process explicitly to make sure lineage is extracted
|
|
||||||
for connector in connectors:
|
for connector in connectors:
|
||||||
self._fill_connector_lineage(connector)
|
self._fill_connector_lineage(connector)
|
||||||
|
|
||||||
# Perform second pass for column lineage enhancement
|
|
||||||
self._enhance_column_lineage_for_connectors(connectors)
|
|
||||||
|
|
||||||
def _enhance_column_lineage_for_connectors(
|
|
||||||
self, connectors: List[Connector]
|
|
||||||
) -> None:
|
|
||||||
"""Enhance column lineage for connectors that have tables without column information."""
|
|
||||||
logger.info("Performing secondary column lineage enhancement pass")
|
|
||||||
|
|
||||||
for connector in connectors:
|
|
||||||
try:
|
|
||||||
if not connector.lineage:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Find tables that need column lineage enhancement
|
|
||||||
tables_missing_columns = self._find_tables_missing_columns(connector)
|
|
||||||
if not tables_missing_columns:
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Enhancing column lineage for {len(tables_missing_columns)} tables in connector {connector.connector_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get schemas if we don't have them already
|
|
||||||
schemas = self.api_client.list_connector_schemas(connector.connector_id)
|
|
||||||
if not schemas:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Make sure we have column info
|
|
||||||
self._preload_all_columns(connector.connector_id, schemas)
|
|
||||||
|
|
||||||
# Process each table missing columns
|
|
||||||
self._process_tables_missing_columns(
|
|
||||||
connector, tables_missing_columns, schemas
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Error enhancing column lineage for connector {connector.connector_id}: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _find_tables_missing_columns(
|
|
||||||
self, connector: Connector
|
|
||||||
) -> List[Tuple[int, TableLineage]]:
|
|
||||||
"""Find tables in a connector that don't have column lineage."""
|
|
||||||
tables_missing_columns = []
|
|
||||||
for idx, table_lineage in enumerate(connector.lineage):
|
|
||||||
if not table_lineage.column_lineage:
|
|
||||||
# Track tables that need column lineage enhancement
|
|
||||||
tables_missing_columns.append((idx, table_lineage))
|
|
||||||
return tables_missing_columns
|
|
||||||
|
|
||||||
def _process_tables_missing_columns(
|
|
||||||
self,
|
|
||||||
connector: Connector,
|
|
||||||
tables_missing_columns: List[Tuple[int, TableLineage]],
|
|
||||||
schemas: List[Dict],
|
|
||||||
) -> None:
|
|
||||||
"""Process tables that are missing column information."""
|
|
||||||
for idx, table_lineage in tables_missing_columns:
|
|
||||||
source_table = table_lineage.source_table
|
|
||||||
if "." not in source_table:
|
|
||||||
continue
|
|
||||||
|
|
||||||
schema_name, table_name = source_table.split(".", 1)
|
|
||||||
|
|
||||||
# Find this table in the schemas
|
|
||||||
self._find_and_enhance_table_columns(
|
|
||||||
connector, idx, table_lineage, schema_name, table_name, schemas
|
|
||||||
)
|
|
||||||
|
|
||||||
def _find_and_enhance_table_columns(
|
|
||||||
self,
|
|
||||||
connector: Connector,
|
|
||||||
idx: int,
|
|
||||||
table_lineage: TableLineage,
|
|
||||||
schema_name: str,
|
|
||||||
table_name: str,
|
|
||||||
schemas: List[Dict],
|
|
||||||
) -> None:
|
|
||||||
"""Find a table in schemas and enhance its column information."""
|
|
||||||
for schema in schemas:
|
|
||||||
if schema.get("name") != schema_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for table in schema.get("tables", []):
|
|
||||||
if not isinstance(table, dict) or table.get("name") != table_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Found matching table, extract column lineage
|
|
||||||
self._extract_and_set_column_lineage(
|
|
||||||
connector, idx, table_lineage, table
|
|
||||||
)
|
|
||||||
return # Found our table, no need to continue searching
|
|
||||||
|
|
||||||
def _extract_and_set_column_lineage(
|
|
||||||
self, connector: Connector, idx: int, table_lineage: TableLineage, table: Dict
|
|
||||||
) -> None:
|
|
||||||
"""Extract column lineage from table and set it on the lineage object."""
|
|
||||||
# Found matching table, extract column lineage
|
|
||||||
columns = table.get("columns", [])
|
|
||||||
if not columns:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Determine destination platform
|
|
||||||
destination_platform = self._get_destination_platform(connector)
|
|
||||||
|
|
||||||
# Extract column lineage
|
|
||||||
column_lineage = self._extract_column_lineage_from_columns(
|
|
||||||
columns, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
if column_lineage:
|
|
||||||
logger.info(
|
|
||||||
f"Added {len(column_lineage)} columns to lineage for {table_lineage.source_table}"
|
|
||||||
)
|
|
||||||
connector.lineage[idx].column_lineage = column_lineage
|
|
||||||
|
|
||||||
def _extract_column_lineage_from_columns(
|
|
||||||
self, columns: List[Any], destination_platform: str
|
|
||||||
) -> List[ColumnLineage]:
|
|
||||||
"""Extract column lineage from a list of columns."""
|
|
||||||
column_lineage = []
|
|
||||||
is_bigquery = destination_platform.lower() == "bigquery"
|
|
||||||
|
|
||||||
if not isinstance(columns, list):
|
|
||||||
return column_lineage
|
|
||||||
|
|
||||||
for column in columns:
|
|
||||||
col_name = None
|
|
||||||
if isinstance(column, dict):
|
|
||||||
col_name = column.get("name")
|
|
||||||
elif isinstance(column, str):
|
|
||||||
col_name = column
|
|
||||||
|
|
||||||
if col_name and not col_name.startswith("_fivetran"):
|
|
||||||
# Get destination column name - prefer name_in_destination if available
|
|
||||||
dest_col_name = None
|
|
||||||
if isinstance(column, dict) and "name_in_destination" in column:
|
|
||||||
dest_col_name = column.get("name_in_destination")
|
|
||||||
|
|
||||||
# If no name_in_destination, transform based on platform
|
|
||||||
if not dest_col_name:
|
|
||||||
dest_col_name = self._transform_column_name_for_platform(
|
|
||||||
col_name, is_bigquery
|
|
||||||
)
|
|
||||||
|
|
||||||
column_lineage.append(
|
|
||||||
ColumnLineage(
|
|
||||||
source_column=col_name, destination_column=dest_col_name
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return column_lineage
|
|
||||||
|
|
||||||
def _create_synthetic_lineage(
|
def _create_synthetic_lineage(
|
||||||
self, connector: Connector, schemas: List[dict], destination_platform: str
|
self, connector: Connector, schemas: List[dict], destination_platform: str
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create synthetic lineage for a connector based just on schema and table information."""
|
"""Create synthetic lineage for a connector based just on schema and table information."""
|
||||||
lineage_list = []
|
lineage_list = []
|
||||||
|
|
||||||
# Process each schema and its tables
|
|
||||||
for schema in schemas:
|
for schema in schemas:
|
||||||
schema_name = schema.get("name", "")
|
schema_name = schema.get("name", "")
|
||||||
if not schema_name:
|
if not schema_name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
lineage_entries = self._process_schema_for_synthetic_lineage(
|
for table in schema.get("tables", []):
|
||||||
schema, schema_name, destination_platform
|
|
||||||
)
|
|
||||||
lineage_list.extend(lineage_entries)
|
|
||||||
|
|
||||||
if lineage_list:
|
|
||||||
logger.info(
|
|
||||||
f"Created {len(lineage_list)} synthetic table lineage entries for connector {connector.connector_id}"
|
|
||||||
)
|
|
||||||
# Store the lineage in the connector
|
|
||||||
connector.lineage = lineage_list
|
|
||||||
|
|
||||||
def _process_schema_for_synthetic_lineage(
|
|
||||||
self, schema: Dict, schema_name: str, destination_platform: str
|
|
||||||
) -> List[TableLineage]:
|
|
||||||
"""Process a schema to create synthetic lineage entries."""
|
|
||||||
lineage_entries = []
|
|
||||||
|
|
||||||
tables = schema.get("tables", [])
|
|
||||||
if not isinstance(tables, list):
|
|
||||||
return lineage_entries
|
|
||||||
|
|
||||||
for table in tables:
|
|
||||||
if not isinstance(table, dict):
|
if not isinstance(table, dict):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
lineage_entry = self._process_table_for_synthetic_lineage(
|
|
||||||
table, schema, schema_name, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
if lineage_entry:
|
|
||||||
lineage_entries.append(lineage_entry)
|
|
||||||
|
|
||||||
return lineage_entries
|
|
||||||
|
|
||||||
def _process_table_for_synthetic_lineage(
|
|
||||||
self, table: Dict, schema: Dict, schema_name: str, destination_platform: str
|
|
||||||
) -> Optional[TableLineage]:
|
|
||||||
"""Process a table to create a synthetic lineage entry."""
|
|
||||||
table_name = table.get("name", "")
|
table_name = table.get("name", "")
|
||||||
if not table_name or not table.get("enabled", True):
|
if not table_name or not table.get("enabled", True):
|
||||||
return None
|
continue
|
||||||
|
|
||||||
# Create source and destination table identifiers
|
|
||||||
source_table, destination_table = self._create_source_dest_table_names(
|
|
||||||
table, schema, schema_name, table_name, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create column lineage
|
|
||||||
column_lineage = self._create_synthetic_column_lineage(
|
|
||||||
table, source_table, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add this table's lineage if we have table information
|
|
||||||
if source_table and destination_table:
|
|
||||||
logger.info(
|
|
||||||
f"Creating lineage: {source_table} -> {destination_table} with {len(column_lineage)} columns"
|
|
||||||
)
|
|
||||||
return TableLineage(
|
|
||||||
source_table=source_table,
|
|
||||||
destination_table=destination_table,
|
|
||||||
column_lineage=column_lineage,
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _create_source_dest_table_names(
|
|
||||||
self,
|
|
||||||
table: Dict,
|
|
||||||
schema: Dict,
|
|
||||||
schema_name: str,
|
|
||||||
table_name: str,
|
|
||||||
destination_platform: str,
|
|
||||||
) -> Tuple[str, str]:
|
|
||||||
"""Create source and destination table names with proper casing and naming conventions."""
|
|
||||||
# Create source table identifier
|
# Create source table identifier
|
||||||
source_table = f"{schema_name}.{table_name}"
|
source_table = f"{schema_name}.{table_name}"
|
||||||
|
|
||||||
# Get destination names - using name_in_destination if available
|
# Get destination names
|
||||||
schema_name_in_destination = schema.get("name_in_destination")
|
dest_schema = self._get_destination_schema_name(
|
||||||
dest_schema = (
|
schema_name, destination_platform
|
||||||
schema_name_in_destination
|
|
||||||
if schema_name_in_destination
|
|
||||||
else self._get_destination_schema_name(schema_name, destination_platform)
|
|
||||||
)
|
)
|
||||||
|
dest_table = self._get_destination_table_name(
|
||||||
table_name_in_destination = table.get("name_in_destination")
|
table_name, destination_platform
|
||||||
dest_table = (
|
|
||||||
table_name_in_destination
|
|
||||||
if table_name_in_destination
|
|
||||||
else self._get_destination_table_name(table_name, destination_platform)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
destination_table = f"{dest_schema}.{dest_table}"
|
destination_table = f"{dest_schema}.{dest_table}"
|
||||||
|
|
||||||
return source_table, destination_table
|
# Create synthetic column lineage if we have column info
|
||||||
|
|
||||||
def _create_synthetic_column_lineage(
|
|
||||||
self, table: Dict, source_table: str, destination_platform: str
|
|
||||||
) -> List[ColumnLineage]:
|
|
||||||
"""Create synthetic column lineage for a table."""
|
|
||||||
column_lineage = []
|
column_lineage = []
|
||||||
|
|
||||||
# First try with list format columns
|
|
||||||
columns = table.get("columns", [])
|
columns = table.get("columns", [])
|
||||||
|
|
||||||
if isinstance(columns, list):
|
if isinstance(columns, list):
|
||||||
column_lineage = self._process_list_columns_for_lineage(
|
|
||||||
columns, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to extract columns from dict format if no columns found yet
|
|
||||||
if not column_lineage and isinstance(table.get("columns"), dict):
|
|
||||||
column_dict = table.get("columns", {})
|
|
||||||
column_lineage = self._process_dict_columns_for_lineage(
|
|
||||||
column_dict, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
# If we still don't have columns but we know the source table,
|
|
||||||
# try to get columns from API or similar tables
|
|
||||||
if not column_lineage:
|
|
||||||
logger.info(
|
|
||||||
f"No columns found for {source_table}, attempting additional discovery"
|
|
||||||
)
|
|
||||||
column_lineage = self._get_columns_from_additional_sources(
|
|
||||||
source_table, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
return column_lineage
|
|
||||||
|
|
||||||
def _process_list_columns_for_lineage(
|
|
||||||
self, columns: List[Any], destination_platform: str
|
|
||||||
) -> List[ColumnLineage]:
|
|
||||||
"""Process list format columns for lineage."""
|
|
||||||
column_lineage = []
|
|
||||||
|
|
||||||
for column in columns:
|
for column in columns:
|
||||||
if not column:
|
|
||||||
continue
|
|
||||||
|
|
||||||
col_name = None
|
col_name = None
|
||||||
if isinstance(column, dict):
|
if isinstance(column, dict):
|
||||||
col_name = column.get("name")
|
col_name = column.get("name")
|
||||||
elif isinstance(column, str):
|
elif isinstance(column, str):
|
||||||
col_name = column
|
col_name = column
|
||||||
|
|
||||||
if not col_name or col_name.startswith("_fivetran"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the destination column name
|
|
||||||
dest_col_name = self._get_destination_column_name(
|
|
||||||
column, col_name, destination_platform
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add to lineage
|
|
||||||
column_lineage.append(
|
|
||||||
ColumnLineage(source_column=col_name, destination_column=dest_col_name)
|
|
||||||
)
|
|
||||||
|
|
||||||
return column_lineage
|
|
||||||
|
|
||||||
def _get_destination_column_name(
|
|
||||||
self, column: Any, col_name: str, destination_platform: str
|
|
||||||
) -> str:
|
|
||||||
"""Get destination column name with proper handling of name_in_destination."""
|
|
||||||
# First check for name_in_destination
|
|
||||||
dest_col_name = None
|
|
||||||
if isinstance(column, dict) and "name_in_destination" in column:
|
|
||||||
dest_col_name = column.get("name_in_destination")
|
|
||||||
logger.debug(
|
|
||||||
f"Using name_in_destination {dest_col_name} for column {col_name}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# If no explicit mapping, transform based on destination platform
|
|
||||||
if not dest_col_name:
|
|
||||||
is_bigquery = destination_platform.lower() == "bigquery"
|
|
||||||
dest_col_name = self._transform_column_name_for_platform(
|
|
||||||
col_name, is_bigquery
|
|
||||||
)
|
|
||||||
logger.debug(f"Transformed column name: {col_name} -> {dest_col_name}")
|
|
||||||
|
|
||||||
return dest_col_name
|
|
||||||
|
|
||||||
def _process_dict_columns_for_lineage(
|
|
||||||
self, column_dict: Dict, destination_platform: str
|
|
||||||
) -> List[ColumnLineage]:
|
|
||||||
"""Process dictionary format columns for lineage."""
|
|
||||||
column_lineage = []
|
|
||||||
|
|
||||||
logger.info("Extracting columns from dictionary format")
|
|
||||||
|
|
||||||
for col_name, col_info in column_dict.items():
|
|
||||||
if col_name.startswith("_fivetran"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get destination column name
|
|
||||||
dest_col_name = None
|
|
||||||
if isinstance(col_info, dict) and "name_in_destination" in col_info:
|
|
||||||
dest_col_name = col_info.get("name_in_destination")
|
|
||||||
|
|
||||||
if not dest_col_name:
|
|
||||||
is_bigquery = destination_platform.lower() == "bigquery"
|
|
||||||
dest_col_name = self._transform_column_name_for_platform(
|
|
||||||
col_name, is_bigquery
|
|
||||||
)
|
|
||||||
|
|
||||||
column_lineage.append(
|
|
||||||
ColumnLineage(source_column=col_name, destination_column=dest_col_name)
|
|
||||||
)
|
|
||||||
|
|
||||||
return column_lineage
|
|
||||||
|
|
||||||
def _get_columns_from_additional_sources(
|
|
||||||
self, source_table: str, destination_platform: str
|
|
||||||
) -> List[ColumnLineage]:
|
|
||||||
"""Try to get columns from additional sources when normal methods fail."""
|
|
||||||
column_lineage = []
|
|
||||||
|
|
||||||
# Try to infer from API
|
|
||||||
columns_from_api = self._get_columns_from_api(source_table)
|
|
||||||
if columns_from_api:
|
|
||||||
logger.info(
|
|
||||||
f"Found {len(columns_from_api)} columns from API for {source_table}"
|
|
||||||
)
|
|
||||||
|
|
||||||
is_bigquery = destination_platform.lower() == "bigquery"
|
|
||||||
for col in columns_from_api:
|
|
||||||
col_name = col.get("name") if isinstance(col, dict) else col
|
|
||||||
if col_name and not col_name.startswith("_fivetran"):
|
if col_name and not col_name.startswith("_fivetran"):
|
||||||
dest_col_name = self._transform_column_name_for_platform(
|
is_bigquery = destination_platform.lower() == "bigquery"
|
||||||
|
dest_col = self._transform_column_name_for_platform(
|
||||||
col_name, is_bigquery
|
col_name, is_bigquery
|
||||||
)
|
)
|
||||||
column_lineage.append(
|
column_lineage.append(
|
||||||
ColumnLineage(
|
ColumnLineage(
|
||||||
source_column=col_name, destination_column=dest_col_name
|
source_column=col_name, destination_column=dest_col
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return column_lineage
|
# Add this table's lineage
|
||||||
|
lineage_list.append(
|
||||||
|
TableLineage(
|
||||||
|
source_table=source_table,
|
||||||
|
destination_table=destination_table,
|
||||||
|
column_lineage=column_lineage,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if lineage_list:
|
||||||
|
logger.info(
|
||||||
|
f"Created {len(lineage_list)} synthetic table lineage entries for connector {connector.connector_id}"
|
||||||
|
)
|
||||||
|
# Set the lineage directly on the connector instead of using _lineage_cache
|
||||||
|
connector.lineage = lineage_list
|
||||||
|
|
||||||
def _process_connector(
|
def _process_connector(
|
||||||
self,
|
self,
|
||||||
@ -1356,13 +1018,8 @@ class FivetranStandardAPI(FivetranAccessInterface):
|
|||||||
|
|
||||||
return column_lineage
|
return column_lineage
|
||||||
|
|
||||||
def _get_columns_from_api(
|
def _get_columns_from_api(self, source_table: str) -> List[Dict]:
|
||||||
self, source_table: str, connector_id: str = None
|
"""Get columns directly from Fivetran API for a table."""
|
||||||
) -> List[Dict]:
|
|
||||||
"""
|
|
||||||
Get columns directly from Fivetran API for a table.
|
|
||||||
Enhanced to use provided connector_id when available.
|
|
||||||
"""
|
|
||||||
# Parse schema and table name
|
# Parse schema and table name
|
||||||
if "." not in source_table:
|
if "." not in source_table:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -1372,24 +1029,23 @@ class FivetranStandardAPI(FivetranAccessInterface):
|
|||||||
|
|
||||||
schema_name, table_name = source_table.split(".", 1)
|
schema_name, table_name = source_table.split(".", 1)
|
||||||
|
|
||||||
# Use provided connector_id or try to find it
|
# Find the connector ID for this source table
|
||||||
conn_id = connector_id
|
connector_id = self._find_connector_id_for_source_table(source_table)
|
||||||
if not conn_id:
|
if not connector_id:
|
||||||
conn_id = self._find_connector_id_for_source_table(source_table)
|
|
||||||
|
|
||||||
if not conn_id:
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Could not find connector ID for source table {source_table}"
|
f"Could not find connector ID for source table {source_table}"
|
||||||
)
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
logger.info(f"Using connector ID {conn_id} for source table {source_table}")
|
logger.info(
|
||||||
|
f"Found connector ID {connector_id} for source table {source_table}"
|
||||||
|
)
|
||||||
logger.info(f"Querying API for columns of {schema_name}.{table_name}")
|
logger.info(f"Querying API for columns of {schema_name}.{table_name}")
|
||||||
|
|
||||||
# Call the API to get columns using the direct columns endpoint
|
# Call the API to get columns using the direct columns endpoint
|
||||||
try:
|
try:
|
||||||
columns = self.api_client.get_table_columns(
|
columns = self.api_client.get_table_columns(
|
||||||
conn_id, schema_name, table_name
|
connector_id, schema_name, table_name
|
||||||
)
|
)
|
||||||
if columns:
|
if columns:
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user