cll improvements

This commit is contained in:
Jonny Dixon 2025-03-19 23:24:54 +01:00
parent 5642906be3
commit 0816bf3155
3 changed files with 604 additions and 375 deletions

View File

@ -356,12 +356,16 @@ class FivetranSource(StatefulIngestionSourceBase):
def _transform_column_name_for_platform(
self, column_name: str, is_bigquery: bool
) -> str:
"""Transform column name based on the destination platform."""
"""Transform column name based on the destination platform with better handling of edge cases."""
if not column_name:
return ""
if is_bigquery:
# For BigQuery:
# 1. Convert to lowercase
# 2. Replace camelCase with snake_case
# 3. Clean up any invalid characters
import re
# Step 1: Convert camelCase to snake_case with regex
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
@ -373,6 +377,10 @@ class FivetranSource(StatefulIngestionSourceBase):
# Step 3: Remove leading/trailing underscores and collapse multiple underscores
transformed = re.sub(r"_+", "_", transformed).strip("_")
# Log the transformation for debugging
if transformed != column_name.lower():
logger.debug(f"Transformed column: {column_name} -> {transformed}")
return transformed
else:
# For other platforms like Snowflake, typically uppercase
@ -444,88 +452,70 @@ class FivetranSource(StatefulIngestionSourceBase):
dest_urn: Optional[DatasetUrn],
fine_grained_lineage: List[FineGrainedLineage],
) -> None:
"""Create column-level lineage between source and destination tables with fuzzy matching."""
"""Create column-level lineage between source and destination tables with better diagnostics."""
if not source_urn or not dest_urn:
return
# Log details for debugging
logger.info(f"Creating column lineage from {source_urn} to {dest_urn}")
# Get destination platform
dest_platform = str(dest_urn).split(",")[0].split(":")[-1]
is_bigquery = dest_platform.lower() == "bigquery"
# If there are explicit column mappings, use them directly
if lineage.column_lineage:
# Extract and normalize all source and destination columns
source_columns = []
dest_columns = []
original_mappings = {}
if not lineage.column_lineage:
logger.warning(
f"No column lineage data available for {lineage.source_table} -> {lineage.destination_table}"
)
fine_grained_lineage.append(
FineGrainedLineage(
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
upstreams=[str(source_urn)],
downstreamType=FineGrainedLineageDownstreamType.FIELD,
downstreams=[str(dest_urn)],
)
)
return
logger.info(f"Processing {len(lineage.column_lineage)} column mappings")
valid_lineage = []
for column_lineage in lineage.column_lineage:
if (
not column_lineage.source_column
or not column_lineage.destination_column
or column_lineage.destination_column.startswith("_fivetran")
):
continue
source_col = column_lineage.source_column
dest_col = column_lineage.destination_column
# Transform destination column based on platform
transformed_dest = self._transform_column_name_for_platform(
dest_col, is_bigquery
)
# Store original and normalized versions
source_norm = self._normalize_column_name(source_col)
dest_norm = self._normalize_column_name(transformed_dest)
source_columns.append((source_col, source_norm))
dest_columns.append((transformed_dest, dest_norm))
# Keep track of original mappings
original_mappings[(source_col, dest_col)] = (
source_col,
transformed_dest,
)
# Apply fuzzy matching to find best matches where needed
best_matches = {}
for source_col, source_norm in source_columns:
# First try exact match with normalized column name
exact_match = None
for dest_col, dest_norm in dest_columns:
if source_norm == dest_norm:
exact_match = dest_col
break
if exact_match:
best_matches[source_col] = exact_match
if column_lineage.destination_column.startswith("_fivetran"):
continue
# If no exact match, try fuzzy matching
best_match = self._find_best_fuzzy_match(
source_col, source_norm, dest_columns
)
if best_match:
best_matches[source_col] = best_match
logger.info(f"Fuzzy matched: {source_col} -> {best_match}")
valid_lineage.append(column_lineage)
# Create lineage for each matched column
for source_col, dest_col in best_matches.items():
if not valid_lineage:
logger.warning("No valid column mappings found after filtering")
return
# Process valid column mappings
for column_lineage in valid_lineage:
try:
# Log what we're processing
logger.debug(
f"Processing: {column_lineage.source_column} -> {column_lineage.destination_column}"
)
# Create field URNs
source_field_urn = builder.make_schema_field_urn(
str(source_urn),
source_col,
column_lineage.source_column,
)
# For BigQuery, ensure proper case and format
dest_column = column_lineage.destination_column
if is_bigquery:
dest_column = dest_column.lower()
dest_field_urn = builder.make_schema_field_urn(
str(dest_urn),
dest_col,
dest_column,
)
# Add to fine-grained lineage
@ -543,43 +533,7 @@ class FivetranSource(StatefulIngestionSourceBase):
)
except Exception as e:
logger.warning(
f"Failed to create column lineage for {source_col} -> {dest_col}: {e}"
)
# Log the total number of lineage entries created
if fine_grained_lineage:
logger.info(
f"Created {len(fine_grained_lineage)} field lineage entries for {source_urn} -> {dest_urn}"
)
else:
logger.warning(
f"No valid column lineage mappings found for {source_urn} -> {dest_urn}"
)
else:
# No column mappings provided - log a warning
logger.warning(
f"No column lineage data available for {lineage.source_table} -> {lineage.destination_table}. "
f"This may indicate an issue with schema retrieval from the Fivetran API."
)
# Add a special note in the report
self.report.warning(
title="Missing column lineage",
message=(
"No column lineage information was available for some tables. "
"This may indicate an issue with schema retrieval from the Fivetran API."
),
context=f"{lineage.source_table}{lineage.destination_table}",
)
# Add a placeholder entry to indicate table-level lineage only
fine_grained_lineage.append(
FineGrainedLineage(
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
upstreams=[str(source_urn)],
downstreamType=FineGrainedLineageDownstreamType.FIELD,
downstreams=[str(dest_urn)],
)
f"Failed to create column lineage for {column_lineage.source_column} -> {column_lineage.destination_column}: {e}"
)
def _create_field_lineage_mcp(

View File

@ -166,6 +166,97 @@ class FivetranAPIClient:
)
return connector_data
def get_table_columns(
self, connector_id: str, schema_name: str, table_name: str
) -> List[Dict]:
"""
Get detailed column information for a specific table using the tables API endpoint.
This is more reliable for column information than the schemas endpoint.
Args:
connector_id: The Fivetran connector ID
schema_name: The schema name
table_name: The table name
Returns:
List of column dictionaries with name, type, and other properties
"""
try:
# URL-encode the schema and table names to handle special characters
import urllib.parse
encoded_schema = urllib.parse.quote(schema_name)
encoded_table = urllib.parse.quote(table_name)
logger.info(f"Fetching column info directly for {schema_name}.{table_name}")
# Make the API request for detailed table information
response = self._make_request(
"GET",
f"/connectors/{connector_id}/schemas/{encoded_schema}/tables/{encoded_table}",
)
# Extract column information
table_data = response.get("data", {})
logger.debug(f"Table API response structure: {list(table_data.keys())}")
columns_data = table_data.get("columns", {})
# Convert column data to a list format if it's a dictionary
columns = []
if isinstance(columns_data, dict):
for col_name, col_info in columns_data.items():
if isinstance(col_info, dict):
col_entry = (
col_info.copy()
) # Create a copy to avoid modifying the original
col_entry["name"] = col_name
# Ensure there's an enabled field
if "enabled" not in col_entry:
col_entry["enabled"] = True
# Add the column if it's enabled
if col_entry.get("enabled", True):
columns.append(col_entry)
else:
# Simple case where we just have column names
columns.append({"name": col_name, "enabled": True})
elif isinstance(columns_data, list):
columns = [col for col in columns_data if col.get("enabled", True)]
# Check if we have name_in_destination info
for col in columns:
if (
isinstance(col, dict)
and "name_in_destination" not in col
and "name" in col
):
# Add name_in_destination based on destination platform
destination = self.detect_destination_platform(
table_data.get("destination_id", "")
)
if destination.lower() == "bigquery":
# Convert to snake_case for BigQuery
import re
name = col["name"]
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
col["name_in_destination"] = s2.lower()
elif destination.lower() in ["snowflake", "redshift"]:
# Convert to uppercase for Snowflake/Redshift
col["name_in_destination"] = col["name"].upper()
logger.info(
f"Retrieved {len(columns)} columns for {schema_name}.{table_name} via direct table API"
)
return columns
except Exception as e:
logger.warning(f"Failed to get columns for {schema_name}.{table_name}: {e}")
return []
def _enrich_salesforce_connector(
self, connector_id: str, connector_data: Dict
) -> None:
@ -254,12 +345,21 @@ class FivetranAPIClient:
Normalize schema information into a consistent format regardless of API response structure.
"""
schemas = []
# Log what we're working with
logger.debug(f"Raw schema response type: {type(raw_schemas)}")
if isinstance(raw_schemas, dict):
logger.debug(f"Schema keys: {list(raw_schemas.keys())}")
elif isinstance(raw_schemas, list):
logger.debug(f"Schema list length: {len(raw_schemas)}")
if raw_schemas:
logger.debug(f"First schema item type: {type(raw_schemas[0])}")
else:
logger.debug(f"Unexpected schema format: {str(raw_schemas)[:100]}...")
# Handle different response formats
if isinstance(raw_schemas, dict):
# Handle nested object format (older API versions)
logger.debug(f"Schema keys: {list(raw_schemas.keys())}")
logger.info(f"Converting nested schema format for connector {connector_id}")
for schema_name, schema_data in raw_schemas.items():
# Convert to the expected format
@ -308,10 +408,6 @@ class FivetranAPIClient:
schemas.append(schema_obj)
elif isinstance(raw_schemas, list):
logger.debug(f"Schema list length: {len(raw_schemas)}")
if raw_schemas:
logger.debug(f"First schema item type: {type(raw_schemas[0])}")
# Already in the expected list format
schemas = raw_schemas
@ -324,7 +420,6 @@ class FivetranAPIClient:
if "columns" not in table:
table["columns"] = []
else:
logger.debug(f"Unexpected schema format: {raw_schemas[:100]}...")
logger.warning(
f"Unexpected schema format type for connector {connector_id}: {type(raw_schemas)}"
)
@ -689,58 +784,6 @@ class FivetranAPIClient:
f"After retrieval attempts, {tables_still_missing} tables still missing column information"
)
def get_table_columns(
self, connector_id: str, schema_name: str, table_name: str
) -> List[Dict]:
"""
Get detailed column information for a specific table using the tables API endpoint.
This is more reliable for column information than the schemas endpoint.
"""
try:
# URL-encode the schema and table names to handle special characters
import urllib.parse
encoded_schema = urllib.parse.quote(schema_name)
encoded_table = urllib.parse.quote(table_name)
# Make the API request for detailed table information
response = self._make_request(
"GET",
f"/connectors/{connector_id}/schemas/{encoded_schema}/tables/{encoded_table}",
)
# Extract column information
table_data = response.get("data", {})
columns_data = table_data.get("columns", {})
# Convert column data to a list format if it's a dictionary
columns = []
if isinstance(columns_data, dict):
for col_name, col_info in columns_data.items():
if isinstance(col_info, dict):
col_info = (
col_info.copy()
) # Create a copy to avoid modifying the original
col_info["name"] = col_name
if col_info.get(
"enabled", True
): # Only include enabled columns
columns.append(col_info)
else:
# Simple case where we just have column names
columns.append({"name": col_name, "enabled": True})
elif isinstance(columns_data, list):
columns = [col for col in columns_data if col.get("enabled", True)]
logger.info(
f"Retrieved {len(columns)} columns for {schema_name}.{table_name} via direct table API"
)
return columns
except Exception as e:
logger.warning(f"Failed to get columns for {schema_name}.{table_name}: {e}")
return []
def _process_column_data(self, columns: Any) -> List[Dict]:
"""
Process column data from various API response formats into a consistent format.
@ -1348,20 +1391,15 @@ class FivetranAPIClient:
Uses a generic approach that works for any connector type and properly handles name_in_destination.
"""
try:
# Get the connector schemas first
schemas = self.list_connector_schemas(connector_id)
# Get the connector details first
connector_details = self.get_connector(connector_id)
# Log more details about what we retrieved
table_count = sum(len(schema.get("tables", [])) for schema in schemas)
logger.info(
f"Got {len(schemas)} schemas with {table_count} tables for connector {connector_id}"
)
# Get destination information for naming
connector = self.get_connector(connector_id)
destination_id = connector.get("group", {}).get("id", "")
# Get destination information
destination_id = connector_details.get("group", {}).get("id", "")
destination_platform = self.detect_destination_platform(destination_id)
# Get schema information
schemas = self.list_connector_schemas(connector_id)
lineage_list = []
# Handle cases where schemas might be a string or invalid format
@ -1581,9 +1619,14 @@ class FivetranAPIClient:
"""
if destination_platform.lower() == "bigquery":
# BigQuery column names are case-sensitive and typically lowercase
return column_name.lower()
# Also convert camelCase to snake_case
import re
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
return s2.lower()
else:
# For most other systems (Snowflake, Redshift, etc.), column names are uppercased
# For other platforms like Snowflake, typically uppercase
return column_name.upper()
def _build_lineage_from_schemas(

View File

@ -1,5 +1,6 @@
import logging
from typing import Dict, List, Optional, Set, Tuple
import re
from typing import Any, Dict, List, Optional, Set, Tuple
from datahub.configuration.common import AllowDenyPattern
from datahub.ingestion.source.fivetran.config import (
@ -360,44 +361,19 @@ class FivetranStandardAPI(FivetranAccessInterface):
return destinations_seen, destination_details
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
def _process_schemas_for_lineage(
self,
connector: Connector,
schemas: List[Dict],
source_table_columns: Dict[str, Dict[str, str]],
) -> List[TableLineage]:
"""
Fill in lineage information for connectors by calling the API.
This enhanced implementation:
1. Correctly handles API response data
2. Uses name_in_destination when available
3. Implements fallback column-level lineage by matching column names
4. Handles case transformation based on destination platform
Process schemas to extract lineage information for a connector.
This was extracted from _fill_connectors_lineage to reduce complexity.
"""
for connector in connectors:
try:
logger.info(
f"Extracting lineage for connector {connector.connector_id}"
)
# Determine destination platform
destination_platform = self._get_destination_platform(connector)
connector.additional_properties["destination_platform"] = (
destination_platform
)
logger.info(
f"Using destination platform {destination_platform} for connector {connector.connector_id}"
)
# Get schema information from API
schemas = self.api_client.list_connector_schemas(connector.connector_id)
logger.info(
f"Got {len(schemas)} schemas for connector {connector.connector_id}"
)
lineage_list = []
destination_platform = self._get_destination_platform(connector)
# First, collect all source columns with their types for each table
# This will help with generating column-level lineage
source_table_columns = self._collect_source_columns(schemas)
# Process each schema
for schema in schemas:
try:
schema_name = schema.get("name", "")
@ -444,9 +420,7 @@ class FivetranStandardAPI(FivetranAccessInterface):
# Get destination table name - prefer name_in_destination if available
dest_table = None
table_name_in_destination = table.get(
"name_in_destination"
)
table_name_in_destination = table.get("name_in_destination")
if table_name_in_destination:
dest_table = table_name_in_destination
logger.debug(
@ -493,6 +467,50 @@ class FivetranStandardAPI(FivetranAccessInterface):
f"Error processing schema {schema.get('name', 'unknown')}: {schema_e}"
)
return lineage_list
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
"""
Fill in lineage information for connectors by calling the API with enhanced diagnostics and robust error handling.
"""
for connector in connectors:
try:
logger.info(
f"Extracting lineage for connector {connector.connector_id}"
)
# Determine destination platform
destination_platform = self._get_destination_platform(connector)
connector.additional_properties["destination_platform"] = (
destination_platform
)
logger.info(
f"Using destination platform {destination_platform} for connector {connector.connector_id}"
)
# Get schema information from API
schemas = self.api_client.list_connector_schemas(connector.connector_id)
# DIAGNOSTIC: Log detailed schema information
self._log_schema_diagnostics(schemas)
# If we have no columns at all, try direct fetching for each table
if self._should_fetch_missing_columns(schemas):
logger.warning(
"No columns found in initial schema fetch. Attempting direct table column fetching."
)
self._fetch_missing_columns(connector.connector_id, schemas)
self._log_schema_diagnostics(schemas) # Log updated stats
# First, collect all source columns with their types for each table
# This will help with generating column-level lineage
source_table_columns = self._collect_source_columns(schemas)
# Process schemas to extract lineage information
lineage_list = self._process_schemas_for_lineage(
connector, schemas, source_table_columns
)
# Truncate if necessary
if len(lineage_list) > MAX_TABLE_LINEAGE_PER_CONNECTOR:
logger.warning(
@ -503,9 +521,8 @@ class FivetranStandardAPI(FivetranAccessInterface):
connector.lineage = lineage_list
logger.info(
f"Successfully extracted {len(lineage_list)} table lineages for connector {connector.connector_id}"
)
# Final stats logging
self._log_lineage_stats(lineage_list, connector.connector_id)
except Exception as e:
logger.error(
@ -514,6 +531,92 @@ class FivetranStandardAPI(FivetranAccessInterface):
)
connector.lineage = []
def _log_schema_diagnostics(self, schemas: List[Dict]) -> None:
"""Log diagnostic information about schemas and their columns."""
total_columns = 0
total_tables_with_columns = 0
total_tables = 0
for schema in schemas:
schema_name = schema.get("name", "")
for table in schema.get("tables", []):
total_tables += 1
table_name = table.get("name", "")
columns = table.get("columns", [])
if columns:
total_tables_with_columns += 1
total_columns += len(columns)
logger.info(
f"Table {schema_name}.{table_name} has {len(columns)} columns"
)
# DIAGNOSTIC: Print a sample of column names
column_names = [col.get("name", "unknown") for col in columns[:5]]
logger.info(f"Sample columns: {column_names}")
else:
logger.warning(f"Table {schema_name}.{table_name} has NO columns")
logger.info(
f"SCHEMA STATS: {total_tables_with_columns}/{total_tables} tables have columns, total {total_columns} columns"
)
def _should_fetch_missing_columns(self, schemas: List[Dict]) -> bool:
"""Determine if we need to fetch missing columns based on schema content."""
total_columns = 0
total_tables = 0
for schema in schemas:
for table in schema.get("tables", []):
total_tables += 1
columns = table.get("columns", [])
if columns:
total_columns += len(columns)
return total_columns == 0 and total_tables > 0
def _log_lineage_stats(
self, lineage_list: List[TableLineage], connector_id: str
) -> None:
"""Log statistics about lineage processing."""
tables_with_columns = len(
[
table_lineage
for table_lineage in lineage_list
if table_lineage.column_lineage
]
)
total_column_mappings = sum(
len(table_lineage.column_lineage) for table_lineage in lineage_list
)
logger.info(
f"Lineage stats for connector {connector_id}: "
f"{len(lineage_list)} table lineages, {tables_with_columns} tables with column lineage, "
f"{total_column_mappings} total column mappings"
)
def _fetch_missing_columns(self, connector_id: str, schemas: List[Dict]) -> None:
"""Attempt to fetch missing column information directly for each table."""
for schema in schemas:
schema_name = schema.get("name", "")
for table in schema.get("tables", []):
table_name = table.get("name", "")
if not table.get("columns") and table.get("enabled", True):
try:
# Try direct column fetching
columns = self.api_client.get_table_columns(
connector_id, schema_name, table_name
)
if columns:
table["columns"] = columns
logger.info(
f"Directly fetched {len(columns)} columns for {schema_name}.{table_name}"
)
except Exception as e:
logger.warning(
f"Failed to directly fetch columns for {schema_name}.{table_name}: {e}"
)
def _collect_source_columns(self, schemas: List[Dict]) -> Dict[str, Dict[str, str]]:
"""
Collect all source columns with their types for each table.
@ -559,6 +662,88 @@ class FivetranStandardAPI(FivetranAccessInterface):
return source_columns
def _process_columns_from_list(
self, columns: List[Any], is_bigquery: bool, source_table: str
) -> List[ColumnLineage]:
"""
Process columns from list format and create column lineage.
Extracted from _extract_column_lineage to reduce complexity.
"""
column_lineage = []
for column in columns:
col_name = None
if isinstance(column, dict):
col_name = column.get("name")
elif isinstance(column, str):
col_name = column
if not col_name:
continue
# Skip Fivetran system columns
if col_name.startswith("_fivetran"):
continue
# Get destination column name - prefer name_in_destination if available
dest_col_name = None
if isinstance(column, dict) and "name_in_destination" in column:
dest_col_name = column.get("name_in_destination")
logger.debug(
f"Using name_in_destination: {col_name} -> {dest_col_name}"
)
# If no name_in_destination, transform based on platform
if not dest_col_name:
dest_col_name = self._transform_column_name_for_platform(
col_name, is_bigquery
)
logger.debug(f"Transformed name: {col_name} -> {dest_col_name}")
# Add to lineage
column_lineage.append(
ColumnLineage(
source_column=col_name,
destination_column=dest_col_name,
)
)
logger.info(
f"Created {len(column_lineage)} column lineage entries for {source_table}"
)
return column_lineage
def _process_columns_from_source(
self, source_columns: Dict[str, str], is_bigquery: bool, source_table: str
) -> List[ColumnLineage]:
"""
Create column lineage from source column information.
Extracted from _extract_column_lineage to reduce complexity.
"""
column_lineage = []
logger.info(f"Using {len(source_columns)} columns from source table schema")
# Create lineage for each source column
for col_name in source_columns:
if col_name.startswith("_fivetran"):
continue
# Transform destination column name based on platform
dest_col_name = self._transform_column_name_for_platform(
col_name, is_bigquery
)
column_lineage.append(
ColumnLineage(
source_column=col_name,
destination_column=dest_col_name,
)
)
logger.info(f"Created {len(column_lineage)} fallback column lineage entries")
return column_lineage
def _extract_column_lineage(
self,
table: Dict,
@ -567,7 +752,7 @@ class FivetranStandardAPI(FivetranAccessInterface):
source_table_columns: Dict[str, Dict[str, str]],
) -> List[ColumnLineage]:
"""
Extract column-level lineage for a table, with fallback to name matching if needed.
Extract column-level lineage for a table with improved debugging and fallback.
Args:
table: Table data from API
@ -578,84 +763,111 @@ class FivetranStandardAPI(FivetranAccessInterface):
Returns:
List of ColumnLineage objects
"""
column_lineage = []
# Get detailed logging for debugging
logger.info(
f"Extracting column lineage for {source_table} to {destination_platform}"
)
logger.debug(f"Table data keys: {list(table.keys())}")
# Get columns from the API response
columns = table.get("columns", [])
# DIAGNOSTIC: Print details about what we got
self._log_column_diagnostics(columns)
# CRITICAL FIX: Convert dict format to list if needed
if isinstance(columns, dict):
columns = self._convert_column_dict_to_list(columns)
is_bigquery = destination_platform.lower() == "bigquery"
column_lineage = []
# If we have columns, create lineage mappings
if isinstance(columns, list) and columns:
# API provided column information, use it
for column in columns:
if not isinstance(column, dict):
continue
col_name = column.get("name", "")
if not col_name:
continue
# Skip Fivetran system columns
if col_name.startswith("_fivetran"):
continue
# Get destination column name - prefer name_in_destination if available
dest_col_name = None
column_name_in_destination = column.get("name_in_destination")
if column_name_in_destination:
dest_col_name = column_name_in_destination
logger.debug(
f"Using provided name_in_destination '{dest_col_name}' for column {col_name}"
column_lineage = self._process_columns_from_list(
columns, is_bigquery, source_table
)
else:
# Fall back to case transformation if name_in_destination not available
dest_col_name = self._get_destination_column_name(
col_name, destination_platform
)
logger.debug(
f"No name_in_destination found for column {col_name}, using transformed name '{dest_col_name}'"
)
# No column information from API, try other methods
logger.warning(f"No usable column information for {source_table} from API")
column_lineage.append(
ColumnLineage(
source_column=col_name,
destination_column=dest_col_name,
)
)
else:
# No column information from API, use source table columns if available
# Use source_table_columns if available (fallback method)
source_columns = source_table_columns.get(source_table, {})
if source_columns:
column_lineage = self._process_columns_from_source(
source_columns, is_bigquery, source_table
)
else:
# Try to use column mapping from config if available
column_lineage = self._try_get_column_mapping_from_config(source_table)
# Final check and log
if column_lineage:
logger.info(
f"No column information from API for {source_table}. "
f"Using source table schema with {len(source_columns)} columns"
)
# For each source column, create a lineage entry assuming it has the same name in destination
# with appropriate case transformation
for col_name in source_columns:
# Skip Fivetran system columns
if col_name.startswith("_fivetran"):
continue
# Transform column name based on destination platform
dest_col_name = self._get_destination_column_name(
col_name, destination_platform
f"Returning {len(column_lineage)} column lineage mappings for {source_table}"
)
else:
logger.warning(f"No column lineage mappings created for {source_table}")
return column_lineage
def _log_column_diagnostics(self, columns: Any) -> None:
"""Log diagnostic information about column data."""
if isinstance(columns, list):
logger.info(f"Found {len(columns)} columns in list format")
if columns:
sample = columns[:2]
logger.debug(f"Sample columns: {sample}")
elif isinstance(columns, dict):
logger.info(f"Found {len(columns)} columns in dict format")
if columns:
sample_keys = list(columns.keys())[:2]
logger.debug(f"Sample column keys: {sample_keys}")
else:
logger.warning(f"Columns in unexpected format: {type(columns)}")
def _convert_column_dict_to_list(self, columns_dict: Dict) -> List[Dict]:
"""Convert column dictionary to list format for consistent processing."""
columns_list = []
for col_name, col_data in columns_dict.items():
if isinstance(col_data, dict):
col_data = col_data.copy()
col_data["name"] = col_name
columns_list.append(col_data)
else:
columns_list.append({"name": col_name})
logger.info(f"Converted dict format to list with {len(columns_list)} columns")
return columns_list
def _try_get_column_mapping_from_config(
self, source_table: str
) -> List[ColumnLineage]:
"""Try to get column mapping from configuration if available."""
column_lineage = []
# Check if there's a config attribute with column mapping info
if hasattr(self, "config") and self.config:
# Check for any attribute that might have column mappings
for attr_name in dir(self.config):
if "column_mapping" in attr_name.lower() and hasattr(
self.config, attr_name
):
mapping_attr = getattr(self.config, attr_name)
if isinstance(mapping_attr, dict) and source_table in mapping_attr:
column_mapping = mapping_attr.get(source_table, {})
logger.info(
f"Found config column mapping for {source_table}: {len(column_mapping)} columns"
)
for source_col, dest_col in column_mapping.items():
column_lineage.append(
ColumnLineage(
source_column=col_name,
destination_column=dest_col_name,
source_column=source_col,
destination_column=dest_col,
)
)
# Log whether we found column lineage
if column_lineage:
logger.info(f"Found {len(column_lineage)} columns for {source_table}")
else:
logger.warning(f"No column lineage found for {source_table}")
return column_lineage
def _get_destination_platform(self, connector: Connector) -> str:
@ -741,16 +953,36 @@ class FivetranStandardAPI(FivetranAccessInterface):
# For most other systems (Snowflake, Redshift, etc.), table names are uppercased
return table_name.upper()
def _get_destination_column_name(
self, column_name: str, destination_platform: str
def _transform_column_name_for_platform(
self, column_name: str, is_bigquery: bool
) -> str:
"""
Get the destination column name based on the platform.
This is a helper method that applies appropriate case transformations.
Transform column name based on the destination platform with better handling of edge cases.
"""
if destination_platform.lower() == "bigquery":
# BigQuery column names are case-sensitive and typically lowercase
return column_name.lower()
if not column_name:
return ""
if is_bigquery:
# For BigQuery:
# 1. Convert to lowercase
# 2. Replace camelCase with snake_case
# 3. Clean up any invalid characters
# Step 1: Convert camelCase to snake_case with regex
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
# Step 2: lowercase and replace non-alphanumeric with underscore
transformed = re.sub(r"[^a-zA-Z0-9_]", "_", s2.lower())
# Step 3: Remove leading/trailing underscores and collapse multiple underscores
transformed = re.sub(r"_+", "_", transformed).strip("_")
# Log the transformation for debugging
if transformed != column_name.lower():
logger.debug(f"Transformed column: {column_name} -> {transformed}")
return transformed
else:
# For most other systems (Snowflake, Redshift, etc.), column names are uppercased
# For other platforms like Snowflake, typically uppercase
return column_name.upper()