cll improvements

2025-11-10 16:32:26 +00:00 · 2025-03-19 23:24:54 +01:00 · 2025-03-19 23:24:54 +01:00 · 0816bf3155
commit 0816bf3155
parent 5642906be3
3 changed files with 604 additions and 375 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@ -356,12 +356,16 @@ class FivetranSource(StatefulIngestionSourceBase):
    def _transform_column_name_for_platform(
        self, column_name: str, is_bigquery: bool
    ) -> str:
-        """Transform column name based on the destination platform."""
+        """Transform column name based on the destination platform with better handling of edge cases."""
        if not column_name:
            return ""
        if is_bigquery:
            # For BigQuery:
            # 1. Convert to lowercase
            # 2. Replace camelCase with snake_case
            # 3. Clean up any invalid characters
            import re
            # Step 1: Convert camelCase to snake_case with regex
            s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
@ -373,6 +377,10 @@ class FivetranSource(StatefulIngestionSourceBase):
            # Step 3: Remove leading/trailing underscores and collapse multiple underscores
            transformed = re.sub(r"_+", "_", transformed).strip("_")
            # Log the transformation for debugging
            if transformed != column_name.lower():
                logger.debug(f"Transformed column: {column_name} -> {transformed}")
            return transformed
        else:
            # For other platforms like Snowflake, typically uppercase
@ -444,135 +452,19 @@ class FivetranSource(StatefulIngestionSourceBase):
        dest_urn: Optional[DatasetUrn],
        fine_grained_lineage: List[FineGrainedLineage],
    ) -> None:
-        """Create column-level lineage between source and destination tables with fuzzy matching."""
+        """Create column-level lineage between source and destination tables with better diagnostics."""
        if not source_urn or not dest_urn:
            return
        # Log details for debugging
        logger.info(f"Creating column lineage from {source_urn} to {dest_urn}")
        # Get destination platform
        dest_platform = str(dest_urn).split(",")[0].split(":")[-1]
        is_bigquery = dest_platform.lower() == "bigquery"
-        # If there are explicit column mappings, use them directly
+        if not lineage.column_lineage:
        if lineage.column_lineage:
            # Extract and normalize all source and destination columns
            source_columns = []
            dest_columns = []
            original_mappings = {}
            for column_lineage in lineage.column_lineage:
                if (
                    not column_lineage.source_column
                    or not column_lineage.destination_column
                    or column_lineage.destination_column.startswith("_fivetran")
                ):
                    continue
                source_col = column_lineage.source_column
                dest_col = column_lineage.destination_column
                # Transform destination column based on platform
                transformed_dest = self._transform_column_name_for_platform(
                    dest_col, is_bigquery
                )
                # Store original and normalized versions
                source_norm = self._normalize_column_name(source_col)
                dest_norm = self._normalize_column_name(transformed_dest)
                source_columns.append((source_col, source_norm))
                dest_columns.append((transformed_dest, dest_norm))
                # Keep track of original mappings
                original_mappings[(source_col, dest_col)] = (
                    source_col,
                    transformed_dest,
                )
            # Apply fuzzy matching to find best matches where needed
            best_matches = {}
            for source_col, source_norm in source_columns:
                # First try exact match with normalized column name
                exact_match = None
                for dest_col, dest_norm in dest_columns:
                    if source_norm == dest_norm:
                        exact_match = dest_col
                        break
                if exact_match:
                    best_matches[source_col] = exact_match
                    continue
                # If no exact match, try fuzzy matching
                best_match = self._find_best_fuzzy_match(
                    source_col, source_norm, dest_columns
                )
                if best_match:
                    best_matches[source_col] = best_match
                    logger.info(f"Fuzzy matched: {source_col} -> {best_match}")
            # Create lineage for each matched column
            for source_col, dest_col in best_matches.items():
                try:
                    # Create field URNs
                    source_field_urn = builder.make_schema_field_urn(
                        str(source_urn),
                        source_col,
                    )
                    dest_field_urn = builder.make_schema_field_urn(
                        str(dest_urn),
                        dest_col,
                    )
                    # Add to fine-grained lineage
                    fine_grained_lineage.append(
                        FineGrainedLineage(
                            upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
                            upstreams=[source_field_urn],
                            downstreamType=FineGrainedLineageDownstreamType.FIELD,
                            downstreams=[dest_field_urn],
                        )
                    )
                    logger.debug(
                        f"Added field lineage: {source_field_urn} -> {dest_field_urn}"
                    )
                except Exception as e:
                    logger.warning(
                        f"Failed to create column lineage for {source_col} -> {dest_col}: {e}"
                    )
            # Log the total number of lineage entries created
            if fine_grained_lineage:
                logger.info(
                    f"Created {len(fine_grained_lineage)} field lineage entries for {source_urn} -> {dest_urn}"
                )
            else:
                logger.warning(
                    f"No valid column lineage mappings found for {source_urn} -> {dest_urn}"
                )
        else:
            # No column mappings provided - log a warning
            logger.warning(
-                f"No column lineage data available for {lineage.source_table} -> {lineage.destination_table}. "
+                f"No column lineage data available for {lineage.source_table} -> {lineage.destination_table}"
                f"This may indicate an issue with schema retrieval from the Fivetran API."
            )
            # Add a special note in the report
            self.report.warning(
                title="Missing column lineage",
                message=(
                    "No column lineage information was available for some tables. "
                    "This may indicate an issue with schema retrieval from the Fivetran API."
                ),
                context=f"{lineage.source_table} → {lineage.destination_table}",
            )
            # Add a placeholder entry to indicate table-level lineage only
            fine_grained_lineage.append(
                FineGrainedLineage(
                    upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
@ -581,6 +473,68 @@ class FivetranSource(StatefulIngestionSourceBase):
                    downstreams=[str(dest_urn)],
                )
            )
            return
        logger.info(f"Processing {len(lineage.column_lineage)} column mappings")
        valid_lineage = []
        for column_lineage in lineage.column_lineage:
            if (
                not column_lineage.source_column
                or not column_lineage.destination_column
            ):
                continue
            if column_lineage.destination_column.startswith("_fivetran"):
                continue
            valid_lineage.append(column_lineage)
        if not valid_lineage:
            logger.warning("No valid column mappings found after filtering")
            return
        # Process valid column mappings
        for column_lineage in valid_lineage:
            try:
                # Log what we're processing
                logger.debug(
                    f"Processing: {column_lineage.source_column} -> {column_lineage.destination_column}"
                )
                # Create field URNs
                source_field_urn = builder.make_schema_field_urn(
                    str(source_urn),
                    column_lineage.source_column,
                )
                # For BigQuery, ensure proper case and format
                dest_column = column_lineage.destination_column
                if is_bigquery:
                    dest_column = dest_column.lower()
                dest_field_urn = builder.make_schema_field_urn(
                    str(dest_urn),
                    dest_column,
                )
                # Add to fine-grained lineage
                fine_grained_lineage.append(
                    FineGrainedLineage(
                        upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
                        upstreams=[source_field_urn],
                        downstreamType=FineGrainedLineageDownstreamType.FIELD,
                        downstreams=[dest_field_urn],
                    )
                )
                logger.debug(
                    f"Added field lineage: {source_field_urn} -> {dest_field_urn}"
                )
            except Exception as e:
                logger.warning(
                    f"Failed to create column lineage for {column_lineage.source_column} -> {column_lineage.destination_column}: {e}"
                )
    def _create_field_lineage_mcp(
        self,
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py
@ -166,6 +166,97 @@ class FivetranAPIClient:
            )
            return connector_data
    def get_table_columns(
        self, connector_id: str, schema_name: str, table_name: str
    ) -> List[Dict]:
        """
        Get detailed column information for a specific table using the tables API endpoint.
        This is more reliable for column information than the schemas endpoint.
        Args:
            connector_id: The Fivetran connector ID
            schema_name: The schema name
            table_name: The table name
        Returns:
            List of column dictionaries with name, type, and other properties
        """
        try:
            # URL-encode the schema and table names to handle special characters
            import urllib.parse
            encoded_schema = urllib.parse.quote(schema_name)
            encoded_table = urllib.parse.quote(table_name)
            logger.info(f"Fetching column info directly for {schema_name}.{table_name}")
            # Make the API request for detailed table information
            response = self._make_request(
                "GET",
                f"/connectors/{connector_id}/schemas/{encoded_schema}/tables/{encoded_table}",
            )
            # Extract column information
            table_data = response.get("data", {})
            logger.debug(f"Table API response structure: {list(table_data.keys())}")
            columns_data = table_data.get("columns", {})
            # Convert column data to a list format if it's a dictionary
            columns = []
            if isinstance(columns_data, dict):
                for col_name, col_info in columns_data.items():
                    if isinstance(col_info, dict):
                        col_entry = (
                            col_info.copy()
                        )  # Create a copy to avoid modifying the original
                        col_entry["name"] = col_name
                        # Ensure there's an enabled field
                        if "enabled" not in col_entry:
                            col_entry["enabled"] = True
                        # Add the column if it's enabled
                        if col_entry.get("enabled", True):
                            columns.append(col_entry)
                    else:
                        # Simple case where we just have column names
                        columns.append({"name": col_name, "enabled": True})
            elif isinstance(columns_data, list):
                columns = [col for col in columns_data if col.get("enabled", True)]
            # Check if we have name_in_destination info
            for col in columns:
                if (
                    isinstance(col, dict)
                    and "name_in_destination" not in col
                    and "name" in col
                ):
                    # Add name_in_destination based on destination platform
                    destination = self.detect_destination_platform(
                        table_data.get("destination_id", "")
                    )
                    if destination.lower() == "bigquery":
                        # Convert to snake_case for BigQuery
                        import re
                        name = col["name"]
                        s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
                        s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
                        col["name_in_destination"] = s2.lower()
                    elif destination.lower() in ["snowflake", "redshift"]:
                        # Convert to uppercase for Snowflake/Redshift
                        col["name_in_destination"] = col["name"].upper()
            logger.info(
                f"Retrieved {len(columns)} columns for {schema_name}.{table_name} via direct table API"
            )
            return columns
        except Exception as e:
            logger.warning(f"Failed to get columns for {schema_name}.{table_name}: {e}")
            return []
    def _enrich_salesforce_connector(
        self, connector_id: str, connector_data: Dict
    ) -> None:
@ -254,12 +345,21 @@ class FivetranAPIClient:
        Normalize schema information into a consistent format regardless of API response structure.
        """
        schemas = []
        # Log what we're working with
        logger.debug(f"Raw schema response type: {type(raw_schemas)}")
        if isinstance(raw_schemas, dict):
            logger.debug(f"Schema keys: {list(raw_schemas.keys())}")
        elif isinstance(raw_schemas, list):
            logger.debug(f"Schema list length: {len(raw_schemas)}")
            if raw_schemas:
                logger.debug(f"First schema item type: {type(raw_schemas[0])}")
        else:
            logger.debug(f"Unexpected schema format: {str(raw_schemas)[:100]}...")
        # Handle different response formats
        if isinstance(raw_schemas, dict):
            # Handle nested object format (older API versions)
            logger.debug(f"Schema keys: {list(raw_schemas.keys())}")
            logger.info(f"Converting nested schema format for connector {connector_id}")
            for schema_name, schema_data in raw_schemas.items():
                # Convert to the expected format
@ -308,10 +408,6 @@ class FivetranAPIClient:
                schemas.append(schema_obj)
        elif isinstance(raw_schemas, list):
            logger.debug(f"Schema list length: {len(raw_schemas)}")
            if raw_schemas:
                logger.debug(f"First schema item type: {type(raw_schemas[0])}")
            # Already in the expected list format
            schemas = raw_schemas
@ -324,7 +420,6 @@ class FivetranAPIClient:
                    if "columns" not in table:
                        table["columns"] = []
        else:
            logger.debug(f"Unexpected schema format: {raw_schemas[:100]}...")
            logger.warning(
                f"Unexpected schema format type for connector {connector_id}: {type(raw_schemas)}"
            )
@ -689,58 +784,6 @@ class FivetranAPIClient:
            f"After retrieval attempts, {tables_still_missing} tables still missing column information"
        )
    def get_table_columns(
        self, connector_id: str, schema_name: str, table_name: str
    ) -> List[Dict]:
        """
        Get detailed column information for a specific table using the tables API endpoint.
        This is more reliable for column information than the schemas endpoint.
        """
        try:
            # URL-encode the schema and table names to handle special characters
            import urllib.parse
            encoded_schema = urllib.parse.quote(schema_name)
            encoded_table = urllib.parse.quote(table_name)
            # Make the API request for detailed table information
            response = self._make_request(
                "GET",
                f"/connectors/{connector_id}/schemas/{encoded_schema}/tables/{encoded_table}",
            )
            # Extract column information
            table_data = response.get("data", {})
            columns_data = table_data.get("columns", {})
            # Convert column data to a list format if it's a dictionary
            columns = []
            if isinstance(columns_data, dict):
                for col_name, col_info in columns_data.items():
                    if isinstance(col_info, dict):
                        col_info = (
                            col_info.copy()
                        )  # Create a copy to avoid modifying the original
                        col_info["name"] = col_name
                        if col_info.get(
                            "enabled", True
                        ):  # Only include enabled columns
                            columns.append(col_info)
                    else:
                        # Simple case where we just have column names
                        columns.append({"name": col_name, "enabled": True})
            elif isinstance(columns_data, list):
                columns = [col for col in columns_data if col.get("enabled", True)]
            logger.info(
                f"Retrieved {len(columns)} columns for {schema_name}.{table_name} via direct table API"
            )
            return columns
        except Exception as e:
            logger.warning(f"Failed to get columns for {schema_name}.{table_name}: {e}")
            return []
    def _process_column_data(self, columns: Any) -> List[Dict]:
        """
        Process column data from various API response formats into a consistent format.
@ -1348,20 +1391,15 @@ class FivetranAPIClient:
        Uses a generic approach that works for any connector type and properly handles name_in_destination.
        """
        try:
-            # Get the connector schemas first
+            # Get the connector details first
-            schemas = self.list_connector_schemas(connector_id)
+            connector_details = self.get_connector(connector_id)
-            # Log more details about what we retrieved
+            # Get destination information
-            table_count = sum(len(schema.get("tables", [])) for schema in schemas)
+            destination_id = connector_details.get("group", {}).get("id", "")
            logger.info(
                f"Got {len(schemas)} schemas with {table_count} tables for connector {connector_id}"
            )
            # Get destination information for naming
            connector = self.get_connector(connector_id)
            destination_id = connector.get("group", {}).get("id", "")
            destination_platform = self.detect_destination_platform(destination_id)
            # Get schema information
            schemas = self.list_connector_schemas(connector_id)
            lineage_list = []
            # Handle cases where schemas might be a string or invalid format
@ -1581,9 +1619,14 @@ class FivetranAPIClient:
        """
        if destination_platform.lower() == "bigquery":
            # BigQuery column names are case-sensitive and typically lowercase
-            return column_name.lower()
+            # Also convert camelCase to snake_case
            import re
            s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
            s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
            return s2.lower()
        else:
-            # For most other systems (Snowflake, Redshift, etc.), column names are uppercased
+            # For other platforms like Snowflake, typically uppercase
            return column_name.upper()
    def _build_lineage_from_schemas(
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py
@ -1,5 +1,6 @@
 import logging
-from typing import Dict, List, Optional, Set, Tuple
+import re
 from typing import Any, Dict, List, Optional, Set, Tuple
 from datahub.configuration.common import AllowDenyPattern
 from datahub.ingestion.source.fivetran.config import (
@ -360,15 +361,117 @@ class FivetranStandardAPI(FivetranAccessInterface):
        return destinations_seen, destination_details
    def _process_schemas_for_lineage(
        self,
        connector: Connector,
        schemas: List[Dict],
        source_table_columns: Dict[str, Dict[str, str]],
    ) -> List[TableLineage]:
        """
        Process schemas to extract lineage information for a connector.
        This was extracted from _fill_connectors_lineage to reduce complexity.
        """
        lineage_list = []
        destination_platform = self._get_destination_platform(connector)
        for schema in schemas:
            try:
                schema_name = schema.get("name", "")
                if not schema_name:
                    logger.warning(
                        f"Skipping schema with no name in connector {connector.connector_id}"
                    )
                    continue
                # Use name_in_destination if available for schema
                schema_name_in_destination = schema.get("name_in_destination")
                tables = schema.get("tables", [])
                if not isinstance(tables, list):
                    logger.warning(
                        f"Schema {schema_name} has non-list tables: {type(tables)}"
                    )
                    continue
                # Process each table in the schema
                for table in tables:
                    try:
                        if not isinstance(table, dict):
                            continue
                        table_name = table.get("name", "")
                        enabled = table.get("enabled", False)
                        if not enabled or not table_name:
                            continue
                        # Create source table identifier
                        source_table = f"{schema_name}.{table_name}"
                        # Get destination schema name - prefer name_in_destination if available
                        dest_schema = None
                        if schema_name_in_destination:
                            dest_schema = schema_name_in_destination
                        else:
                            # Fall back to case transformation if name_in_destination not available
                            dest_schema = self._get_destination_schema_name(
                                schema_name, destination_platform
                            )
                        # Get destination table name - prefer name_in_destination if available
                        dest_table = None
                        table_name_in_destination = table.get("name_in_destination")
                        if table_name_in_destination:
                            dest_table = table_name_in_destination
                            logger.debug(
                                f"Using provided name_in_destination '{dest_table}' for table {table_name}"
                            )
                        else:
                            # Fall back to case transformation if name_in_destination not available
                            dest_table = self._get_destination_table_name(
                                table_name, destination_platform
                            )
                            logger.debug(
                                f"No name_in_destination found for table {table_name}, using transformed name '{dest_table}'"
                            )
                        # Combine to create full destination table name
                        destination_table = f"{dest_schema}.{dest_table}"
                        # Process columns for lineage
                        column_lineage = self._extract_column_lineage(
                            table=table,
                            source_table=source_table,
                            destination_platform=destination_platform,
                            source_table_columns=source_table_columns,
                        )
                        # Add this table's lineage
                        lineage_list.append(
                            TableLineage(
                                source_table=source_table,
                                destination_table=destination_table,
                                column_lineage=column_lineage,
                            )
                        )
                        logger.debug(
                            f"Added lineage: {source_table} -> {destination_table} with {len(column_lineage)} columns"
                        )
                    except Exception as table_e:
                        logger.warning(
                            f"Error processing table {table.get('name', 'unknown')}: {table_e}"
                        )
            except Exception as schema_e:
                logger.warning(
                    f"Error processing schema {schema.get('name', 'unknown')}: {schema_e}"
                )
        return lineage_list
    def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
        """
-        Fill in lineage information for connectors by calling the API.
+        Fill in lineage information for connectors by calling the API with enhanced diagnostics and robust error handling.
        This enhanced implementation:
        1. Correctly handles API response data
        2. Uses name_in_destination when available
        3. Implements fallback column-level lineage by matching column names
        4. Handles case transformation based on destination platform
        """
        for connector in connectors:
            try:
@ -387,111 +490,26 @@ class FivetranStandardAPI(FivetranAccessInterface):
                # Get schema information from API
                schemas = self.api_client.list_connector_schemas(connector.connector_id)
                logger.info(
                    f"Got {len(schemas)} schemas for connector {connector.connector_id}"
                )
-                lineage_list = []
+                # DIAGNOSTIC: Log detailed schema information
                self._log_schema_diagnostics(schemas)
                # If we have no columns at all, try direct fetching for each table
                if self._should_fetch_missing_columns(schemas):
                    logger.warning(
                        "No columns found in initial schema fetch. Attempting direct table column fetching."
                    )
                    self._fetch_missing_columns(connector.connector_id, schemas)
                    self._log_schema_diagnostics(schemas)  # Log updated stats
                # First, collect all source columns with their types for each table
                # This will help with generating column-level lineage
                source_table_columns = self._collect_source_columns(schemas)
-                # Process each schema
+                # Process schemas to extract lineage information
-                for schema in schemas:
+                lineage_list = self._process_schemas_for_lineage(
-                    try:
+                    connector, schemas, source_table_columns
-                        schema_name = schema.get("name", "")
+                )
                        if not schema_name:
                            logger.warning(
                                f"Skipping schema with no name in connector {connector.connector_id}"
                            )
                            continue
                        # Use name_in_destination if available for schema
                        schema_name_in_destination = schema.get("name_in_destination")
                        tables = schema.get("tables", [])
                        if not isinstance(tables, list):
                            logger.warning(
                                f"Schema {schema_name} has non-list tables: {type(tables)}"
                            )
                            continue
                        # Process each table in the schema
                        for table in tables:
                            try:
                                if not isinstance(table, dict):
                                    continue
                                table_name = table.get("name", "")
                                enabled = table.get("enabled", False)
                                if not enabled or not table_name:
                                    continue
                                # Create source table identifier
                                source_table = f"{schema_name}.{table_name}"
                                # Get destination schema name - prefer name_in_destination if available
                                dest_schema = None
                                if schema_name_in_destination:
                                    dest_schema = schema_name_in_destination
                                else:
                                    # Fall back to case transformation if name_in_destination not available
                                    dest_schema = self._get_destination_schema_name(
                                        schema_name, destination_platform
                                    )
                                # Get destination table name - prefer name_in_destination if available
                                dest_table = None
                                table_name_in_destination = table.get(
                                    "name_in_destination"
                                )
                                if table_name_in_destination:
                                    dest_table = table_name_in_destination
                                    logger.debug(
                                        f"Using provided name_in_destination '{dest_table}' for table {table_name}"
                                    )
                                else:
                                    # Fall back to case transformation if name_in_destination not available
                                    dest_table = self._get_destination_table_name(
                                        table_name, destination_platform
                                    )
                                    logger.debug(
                                        f"No name_in_destination found for table {table_name}, using transformed name '{dest_table}'"
                                    )
                                # Combine to create full destination table name
                                destination_table = f"{dest_schema}.{dest_table}"
                                # Process columns for lineage
                                column_lineage = self._extract_column_lineage(
                                    table=table,
                                    source_table=source_table,
                                    destination_platform=destination_platform,
                                    source_table_columns=source_table_columns,
                                )
                                # Add this table's lineage
                                lineage_list.append(
                                    TableLineage(
                                        source_table=source_table,
                                        destination_table=destination_table,
                                        column_lineage=column_lineage,
                                    )
                                )
                                logger.debug(
                                    f"Added lineage: {source_table} -> {destination_table} with {len(column_lineage)} columns"
                                )
                            except Exception as table_e:
                                logger.warning(
                                    f"Error processing table {table.get('name', 'unknown')}: {table_e}"
                                )
                    except Exception as schema_e:
                        logger.warning(
                            f"Error processing schema {schema.get('name', 'unknown')}: {schema_e}"
                        )
                # Truncate if necessary
                if len(lineage_list) > MAX_TABLE_LINEAGE_PER_CONNECTOR:
@ -503,9 +521,8 @@ class FivetranStandardAPI(FivetranAccessInterface):
                connector.lineage = lineage_list
-                logger.info(
+                # Final stats logging
-                    f"Successfully extracted {len(lineage_list)} table lineages for connector {connector.connector_id}"
+                self._log_lineage_stats(lineage_list, connector.connector_id)
                )
            except Exception as e:
                logger.error(
@ -514,6 +531,92 @@ class FivetranStandardAPI(FivetranAccessInterface):
                )
                connector.lineage = []
    def _log_schema_diagnostics(self, schemas: List[Dict]) -> None:
        """Log diagnostic information about schemas and their columns."""
        total_columns = 0
        total_tables_with_columns = 0
        total_tables = 0
        for schema in schemas:
            schema_name = schema.get("name", "")
            for table in schema.get("tables", []):
                total_tables += 1
                table_name = table.get("name", "")
                columns = table.get("columns", [])
                if columns:
                    total_tables_with_columns += 1
                    total_columns += len(columns)
                    logger.info(
                        f"Table {schema_name}.{table_name} has {len(columns)} columns"
                    )
                    # DIAGNOSTIC: Print a sample of column names
                    column_names = [col.get("name", "unknown") for col in columns[:5]]
                    logger.info(f"Sample columns: {column_names}")
                else:
                    logger.warning(f"Table {schema_name}.{table_name} has NO columns")
        logger.info(
            f"SCHEMA STATS: {total_tables_with_columns}/{total_tables} tables have columns, total {total_columns} columns"
        )
    def _should_fetch_missing_columns(self, schemas: List[Dict]) -> bool:
        """Determine if we need to fetch missing columns based on schema content."""
        total_columns = 0
        total_tables = 0
        for schema in schemas:
            for table in schema.get("tables", []):
                total_tables += 1
                columns = table.get("columns", [])
                if columns:
                    total_columns += len(columns)
        return total_columns == 0 and total_tables > 0
    def _log_lineage_stats(
        self, lineage_list: List[TableLineage], connector_id: str
    ) -> None:
        """Log statistics about lineage processing."""
        tables_with_columns = len(
            [
                table_lineage
                for table_lineage in lineage_list
                if table_lineage.column_lineage
            ]
        )
        total_column_mappings = sum(
            len(table_lineage.column_lineage) for table_lineage in lineage_list
        )
        logger.info(
            f"Lineage stats for connector {connector_id}: "
            f"{len(lineage_list)} table lineages, {tables_with_columns} tables with column lineage, "
            f"{total_column_mappings} total column mappings"
        )
    def _fetch_missing_columns(self, connector_id: str, schemas: List[Dict]) -> None:
        """Attempt to fetch missing column information directly for each table."""
        for schema in schemas:
            schema_name = schema.get("name", "")
            for table in schema.get("tables", []):
                table_name = table.get("name", "")
                if not table.get("columns") and table.get("enabled", True):
                    try:
                        # Try direct column fetching
                        columns = self.api_client.get_table_columns(
                            connector_id, schema_name, table_name
                        )
                        if columns:
                            table["columns"] = columns
                            logger.info(
                                f"Directly fetched {len(columns)} columns for {schema_name}.{table_name}"
                            )
                    except Exception as e:
                        logger.warning(
                            f"Failed to directly fetch columns for {schema_name}.{table_name}: {e}"
                        )
    def _collect_source_columns(self, schemas: List[Dict]) -> Dict[str, Dict[str, str]]:
        """
        Collect all source columns with their types for each table.
@ -559,6 +662,88 @@ class FivetranStandardAPI(FivetranAccessInterface):
        return source_columns
    def _process_columns_from_list(
        self, columns: List[Any], is_bigquery: bool, source_table: str
    ) -> List[ColumnLineage]:
        """
        Process columns from list format and create column lineage.
        Extracted from _extract_column_lineage to reduce complexity.
        """
        column_lineage = []
        for column in columns:
            col_name = None
            if isinstance(column, dict):
                col_name = column.get("name")
            elif isinstance(column, str):
                col_name = column
            if not col_name:
                continue
            # Skip Fivetran system columns
            if col_name.startswith("_fivetran"):
                continue
            # Get destination column name - prefer name_in_destination if available
            dest_col_name = None
            if isinstance(column, dict) and "name_in_destination" in column:
                dest_col_name = column.get("name_in_destination")
                logger.debug(
                    f"Using name_in_destination: {col_name} -> {dest_col_name}"
                )
            # If no name_in_destination, transform based on platform
            if not dest_col_name:
                dest_col_name = self._transform_column_name_for_platform(
                    col_name, is_bigquery
                )
                logger.debug(f"Transformed name: {col_name} -> {dest_col_name}")
            # Add to lineage
            column_lineage.append(
                ColumnLineage(
                    source_column=col_name,
                    destination_column=dest_col_name,
                )
            )
        logger.info(
            f"Created {len(column_lineage)} column lineage entries for {source_table}"
        )
        return column_lineage
    def _process_columns_from_source(
        self, source_columns: Dict[str, str], is_bigquery: bool, source_table: str
    ) -> List[ColumnLineage]:
        """
        Create column lineage from source column information.
        Extracted from _extract_column_lineage to reduce complexity.
        """
        column_lineage = []
        logger.info(f"Using {len(source_columns)} columns from source table schema")
        # Create lineage for each source column
        for col_name in source_columns:
            if col_name.startswith("_fivetran"):
                continue
            # Transform destination column name based on platform
            dest_col_name = self._transform_column_name_for_platform(
                col_name, is_bigquery
            )
            column_lineage.append(
                ColumnLineage(
                    source_column=col_name,
                    destination_column=dest_col_name,
                )
            )
        logger.info(f"Created {len(column_lineage)} fallback column lineage entries")
        return column_lineage
    def _extract_column_lineage(
        self,
        table: Dict,
@ -567,7 +752,7 @@ class FivetranStandardAPI(FivetranAccessInterface):
        source_table_columns: Dict[str, Dict[str, str]],
    ) -> List[ColumnLineage]:
        """
-        Extract column-level lineage for a table, with fallback to name matching if needed.
+        Extract column-level lineage for a table with improved debugging and fallback.
        Args:
            table: Table data from API
@ -578,83 +763,110 @@ class FivetranStandardAPI(FivetranAccessInterface):
        Returns:
            List of ColumnLineage objects
        """
-        column_lineage = []
+        # Get detailed logging for debugging
        logger.info(
            f"Extracting column lineage for {source_table} to {destination_platform}"
        )
        logger.debug(f"Table data keys: {list(table.keys())}")
        # Get columns from the API response
        columns = table.get("columns", [])
        # DIAGNOSTIC: Print details about what we got
        self._log_column_diagnostics(columns)
        # CRITICAL FIX: Convert dict format to list if needed
        if isinstance(columns, dict):
            columns = self._convert_column_dict_to_list(columns)
        is_bigquery = destination_platform.lower() == "bigquery"
        column_lineage = []
        # If we have columns, create lineage mappings
        if isinstance(columns, list) and columns:
-            # API provided column information, use it
+            column_lineage = self._process_columns_from_list(
-            for column in columns:
+                columns, is_bigquery, source_table
-                if not isinstance(column, dict):
+            )
                    continue
                col_name = column.get("name", "")
                if not col_name:
                    continue
                # Skip Fivetran system columns
                if col_name.startswith("_fivetran"):
                    continue
                # Get destination column name - prefer name_in_destination if available
                dest_col_name = None
                column_name_in_destination = column.get("name_in_destination")
                if column_name_in_destination:
                    dest_col_name = column_name_in_destination
                    logger.debug(
                        f"Using provided name_in_destination '{dest_col_name}' for column {col_name}"
                    )
                else:
                    # Fall back to case transformation if name_in_destination not available
                    dest_col_name = self._get_destination_column_name(
                        col_name, destination_platform
                    )
                    logger.debug(
                        f"No name_in_destination found for column {col_name}, using transformed name '{dest_col_name}'"
                    )
                column_lineage.append(
                    ColumnLineage(
                        source_column=col_name,
                        destination_column=dest_col_name,
                    )
                )
        else:
-            # No column information from API, use source table columns if available
+            # No column information from API, try other methods
            logger.warning(f"No usable column information for {source_table} from API")
            # Use source_table_columns if available (fallback method)
            source_columns = source_table_columns.get(source_table, {})
            if source_columns:
-                logger.info(
+                column_lineage = self._process_columns_from_source(
-                    f"No column information from API for {source_table}. "
+                    source_columns, is_bigquery, source_table
                    f"Using source table schema with {len(source_columns)} columns"
                )
            else:
                # Try to use column mapping from config if available
                column_lineage = self._try_get_column_mapping_from_config(source_table)
-                # For each source column, create a lineage entry assuming it has the same name in destination
+        # Final check and log
                # with appropriate case transformation
                for col_name in source_columns:
                    # Skip Fivetran system columns
                    if col_name.startswith("_fivetran"):
                        continue
                    # Transform column name based on destination platform
                    dest_col_name = self._get_destination_column_name(
                        col_name, destination_platform
                    )
                    column_lineage.append(
                        ColumnLineage(
                            source_column=col_name,
                            destination_column=dest_col_name,
                        )
                    )
        # Log whether we found column lineage
        if column_lineage:
-            logger.info(f"Found {len(column_lineage)} columns for {source_table}")
+            logger.info(
                f"Returning {len(column_lineage)} column lineage mappings for {source_table}"
            )
        else:
-            logger.warning(f"No column lineage found for {source_table}")
+            logger.warning(f"No column lineage mappings created for {source_table}")
        return column_lineage
    def _log_column_diagnostics(self, columns: Any) -> None:
        """Log diagnostic information about column data."""
        if isinstance(columns, list):
            logger.info(f"Found {len(columns)} columns in list format")
            if columns:
                sample = columns[:2]
                logger.debug(f"Sample columns: {sample}")
        elif isinstance(columns, dict):
            logger.info(f"Found {len(columns)} columns in dict format")
            if columns:
                sample_keys = list(columns.keys())[:2]
                logger.debug(f"Sample column keys: {sample_keys}")
        else:
            logger.warning(f"Columns in unexpected format: {type(columns)}")
    def _convert_column_dict_to_list(self, columns_dict: Dict) -> List[Dict]:
        """Convert column dictionary to list format for consistent processing."""
        columns_list = []
        for col_name, col_data in columns_dict.items():
            if isinstance(col_data, dict):
                col_data = col_data.copy()
                col_data["name"] = col_name
                columns_list.append(col_data)
            else:
                columns_list.append({"name": col_name})
        logger.info(f"Converted dict format to list with {len(columns_list)} columns")
        return columns_list
    def _try_get_column_mapping_from_config(
        self, source_table: str
    ) -> List[ColumnLineage]:
        """Try to get column mapping from configuration if available."""
        column_lineage = []
        # Check if there's a config attribute with column mapping info
        if hasattr(self, "config") and self.config:
            # Check for any attribute that might have column mappings
            for attr_name in dir(self.config):
                if "column_mapping" in attr_name.lower() and hasattr(
                    self.config, attr_name
                ):
                    mapping_attr = getattr(self.config, attr_name)
                    if isinstance(mapping_attr, dict) and source_table in mapping_attr:
                        column_mapping = mapping_attr.get(source_table, {})
                        logger.info(
                            f"Found config column mapping for {source_table}: {len(column_mapping)} columns"
                        )
                        for source_col, dest_col in column_mapping.items():
                            column_lineage.append(
                                ColumnLineage(
                                    source_column=source_col,
                                    destination_column=dest_col,
                                )
                            )
        return column_lineage
@ -741,16 +953,36 @@ class FivetranStandardAPI(FivetranAccessInterface):
            # For most other systems (Snowflake, Redshift, etc.), table names are uppercased
            return table_name.upper()
-    def _get_destination_column_name(
+    def _transform_column_name_for_platform(
-        self, column_name: str, destination_platform: str
+        self, column_name: str, is_bigquery: bool
    ) -> str:
        """
-        Get the destination column name based on the platform.
+        Transform column name based on the destination platform with better handling of edge cases.
        This is a helper method that applies appropriate case transformations.
        """
-        if destination_platform.lower() == "bigquery":
+        if not column_name:
-            # BigQuery column names are case-sensitive and typically lowercase
+            return ""
-            return column_name.lower()
+
        if is_bigquery:
            # For BigQuery:
            # 1. Convert to lowercase
            # 2. Replace camelCase with snake_case
            # 3. Clean up any invalid characters
            # Step 1: Convert camelCase to snake_case with regex
            s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
            s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
            # Step 2: lowercase and replace non-alphanumeric with underscore
            transformed = re.sub(r"[^a-zA-Z0-9_]", "_", s2.lower())
            # Step 3: Remove leading/trailing underscores and collapse multiple underscores
            transformed = re.sub(r"_+", "_", transformed).strip("_")
            # Log the transformation for debugging
            if transformed != column_name.lower():
                logger.debug(f"Transformed column: {column_name} -> {transformed}")
            return transformed
        else:
-            # For most other systems (Snowflake, Redshift, etc.), column names are uppercased
+            # For other platforms like Snowflake, typically uppercase
            return column_name.upper()