cll update

2025-11-08 07:23:34 +00:00 · 2025-03-19 11:31:03 +01:00 · 2025-03-19 11:31:03 +01:00 · 5642906be3
commit 5642906be3
parent fed49ac79e
1 changed files with 167 additions and 39 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_api_client.py
@ -254,10 +254,12 @@ class FivetranAPIClient:
        Normalize schema information into a consistent format regardless of API response structure.
        """
        schemas = []
        logger.debug(f"Raw schema response type: {type(raw_schemas)}")
        # Handle different response formats
        if isinstance(raw_schemas, dict):
            # Handle nested object format (older API versions)
            logger.debug(f"Schema keys: {list(raw_schemas.keys())}")
            logger.info(f"Converting nested schema format for connector {connector_id}")
            for schema_name, schema_data in raw_schemas.items():
                # Convert to the expected format
@ -306,6 +308,10 @@ class FivetranAPIClient:
                schemas.append(schema_obj)
        elif isinstance(raw_schemas, list):
            logger.debug(f"Schema list length: {len(raw_schemas)}")
            if raw_schemas:
                logger.debug(f"First schema item type: {type(raw_schemas[0])}")
            # Already in the expected list format
            schemas = raw_schemas
@ -318,6 +324,7 @@ class FivetranAPIClient:
                    if "columns" not in table:
                        table["columns"] = []
        else:
            logger.debug(f"Unexpected schema format: {raw_schemas[:100]}...")
            logger.warning(
                f"Unexpected schema format type for connector {connector_id}: {type(raw_schemas)}"
            )
@ -562,61 +569,177 @@ class FivetranAPIClient:
    ) -> None:
        """
        Ensure we have column information for tables by fetching additional details if needed.
        Uses multiple strategies to get complete column information:
        1. Check existing schema data first
        2. Try dedicated table API endpoint for tables missing columns
        3. Attempt to infer columns from metadata if available
        """
        tables_missing_columns = []
        tables_with_columns = 0
        total_tables = 0
        # Check if we have tables without column information
        for schema in schemas:
            schema_name = schema.get("name", "")
            for table in schema.get("tables", []):
-                if not table.get("columns") and table.get("enabled", True):
+                total_tables += 1
                table_name = table.get("name", "")
                # Skip tables that aren't enabled
                if not table.get("enabled", True):
                    continue
                # Check if table has column information
                columns = table.get("columns", [])
                if not columns:
                    # Add to list of tables needing column info
                    tables_missing_columns.append(
-                        {"schema": schema["name"], "table": table["name"]}
+                        {
                            "schema": schema_name,
                            "table": table_name,
                            "table_obj": table,  # Keep reference to the table object for updates
                        }
                    )
                else:
                    tables_with_columns += 1
        # Log statistics about column availability
        logger.info(
            f"Column information stats for connector {connector_id}: "
            f"{tables_with_columns} tables have columns, "
            f"{len(tables_missing_columns)} tables missing columns, "
            f"out of {total_tables} total tables"
        )
        if not tables_missing_columns:
            return
        # Limit the number of API calls to avoid rate limiting
        tables_to_process = tables_missing_columns[:10]
        logger.info(
-            f"Found {len(tables_missing_columns)} tables without column information for connector {connector_id}"
+            f"Fetching column information for {len(tables_to_process)} tables out of {len(tables_missing_columns)} missing column info"
        )
        # Try to fetch column information for these tables
-        for table_info in tables_missing_columns[
+        for table_info in tables_to_process:
            :10
        ]:  # Limit to avoid too many API calls
            schema_name = table_info["schema"]
            table_name = table_info["table"]
            table_obj = table_info["table_obj"]
-            try:
+            # Get columns using the dedicated table API endpoint
-                # Try to get column information from table metadata endpoint
+            columns = self.get_table_columns(connector_id, schema_name, table_name)
                table_path = f"/connectors/{connector_id}/schemas/{schema_name}/tables/{table_name}"
                try:
                    table_response = self._make_request("GET", table_path)
                    table_data = table_response.get("data", {})
-                    if "columns" in table_data:
+            if columns:
-                        columns = table_data.get("columns", [])
+                # Update the table object directly with these columns
-                        # Update our schema information with these columns
+                table_obj["columns"] = columns
-                        for schema in schemas:
+                logger.info(
-                            if schema["name"] == schema_name:
+                    f"Updated {schema_name}.{table_name} with {len(columns)} columns from table API"
                                for table in schema["tables"]:
                                    if table["name"] == table_name:
                                        table["columns"] = self._process_column_data(
                                            columns
                                        )
                                        logger.info(
                                            f"Added {len(table['columns'])} columns to {schema_name}.{table_name}"
                                        )
                except Exception as e:
                    logger.debug(
                        f"Could not get details for table {schema_name}.{table_name}: {e}"
                    )
            except Exception as e:
                logger.warning(
                    f"Error ensuring column information for {schema_name}.{table_name}: {e}"
                )
            else:
                # If API doesn't return columns, try to infer from metadata
                logger.warning(
                    f"Could not get columns for {schema_name}.{table_name} from API, attempting fallback methods"
                )
                # Try getting metadata that might have column information
                try:
                    metadata_path = f"/connectors/{connector_id}/metadata"
                    metadata_response = self._make_request("GET", metadata_path)
                    metadata = metadata_response.get("data", {})
                    # Look for column information in metadata
                    source_objects = metadata.get("source_objects", [])
                    for obj in source_objects:
                        if (
                            isinstance(obj, dict)
                            and obj.get("name") == table_name
                            and obj.get("schema") == schema_name
                        ):
                            metadata_columns = obj.get("columns", [])
                            if metadata_columns:
                                # Convert to our expected format
                                formatted_columns = []
                                for col in metadata_columns:
                                    if isinstance(col, dict) and "name" in col:
                                        formatted_columns.append(
                                            {
                                                "name": col["name"],
                                                "type": col.get("type", ""),
                                                "enabled": True,
                                            }
                                        )
                                if formatted_columns:
                                    table_obj["columns"] = formatted_columns
                                    logger.info(
                                        f"Inferred {len(formatted_columns)} columns for {schema_name}.{table_name} from metadata"
                                    )
                                    break
                except Exception as e:
                    logger.warning(f"Failed to get metadata for {connector_id}: {e}")
        # Count how many tables we still don't have column info for
        tables_still_missing = 0
        for schema in schemas:
            for table in schema.get("tables", []):
                if table.get("enabled", True) and not table.get("columns"):
                    tables_still_missing += 1
        logger.info(
            f"After retrieval attempts, {tables_still_missing} tables still missing column information"
        )
    def get_table_columns(
        self, connector_id: str, schema_name: str, table_name: str
    ) -> List[Dict]:
        """
        Get detailed column information for a specific table using the tables API endpoint.
        This is more reliable for column information than the schemas endpoint.
        """
        try:
            # URL-encode the schema and table names to handle special characters
            import urllib.parse
            encoded_schema = urllib.parse.quote(schema_name)
            encoded_table = urllib.parse.quote(table_name)
            # Make the API request for detailed table information
            response = self._make_request(
                "GET",
                f"/connectors/{connector_id}/schemas/{encoded_schema}/tables/{encoded_table}",
            )
            # Extract column information
            table_data = response.get("data", {})
            columns_data = table_data.get("columns", {})
            # Convert column data to a list format if it's a dictionary
            columns = []
            if isinstance(columns_data, dict):
                for col_name, col_info in columns_data.items():
                    if isinstance(col_info, dict):
                        col_info = (
                            col_info.copy()
                        )  # Create a copy to avoid modifying the original
                        col_info["name"] = col_name
                        if col_info.get(
                            "enabled", True
                        ):  # Only include enabled columns
                            columns.append(col_info)
                    else:
                        # Simple case where we just have column names
                        columns.append({"name": col_name, "enabled": True})
            elif isinstance(columns_data, list):
                columns = [col for col in columns_data if col.get("enabled", True)]
            logger.info(
                f"Retrieved {len(columns)} columns for {schema_name}.{table_name} via direct table API"
            )
            return columns
        except Exception as e:
            logger.warning(f"Failed to get columns for {schema_name}.{table_name}: {e}")
            return []
    def _process_column_data(self, columns: Any) -> List[Dict]:
        """
@ -1225,15 +1348,20 @@ class FivetranAPIClient:
        Uses a generic approach that works for any connector type and properly handles name_in_destination.
        """
        try:
-            # Get the connector details first
+            # Get the connector schemas first
-            connector_details = self.get_connector(connector_id)
+            schemas = self.list_connector_schemas(connector_id)
-            # Get destination information
+            # Log more details about what we retrieved
-            destination_id = connector_details.get("group", {}).get("id", "")
+            table_count = sum(len(schema.get("tables", [])) for schema in schemas)
            logger.info(
                f"Got {len(schemas)} schemas with {table_count} tables for connector {connector_id}"
            )
            # Get destination information for naming
            connector = self.get_connector(connector_id)
            destination_id = connector.get("group", {}).get("id", "")
            destination_platform = self.detect_destination_platform(destination_id)
            # Get schema information
            schemas = self.list_connector_schemas(connector_id)
            lineage_list = []
            # Handle cases where schemas might be a string or invalid format