cll update

This commit is contained in:
Jonny Dixon 2025-03-19 11:31:03 +01:00
parent fed49ac79e
commit 5642906be3

View File

@ -254,10 +254,12 @@ class FivetranAPIClient:
Normalize schema information into a consistent format regardless of API response structure. Normalize schema information into a consistent format regardless of API response structure.
""" """
schemas = [] schemas = []
logger.debug(f"Raw schema response type: {type(raw_schemas)}")
# Handle different response formats # Handle different response formats
if isinstance(raw_schemas, dict): if isinstance(raw_schemas, dict):
# Handle nested object format (older API versions) # Handle nested object format (older API versions)
logger.debug(f"Schema keys: {list(raw_schemas.keys())}")
logger.info(f"Converting nested schema format for connector {connector_id}") logger.info(f"Converting nested schema format for connector {connector_id}")
for schema_name, schema_data in raw_schemas.items(): for schema_name, schema_data in raw_schemas.items():
# Convert to the expected format # Convert to the expected format
@ -306,6 +308,10 @@ class FivetranAPIClient:
schemas.append(schema_obj) schemas.append(schema_obj)
elif isinstance(raw_schemas, list): elif isinstance(raw_schemas, list):
logger.debug(f"Schema list length: {len(raw_schemas)}")
if raw_schemas:
logger.debug(f"First schema item type: {type(raw_schemas[0])}")
# Already in the expected list format # Already in the expected list format
schemas = raw_schemas schemas = raw_schemas
@ -318,6 +324,7 @@ class FivetranAPIClient:
if "columns" not in table: if "columns" not in table:
table["columns"] = [] table["columns"] = []
else: else:
logger.debug(f"Unexpected schema format: {raw_schemas[:100]}...")
logger.warning( logger.warning(
f"Unexpected schema format type for connector {connector_id}: {type(raw_schemas)}" f"Unexpected schema format type for connector {connector_id}: {type(raw_schemas)}"
) )
@ -562,61 +569,177 @@ class FivetranAPIClient:
) -> None: ) -> None:
""" """
Ensure we have column information for tables by fetching additional details if needed. Ensure we have column information for tables by fetching additional details if needed.
Uses multiple strategies to get complete column information:
1. Check existing schema data first
2. Try dedicated table API endpoint for tables missing columns
3. Attempt to infer columns from metadata if available
""" """
tables_missing_columns = [] tables_missing_columns = []
tables_with_columns = 0
total_tables = 0
# Check if we have tables without column information # Check if we have tables without column information
for schema in schemas: for schema in schemas:
schema_name = schema.get("name", "")
for table in schema.get("tables", []): for table in schema.get("tables", []):
if not table.get("columns") and table.get("enabled", True): total_tables += 1
table_name = table.get("name", "")
# Skip tables that aren't enabled
if not table.get("enabled", True):
continue
# Check if table has column information
columns = table.get("columns", [])
if not columns:
# Add to list of tables needing column info # Add to list of tables needing column info
tables_missing_columns.append( tables_missing_columns.append(
{"schema": schema["name"], "table": table["name"]} {
"schema": schema_name,
"table": table_name,
"table_obj": table, # Keep reference to the table object for updates
}
) )
else:
tables_with_columns += 1
# Log statistics about column availability
logger.info(
f"Column information stats for connector {connector_id}: "
f"{tables_with_columns} tables have columns, "
f"{len(tables_missing_columns)} tables missing columns, "
f"out of {total_tables} total tables"
)
if not tables_missing_columns: if not tables_missing_columns:
return return
# Limit the number of API calls to avoid rate limiting
tables_to_process = tables_missing_columns[:10]
logger.info( logger.info(
f"Found {len(tables_missing_columns)} tables without column information for connector {connector_id}" f"Fetching column information for {len(tables_to_process)} tables out of {len(tables_missing_columns)} missing column info"
) )
# Try to fetch column information for these tables # Try to fetch column information for these tables
for table_info in tables_missing_columns[ for table_info in tables_to_process:
:10
]: # Limit to avoid too many API calls
schema_name = table_info["schema"] schema_name = table_info["schema"]
table_name = table_info["table"] table_name = table_info["table"]
table_obj = table_info["table_obj"]
try: # Get columns using the dedicated table API endpoint
# Try to get column information from table metadata endpoint columns = self.get_table_columns(connector_id, schema_name, table_name)
table_path = f"/connectors/{connector_id}/schemas/{schema_name}/tables/{table_name}"
try:
table_response = self._make_request("GET", table_path)
table_data = table_response.get("data", {})
if "columns" in table_data: if columns:
columns = table_data.get("columns", []) # Update the table object directly with these columns
# Update our schema information with these columns table_obj["columns"] = columns
for schema in schemas: logger.info(
if schema["name"] == schema_name: f"Updated {schema_name}.{table_name} with {len(columns)} columns from table API"
for table in schema["tables"]:
if table["name"] == table_name:
table["columns"] = self._process_column_data(
columns
)
logger.info(
f"Added {len(table['columns'])} columns to {schema_name}.{table_name}"
)
except Exception as e:
logger.debug(
f"Could not get details for table {schema_name}.{table_name}: {e}"
)
except Exception as e:
logger.warning(
f"Error ensuring column information for {schema_name}.{table_name}: {e}"
) )
else:
# If API doesn't return columns, try to infer from metadata
logger.warning(
f"Could not get columns for {schema_name}.{table_name} from API, attempting fallback methods"
)
# Try getting metadata that might have column information
try:
metadata_path = f"/connectors/{connector_id}/metadata"
metadata_response = self._make_request("GET", metadata_path)
metadata = metadata_response.get("data", {})
# Look for column information in metadata
source_objects = metadata.get("source_objects", [])
for obj in source_objects:
if (
isinstance(obj, dict)
and obj.get("name") == table_name
and obj.get("schema") == schema_name
):
metadata_columns = obj.get("columns", [])
if metadata_columns:
# Convert to our expected format
formatted_columns = []
for col in metadata_columns:
if isinstance(col, dict) and "name" in col:
formatted_columns.append(
{
"name": col["name"],
"type": col.get("type", ""),
"enabled": True,
}
)
if formatted_columns:
table_obj["columns"] = formatted_columns
logger.info(
f"Inferred {len(formatted_columns)} columns for {schema_name}.{table_name} from metadata"
)
break
except Exception as e:
logger.warning(f"Failed to get metadata for {connector_id}: {e}")
# Count how many tables we still don't have column info for
tables_still_missing = 0
for schema in schemas:
for table in schema.get("tables", []):
if table.get("enabled", True) and not table.get("columns"):
tables_still_missing += 1
logger.info(
f"After retrieval attempts, {tables_still_missing} tables still missing column information"
)
def get_table_columns(
self, connector_id: str, schema_name: str, table_name: str
) -> List[Dict]:
"""
Get detailed column information for a specific table using the tables API endpoint.
This is more reliable for column information than the schemas endpoint.
"""
try:
# URL-encode the schema and table names to handle special characters
import urllib.parse
encoded_schema = urllib.parse.quote(schema_name)
encoded_table = urllib.parse.quote(table_name)
# Make the API request for detailed table information
response = self._make_request(
"GET",
f"/connectors/{connector_id}/schemas/{encoded_schema}/tables/{encoded_table}",
)
# Extract column information
table_data = response.get("data", {})
columns_data = table_data.get("columns", {})
# Convert column data to a list format if it's a dictionary
columns = []
if isinstance(columns_data, dict):
for col_name, col_info in columns_data.items():
if isinstance(col_info, dict):
col_info = (
col_info.copy()
) # Create a copy to avoid modifying the original
col_info["name"] = col_name
if col_info.get(
"enabled", True
): # Only include enabled columns
columns.append(col_info)
else:
# Simple case where we just have column names
columns.append({"name": col_name, "enabled": True})
elif isinstance(columns_data, list):
columns = [col for col in columns_data if col.get("enabled", True)]
logger.info(
f"Retrieved {len(columns)} columns for {schema_name}.{table_name} via direct table API"
)
return columns
except Exception as e:
logger.warning(f"Failed to get columns for {schema_name}.{table_name}: {e}")
return []
def _process_column_data(self, columns: Any) -> List[Dict]: def _process_column_data(self, columns: Any) -> List[Dict]:
""" """
@ -1225,15 +1348,20 @@ class FivetranAPIClient:
Uses a generic approach that works for any connector type and properly handles name_in_destination. Uses a generic approach that works for any connector type and properly handles name_in_destination.
""" """
try: try:
# Get the connector details first # Get the connector schemas first
connector_details = self.get_connector(connector_id) schemas = self.list_connector_schemas(connector_id)
# Get destination information # Log more details about what we retrieved
destination_id = connector_details.get("group", {}).get("id", "") table_count = sum(len(schema.get("tables", [])) for schema in schemas)
logger.info(
f"Got {len(schemas)} schemas with {table_count} tables for connector {connector_id}"
)
# Get destination information for naming
connector = self.get_connector(connector_id)
destination_id = connector.get("group", {}).get("id", "")
destination_platform = self.detect_destination_platform(destination_id) destination_platform = self.detect_destination_platform(destination_id)
# Get schema information
schemas = self.list_connector_schemas(connector_id)
lineage_list = [] lineage_list = []
# Handle cases where schemas might be a string or invalid format # Handle cases where schemas might be a string or invalid format