remove unused methods

2025-08-09 17:58:03 +00:00 · 2025-03-20 22:45:59 +00:00 · 2025-03-20 22:45:59 +00:00 · d3516b234b
commit d3516b234b
parent 23e2cb0273
2 changed files with 124 additions and 381 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@ -1,7 +1,5 @@
 import difflib
 import logging
-import re
+from typing import Dict, Iterable, List, Optional
 from typing import Dict, Iterable, List, Optional, Tuple
 import datahub.emitter.mce_builder as builder
 from datahub.api.entities.datajob import DataFlow, DataJob
@ -347,104 +345,6 @@ class FivetranSource(StatefulIngestionSourceBase):
            )
            return None
    def _normalize_column_name(self, column_name: str) -> str:
        """Normalize column name for comparison by removing non-alphanumeric chars and converting to lowercase."""
        # Remove non-alphanumeric characters and convert to lowercase
        normalized = re.sub(r"[^a-zA-Z0-9]", "", column_name).lower()
        return normalized
    def _transform_column_name_for_platform(
        self, column_name: str, is_bigquery: bool
    ) -> str:
        """Transform column name based on the destination platform with better handling of edge cases."""
        if not column_name:
            return ""
        if is_bigquery:
            # For BigQuery:
            # 1. Convert to lowercase
            # 2. Replace camelCase with snake_case
            # 3. Clean up any invalid characters
            import re
            # Step 1: Convert camelCase to snake_case with regex
            s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name)
            s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
            # Step 2: lowercase and replace non-alphanumeric with underscore
            transformed = re.sub(r"[^a-zA-Z0-9_]", "_", s2.lower())
            # Step 3: Remove leading/trailing underscores and collapse multiple underscores
            transformed = re.sub(r"_+", "_", transformed).strip("_")
            # Log the transformation for debugging
            if transformed != column_name.lower():
                logger.debug(f"Transformed column: {column_name} -> {transformed}")
            return transformed
        else:
            # For other platforms like Snowflake, typically uppercase
            return column_name.upper()
    def _find_best_fuzzy_match(
        self, source_col: str, source_norm: str, dest_columns: List[Tuple[str, str]]
    ) -> Optional[str]:
        """Find best fuzzy match for a source column from destination columns.
        Args:
            source_col: Original source column name
            source_norm: Normalized source column name
            dest_columns: List of (original_dest, normalized_dest) tuples
        Returns:
            Best matching destination column name or None if no good match found
        """
        # First try to match normalized versions with high cutoff
        dest_norms = [dest_norm for _, dest_norm in dest_columns]
        matches = difflib.get_close_matches(source_norm, dest_norms, n=1, cutoff=0.8)
        if matches:
            # Find original dest column with this normalized value
            matched_norm = matches[0]
            for dest_col, dest_norm in dest_columns:
                if dest_norm == matched_norm:
                    return dest_col
        # If no high-quality match found, try a lower threshold on original names
        # This helps with acronyms and abbreviated field names
        dest_cols = [dest_col for dest_col, _ in dest_columns]
        matches = difflib.get_close_matches(source_col, dest_cols, n=1, cutoff=0.6)
        if matches:
            return matches[0]
        # Try special patterns like converting "someField" to "some_field"
        snake_case = re.sub("([a-z0-9])([A-Z])", r"\1_\2", source_col).lower()
        for dest_col, _ in dest_columns:
            if dest_col.lower() == snake_case:
                return dest_col
        # If source_col contains words that are also in a destination column, consider it a match
        # This helps with "BillingStreet" matching "billing_street" or "street_billing"
        words = re.findall(r"[A-Z][a-z]+|[a-z]+|[0-9]+", source_col)
        if words:
            word_matches = {}
            for dest_col, _ in dest_columns:
                # Count how many words from source appear in destination
                dest_words = re.findall(r"[A-Z][a-z]+|[a-z]+|[0-9]+", dest_col)
                common_words = len(
                    set(w.lower() for w in words) & set(w.lower() for w in dest_words)
                )
                if common_words > 0:
                    word_matches[dest_col] = common_words
            # If we found matches based on common words, return the one with most matches
            if word_matches:
                return max(word_matches.items(), key=lambda x: x[1])[0]
        # No good match found
        return None
    def _create_column_lineage(
        self,
        lineage: TableLineage,
@ -536,70 +436,6 @@ class FivetranSource(StatefulIngestionSourceBase):
                    f"Failed to create column lineage for {column_lineage.source_column} -> {column_lineage.destination_column}: {e}"
                )
    def _create_field_lineage_mcp(
        self,
        source_urn: DatasetUrn,
        dest_urn: DatasetUrn,
        lineage_field_map: Dict[str, List[str]],
    ) -> Optional[MetadataWorkUnit]:
        """
        Create field-level lineage between datasets using MetadataChangeProposal.
        Args:
            source_urn: Source dataset URN
            dest_urn: Destination dataset URN
            lineage_field_map: Map of destination field URNs to lists of source field URNs
        """
        try:
            from datahub.emitter.mcp import MetadataChangeProposalWrapper
            from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
                DatasetLineageType,
                FineGrainedLineage,
                FineGrainedLineageDownstreamType,
                FineGrainedLineageUpstreamType,
                Upstream,
                UpstreamLineage,
            )
            # Create the upstream relationship
            upstream = Upstream(
                dataset=str(source_urn), type=DatasetLineageType.TRANSFORMED
            )
            # Create fine-grained lineages for each field mapping
            fine_grained_lineages = []
            for dest_field, source_fields in lineage_field_map.items():
                fine_grained_lineages.append(
                    FineGrainedLineage(
                        upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
                        upstreams=source_fields,
                        downstreamType=FineGrainedLineageDownstreamType.FIELD,
                        downstreams=[dest_field],
                    )
                )
            # Create the lineage aspect
            upstream_lineage = UpstreamLineage(
                upstreams=[upstream], fineGrainedLineages=fine_grained_lineages
            )
            # Create and emit the MCP
            lineage_mcp = MetadataChangeProposalWrapper(
                entityUrn=str(dest_urn),
                aspect=upstream_lineage,
            )
            # Now create a workunit from this MCP
            wu = MetadataWorkUnit(id=f"{dest_urn}-field-lineage", mcp=lineage_mcp)
            # Return the workunit - it will be collected and emitted by the main process
            return wu
        except Exception as e:
            logger.error(f"Error creating field-level lineage MCP: {e}", exc_info=True)
            return None
    def _build_lineage_properties(
        self,
        connector: Connector,
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_standard_api.py
@ -473,6 +473,8 @@ class FivetranStandardAPI(FivetranAccessInterface):
        """
        Fill in lineage information for connectors by calling the API with enhanced diagnostics and robust error handling.
        """
        self._connector_cache = connectors
        for connector in connectors:
            try:
                logger.info(
@ -695,14 +697,128 @@ class FivetranStandardAPI(FivetranAccessInterface):
        return source_columns
-    def _process_columns_from_list(
+    def _extract_column_lineage(
-        self, columns: List[Any], is_bigquery: bool, source_table: str
+        self,
        table: Dict,
        source_table: str,
        destination_platform: str,
        source_table_columns: Dict[str, Dict[str, str]],
    ) -> List[ColumnLineage]:
-        """
+        """Extract column lineage with enhanced retrieval and mapping."""
-        Process columns from list format and create column lineage.
+        logger.info(
-        Extracted from _extract_column_lineage to reduce complexity.
+            f"Extracting column lineage for {source_table} to {destination_platform}"
-        """
+        )
        # Get columns from different sources
        columns = self._get_columns_from_sources(
            table, source_table, source_table_columns
        )
        # If we still don't have any columns, return empty list
        if not columns:
            logger.warning(f"No column information available for {source_table}")
            return []
        # Create column lineage
        column_lineage = self._create_column_lineage_from_columns(
            columns, source_table, destination_platform
        )
        # Log results
        if column_lineage:
            logger.info(
                f"Created {len(column_lineage)} column lineage entries for {source_table}"
            )
        else:
            logger.warning(f"Failed to create any column lineage for {source_table}")
        return column_lineage
    def _get_columns_from_sources(
        self,
        table: Dict,
        source_table: str,
        source_table_columns: Dict[str, Dict[str, str]],
    ) -> List[Dict]:
        """Get columns from various sources."""
        # 1. First try to get columns from the table data
        columns = table.get("columns", [])
        # Handle different column formats
        if isinstance(columns, dict):
            # Convert dict format to list
            columns = self._convert_column_dict_to_list(columns)
        # 2. If no columns found, try to retrieve them from the schemas endpoint
        if not columns:
            columns = self._get_columns_from_schemas_endpoint(source_table)
        # 3. If still no columns, try source_table_columns
        if not columns and source_table in source_table_columns:
            logger.info(f"Using columns from source_table_columns for {source_table}")
            columns = [
                {"name": col_name, "type": col_type}
                for col_name, col_type in source_table_columns[source_table].items()
            ]
        return columns
    def _convert_column_dict_to_list(self, columns_dict: Dict) -> List[Dict]:
        """Convert column dictionary to list format."""
        columns_list = []
        for col_name, col_data in columns_dict.items():
            if isinstance(col_data, dict):
                col_data = col_data.copy()
                col_data["name"] = col_name
                columns_list.append(col_data)
            else:
                columns_list.append({"name": col_name})
        return columns_list
    def _get_columns_from_schemas_endpoint(self, source_table: str) -> List[Dict]:
        """Try to get columns from the schemas endpoint."""
        columns: List[Dict] = []
        if not hasattr(self.api_client, "get_table_columns"):
            return columns
        logger.info("No columns found in table data, trying schemas endpoint")
        schema_name, table_name = None, None
        if "." in source_table:
            schema_name, table_name = source_table.split(".", 1)
        if not (schema_name and table_name):
            return columns
        try:
            connector_id = self._find_connector_id_for_source_table(source_table)
            if connector_id:
                columns = self.api_client.get_table_columns(
                    connector_id, schema_name, table_name
                )
                logger.info(f"Retrieved {len(columns)} columns from schemas endpoint")
        except Exception as e:
            logger.warning(f"Failed to get columns from schemas endpoint: {e}")
        return columns
    def _find_connector_id_for_source_table(self, source_table: str) -> Optional[str]:
        """Find the connector ID for a source table."""
        for conn in getattr(self, "_connector_cache", []):
            if hasattr(conn, "connector_id") and source_table in str(conn.lineage):
                return conn.connector_id
        return None
    def _create_column_lineage_from_columns(
        self,
        columns: List[Dict],
        source_table: str,
        destination_platform: str,
    ) -> List[ColumnLineage]:
        """Create column lineage objects from column data."""
        column_lineage = []
        is_bigquery = destination_platform.lower() == "bigquery"
        for column in columns:
            col_name = None
@ -711,11 +827,7 @@ class FivetranStandardAPI(FivetranAccessInterface):
            elif isinstance(column, str):
                col_name = column
-            if not col_name:
+            if not col_name or col_name.startswith("_fivetran"):
                continue
            # Skip Fivetran system columns
            if col_name.startswith("_fivetran"):
                continue
            # Get destination column name - prefer name_in_destination if available
@ -741,167 +853,6 @@ class FivetranStandardAPI(FivetranAccessInterface):
                )
            )
        logger.info(
            f"Created {len(column_lineage)} column lineage entries for {source_table}"
        )
        return column_lineage
    def _process_columns_from_source(
        self, source_columns: Dict[str, str], is_bigquery: bool, source_table: str
    ) -> List[ColumnLineage]:
        """
        Create column lineage from source column information.
        Extracted from _extract_column_lineage to reduce complexity.
        """
        column_lineage = []
        logger.info(f"Using {len(source_columns)} columns from source table schema")
        # Create lineage for each source column
        for col_name in source_columns:
            if col_name.startswith("_fivetran"):
                continue
            # Transform destination column name based on platform
            dest_col_name = self._transform_column_name_for_platform(
                col_name, is_bigquery
            )
            column_lineage.append(
                ColumnLineage(
                    source_column=col_name,
                    destination_column=dest_col_name,
                )
            )
        logger.info(f"Created {len(column_lineage)} fallback column lineage entries")
        return column_lineage
    def _extract_column_lineage(
        self,
        table: Dict,
        source_table: str,
        destination_platform: str,
        source_table_columns: Dict[str, Dict[str, str]],
    ) -> List[ColumnLineage]:
        """
        Extract column-level lineage for a table with improved matching techniques.
        Args:
            table: Table data from API
            source_table: Full source table name (schema.table)
            destination_platform: Destination platform type
            source_table_columns: Dict mapping table names to column information
        Returns:
            List of ColumnLineage objects
        """
        # Get detailed logging for debugging
        logger.info(
            f"Extracting column lineage for {source_table} to {destination_platform}"
        )
        logger.debug(f"Table data keys: {list(table.keys())}")
        # Get columns from the API response
        columns = table.get("columns", [])
        # Log what we're working with
        if isinstance(columns, list):
            logger.info(f"Found {len(columns)} columns in list format")
        elif isinstance(columns, dict):
            logger.info(f"Found {len(columns)} columns in dict format")
            # Convert dict format to list if needed
            columns_list = []
            for col_name, col_data in columns.items():
                if isinstance(col_data, dict):
                    col_data = col_data.copy()
                    col_data["name"] = col_name
                    columns_list.append(col_data)
                else:
                    columns_list.append({"name": col_name})
            columns = columns_list
            logger.info(f"Converted dict format to list with {len(columns)} columns")
        else:
            logger.warning(f"Columns in unexpected format: {type(columns)}")
        is_bigquery = destination_platform.lower() == "bigquery"
        column_lineage = []
        # First try direct mapping from columns if available
        if isinstance(columns, list) and columns:
            for column in columns:
                col_name = None
                if isinstance(column, dict):
                    col_name = column.get("name")
                elif isinstance(column, str):
                    col_name = column
                if not col_name or col_name.startswith("_fivetran"):
                    continue
                # Get destination column name - prefer name_in_destination if available
                dest_col_name = None
                if isinstance(column, dict) and "name_in_destination" in column:
                    dest_col_name = column.get("name_in_destination")
                    logger.debug(
                        f"Using name_in_destination: {col_name} -> {dest_col_name}"
                    )
                # If no name_in_destination, transform based on platform
                if not dest_col_name:
                    dest_col_name = self._transform_column_name_for_platform(
                        col_name, is_bigquery
                    )
                    logger.debug(f"Transformed name: {col_name} -> {dest_col_name}")
                # Add to lineage
                column_lineage.append(
                    ColumnLineage(
                        source_column=col_name,
                        destination_column=dest_col_name,
                    )
                )
        # If we got column lineage, return it
        if column_lineage:
            logger.info(
                f"Created {len(column_lineage)} column lineage entries using direct mapping"
            )
            return column_lineage
        # No direct column mapping, try to derive from source_table_columns
        if source_table in source_table_columns:
            logger.info(
                f"Attempting to derive column lineage from source_table_columns for {source_table}"
            )
            source_cols = source_table_columns[source_table]
            for col_name in source_cols:
                if col_name.startswith("_fivetran"):
                    continue
                # Transform destination column name based on platform
                dest_col_name = self._transform_column_name_for_platform(
                    col_name, is_bigquery
                )
                column_lineage.append(
                    ColumnLineage(
                        source_column=col_name,
                        destination_column=dest_col_name,
                    )
                )
            logger.info(
                f"Created {len(column_lineage)} column lineage entries using source_table_columns"
            )
        else:
            logger.warning(f"No source_table_columns available for {source_table}")
        # If we still have no lineage, warn about it
        if not column_lineage:
            logger.warning(f"Could not create any column lineage for {source_table}")
        return column_lineage
    def _transform_column_name_for_platform(
@ -1028,50 +979,6 @@ class FivetranStandardAPI(FivetranAccessInterface):
        else:
            logger.warning(f"Columns in unexpected format: {type(columns)}")
    def _convert_column_dict_to_list(self, columns_dict: Dict) -> List[Dict]:
        """Convert column dictionary to list format for consistent processing."""
        columns_list = []
        for col_name, col_data in columns_dict.items():
            if isinstance(col_data, dict):
                col_data = col_data.copy()
                col_data["name"] = col_name
                columns_list.append(col_data)
            else:
                columns_list.append({"name": col_name})
        logger.info(f"Converted dict format to list with {len(columns_list)} columns")
        return columns_list
    def _try_get_column_mapping_from_config(
        self, source_table: str
    ) -> List[ColumnLineage]:
        """Try to get column mapping from configuration if available."""
        column_lineage = []
        # Check if there's a config attribute with column mapping info
        if hasattr(self, "config") and self.config:
            # Check for any attribute that might have column mappings
            for attr_name in dir(self.config):
                if "column_mapping" in attr_name.lower() and hasattr(
                    self.config, attr_name
                ):
                    mapping_attr = getattr(self.config, attr_name)
                    if isinstance(mapping_attr, dict) and source_table in mapping_attr:
                        column_mapping = mapping_attr.get(source_table, {})
                        logger.info(
                            f"Found config column mapping for {source_table}: {len(column_mapping)} columns"
                        )
                        for source_col, dest_col in column_mapping.items():
                            column_lineage.append(
                                ColumnLineage(
                                    source_column=source_col,
                                    destination_column=dest_col,
                                )
                            )
        return column_lineage
    def _get_destination_platform(self, connector: Connector) -> str:
        """
        Determine the destination platform based on the configuration and connector details.