mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-03 20:27:50 +00:00 
			
		
		
		
	feat(ingest): snowflake profile tables only if they have been updates since N days (#5132)
This commit is contained in:
		
							parent
							
								
									503208beaa
								
							
						
					
					
						commit
						7b143b06fc
					
				@ -11,6 +11,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 | 
			
		||||
### Deprecations
 | 
			
		||||
 | 
			
		||||
### Other notable Changes
 | 
			
		||||
- #5132 Profile tables in `snowflake` source only if they have been updated since configured (default: `1`) number of day(s). Update the config `profiling.profile_if_updated_since_days` as per your profiling schedule or set it to `None` if you want older behaviour.
 | 
			
		||||
 | 
			
		||||
## `v0.8.38`
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -84,6 +84,11 @@ class GEProfilingConfig(ConfigModel):
 | 
			
		||||
        description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
 | 
			
		||||
        default=1,
 | 
			
		||||
        description="Profile table only if it has been updated since these many number of days. `None` implies profile all tables. Only Snowflake supports this.",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # The default of (5 * cpu_count) is adopted from the default max_workers
 | 
			
		||||
    # parameter of ThreadPoolExecutor. Given that profiling is often an I/O-bound
 | 
			
		||||
    # task, it may make sense to increase this default value in the future.
 | 
			
		||||
 | 
			
		||||
@ -810,13 +810,19 @@ WHERE
 | 
			
		||||
        return None, None
 | 
			
		||||
 | 
			
		||||
    def is_dataset_eligible_for_profiling(
 | 
			
		||||
        self, dataset_name: str, sql_config: SQLAlchemyConfig
 | 
			
		||||
        self,
 | 
			
		||||
        dataset_name: str,
 | 
			
		||||
        sql_config: SQLAlchemyConfig,
 | 
			
		||||
        inspector: Inspector,
 | 
			
		||||
        profile_candidates: Optional[List[str]],
 | 
			
		||||
    ) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Method overrides default profiling filter which checks profiling eligibility based on allow-deny pattern.
 | 
			
		||||
        This one also don't profile those sharded tables which are not the latest.
 | 
			
		||||
        """
 | 
			
		||||
        if not super().is_dataset_eligible_for_profiling(dataset_name, sql_config):
 | 
			
		||||
        if not super().is_dataset_eligible_for_profiling(
 | 
			
		||||
            dataset_name, sql_config, inspector, profile_candidates
 | 
			
		||||
        ):
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        (project_id, schema, table) = dataset_name.split(".")
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,7 @@
 | 
			
		||||
import json
 | 
			
		||||
import logging
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 | 
			
		||||
 | 
			
		||||
import pydantic
 | 
			
		||||
@ -177,8 +178,12 @@ class SnowflakeSource(SQLAlchemySource):
 | 
			
		||||
            else:
 | 
			
		||||
                self.report.report_dropped(db)
 | 
			
		||||
 | 
			
		||||
    def get_identifier(self, *, schema: str, entity: str, **kwargs: Any) -> str:
 | 
			
		||||
        regular = super().get_identifier(schema=schema, entity=entity, **kwargs)
 | 
			
		||||
    def get_identifier(
 | 
			
		||||
        self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
 | 
			
		||||
    ) -> str:
 | 
			
		||||
        regular = super().get_identifier(
 | 
			
		||||
            schema=schema, entity=entity, inspector=inspector, **kwargs
 | 
			
		||||
        )
 | 
			
		||||
        return f"{self.current_database.lower()}.{regular}"
 | 
			
		||||
 | 
			
		||||
    def _populate_view_upstream_lineage(self, engine: sqlalchemy.engine.Engine) -> None:
 | 
			
		||||
@ -748,6 +753,37 @@ QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name, upstream_table_na
 | 
			
		||||
            return False
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def generate_profile_candidates(
 | 
			
		||||
        self, inspector: Inspector, threshold_time: datetime
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        self.report.profile_if_updated_since = threshold_time
 | 
			
		||||
        _profile_candidates = []
 | 
			
		||||
 | 
			
		||||
        db_rows = inspector.engine.execute(
 | 
			
		||||
            text(
 | 
			
		||||
                """
 | 
			
		||||
select table_catalog, table_schema, table_name
 | 
			
		||||
from information_schema.tables
 | 
			
		||||
where last_altered >= to_timestamp_ltz({timestamp}, 3) and table_type= 'BASE TABLE'
 | 
			
		||||
            """.format(
 | 
			
		||||
                    timestamp=round(threshold_time.timestamp() * 1000)
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        db_name = self.current_database
 | 
			
		||||
        for db_row in db_rows:
 | 
			
		||||
            _profile_candidates.append(
 | 
			
		||||
                self.get_identifier(
 | 
			
		||||
                    schema=db_row.table_schema,
 | 
			
		||||
                    entity=db_row.table_name,
 | 
			
		||||
                    inspector=inspector,
 | 
			
		||||
                ).lower()
 | 
			
		||||
            )
 | 
			
		||||
        logger.debug(f"Generating profiling candidates for db {db_name}")
 | 
			
		||||
        self.report.profile_candidates[db_name] = _profile_candidates
 | 
			
		||||
        return _profile_candidates
 | 
			
		||||
 | 
			
		||||
    # Stateful Ingestion specific overrides
 | 
			
		||||
    # NOTE: There is no special state associated with this source yet than what is provided by sql_common.
 | 
			
		||||
    def get_platform_instance_id(self) -> str:
 | 
			
		||||
 | 
			
		||||
@ -1185,6 +1185,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
			
		||||
        except NotImplementedError:
 | 
			
		||||
            description: Optional[str] = None
 | 
			
		||||
            properties: Dict[str, str] = {}
 | 
			
		||||
        except ProgrammingError as pe:
 | 
			
		||||
            # Snowflake needs schema names quoted when fetching table comments.
 | 
			
		||||
            logger.debug(
 | 
			
		||||
                f"Encountered ProgrammingError. Retrying with quoted schema name for schema {schema} and view {view}",
 | 
			
		||||
                pe,
 | 
			
		||||
            )
 | 
			
		||||
            description = None
 | 
			
		||||
            properties = {}
 | 
			
		||||
            view_info: dict = inspector.get_table_comment(view, f'"{schema}"')  # type: ignore
 | 
			
		||||
        else:
 | 
			
		||||
            description = view_info["text"]
 | 
			
		||||
 | 
			
		||||
@ -1308,13 +1317,27 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
			
		||||
    ) -> Optional[bool]:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    # Override if needed
 | 
			
		||||
    def generate_profile_candidates(
 | 
			
		||||
        self, inspector: Inspector, threshold_time: datetime.datetime
 | 
			
		||||
    ) -> Optional[List[str]]:
 | 
			
		||||
        raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
    # Override if you want to do additional checks
 | 
			
		||||
    def is_dataset_eligible_for_profiling(
 | 
			
		||||
        self, dataset_name: str, sql_config: SQLAlchemyConfig
 | 
			
		||||
        self,
 | 
			
		||||
        dataset_name: str,
 | 
			
		||||
        sql_config: SQLAlchemyConfig,
 | 
			
		||||
        inspector: Inspector,
 | 
			
		||||
        profile_candidates: Optional[List[str]],
 | 
			
		||||
    ) -> bool:
 | 
			
		||||
        return sql_config.table_pattern.allowed(
 | 
			
		||||
            dataset_name
 | 
			
		||||
        ) and sql_config.profile_pattern.allowed(dataset_name)
 | 
			
		||||
        return (
 | 
			
		||||
            sql_config.table_pattern.allowed(dataset_name)
 | 
			
		||||
            and sql_config.profile_pattern.allowed(dataset_name)
 | 
			
		||||
        ) and (
 | 
			
		||||
            profile_candidates is None
 | 
			
		||||
            or (profile_candidates is not None and dataset_name in profile_candidates)
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def loop_profiler_requests(
 | 
			
		||||
        self,
 | 
			
		||||
@ -1325,6 +1348,19 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
			
		||||
        from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest
 | 
			
		||||
 | 
			
		||||
        tables_seen: Set[str] = set()
 | 
			
		||||
        profile_candidates = None  # Default value if profile candidates not available.
 | 
			
		||||
        if sql_config.profiling.profile_if_updated_since_days is not None:
 | 
			
		||||
            try:
 | 
			
		||||
                threshold_time: datetime.datetime = datetime.datetime.now(
 | 
			
		||||
                    datetime.timezone.utc
 | 
			
		||||
                ) - datetime.timedelta(
 | 
			
		||||
                    sql_config.profiling.profile_if_updated_since_days  # type:ignore
 | 
			
		||||
                )
 | 
			
		||||
                profile_candidates = self.generate_profile_candidates(
 | 
			
		||||
                    inspector, threshold_time
 | 
			
		||||
                )
 | 
			
		||||
            except NotImplementedError:
 | 
			
		||||
                logger.debug("Source does not support generating profile candidates.")
 | 
			
		||||
 | 
			
		||||
        for table in inspector.get_table_names(schema):
 | 
			
		||||
            schema, table = self.standardize_schema_table_names(
 | 
			
		||||
@ -1333,7 +1369,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
			
		||||
            dataset_name = self.get_identifier(
 | 
			
		||||
                schema=schema, entity=table, inspector=inspector
 | 
			
		||||
            )
 | 
			
		||||
            if not self.is_dataset_eligible_for_profiling(dataset_name, sql_config):
 | 
			
		||||
            if not self.is_dataset_eligible_for_profiling(
 | 
			
		||||
                dataset_name, sql_config, inspector, profile_candidates
 | 
			
		||||
            ):
 | 
			
		||||
                if self.config.profiling.report_dropped_profiles:
 | 
			
		||||
                    self.report.report_dropped(f"profile of {dataset_name}")
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
@ -36,3 +36,6 @@ class SnowflakeReport(BaseSnowflakeReport, SQLSourceReport):
 | 
			
		||||
    role: str = ""
 | 
			
		||||
    check_role_grants: Optional[bool] = None
 | 
			
		||||
    role_grants: List[str] = field(default_factory=list)
 | 
			
		||||
 | 
			
		||||
    profile_if_updated_since: Optional[datetime] = None
 | 
			
		||||
    profile_candidates: Dict[str, List[str]] = field(default_factory=dict)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user