feat(ingest): snowflake-beta - populate size, add table level profiles for large tables (#5774)

2025-11-25 01:06:42 +00:00 · 2022-09-05 11:07:57 +05:30 · 2022-09-05 11:07:57 +05:30 · 4bf4236e29
commit 4bf4236e29
parent 84b279a933
2 changed files with 62 additions and 65 deletions
--- a/metadata-ingestion/docs/sources/snowflake/README.md
+++ b/metadata-ingestion/docs/sources/snowflake/README.md
@ -1,4 +1,4 @@
 To get all metadata from Snowflake you need to use two plugins `snowflake` and `snowflake-usage`. Both of them are described in this page. These will require 2 separate recipes.


-We encourage you to try out new `snowflake-beta` plugin as alternative to running both `snowflake` and `snowflake-usage` plugins and share feedback. `snowflake-beta` is much faster than `snowflake` for extracting metadata . Please note that, `snowflake-beta` plugin currently does not support column level profiling, unlike `snowflake` plugin.
+We encourage you to try out new `snowflake-beta` plugin as alternative to running both `snowflake` and `snowflake-usage` plugins and share feedback. `snowflake-beta` is much faster than `snowflake` for extracting metadata .
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
@ -1,6 +1,7 @@
+import dataclasses
 import datetime
 import logging
-from typing import Callable, Dict, Iterable, List, Optional
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, cast

 from sqlalchemy import create_engine, inspect

@ -19,10 +20,17 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
 )
 from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
 from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
+from datahub.metadata.schema_classes import DatasetProfileClass

 logger = logging.getLogger(__name__)


+@dataclasses.dataclass
+class SnowflakeProfilerRequest(GEProfilerRequest):
+    table: SnowflakeTable
+    profile_table_level_only: bool = False
+
+
 class SnowflakeProfiler(SnowflakeCommonMixin):
    def __init__(self, config: SnowflakeV2Config, report: SnowflakeV2Report) -> None:
        self.config = config
@ -31,12 +39,6 @@ class SnowflakeProfiler(SnowflakeCommonMixin):

    def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit]:

-        # If only table level profiling is enabled, report table profile and exit
-        if self.config.profiling.profile_table_level_only:
-
-            yield from self.get_table_level_profile_workunits(databases)
-            return
-
        # Extra default SQLAlchemy option for better connection pooling and threading.
        # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
        if self.config.profiling.enabled:
@ -55,7 +57,7 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
                for table in schema.tables:

                    # Emit the profile work unit
-                    profile_request = self.get_ge_profile_request(
+                    profile_request = self.get_snowflake_profile_request(
                        table, schema.name, db.name
                    )
                    if profile_request is not None:
@ -63,13 +65,14 @@ class SnowflakeProfiler(SnowflakeCommonMixin):

            if len(profile_requests) == 0:
                continue
-            ge_profiler = self.get_profiler_instance(db.name)
-            for request, profile in ge_profiler.generate_profiles(
+            for request, profile in self.generate_profiles(
+                db.name,
                profile_requests,
                self.config.profiling.max_workers,
                platform=self.platform,
                profiler_args=self.get_profile_args(),
            ):
+                profile.sizeInBytes = request.table.size_in_bytes  # type:ignore
                if profile is None:
                    continue
                dataset_name = request.pretty_name
@ -86,68 +89,26 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
                    profile,
                )

-    def get_table_level_profile_workunits(
-        self, databases: List[SnowflakeDatabase]
-    ) -> Iterable[WorkUnit]:
-        for db in databases:
-            if not self.config.database_pattern.allowed(db.name):
-                continue
-            for schema in db.schemas:
-                if not self.config.schema_pattern.allowed(schema.name):
-                    continue
-                for table in schema.tables:
-                    dataset_name = self.get_dataset_identifier(
-                        table.name, schema.name, db.name
-                    )
-                    skip_profiling = False
-                    # no need to filter by size_in_bytes and row_count limits,
-                    # if table level profilin, since its not expensive
-                    if not self.is_dataset_eligible_for_profiling(
-                        dataset_name,
-                        table.last_altered,
-                        0,
-                        0,
-                    ):
-                        skip_profiling = True
-
-                    if skip_profiling:
-                        if self.config.profiling.report_dropped_profiles:
-                            self.report.report_dropped(f"profile of {dataset_name}")
-                        return None
-
-                    self.report.report_entity_profiled(dataset_name)
-
-                    dataset_urn = make_dataset_urn_with_platform_instance(
-                        self.platform,
-                        dataset_name,
-                        self.config.platform_instance,
-                        self.config.env,
-                    )
-                    yield self.wrap_aspect_as_workunit(
-                        "dataset",
-                        dataset_urn,
-                        "datasetProfile",
-                        DatasetProfile(
-                            timestampMillis=round(
-                                datetime.datetime.now().timestamp() * 1000
-                            ),
-                            columnCount=len(table.columns),
-                            rowCount=table.rows_count,
-                        ),
-                    )
-
-    def get_ge_profile_request(
+    def get_snowflake_profile_request(
        self,
        table: SnowflakeTable,
        schema_name: str,
        db_name: str,
-    ) -> Optional[GEProfilerRequest]:
+    ) -> Optional[SnowflakeProfilerRequest]:
        skip_profiling = False
+        profile_table_level_only = self.config.profiling.profile_table_level_only
        dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name)
        if not self.is_dataset_eligible_for_profiling(
            dataset_name, table.last_altered, table.size_in_bytes, table.rows_count
        ):
-            skip_profiling = True
+            # Profile only table level if dataset is filtered from profiling
+            # due to size limits alone
+            if self.is_dataset_eligible_for_profiling(
+                dataset_name, table.last_altered, 0, 0
+            ):
+                profile_table_level_only = True
+            else:
+                skip_profiling = True

        if len(table.columns) == 0:
            skip_profiling = True
@ -159,9 +120,11 @@ class SnowflakeProfiler(SnowflakeCommonMixin):

        self.report.report_entity_profiled(dataset_name)
        logger.debug(f"Preparing profiling request for {dataset_name}")
-        profile_request = GEProfilerRequest(
+        profile_request = SnowflakeProfilerRequest(
            pretty_name=dataset_name,
            batch_kwargs=dict(schema=schema_name, table=table.name),
+            table=table,
+            profile_table_level_only=profile_table_level_only,
        )
        return profile_request

@ -237,3 +200,37 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
            return conn

        return get_db_connection
+
+    def generate_profiles(
+        self,
+        db_name: str,
+        requests: List[SnowflakeProfilerRequest],
+        max_workers: int,
+        platform: Optional[str] = None,
+        profiler_args: Optional[Dict] = None,
+    ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
+
+        ge_profile_requests: List[GEProfilerRequest] = [
+            cast(GEProfilerRequest, request)
+            for request in requests
+            if not request.profile_table_level_only
+        ]
+        table_level_profile_requests: List[SnowflakeProfilerRequest] = [
+            request for request in requests if request.profile_table_level_only
+        ]
+        for request in table_level_profile_requests:
+            profile = DatasetProfile(
+                timestampMillis=round(datetime.datetime.now().timestamp() * 1000),
+                columnCount=len(request.table.columns),
+                rowCount=request.table.rows_count,
+                sizeInBytes=request.table.size_in_bytes,
+            )
+            yield (request, profile)
+
+        if len(ge_profile_requests) == 0:
+            return
+
+        ge_profiler = self.get_profiler_instance(db_name)
+        yield from ge_profiler.generate_profiles(
+            ge_profile_requests, max_workers, platform, profiler_args
+        )