feat(ingest): snowflake-beta - populate size, add table level profiles for large tables (#5774)

This commit is contained in:
Mayuri Nehate 2022-09-05 11:07:57 +05:30 committed by GitHub
parent 84b279a933
commit 4bf4236e29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 65 deletions

View File

@ -1,4 +1,4 @@
To get all metadata from Snowflake you need to use two plugins `snowflake` and `snowflake-usage`. Both of them are described in this page. These will require 2 separate recipes. To get all metadata from Snowflake you need to use two plugins `snowflake` and `snowflake-usage`. Both of them are described in this page. These will require 2 separate recipes.
We encourage you to try out new `snowflake-beta` plugin as alternative to running both `snowflake` and `snowflake-usage` plugins and share feedback. `snowflake-beta` is much faster than `snowflake` for extracting metadata . Please note that, `snowflake-beta` plugin currently does not support column level profiling, unlike `snowflake` plugin. We encourage you to try out new `snowflake-beta` plugin as alternative to running both `snowflake` and `snowflake-usage` plugins and share feedback. `snowflake-beta` is much faster than `snowflake` for extracting metadata .

View File

@ -1,6 +1,7 @@
import dataclasses
import datetime import datetime
import logging import logging
from typing import Callable, Dict, Iterable, List, Optional from typing import Callable, Dict, Iterable, List, Optional, Tuple, cast
from sqlalchemy import create_engine, inspect from sqlalchemy import create_engine, inspect
@ -19,10 +20,17 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
) )
from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
from datahub.metadata.schema_classes import DatasetProfileClass
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclasses.dataclass
class SnowflakeProfilerRequest(GEProfilerRequest):
table: SnowflakeTable
profile_table_level_only: bool = False
class SnowflakeProfiler(SnowflakeCommonMixin): class SnowflakeProfiler(SnowflakeCommonMixin):
def __init__(self, config: SnowflakeV2Config, report: SnowflakeV2Report) -> None: def __init__(self, config: SnowflakeV2Config, report: SnowflakeV2Report) -> None:
self.config = config self.config = config
@ -31,12 +39,6 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit]: def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit]:
# If only table level profiling is enabled, report table profile and exit
if self.config.profiling.profile_table_level_only:
yield from self.get_table_level_profile_workunits(databases)
return
# Extra default SQLAlchemy option for better connection pooling and threading. # Extra default SQLAlchemy option for better connection pooling and threading.
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
if self.config.profiling.enabled: if self.config.profiling.enabled:
@ -55,7 +57,7 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
for table in schema.tables: for table in schema.tables:
# Emit the profile work unit # Emit the profile work unit
profile_request = self.get_ge_profile_request( profile_request = self.get_snowflake_profile_request(
table, schema.name, db.name table, schema.name, db.name
) )
if profile_request is not None: if profile_request is not None:
@ -63,13 +65,14 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
if len(profile_requests) == 0: if len(profile_requests) == 0:
continue continue
ge_profiler = self.get_profiler_instance(db.name) for request, profile in self.generate_profiles(
for request, profile in ge_profiler.generate_profiles( db.name,
profile_requests, profile_requests,
self.config.profiling.max_workers, self.config.profiling.max_workers,
platform=self.platform, platform=self.platform,
profiler_args=self.get_profile_args(), profiler_args=self.get_profile_args(),
): ):
profile.sizeInBytes = request.table.size_in_bytes # type:ignore
if profile is None: if profile is None:
continue continue
dataset_name = request.pretty_name dataset_name = request.pretty_name
@ -86,68 +89,26 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
profile, profile,
) )
def get_table_level_profile_workunits( def get_snowflake_profile_request(
self, databases: List[SnowflakeDatabase]
) -> Iterable[WorkUnit]:
for db in databases:
if not self.config.database_pattern.allowed(db.name):
continue
for schema in db.schemas:
if not self.config.schema_pattern.allowed(schema.name):
continue
for table in schema.tables:
dataset_name = self.get_dataset_identifier(
table.name, schema.name, db.name
)
skip_profiling = False
# no need to filter by size_in_bytes and row_count limits,
# if table level profilin, since its not expensive
if not self.is_dataset_eligible_for_profiling(
dataset_name,
table.last_altered,
0,
0,
):
skip_profiling = True
if skip_profiling:
if self.config.profiling.report_dropped_profiles:
self.report.report_dropped(f"profile of {dataset_name}")
return None
self.report.report_entity_profiled(dataset_name)
dataset_urn = make_dataset_urn_with_platform_instance(
self.platform,
dataset_name,
self.config.platform_instance,
self.config.env,
)
yield self.wrap_aspect_as_workunit(
"dataset",
dataset_urn,
"datasetProfile",
DatasetProfile(
timestampMillis=round(
datetime.datetime.now().timestamp() * 1000
),
columnCount=len(table.columns),
rowCount=table.rows_count,
),
)
def get_ge_profile_request(
self, self,
table: SnowflakeTable, table: SnowflakeTable,
schema_name: str, schema_name: str,
db_name: str, db_name: str,
) -> Optional[GEProfilerRequest]: ) -> Optional[SnowflakeProfilerRequest]:
skip_profiling = False skip_profiling = False
profile_table_level_only = self.config.profiling.profile_table_level_only
dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name)
if not self.is_dataset_eligible_for_profiling( if not self.is_dataset_eligible_for_profiling(
dataset_name, table.last_altered, table.size_in_bytes, table.rows_count dataset_name, table.last_altered, table.size_in_bytes, table.rows_count
): ):
skip_profiling = True # Profile only table level if dataset is filtered from profiling
# due to size limits alone
if self.is_dataset_eligible_for_profiling(
dataset_name, table.last_altered, 0, 0
):
profile_table_level_only = True
else:
skip_profiling = True
if len(table.columns) == 0: if len(table.columns) == 0:
skip_profiling = True skip_profiling = True
@ -159,9 +120,11 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
self.report.report_entity_profiled(dataset_name) self.report.report_entity_profiled(dataset_name)
logger.debug(f"Preparing profiling request for {dataset_name}") logger.debug(f"Preparing profiling request for {dataset_name}")
profile_request = GEProfilerRequest( profile_request = SnowflakeProfilerRequest(
pretty_name=dataset_name, pretty_name=dataset_name,
batch_kwargs=dict(schema=schema_name, table=table.name), batch_kwargs=dict(schema=schema_name, table=table.name),
table=table,
profile_table_level_only=profile_table_level_only,
) )
return profile_request return profile_request
@ -237,3 +200,37 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
return conn return conn
return get_db_connection return get_db_connection
def generate_profiles(
self,
db_name: str,
requests: List[SnowflakeProfilerRequest],
max_workers: int,
platform: Optional[str] = None,
profiler_args: Optional[Dict] = None,
) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
ge_profile_requests: List[GEProfilerRequest] = [
cast(GEProfilerRequest, request)
for request in requests
if not request.profile_table_level_only
]
table_level_profile_requests: List[SnowflakeProfilerRequest] = [
request for request in requests if request.profile_table_level_only
]
for request in table_level_profile_requests:
profile = DatasetProfile(
timestampMillis=round(datetime.datetime.now().timestamp() * 1000),
columnCount=len(request.table.columns),
rowCount=request.table.rows_count,
sizeInBytes=request.table.size_in_bytes,
)
yield (request, profile)
if len(ge_profile_requests) == 0:
return
ge_profiler = self.get_profiler_instance(db_name)
yield from ge_profiler.generate_profiles(
ge_profile_requests, max_workers, platform, profiler_args
)