mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-18 14:16:48 +00:00
feat(ingest): snowflake-beta - populate size, add table level profiles for large tables (#5774)
This commit is contained in:
parent
84b279a933
commit
4bf4236e29
@ -1,4 +1,4 @@
|
|||||||
To get all metadata from Snowflake you need to use two plugins `snowflake` and `snowflake-usage`. Both of them are described in this page. These will require 2 separate recipes.
|
To get all metadata from Snowflake you need to use two plugins `snowflake` and `snowflake-usage`. Both of them are described in this page. These will require 2 separate recipes.
|
||||||
|
|
||||||
|
|
||||||
We encourage you to try out new `snowflake-beta` plugin as alternative to running both `snowflake` and `snowflake-usage` plugins and share feedback. `snowflake-beta` is much faster than `snowflake` for extracting metadata . Please note that, `snowflake-beta` plugin currently does not support column level profiling, unlike `snowflake` plugin.
|
We encourage you to try out new `snowflake-beta` plugin as alternative to running both `snowflake` and `snowflake-usage` plugins and share feedback. `snowflake-beta` is much faster than `snowflake` for extracting metadata .
|
@ -1,6 +1,7 @@
|
|||||||
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
from typing import Callable, Dict, Iterable, List, Optional
|
from typing import Callable, Dict, Iterable, List, Optional, Tuple, cast
|
||||||
|
|
||||||
from sqlalchemy import create_engine, inspect
|
from sqlalchemy import create_engine, inspect
|
||||||
|
|
||||||
@ -19,10 +20,17 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|||||||
)
|
)
|
||||||
from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
|
from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
|
||||||
|
from datahub.metadata.schema_classes import DatasetProfileClass
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class SnowflakeProfilerRequest(GEProfilerRequest):
|
||||||
|
table: SnowflakeTable
|
||||||
|
profile_table_level_only: bool = False
|
||||||
|
|
||||||
|
|
||||||
class SnowflakeProfiler(SnowflakeCommonMixin):
|
class SnowflakeProfiler(SnowflakeCommonMixin):
|
||||||
def __init__(self, config: SnowflakeV2Config, report: SnowflakeV2Report) -> None:
|
def __init__(self, config: SnowflakeV2Config, report: SnowflakeV2Report) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
@ -31,12 +39,6 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
|
|||||||
|
|
||||||
def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit]:
|
def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit]:
|
||||||
|
|
||||||
# If only table level profiling is enabled, report table profile and exit
|
|
||||||
if self.config.profiling.profile_table_level_only:
|
|
||||||
|
|
||||||
yield from self.get_table_level_profile_workunits(databases)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Extra default SQLAlchemy option for better connection pooling and threading.
|
# Extra default SQLAlchemy option for better connection pooling and threading.
|
||||||
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
||||||
if self.config.profiling.enabled:
|
if self.config.profiling.enabled:
|
||||||
@ -55,7 +57,7 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
|
|||||||
for table in schema.tables:
|
for table in schema.tables:
|
||||||
|
|
||||||
# Emit the profile work unit
|
# Emit the profile work unit
|
||||||
profile_request = self.get_ge_profile_request(
|
profile_request = self.get_snowflake_profile_request(
|
||||||
table, schema.name, db.name
|
table, schema.name, db.name
|
||||||
)
|
)
|
||||||
if profile_request is not None:
|
if profile_request is not None:
|
||||||
@ -63,13 +65,14 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
|
|||||||
|
|
||||||
if len(profile_requests) == 0:
|
if len(profile_requests) == 0:
|
||||||
continue
|
continue
|
||||||
ge_profiler = self.get_profiler_instance(db.name)
|
for request, profile in self.generate_profiles(
|
||||||
for request, profile in ge_profiler.generate_profiles(
|
db.name,
|
||||||
profile_requests,
|
profile_requests,
|
||||||
self.config.profiling.max_workers,
|
self.config.profiling.max_workers,
|
||||||
platform=self.platform,
|
platform=self.platform,
|
||||||
profiler_args=self.get_profile_args(),
|
profiler_args=self.get_profile_args(),
|
||||||
):
|
):
|
||||||
|
profile.sizeInBytes = request.table.size_in_bytes # type:ignore
|
||||||
if profile is None:
|
if profile is None:
|
||||||
continue
|
continue
|
||||||
dataset_name = request.pretty_name
|
dataset_name = request.pretty_name
|
||||||
@ -86,68 +89,26 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
|
|||||||
profile,
|
profile,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_table_level_profile_workunits(
|
def get_snowflake_profile_request(
|
||||||
self, databases: List[SnowflakeDatabase]
|
|
||||||
) -> Iterable[WorkUnit]:
|
|
||||||
for db in databases:
|
|
||||||
if not self.config.database_pattern.allowed(db.name):
|
|
||||||
continue
|
|
||||||
for schema in db.schemas:
|
|
||||||
if not self.config.schema_pattern.allowed(schema.name):
|
|
||||||
continue
|
|
||||||
for table in schema.tables:
|
|
||||||
dataset_name = self.get_dataset_identifier(
|
|
||||||
table.name, schema.name, db.name
|
|
||||||
)
|
|
||||||
skip_profiling = False
|
|
||||||
# no need to filter by size_in_bytes and row_count limits,
|
|
||||||
# if table level profilin, since its not expensive
|
|
||||||
if not self.is_dataset_eligible_for_profiling(
|
|
||||||
dataset_name,
|
|
||||||
table.last_altered,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
):
|
|
||||||
skip_profiling = True
|
|
||||||
|
|
||||||
if skip_profiling:
|
|
||||||
if self.config.profiling.report_dropped_profiles:
|
|
||||||
self.report.report_dropped(f"profile of {dataset_name}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
self.report.report_entity_profiled(dataset_name)
|
|
||||||
|
|
||||||
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
||||||
self.platform,
|
|
||||||
dataset_name,
|
|
||||||
self.config.platform_instance,
|
|
||||||
self.config.env,
|
|
||||||
)
|
|
||||||
yield self.wrap_aspect_as_workunit(
|
|
||||||
"dataset",
|
|
||||||
dataset_urn,
|
|
||||||
"datasetProfile",
|
|
||||||
DatasetProfile(
|
|
||||||
timestampMillis=round(
|
|
||||||
datetime.datetime.now().timestamp() * 1000
|
|
||||||
),
|
|
||||||
columnCount=len(table.columns),
|
|
||||||
rowCount=table.rows_count,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_ge_profile_request(
|
|
||||||
self,
|
self,
|
||||||
table: SnowflakeTable,
|
table: SnowflakeTable,
|
||||||
schema_name: str,
|
schema_name: str,
|
||||||
db_name: str,
|
db_name: str,
|
||||||
) -> Optional[GEProfilerRequest]:
|
) -> Optional[SnowflakeProfilerRequest]:
|
||||||
skip_profiling = False
|
skip_profiling = False
|
||||||
|
profile_table_level_only = self.config.profiling.profile_table_level_only
|
||||||
dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name)
|
dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name)
|
||||||
if not self.is_dataset_eligible_for_profiling(
|
if not self.is_dataset_eligible_for_profiling(
|
||||||
dataset_name, table.last_altered, table.size_in_bytes, table.rows_count
|
dataset_name, table.last_altered, table.size_in_bytes, table.rows_count
|
||||||
):
|
):
|
||||||
skip_profiling = True
|
# Profile only table level if dataset is filtered from profiling
|
||||||
|
# due to size limits alone
|
||||||
|
if self.is_dataset_eligible_for_profiling(
|
||||||
|
dataset_name, table.last_altered, 0, 0
|
||||||
|
):
|
||||||
|
profile_table_level_only = True
|
||||||
|
else:
|
||||||
|
skip_profiling = True
|
||||||
|
|
||||||
if len(table.columns) == 0:
|
if len(table.columns) == 0:
|
||||||
skip_profiling = True
|
skip_profiling = True
|
||||||
@ -159,9 +120,11 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
|
|||||||
|
|
||||||
self.report.report_entity_profiled(dataset_name)
|
self.report.report_entity_profiled(dataset_name)
|
||||||
logger.debug(f"Preparing profiling request for {dataset_name}")
|
logger.debug(f"Preparing profiling request for {dataset_name}")
|
||||||
profile_request = GEProfilerRequest(
|
profile_request = SnowflakeProfilerRequest(
|
||||||
pretty_name=dataset_name,
|
pretty_name=dataset_name,
|
||||||
batch_kwargs=dict(schema=schema_name, table=table.name),
|
batch_kwargs=dict(schema=schema_name, table=table.name),
|
||||||
|
table=table,
|
||||||
|
profile_table_level_only=profile_table_level_only,
|
||||||
)
|
)
|
||||||
return profile_request
|
return profile_request
|
||||||
|
|
||||||
@ -237,3 +200,37 @@ class SnowflakeProfiler(SnowflakeCommonMixin):
|
|||||||
return conn
|
return conn
|
||||||
|
|
||||||
return get_db_connection
|
return get_db_connection
|
||||||
|
|
||||||
|
def generate_profiles(
|
||||||
|
self,
|
||||||
|
db_name: str,
|
||||||
|
requests: List[SnowflakeProfilerRequest],
|
||||||
|
max_workers: int,
|
||||||
|
platform: Optional[str] = None,
|
||||||
|
profiler_args: Optional[Dict] = None,
|
||||||
|
) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
|
||||||
|
|
||||||
|
ge_profile_requests: List[GEProfilerRequest] = [
|
||||||
|
cast(GEProfilerRequest, request)
|
||||||
|
for request in requests
|
||||||
|
if not request.profile_table_level_only
|
||||||
|
]
|
||||||
|
table_level_profile_requests: List[SnowflakeProfilerRequest] = [
|
||||||
|
request for request in requests if request.profile_table_level_only
|
||||||
|
]
|
||||||
|
for request in table_level_profile_requests:
|
||||||
|
profile = DatasetProfile(
|
||||||
|
timestampMillis=round(datetime.datetime.now().timestamp() * 1000),
|
||||||
|
columnCount=len(request.table.columns),
|
||||||
|
rowCount=request.table.rows_count,
|
||||||
|
sizeInBytes=request.table.size_in_bytes,
|
||||||
|
)
|
||||||
|
yield (request, profile)
|
||||||
|
|
||||||
|
if len(ge_profile_requests) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
ge_profiler = self.get_profiler_instance(db_name)
|
||||||
|
yield from ge_profiler.generate_profiles(
|
||||||
|
ge_profile_requests, max_workers, platform, profiler_args
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user