Fixes #7422 - Implement FROM table SAMPLE ... for Snowflake sampling (#7555)

* Updated sampling for snowflake

* Fixed python style
This commit is contained in:
Teddy 2022-09-20 08:55:39 +02:00 committed by GitHub
parent e22036cc09
commit bdfdc50a5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 4 deletions

View File

@ -426,7 +426,7 @@ def get_table_metrics(
dictionnary of results
"""
try:
row = runner.select_first_from_table(*[metric().fn() for metric in metrics])
row = runner.select_first_from_sample(*[metric().fn() for metric in metrics])
if row:
return dict(row)

View File

@ -89,3 +89,12 @@ def _(*_, **__):
def _(*_, **__):
"""Oracle random logic"""
return "ABS(DBMS_RANDOM.VALUE) * 100"
@compiles(RandomNumFn, Dialects.Snowflake)
def _(*_, **__):
"""We use FROM <table> SAMPLE SYSTEM (n) for sampling
in snowflake. We'll return 0 to make sure we get all the rows
from the already sampled results when executing row::MOD(0, 100) < profile_sample.
"""
return "0"

View File

@ -21,6 +21,7 @@ from sqlalchemy.orm.util import AliasedClass
from metadata.generated.schema.entity.data.table import TableData
from metadata.orm_profiler.orm.functions.modulo import ModuloFn
from metadata.orm_profiler.orm.functions.random_num import RandomNumFn
from metadata.orm_profiler.orm.registry import Dialects
from metadata.orm_profiler.profiler.handle_partition import partition_filter_handler
RANDOM_LABEL = "random"
@ -50,9 +51,15 @@ class Sampler:
@partition_filter_handler(build_sample=True)
def get_sample_query(self) -> Query:
return self.session.query(
self.table, (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL)
).cte(f"{self.table.__tablename__}_rnd")
return (
self.session.query(
self.table, (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL)
)
.suffix_with(
f"SAMPLE SYSTEM ({self.profile_sample})", dialect=Dialects.Snowflake
)
.cte(f"{self.table.__tablename__}_rnd")
)
def random_sample(self) -> Union[DeclarativeMeta, AliasedClass]:
"""