diff --git a/ingestion/src/metadata/interfaces/sqa_interface.py b/ingestion/src/metadata/interfaces/sqa_interface.py index c40d78a64b6..ab0aadcd796 100644 --- a/ingestion/src/metadata/interfaces/sqa_interface.py +++ b/ingestion/src/metadata/interfaces/sqa_interface.py @@ -426,7 +426,7 @@ def get_table_metrics( dictionnary of results """ try: - row = runner.select_first_from_table(*[metric().fn() for metric in metrics]) + row = runner.select_first_from_sample(*[metric().fn() for metric in metrics]) if row: return dict(row) diff --git a/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py b/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py index ccd988e2332..4efe6033492 100644 --- a/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py +++ b/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py @@ -89,3 +89,12 @@ def _(*_, **__): def _(*_, **__): """Oracle random logic""" return "ABS(DBMS_RANDOM.VALUE) * 100" + + +@compiles(RandomNumFn, Dialects.Snowflake) +def _(*_, **__): + """We use FROM SAMPLE SYSTEM (n) for sampling + in snowflake. We'll return 0 to make sure we get all the rows + from the already sampled results when executing row::MOD(0, 100) < profile_sample. + """ + return "0" diff --git a/ingestion/src/metadata/orm_profiler/profiler/sampler.py b/ingestion/src/metadata/orm_profiler/profiler/sampler.py index 1794f104ed6..c0a267a93e2 100644 --- a/ingestion/src/metadata/orm_profiler/profiler/sampler.py +++ b/ingestion/src/metadata/orm_profiler/profiler/sampler.py @@ -21,6 +21,7 @@ from sqlalchemy.orm.util import AliasedClass from metadata.generated.schema.entity.data.table import TableData from metadata.orm_profiler.orm.functions.modulo import ModuloFn from metadata.orm_profiler.orm.functions.random_num import RandomNumFn +from metadata.orm_profiler.orm.registry import Dialects from metadata.orm_profiler.profiler.handle_partition import partition_filter_handler RANDOM_LABEL = "random" @@ -50,9 +51,15 @@ class Sampler: @partition_filter_handler(build_sample=True) def get_sample_query(self) -> Query: - return self.session.query( - self.table, (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL) - ).cte(f"{self.table.__tablename__}_rnd") + return ( + self.session.query( + self.table, (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL) + ) + .suffix_with( + f"SAMPLE SYSTEM ({self.profile_sample})", dialect=Dialects.Snowflake + ) + .cte(f"{self.table.__tablename__}_rnd") + ) def random_sample(self) -> Union[DeclarativeMeta, AliasedClass]: """