feat(ingest/snowflake): improve accuracy of computed sample-based pro… (#9600)

2025-09-12 10:41:31 +00:00 · 2024-01-10 23:29:09 +05:30 · 2024-01-10 23:29:09 +05:30 · af866eaf95
commit af866eaf95
parent 3f9b90158f
1 changed files with 6 additions and 8 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
        assert profile.rowCount is not None
        row_count: int  # used for null counts calculation
        if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
-            # We can alternatively use `self._get_dataset_rows(profile)` to get
+            # Querying exact row count of sample using `_get_dataset_rows`.
-            # exact count of rows in sample, as actual rows involved in sample
+            # We are not using `self.config.sample_size` directly as actual row count
-            # may be slightly different (more or less) than configured `sample_size`.
+            # in sample may be slightly different (more or less) than configured `sample_size`.
-            # However not doing so to start with, as that adds another query overhead
+            self._get_dataset_rows(profile)
-            # plus approximate metrics should work for sampling based profiling.
+
-            row_count = self.config.sample_size
+        row_count = profile.rowCount
        else:
            row_count = profile.rowCount
        for column_spec in columns_profiling_queue:
            column = column_spec.column