From af866eaf955443f837f30d49b6681b68a8cbcb51 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 10 Jan 2024 23:29:09 +0530 Subject: [PATCH] =?UTF-8?q?feat(ingest/snowflake):=20improve=20accuracy=20?= =?UTF-8?q?of=20computed=20sample-based=20pro=E2=80=A6=20(#9600)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datahub/ingestion/source/ge_data_profiler.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 4f1ad00b1e..91f7c2a140 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): assert profile.rowCount is not None row_count: int # used for null counts calculation if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition: - # We can alternatively use `self._get_dataset_rows(profile)` to get - # exact count of rows in sample, as actual rows involved in sample - # may be slightly different (more or less) than configured `sample_size`. - # However not doing so to start with, as that adds another query overhead - # plus approximate metrics should work for sampling based profiling. - row_count = self.config.sample_size - else: - row_count = profile.rowCount + # Querying exact row count of sample using `_get_dataset_rows`. + # We are not using `self.config.sample_size` directly as actual row count + # in sample may be slightly different (more or less) than configured `sample_size`. + self._get_dataset_rows(profile) + + row_count = profile.rowCount for column_spec in columns_profiling_queue: column = column_spec.column