From af866eaf955443f837f30d49b6681b68a8cbcb51 Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Wed, 10 Jan 2024 23:29:09 +0530
Subject: [PATCH] =?UTF-8?q?feat(ingest/snowflake):=20improve=20accuracy=20?=
 =?UTF-8?q?of=20computed=20sample-based=20pro=E2=80=A6=20(#9600)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../datahub/ingestion/source/ge_data_profiler.py   | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
index 4f1ad00b1e..91f7c2a140 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
         assert profile.rowCount is not None
         row_count: int  # used for null counts calculation
         if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
-            # We can alternatively use `self._get_dataset_rows(profile)` to get
-            # exact count of rows in sample, as actual rows involved in sample
-            # may be slightly different (more or less) than configured `sample_size`.
-            # However not doing so to start with, as that adds another query overhead
-            # plus approximate metrics should work for sampling based profiling.
-            row_count = self.config.sample_size
-        else:
-            row_count = profile.rowCount
+            # Querying exact row count of sample using `_get_dataset_rows`.
+            # We are not using `self.config.sample_size` directly as actual row count
+            # in sample may be slightly different (more or less) than configured `sample_size`.
+            self._get_dataset_rows(profile)
+
+        row_count = profile.rowCount
 
         for column_spec in columns_profiling_queue:
             column = column_spec.column