diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 4f1ad00b1e..91f7c2a140 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): assert profile.rowCount is not None row_count: int # used for null counts calculation if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition: - # We can alternatively use `self._get_dataset_rows(profile)` to get - # exact count of rows in sample, as actual rows involved in sample - # may be slightly different (more or less) than configured `sample_size`. - # However not doing so to start with, as that adds another query overhead - # plus approximate metrics should work for sampling based profiling. - row_count = self.config.sample_size - else: - row_count = profile.rowCount + # Querying exact row count of sample using `_get_dataset_rows`. + # We are not using `self.config.sample_size` directly as actual row count + # in sample may be slightly different (more or less) than configured `sample_size`. + self._get_dataset_rows(profile) + + row_count = profile.rowCount for column_spec in columns_profiling_queue: column = column_spec.column