feat(ingest/snowflake): improve accuracy of computed sample-based pro… (#9600)

This commit is contained in:
Mayuri Nehate 2024-01-10 23:29:09 +05:30 committed by GitHub
parent 3f9b90158f
commit af866eaf95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
assert profile.rowCount is not None
row_count: int # used for null counts calculation
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
# We can alternatively use `self._get_dataset_rows(profile)` to get
# exact count of rows in sample, as actual rows involved in sample
# may be slightly different (more or less) than configured `sample_size`.
# However not doing so to start with, as that adds another query overhead
# plus approximate metrics should work for sampling based profiling.
row_count = self.config.sample_size
else:
row_count = profile.rowCount
# Querying exact row count of sample using `_get_dataset_rows`.
# We are not using `self.config.sample_size` directly as actual row count
# in sample may be slightly different (more or less) than configured `sample_size`.
self._get_dataset_rows(profile)
row_count = profile.rowCount
for column_spec in columns_profiling_queue:
column = column_spec.column