mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-22 17:10:30 +00:00
feat(ingest/snowflake): improve accuracy of computed sample-based pro… (#9600)
This commit is contained in:
parent
3f9b90158f
commit
af866eaf95
@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
||||
assert profile.rowCount is not None
|
||||
row_count: int # used for null counts calculation
|
||||
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
|
||||
# We can alternatively use `self._get_dataset_rows(profile)` to get
|
||||
# exact count of rows in sample, as actual rows involved in sample
|
||||
# may be slightly different (more or less) than configured `sample_size`.
|
||||
# However not doing so to start with, as that adds another query overhead
|
||||
# plus approximate metrics should work for sampling based profiling.
|
||||
row_count = self.config.sample_size
|
||||
else:
|
||||
row_count = profile.rowCount
|
||||
# Querying exact row count of sample using `_get_dataset_rows`.
|
||||
# We are not using `self.config.sample_size` directly as actual row count
|
||||
# in sample may be slightly different (more or less) than configured `sample_size`.
|
||||
self._get_dataset_rows(profile)
|
||||
|
||||
row_count = profile.rowCount
|
||||
|
||||
for column_spec in columns_profiling_queue:
|
||||
column = column_spec.column
|
||||
|
Loading…
x
Reference in New Issue
Block a user