mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-23 17:39:59 +00:00
feat(ingest/snowflake): improve accuracy of computed sample-based pro… (#9600)
This commit is contained in:
parent
3f9b90158f
commit
af866eaf95
@ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|||||||
assert profile.rowCount is not None
|
assert profile.rowCount is not None
|
||||||
row_count: int # used for null counts calculation
|
row_count: int # used for null counts calculation
|
||||||
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
|
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
|
||||||
# We can alternatively use `self._get_dataset_rows(profile)` to get
|
# Querying exact row count of sample using `_get_dataset_rows`.
|
||||||
# exact count of rows in sample, as actual rows involved in sample
|
# We are not using `self.config.sample_size` directly as actual row count
|
||||||
# may be slightly different (more or less) than configured `sample_size`.
|
# in sample may be slightly different (more or less) than configured `sample_size`.
|
||||||
# However not doing so to start with, as that adds another query overhead
|
self._get_dataset_rows(profile)
|
||||||
# plus approximate metrics should work for sampling based profiling.
|
|
||||||
row_count = self.config.sample_size
|
row_count = profile.rowCount
|
||||||
else:
|
|
||||||
row_count = profile.rowCount
|
|
||||||
|
|
||||||
for column_spec in columns_profiling_queue:
|
for column_spec in columns_profiling_queue:
|
||||||
column = column_spec.column
|
column = column_spec.column
|
||||||
|
Loading…
x
Reference in New Issue
Block a user