mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-01 19:25:56 +00:00
fix(ingest/bigquery): use correct row count in null count profiling c… (#9123)
Co-authored-by: Harshal Sheth <hsheth2@gmail.com> Co-authored-by: Aseem Bansal <asmbansal2@gmail.com>
This commit is contained in:
parent
bab9d1c931
commit
12b41713b4
@ -659,7 +659,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
||||
self.query_combiner.flush()
|
||||
|
||||
assert profile.rowCount is not None
|
||||
row_count: int = profile.rowCount
|
||||
row_count: int # used for null counts calculation
|
||||
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
|
||||
# We can alternatively use `self._get_dataset_rows(profile)` to get
|
||||
# exact count of rows in sample, as actual rows involved in sample
|
||||
# may be slightly different (more or less) than configured `sample_size`.
|
||||
# However not doing so to start with, as that adds another query overhead
|
||||
# plus approximate metrics should work for sampling based profiling.
|
||||
row_count = self.config.sample_size
|
||||
else:
|
||||
row_count = profile.rowCount
|
||||
|
||||
for column_spec in columns_profiling_queue:
|
||||
column = column_spec.column
|
||||
@ -811,7 +820,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
||||
sample_pc = 100 * self.config.sample_size / profile.rowCount
|
||||
sql = (
|
||||
f"SELECT * FROM {str(self.dataset._table)} "
|
||||
+ f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
|
||||
+ f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)"
|
||||
)
|
||||
temp_table_name = create_bigquery_temp_table(
|
||||
self,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user