fix(ingest/bigquery): use correct row count in null count profiling c… (#9123)

Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
Co-authored-by: Aseem Bansal <asmbansal2@gmail.com>
This commit is contained in:
Mayuri Nehate 2023-11-02 10:05:24 +05:30 committed by GitHub
parent bab9d1c931
commit 12b41713b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -659,7 +659,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
self.query_combiner.flush()
assert profile.rowCount is not None
row_count: int = profile.rowCount
row_count: int # used for null counts calculation
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
# We can alternatively use `self._get_dataset_rows(profile)` to get
# exact count of rows in sample, as actual rows involved in sample
# may be slightly different (more or less) than configured `sample_size`.
# However not doing so to start with, as that adds another query overhead
# plus approximate metrics should work for sampling based profiling.
row_count = self.config.sample_size
else:
row_count = profile.rowCount
for column_spec in columns_profiling_queue:
column = column_spec.column
@ -811,7 +820,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
sample_pc = 100 * self.config.sample_size / profile.rowCount
sql = (
f"SELECT * FROM {str(self.dataset._table)} "
+ f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
+ f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)"
)
temp_table_name = create_bigquery_temp_table(
self,