fix(ingest/profiling): ignore limit/offset when counting table row count in profiling (#9635)

This commit is contained in:
Tamas Nemeth 2024-02-13 01:06:29 +01:00 committed by GitHub
parent a4e5ed82fc
commit 89018e26fd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -419,7 +419,22 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
self.dataset.engine.execute(get_estimate_script).scalar()
)
else:
dataset_profile.rowCount = self.dataset.get_row_count()
# If the configuration is not set to 'estimate only' mode, we directly obtain the row count from the dataset.
# However, if an offset or limit is set, we need to adjust how we calculate the row count.
# This is because applying a limit or offset could potentially skew the row count.
# For instance, if a limit is set and the actual row count exceeds this limit,
# the returned row count would incorrectly be the limit value.
#
# To address this, if a limit is set, we use the original table name when calculating the row count.
# This ensures that the row count is based on the original table, not on a view which have limit or offset applied.
if (self.config.limit or self.config.offset) and not self.custom_sql:
# We don't want limit and offset to get applied to the row count
# This is kinda hacky way to do it, but every other way would require major refactoring
dataset_profile.rowCount = self.dataset.get_row_count(
self.dataset_name.split(".")[-1]
)
else:
dataset_profile.rowCount = self.dataset.get_row_count()
@_run_with_query_combiner
def _get_dataset_column_min(