IceS2 06c7d82101
Improve dimensionality performance (#24488)
* Fix Bigquery Dimensionality Issue + Refactor

* Remove comment

* Improve Dimensionality Code and Changed Median to use Approx_Quantile for Snowflake

* Remove commented method

* Improve statistical validator failed row count strategy
2025-11-21 18:07:31 +01:00

129 lines
4.4 KiB
Python

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Validator for column value mean to be between test case
"""
from typing import List, Optional
from sqlalchemy import Column
from metadata.data_quality.validations.base_test_handler import (
DIMENSION_TOTAL_COUNT_KEY,
)
from metadata.data_quality.validations.column.base.columnValueMeanToBeBetween import (
BaseColumnValueMeanToBeBetweenValidator,
)
from metadata.data_quality.validations.mixins.sqa_validator_mixin import (
SQAValidatorMixin,
)
from metadata.generated.schema.tests.dimensionResult import DimensionResult
from metadata.profiler.metrics.registry import Metrics
from metadata.utils.logger import test_suite_logger
logger = test_suite_logger()
class ColumnValueMeanToBeBetweenValidator(
BaseColumnValueMeanToBeBetweenValidator, SQAValidatorMixin
):
"""Validator for column value mean to be between test case"""
def _run_results(self, metric: Metrics, column: Column) -> Optional[int]:
"""compute result of the test case
Args:
metric: metric
column: column
"""
return self.run_query_results(self.runner, metric, column)
def _execute_dimensional_validation(
self,
column: Column,
dimension_col: Column,
metrics_to_compute: dict,
test_params: dict,
) -> List[DimensionResult]:
"""Execute dimensional validation for mean with proper weighted aggregation
Uses the statistical aggregation helper to:
1. Compute raw metrics (sum, count, mean) per dimension
2. Calculate impact score based on whether mean is within bounds
3. Aggregate "Others" using weighted mean: SUM(sums) / SUM(counts)
Args:
column: The column being validated
dimension_col: The dimension column to group by
metrics_to_compute: Dict mapping metric names to Metrics enums
test_params: Test parameters (min/max bounds)
Returns:
List[DimensionResult]: Top N dimensions plus "Others"
"""
dimension_results = []
try:
row_count_expr = Metrics.ROW_COUNT().fn()
mean_expr = Metrics.MEAN(column).fn()
metric_expressions = {
DIMENSION_TOTAL_COUNT_KEY: row_count_expr,
Metrics.MEAN.name: mean_expr,
}
failed_count_builder = (
lambda cte, row_count_expr: self._get_validation_checker(
test_params
).build_agg_level_violation_sqa(
[getattr(cte.c, Metrics.MEAN.name)], row_count_expr
)
)
normalized_dimension = self._get_normalized_dimension_expression(
dimension_col
)
result_rows = self._run_dimensional_validation_query(
source=self.runner.dataset,
dimension_expr=normalized_dimension,
metric_expressions=metric_expressions,
failed_count_builder=failed_count_builder,
)
for row in result_rows:
mean_value = row.get(Metrics.MEAN.name)
if mean_value is None:
continue
metric_values = {
Metrics.MEAN.name: mean_value,
}
evaluation = self._evaluate_test_condition(metric_values, test_params)
dimension_result = self._create_dimension_result(
row,
dimension_col.name,
metric_values,
evaluation,
test_params,
)
dimension_results.append(dimension_result)
except Exception as exc:
logger.warning(f"Error executing dimensional query: {exc}")
logger.debug("Full error details: ", exc_info=True)
return dimension_results