IceS2 4e398d003b
Feature/dimensionality column values to be not null (#24211)
* Initial implementation for Dimensionality on Data Quality Tests

* Fix ColumnValuesToBeUnique and create TestCaseResult API

* Refactor dimension result

* Initial E2E Implementation without Impact Score

* Dimensionality Thin Slice

* Update generated TypeScript types

* Update generated TypeScript types

* Removed useless method to use the one we already had

* Fix Pandas Dimensionality checks

* Remove useless comments

* Implement PR comments, fix Tests

* Improve the code a bit

* Fix imports

* Implement Dimensionality for ColumnMeanToBeBetween

* Removed useless comments and improved minor things

* Implement UnitTests

* Fixes

* Moved import pandas to type checking

* Fix Min/Max being optional

* Fix Unittests

* small fixes

* Fix Unittests

* Fix Issue with counting total rows on mean

* Improve code

* Fix Merge

* Removed unused type

* Refactor to reduce code repetition and complexity

* Fix conflict

* Rename method

* Refactor some metrics

* Implement Dimensionality to ColumnLengthToBeBetween

* Implement Dimensionality for ColumnMedianToBeBetween in Pandas

* Implement Median Dimensionality for SQL

* Add database tests

* Fix median metric

* Implement Dimensionality SumToBeBetween

* Implement dimensionality for Column Values not In Set

* Implement Dimensionality for ColumnValuestoMatchRegex and ColumnValuesToNotMatchRegex

* Implement NotNull and MissingCount dimensionality

* Fix test

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-11-07 14:44:58 +01:00

220 lines
9.1 KiB
Python

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Validator for column value missing count to be equal test case
"""
from collections import defaultdict
from typing import List, Optional, cast
import pandas as pd
from metadata.data_quality.validations.base_test_handler import (
DIMENSION_FAILED_COUNT_KEY,
DIMENSION_TOTAL_COUNT_KEY,
DIMENSION_VALUE_KEY,
)
from metadata.data_quality.validations.column.base.columnValuesMissingCount import (
BaseColumnValuesMissingCountValidator,
)
from metadata.data_quality.validations.impact_score import (
DEFAULT_TOP_DIMENSIONS,
calculate_impact_score_pandas,
)
from metadata.data_quality.validations.mixins.pandas_validator_mixin import (
PandasValidatorMixin,
aggregate_others_statistical_pandas,
)
from metadata.generated.schema.tests.dimensionResult import DimensionResult
from metadata.profiler.metrics.core import add_props
from metadata.profiler.metrics.registry import Metrics
from metadata.utils.logger import test_suite_logger
from metadata.utils.sqa_like_column import SQALikeColumn
logger = test_suite_logger()
class ColumnValuesMissingCountValidator(
BaseColumnValuesMissingCountValidator, PandasValidatorMixin
):
"""Validator for column value missing count to be equal test case"""
def _run_results(
self, metric: Metrics, column: SQALikeColumn, **kwargs
) -> Optional[int]:
"""compute result of the test case
Args:
metric: metric
column: column
"""
return self.run_dataframe_results(self.runner, metric, column, **kwargs)
def _execute_dimensional_validation(
self,
column: SQALikeColumn,
dimension_col: SQALikeColumn,
metrics_to_compute: dict,
test_params: dict,
) -> List[DimensionResult]:
"""Execute dimensional query with impact scoring and Others aggregation for pandas
Follows the iterate pattern from the Mean metric's df_fn method to handle
multiple dataframes efficiently without concatenating them in memory.
Memory-efficient approach: Instead of concatenating all dataframes (which creates
a full copy in memory), we iterate over them and accumulate aggregates. This is
especially important for large parquet files split across many chunks.
For missing count validation, we accumulate null/missing counts across dataframes
to accurately track how many missing values exist per dimension.
Args:
column: The column being validated
dimension_col: Single SQALikeColumn object corresponding to the dimension column
metrics_to_compute: Dictionary mapping Metrics enum names to Metrics objects
test_params: Dictionary with test-specific parameters (MISSING_VALUE_MATCH, MISSING_COUNT_VALUE)
Returns:
List[DimensionResult]: Top N dimensions by impact score plus "Others"
"""
dimension_results = []
try:
dfs = self.runner if isinstance(self.runner, list) else [self.runner]
metric_expressions = {
Metrics.NULL_MISSING_COUNT.name: Metrics.NULL_MISSING_COUNT(
column
).get_pandas_computation(),
Metrics.ROW_COUNT.name: Metrics.ROW_COUNT().get_pandas_computation(),
}
missing_values = test_params.get(self.MISSING_VALUE_MATCH)
missing_values_expected_count = test_params.get(self.MISSING_COUNT_VALUE, 0)
if missing_values:
metric_expressions[Metrics.COUNT_IN_SET.name] = add_props(
values=missing_values
)(Metrics.COUNT_IN_SET.value)(column).get_pandas_computation()
dimension_aggregates = defaultdict(
lambda: {
metric_name: metric.create_accumulator()
for metric_name, metric in metric_expressions.items()
}
)
for df in dfs:
df_typed = cast(pd.DataFrame, df)
grouped = df_typed.groupby(dimension_col.name, dropna=False)
for dimension_value, group_df in grouped:
dimension_value = self.format_dimension_value(dimension_value)
for metric_name, metric in metric_expressions.items():
dimension_aggregates[dimension_value][
metric_name
] = metric.update_accumulator(
dimension_aggregates[dimension_value][metric_name], group_df
)
results_data = []
for dimension_value, agg in dimension_aggregates.items():
total_missing_count = sum(
metric.aggregate_accumulator(agg[metric_name])
for metric_name, metric in metric_expressions.items()
if metric_name != Metrics.ROW_COUNT.name
)
total_rows = metric_expressions[
Metrics.ROW_COUNT.name
].aggregate_accumulator(agg[Metrics.ROW_COUNT.name])
# Calculate initial deviation (will be recalculated for "Others")
deviation = abs(total_missing_count - missing_values_expected_count)
results_data.append(
{
DIMENSION_VALUE_KEY: dimension_value,
self.TOTAL_MISSING_COUNT: total_missing_count,
DIMENSION_TOTAL_COUNT_KEY: total_rows,
DIMENSION_FAILED_COUNT_KEY: deviation,
}
)
results_df = pd.DataFrame(results_data)
if not results_df.empty:
# Define recalculation function for deviation after aggregation
def recalculate_failed_count(df_aggregated, others_mask, metric_column):
"""Recalculate failed_count (deviation) for 'Others' from aggregated total_missing_count"""
result = df_aggregated[metric_column].copy()
if others_mask.any():
others_total = df_aggregated.loc[
others_mask, self.TOTAL_MISSING_COUNT
].iloc[0]
# Deviation is the failed_count
result.loc[others_mask] = abs(
others_total - missing_values_expected_count
)
return result
results_df = calculate_impact_score_pandas(
results_df,
failed_column=DIMENSION_FAILED_COUNT_KEY,
total_column=DIMENSION_TOTAL_COUNT_KEY,
)
results_df = aggregate_others_statistical_pandas(
results_df,
dimension_column=DIMENSION_VALUE_KEY,
top_n=DEFAULT_TOP_DIMENSIONS,
agg_functions={
self.TOTAL_MISSING_COUNT: "sum", # Sum actual missing counts
DIMENSION_TOTAL_COUNT_KEY: "sum",
DIMENSION_FAILED_COUNT_KEY: "sum", # This will be recalculated for Others
},
final_metric_calculators={
DIMENSION_FAILED_COUNT_KEY: recalculate_failed_count, # Recalculate deviation for Others
},
# No violation_predicate needed - deviation IS the failed_count
)
for row_dict in results_df.to_dict("records"):
metric_values = self._build_metric_values_from_row(
row_dict, metrics_to_compute, test_params
)
# Need to add the calculated metric here.
metric_values[self.TOTAL_MISSING_COUNT] = row_dict.get(
self.TOTAL_MISSING_COUNT
)
evaluation = self._evaluate_test_condition(
metric_values, test_params
)
dimension_result = self._create_dimension_result(
row_dict,
dimension_col.name,
metric_values,
evaluation,
test_params,
)
dimension_results.append(dimension_result)
except Exception as exc:
logger.warning(f"Error executing dimensional query: {exc}")
logger.debug("Full error details: ", exc_info=True)
return dimension_results