diff --git a/ingestion/src/metadata/data_quality/validations/base_test_handler.py b/ingestion/src/metadata/data_quality/validations/base_test_handler.py index 98e3a28ae6d..984bbd40623 100644 --- a/ingestion/src/metadata/data_quality/validations/base_test_handler.py +++ b/ingestion/src/metadata/data_quality/validations/base_test_handler.py @@ -347,13 +347,14 @@ class BaseTestValidator(ABC): if failed_rows is None: failed_rows = total_rows - passed_rows - # Calculate percentages with rounding to 2 decimal places - passed_rows_percentage = ( - round(passed_rows / total_rows * 100, 2) if total_rows > 0 else 0 - ) - failed_rows_percentage = ( - round(failed_rows / total_rows * 100, 2) if total_rows > 0 else 0 - ) + # Calculate percentages - derive one from the other to ensure they sum to 100% + if total_rows > 0: + passed_rows_percentage = round(passed_rows / total_rows * 100, 2) + # Derive failed percentage to ensure sum equals 100% + failed_rows_percentage = round(100 - passed_rows_percentage, 2) + else: + passed_rows_percentage = 0 + failed_rows_percentage = 0 # Convert dictionary to array of DimensionValue objects dimension_values_array = [ diff --git a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToBeUnique.py b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToBeUnique.py index cefb1289691..53d7d55e64f 100644 --- a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToBeUnique.py +++ b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToBeUnique.py @@ -51,7 +51,7 @@ class BaseColumnValuesToBeUniqueValidator(BaseTestValidator): try: column: Union[SQALikeColumn, Column] = self._get_column_name() count = self._run_results(Metrics.COUNT, column) - unique_count = self._get_unique_count(Metrics.UNIQUE_COUNT, column) + unique_count = self._get_unique_count(Metrics.DISTINCT_COUNT, column) except (ValueError, RuntimeError) as exc: msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore logger.debug(traceback.format_exc()) diff --git a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToBeInSet.py b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToBeInSet.py index 364e54269e2..8b2d0d6d84d 100644 --- a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToBeInSet.py +++ b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToBeInSet.py @@ -143,19 +143,19 @@ class ColumnValuesToBeInSetValidator( { "dimension": dimension_value, "count_in_set": count_in_set, - "row_count": row_count, DIMENSION_TOTAL_COUNT_KEY: row_count, DIMENSION_FAILED_COUNT_KEY: failed_count, } ) else: - # Non-enum mode + # Non-enum mode: we only care about matches, not failures + # Following SQLAlchemy's logic exactly results_data.append( { "dimension": dimension_value, "count_in_set": count_in_set, - DIMENSION_TOTAL_COUNT_KEY: count_in_set, - DIMENSION_FAILED_COUNT_KEY: 0, + DIMENSION_TOTAL_COUNT_KEY: count_in_set, # Use count_in_set as total + DIMENSION_FAILED_COUNT_KEY: 0, # Don't track failures in non-enum mode } ) @@ -184,14 +184,17 @@ class ColumnValuesToBeInSetValidator( # Extract metric values count_in_set = int(row.get("count_in_set", 0)) + # Follow SQLAlchemy's exact logic if match_enum: - total_count = int(row.get("row_count", 0)) + # Enum mode: track actual totals and failures + total_count = int(row.get(DIMENSION_TOTAL_COUNT_KEY, 0)) failed_count = int(row.get(DIMENSION_FAILED_COUNT_KEY, 0)) - matched = total_count - count_in_set == 0 + matched = failed_count == 0 # All values must be in enum else: - total_count = count_in_set - failed_count = 0 - matched = count_in_set > 0 + # Non-enum mode: we only care about matches + matched = count_in_set > 0 # Pass if ANY values are in set + total_count = count_in_set # Use count_in_set as total + failed_count = 0 # Don't track failures impact_score = float(row.get("impact_score", 0.0))