Fix Pandas Dimensionality checks

This commit is contained in:
Pablo Takara 2025-09-25 12:30:27 +02:00
parent 03ede24eba
commit 6ea9c2d8ed
No known key found for this signature in database
GPG Key ID: 63381DDFBB2BF725
3 changed files with 21 additions and 17 deletions

View File

@ -347,13 +347,14 @@ class BaseTestValidator(ABC):
if failed_rows is None:
failed_rows = total_rows - passed_rows
# Calculate percentages with rounding to 2 decimal places
passed_rows_percentage = (
round(passed_rows / total_rows * 100, 2) if total_rows > 0 else 0
)
failed_rows_percentage = (
round(failed_rows / total_rows * 100, 2) if total_rows > 0 else 0
)
# Calculate percentages - derive one from the other to ensure they sum to 100%
if total_rows > 0:
passed_rows_percentage = round(passed_rows / total_rows * 100, 2)
# Derive failed percentage to ensure sum equals 100%
failed_rows_percentage = round(100 - passed_rows_percentage, 2)
else:
passed_rows_percentage = 0
failed_rows_percentage = 0
# Convert dictionary to array of DimensionValue objects
dimension_values_array = [

View File

@ -51,7 +51,7 @@ class BaseColumnValuesToBeUniqueValidator(BaseTestValidator):
try:
column: Union[SQALikeColumn, Column] = self._get_column_name()
count = self._run_results(Metrics.COUNT, column)
unique_count = self._get_unique_count(Metrics.UNIQUE_COUNT, column)
unique_count = self._get_unique_count(Metrics.DISTINCT_COUNT, column)
except (ValueError, RuntimeError) as exc:
msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore
logger.debug(traceback.format_exc())

View File

@ -143,19 +143,19 @@ class ColumnValuesToBeInSetValidator(
{
"dimension": dimension_value,
"count_in_set": count_in_set,
"row_count": row_count,
DIMENSION_TOTAL_COUNT_KEY: row_count,
DIMENSION_FAILED_COUNT_KEY: failed_count,
}
)
else:
# Non-enum mode
# Non-enum mode: we only care about matches, not failures
# Following SQLAlchemy's logic exactly
results_data.append(
{
"dimension": dimension_value,
"count_in_set": count_in_set,
DIMENSION_TOTAL_COUNT_KEY: count_in_set,
DIMENSION_FAILED_COUNT_KEY: 0,
DIMENSION_TOTAL_COUNT_KEY: count_in_set, # Use count_in_set as total
DIMENSION_FAILED_COUNT_KEY: 0, # Don't track failures in non-enum mode
}
)
@ -184,14 +184,17 @@ class ColumnValuesToBeInSetValidator(
# Extract metric values
count_in_set = int(row.get("count_in_set", 0))
# Follow SQLAlchemy's exact logic
if match_enum:
total_count = int(row.get("row_count", 0))
# Enum mode: track actual totals and failures
total_count = int(row.get(DIMENSION_TOTAL_COUNT_KEY, 0))
failed_count = int(row.get(DIMENSION_FAILED_COUNT_KEY, 0))
matched = total_count - count_in_set == 0
matched = failed_count == 0 # All values must be in enum
else:
total_count = count_in_set
failed_count = 0
matched = count_in_set > 0
# Non-enum mode: we only care about matches
matched = count_in_set > 0 # Pass if ANY values are in set
total_count = count_in_set # Use count_in_set as total
failed_count = 0 # Don't track failures
impact_score = float(row.get("impact_score", 0.0))