fix: compute overall and regex count in the same query (#16962)

This commit is contained in:
Teddy 2024-07-09 14:05:13 +02:00 committed by GitHub
parent b4dc4df811
commit c8870a0f1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 47 additions and 13 deletions

View File

@ -50,9 +50,8 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
)
try:
column: Union[SQALikeColumn, Column] = self._get_column_name()
count = self._run_results(Metrics.COUNT, column)
match_count = self._run_results(
Metrics.REGEX_COUNT, column, expression=regex
count, match_count = self._run_results(
(Metrics.COUNT, Metrics.REGEX_COUNT), column, expression=regex
)
except (ValueError, RuntimeError) as exc:
msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore
@ -66,7 +65,7 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
)
if self.test_case.computePassedFailedRowCount:
row_count = self.get_row_count()
row_count = count
else:
row_count = None

View File

@ -13,7 +13,7 @@
Validator for column values to match regex test case
"""
from typing import Optional
from typing import Optional, Tuple
from metadata.data_quality.validations.column.base.columnValuesToMatchRegex import (
BaseColumnValuesToMatchRegexValidator,
@ -42,15 +42,21 @@ class ColumnValuesToMatchRegexValidator(
)
def _run_results(
self, metric: Metrics, column: SQALikeColumn, **kwargs
) -> Optional[int]:
self, metric: Tuple[Metrics], column: SQALikeColumn, **kwargs
) -> Tuple[Optional[int], Optional[int]]:
"""compute result of the test case
Args:
metric: metric
column: column
"""
return self.run_dataframe_results(self.runner, metric, column, **kwargs)
res = {}
for mtr in metric:
res[mtr.name] = self.run_dataframe_results(
self.runner, mtr, column, **kwargs
)
return res.get(Metrics.COUNT.name), res.get(Metrics.REGEX_COUNT.name)
def compute_row_count(self, column: SQALikeColumn):
"""Compute row count for the given column

View File

@ -13,7 +13,7 @@
Validator for column values to match regex test case
"""
from typing import Optional
from typing import Optional, Tuple
from sqlalchemy import Column, inspect
from sqlalchemy.exc import CompileError, SQLAlchemyError
@ -46,7 +46,9 @@ class ColumnValuesToMatchRegexValidator(
inspect(self.runner.table).c,
)
def _run_results(self, metric: Metrics, column: Column, **kwargs) -> Optional[int]:
def _run_results(
self, metric: Tuple[Metrics], column: Column, **kwargs
) -> Tuple[Optional[int], Optional[int]]:
"""compute result of the test case
Args:
@ -54,15 +56,42 @@ class ColumnValuesToMatchRegexValidator(
column: column
"""
try:
return self.run_query_results(self.runner, metric, column, **kwargs)
regex_count = Metrics.REGEX_COUNT(column)
regex_count.expression = kwargs.get("expression")
regex_count_fn = regex_count.fn()
res = dict(
self.runner.dispatch_query_select_first(
Metrics.COUNT(column).fn(),
regex_count_fn,
)
)
except (CompileError, SQLAlchemyError) as err:
logger.warning(
f"Could not use `REGEXP` due to - {err}. Falling back to `LIKE`"
)
return self.run_query_results(
self.runner, Metrics.LIKE_COUNT, column, **kwargs
regex_count = Metrics.LIKE_COUNT(column)
regex_count.expression = kwargs.get("expression")
regex_count_fn = regex_count.fn()
res = dict(
self.runner.dispatch_query_select_first(
Metrics.COUNT(column).fn(),
regex_count,
)
)
if not res:
# pylint: disable=line-too-long
raise ValueError(
f"\nQuery on table/column {column.name if column is not None else ''} returned None. Your table might be empty. "
"If you confirmed your table is not empty and are still seeing this message you can:\n"
"\t1. check the documentation: https://docs.open-metadata.org/v1.3.x/connectors/ingestion/workflows/data-quality/tests\n"
"\t2. reach out to the Collate team for support"
)
# pylint: enable=line-too-long
return res.get(Metrics.COUNT.name), res.get(regex_count.name())
def compute_row_count(self, column: Column):
"""Compute row count for the given column