fix: compute overall and regex count in the same query (#16962)

This commit is contained in:
Teddy 2024-07-09 14:05:13 +02:00 committed by GitHub
parent b4dc4df811
commit c8870a0f1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 47 additions and 13 deletions

View File

@ -50,9 +50,8 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
) )
try: try:
column: Union[SQALikeColumn, Column] = self._get_column_name() column: Union[SQALikeColumn, Column] = self._get_column_name()
count = self._run_results(Metrics.COUNT, column) count, match_count = self._run_results(
match_count = self._run_results( (Metrics.COUNT, Metrics.REGEX_COUNT), column, expression=regex
Metrics.REGEX_COUNT, column, expression=regex
) )
except (ValueError, RuntimeError) as exc: except (ValueError, RuntimeError) as exc:
msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore
@ -66,7 +65,7 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
) )
if self.test_case.computePassedFailedRowCount: if self.test_case.computePassedFailedRowCount:
row_count = self.get_row_count() row_count = count
else: else:
row_count = None row_count = None

View File

@ -13,7 +13,7 @@
Validator for column values to match regex test case Validator for column values to match regex test case
""" """
from typing import Optional from typing import Optional, Tuple
from metadata.data_quality.validations.column.base.columnValuesToMatchRegex import ( from metadata.data_quality.validations.column.base.columnValuesToMatchRegex import (
BaseColumnValuesToMatchRegexValidator, BaseColumnValuesToMatchRegexValidator,
@ -42,15 +42,21 @@ class ColumnValuesToMatchRegexValidator(
) )
def _run_results( def _run_results(
self, metric: Metrics, column: SQALikeColumn, **kwargs self, metric: Tuple[Metrics], column: SQALikeColumn, **kwargs
) -> Optional[int]: ) -> Tuple[Optional[int], Optional[int]]:
"""compute result of the test case """compute result of the test case
Args: Args:
metric: metric metric: metric
column: column column: column
""" """
return self.run_dataframe_results(self.runner, metric, column, **kwargs) res = {}
for mtr in metric:
res[mtr.name] = self.run_dataframe_results(
self.runner, mtr, column, **kwargs
)
return res.get(Metrics.COUNT.name), res.get(Metrics.REGEX_COUNT.name)
def compute_row_count(self, column: SQALikeColumn): def compute_row_count(self, column: SQALikeColumn):
"""Compute row count for the given column """Compute row count for the given column

View File

@ -13,7 +13,7 @@
Validator for column values to match regex test case Validator for column values to match regex test case
""" """
from typing import Optional from typing import Optional, Tuple
from sqlalchemy import Column, inspect from sqlalchemy import Column, inspect
from sqlalchemy.exc import CompileError, SQLAlchemyError from sqlalchemy.exc import CompileError, SQLAlchemyError
@ -46,7 +46,9 @@ class ColumnValuesToMatchRegexValidator(
inspect(self.runner.table).c, inspect(self.runner.table).c,
) )
def _run_results(self, metric: Metrics, column: Column, **kwargs) -> Optional[int]: def _run_results(
self, metric: Tuple[Metrics], column: Column, **kwargs
) -> Tuple[Optional[int], Optional[int]]:
"""compute result of the test case """compute result of the test case
Args: Args:
@ -54,14 +56,41 @@ class ColumnValuesToMatchRegexValidator(
column: column column: column
""" """
try: try:
return self.run_query_results(self.runner, metric, column, **kwargs) regex_count = Metrics.REGEX_COUNT(column)
regex_count.expression = kwargs.get("expression")
regex_count_fn = regex_count.fn()
res = dict(
self.runner.dispatch_query_select_first(
Metrics.COUNT(column).fn(),
regex_count_fn,
)
)
except (CompileError, SQLAlchemyError) as err: except (CompileError, SQLAlchemyError) as err:
logger.warning( logger.warning(
f"Could not use `REGEXP` due to - {err}. Falling back to `LIKE`" f"Could not use `REGEXP` due to - {err}. Falling back to `LIKE`"
) )
return self.run_query_results( regex_count = Metrics.LIKE_COUNT(column)
self.runner, Metrics.LIKE_COUNT, column, **kwargs regex_count.expression = kwargs.get("expression")
regex_count_fn = regex_count.fn()
res = dict(
self.runner.dispatch_query_select_first(
Metrics.COUNT(column).fn(),
regex_count,
) )
)
if not res:
# pylint: disable=line-too-long
raise ValueError(
f"\nQuery on table/column {column.name if column is not None else ''} returned None. Your table might be empty. "
"If you confirmed your table is not empty and are still seeing this message you can:\n"
"\t1. check the documentation: https://docs.open-metadata.org/v1.3.x/connectors/ingestion/workflows/data-quality/tests\n"
"\t2. reach out to the Collate team for support"
)
# pylint: enable=line-too-long
return res.get(Metrics.COUNT.name), res.get(regex_count.name())
def compute_row_count(self, column: Column): def compute_row_count(self, column: Column):
"""Compute row count for the given column """Compute row count for the given column