From c8870a0f1c150ab04b1d04285d657e22625dacf0 Mon Sep 17 00:00:00 2001 From: Teddy Date: Tue, 9 Jul 2024 14:05:13 +0200 Subject: [PATCH] fix: compute overall and regex count in the same query (#16962) --- .../column/base/columnValuesToMatchRegex.py | 7 ++-- .../column/pandas/columnValuesToMatchRegex.py | 14 +++++-- .../sqlalchemy/columnValuesToMatchRegex.py | 39 ++++++++++++++++--- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py index d52d6bb3d94..3e2e87fc462 100644 --- a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py +++ b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py @@ -50,9 +50,8 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator): ) try: column: Union[SQALikeColumn, Column] = self._get_column_name() - count = self._run_results(Metrics.COUNT, column) - match_count = self._run_results( - Metrics.REGEX_COUNT, column, expression=regex + count, match_count = self._run_results( + (Metrics.COUNT, Metrics.REGEX_COUNT), column, expression=regex ) except (ValueError, RuntimeError) as exc: msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore @@ -66,7 +65,7 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator): ) if self.test_case.computePassedFailedRowCount: - row_count = self.get_row_count() + row_count = count else: row_count = None diff --git a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py index fdfde310907..1cd05f61596 100644 --- a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py +++ b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py @@ -13,7 +13,7 @@ Validator for column values to match regex test case """ -from typing import Optional +from typing import Optional, Tuple from metadata.data_quality.validations.column.base.columnValuesToMatchRegex import ( BaseColumnValuesToMatchRegexValidator, @@ -42,15 +42,21 @@ class ColumnValuesToMatchRegexValidator( ) def _run_results( - self, metric: Metrics, column: SQALikeColumn, **kwargs - ) -> Optional[int]: + self, metric: Tuple[Metrics], column: SQALikeColumn, **kwargs + ) -> Tuple[Optional[int], Optional[int]]: """compute result of the test case Args: metric: metric column: column """ - return self.run_dataframe_results(self.runner, metric, column, **kwargs) + res = {} + for mtr in metric: + res[mtr.name] = self.run_dataframe_results( + self.runner, mtr, column, **kwargs + ) + + return res.get(Metrics.COUNT.name), res.get(Metrics.REGEX_COUNT.name) def compute_row_count(self, column: SQALikeColumn): """Compute row count for the given column diff --git a/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py b/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py index 1e5f570a237..be28e57963b 100644 --- a/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py +++ b/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py @@ -13,7 +13,7 @@ Validator for column values to match regex test case """ -from typing import Optional +from typing import Optional, Tuple from sqlalchemy import Column, inspect from sqlalchemy.exc import CompileError, SQLAlchemyError @@ -46,7 +46,9 @@ class ColumnValuesToMatchRegexValidator( inspect(self.runner.table).c, ) - def _run_results(self, metric: Metrics, column: Column, **kwargs) -> Optional[int]: + def _run_results( + self, metric: Tuple[Metrics], column: Column, **kwargs + ) -> Tuple[Optional[int], Optional[int]]: """compute result of the test case Args: @@ -54,15 +56,42 @@ class ColumnValuesToMatchRegexValidator( column: column """ try: - return self.run_query_results(self.runner, metric, column, **kwargs) + regex_count = Metrics.REGEX_COUNT(column) + regex_count.expression = kwargs.get("expression") + regex_count_fn = regex_count.fn() + + res = dict( + self.runner.dispatch_query_select_first( + Metrics.COUNT(column).fn(), + regex_count_fn, + ) + ) except (CompileError, SQLAlchemyError) as err: logger.warning( f"Could not use `REGEXP` due to - {err}. Falling back to `LIKE`" ) - return self.run_query_results( - self.runner, Metrics.LIKE_COUNT, column, **kwargs + regex_count = Metrics.LIKE_COUNT(column) + regex_count.expression = kwargs.get("expression") + regex_count_fn = regex_count.fn() + res = dict( + self.runner.dispatch_query_select_first( + Metrics.COUNT(column).fn(), + regex_count, + ) ) + if not res: + # pylint: disable=line-too-long + raise ValueError( + f"\nQuery on table/column {column.name if column is not None else ''} returned None. Your table might be empty. " + "If you confirmed your table is not empty and are still seeing this message you can:\n" + "\t1. check the documentation: https://docs.open-metadata.org/v1.3.x/connectors/ingestion/workflows/data-quality/tests\n" + "\t2. reach out to the Collate team for support" + ) + # pylint: enable=line-too-long + + return res.get(Metrics.COUNT.name), res.get(regex_count.name()) + def compute_row_count(self, column: Column): """Compute row count for the given column