fix: compute overall and regex count in the same query (#16962)

2025-12-18 11:07:41 +00:00 · 2024-07-09 14:05:13 +02:00 · 2024-07-09 14:05:13 +02:00 · c8870a0f1c
commit c8870a0f1c
parent b4dc4df811
3 changed files with 47 additions and 13 deletions
--- a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py
+++ b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py
@ -50,9 +50,8 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
        )
        try:
            column: Union[SQALikeColumn, Column] = self._get_column_name()
-            count = self._run_results(Metrics.COUNT, column)
+            count, match_count = self._run_results(
-            match_count = self._run_results(
+                (Metrics.COUNT, Metrics.REGEX_COUNT), column, expression=regex
                Metrics.REGEX_COUNT, column, expression=regex
            )
        except (ValueError, RuntimeError) as exc:
            msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}"  # type: ignore
@ -66,7 +65,7 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
            )
        if self.test_case.computePassedFailedRowCount:
-            row_count = self.get_row_count()
+            row_count = count
        else:
            row_count = None
--- a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py
+++ b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py
@ -13,7 +13,7 @@
 Validator for column values to match regex test case
 """
-from typing import Optional
+from typing import Optional, Tuple
 from metadata.data_quality.validations.column.base.columnValuesToMatchRegex import (
    BaseColumnValuesToMatchRegexValidator,
@ -42,15 +42,21 @@ class ColumnValuesToMatchRegexValidator(
        )
    def _run_results(
-        self, metric: Metrics, column: SQALikeColumn, **kwargs
+        self, metric: Tuple[Metrics], column: SQALikeColumn, **kwargs
-    ) -> Optional[int]:
+    ) -> Tuple[Optional[int], Optional[int]]:
        """compute result of the test case
        Args:
            metric: metric
            column: column
        """
-        return self.run_dataframe_results(self.runner, metric, column, **kwargs)
+        res = {}
        for mtr in metric:
            res[mtr.name] = self.run_dataframe_results(
                self.runner, mtr, column, **kwargs
            )
        return res.get(Metrics.COUNT.name), res.get(Metrics.REGEX_COUNT.name)
    def compute_row_count(self, column: SQALikeColumn):
        """Compute row count for the given column
--- a/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py
+++ b/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py
@ -13,7 +13,7 @@
 Validator for column values to match regex test case
 """
-from typing import Optional
+from typing import Optional, Tuple
 from sqlalchemy import Column, inspect
 from sqlalchemy.exc import CompileError, SQLAlchemyError
@ -46,7 +46,9 @@ class ColumnValuesToMatchRegexValidator(
            inspect(self.runner.table).c,
        )
-    def _run_results(self, metric: Metrics, column: Column, **kwargs) -> Optional[int]:
+    def _run_results(
        self, metric: Tuple[Metrics], column: Column, **kwargs
    ) -> Tuple[Optional[int], Optional[int]]:
        """compute result of the test case
        Args:
@ -54,14 +56,41 @@ class ColumnValuesToMatchRegexValidator(
            column: column
        """
        try:
-            return self.run_query_results(self.runner, metric, column, **kwargs)
+            regex_count = Metrics.REGEX_COUNT(column)
            regex_count.expression = kwargs.get("expression")
            regex_count_fn = regex_count.fn()
            res = dict(
                self.runner.dispatch_query_select_first(
                    Metrics.COUNT(column).fn(),
                    regex_count_fn,
                )
            )
        except (CompileError, SQLAlchemyError) as err:
            logger.warning(
                f"Could not use `REGEXP` due to - {err}. Falling back to `LIKE`"
            )
-            return self.run_query_results(
+            regex_count = Metrics.LIKE_COUNT(column)
-                self.runner, Metrics.LIKE_COUNT, column, **kwargs
+            regex_count.expression = kwargs.get("expression")
            regex_count_fn = regex_count.fn()
            res = dict(
                self.runner.dispatch_query_select_first(
                    Metrics.COUNT(column).fn(),
                    regex_count,
                )
            )
        if not res:
            # pylint: disable=line-too-long
            raise ValueError(
                f"\nQuery on table/column {column.name if column is not None else ''} returned None. Your table might be empty. "
                "If you confirmed your table is not empty and are still seeing this message you can:\n"
                "\t1. check the documentation: https://docs.open-metadata.org/v1.3.x/connectors/ingestion/workflows/data-quality/tests\n"
                "\t2. reach out to the Collate team for support"
            )
            # pylint: enable=line-too-long
        return res.get(Metrics.COUNT.name), res.get(regex_count.name())
    def compute_row_count(self, column: Column):
        """Compute row count for the given column