From c8870a0f1c150ab04b1d04285d657e22625dacf0 Mon Sep 17 00:00:00 2001
From: Teddy <teddy.crepineau@gmail.com>
Date: Tue, 9 Jul 2024 14:05:13 +0200
Subject: [PATCH] fix: compute overall and regex count in the same query
 (#16962)

---
 .../column/base/columnValuesToMatchRegex.py   |  7 ++--
 .../column/pandas/columnValuesToMatchRegex.py | 14 +++++--
 .../sqlalchemy/columnValuesToMatchRegex.py    | 39 ++++++++++++++++---
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py
index d52d6bb3d94..3e2e87fc462 100644
--- a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py
+++ b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesToMatchRegex.py
@@ -50,9 +50,8 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
         )
         try:
             column: Union[SQALikeColumn, Column] = self._get_column_name()
-            count = self._run_results(Metrics.COUNT, column)
-            match_count = self._run_results(
-                Metrics.REGEX_COUNT, column, expression=regex
+            count, match_count = self._run_results(
+                (Metrics.COUNT, Metrics.REGEX_COUNT), column, expression=regex
             )
         except (ValueError, RuntimeError) as exc:
             msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}"  # type: ignore
@@ -66,7 +65,7 @@ class BaseColumnValuesToMatchRegexValidator(BaseTestValidator):
             )
 
         if self.test_case.computePassedFailedRowCount:
-            row_count = self.get_row_count()
+            row_count = count
         else:
             row_count = None
 
diff --git a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py
index fdfde310907..1cd05f61596 100644
--- a/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py
+++ b/ingestion/src/metadata/data_quality/validations/column/pandas/columnValuesToMatchRegex.py
@@ -13,7 +13,7 @@
 Validator for column values to match regex test case
 """
 
-from typing import Optional
+from typing import Optional, Tuple
 
 from metadata.data_quality.validations.column.base.columnValuesToMatchRegex import (
     BaseColumnValuesToMatchRegexValidator,
@@ -42,15 +42,21 @@ class ColumnValuesToMatchRegexValidator(
         )
 
     def _run_results(
-        self, metric: Metrics, column: SQALikeColumn, **kwargs
-    ) -> Optional[int]:
+        self, metric: Tuple[Metrics], column: SQALikeColumn, **kwargs
+    ) -> Tuple[Optional[int], Optional[int]]:
         """compute result of the test case
 
         Args:
             metric: metric
             column: column
         """
-        return self.run_dataframe_results(self.runner, metric, column, **kwargs)
+        res = {}
+        for mtr in metric:
+            res[mtr.name] = self.run_dataframe_results(
+                self.runner, mtr, column, **kwargs
+            )
+
+        return res.get(Metrics.COUNT.name), res.get(Metrics.REGEX_COUNT.name)
 
     def compute_row_count(self, column: SQALikeColumn):
         """Compute row count for the given column
diff --git a/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py b/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py
index 1e5f570a237..be28e57963b 100644
--- a/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py
+++ b/ingestion/src/metadata/data_quality/validations/column/sqlalchemy/columnValuesToMatchRegex.py
@@ -13,7 +13,7 @@
 Validator for column values to match regex test case
 """
 
-from typing import Optional
+from typing import Optional, Tuple
 
 from sqlalchemy import Column, inspect
 from sqlalchemy.exc import CompileError, SQLAlchemyError
@@ -46,7 +46,9 @@ class ColumnValuesToMatchRegexValidator(
             inspect(self.runner.table).c,
         )
 
-    def _run_results(self, metric: Metrics, column: Column, **kwargs) -> Optional[int]:
+    def _run_results(
+        self, metric: Tuple[Metrics], column: Column, **kwargs
+    ) -> Tuple[Optional[int], Optional[int]]:
         """compute result of the test case
 
         Args:
@@ -54,15 +56,42 @@ class ColumnValuesToMatchRegexValidator(
             column: column
         """
         try:
-            return self.run_query_results(self.runner, metric, column, **kwargs)
+            regex_count = Metrics.REGEX_COUNT(column)
+            regex_count.expression = kwargs.get("expression")
+            regex_count_fn = regex_count.fn()
+
+            res = dict(
+                self.runner.dispatch_query_select_first(
+                    Metrics.COUNT(column).fn(),
+                    regex_count_fn,
+                )
+            )
         except (CompileError, SQLAlchemyError) as err:
             logger.warning(
                 f"Could not use `REGEXP` due to - {err}. Falling back to `LIKE`"
             )
-            return self.run_query_results(
-                self.runner, Metrics.LIKE_COUNT, column, **kwargs
+            regex_count = Metrics.LIKE_COUNT(column)
+            regex_count.expression = kwargs.get("expression")
+            regex_count_fn = regex_count.fn()
+            res = dict(
+                self.runner.dispatch_query_select_first(
+                    Metrics.COUNT(column).fn(),
+                    regex_count,
+                )
             )
 
+        if not res:
+            # pylint: disable=line-too-long
+            raise ValueError(
+                f"\nQuery on table/column {column.name if column is not None else ''} returned None. Your table might be empty. "
+                "If you confirmed your table is not empty and are still seeing this message you can:\n"
+                "\t1. check the documentation: https://docs.open-metadata.org/v1.3.x/connectors/ingestion/workflows/data-quality/tests\n"
+                "\t2. reach out to the Collate team for support"
+            )
+            # pylint: enable=line-too-long
+
+        return res.get(Metrics.COUNT.name), res.get(regex_count.name())
+
     def compute_row_count(self, column: Column):
         """Compute row count for the given column