MINOR: handle "ignored exception" error message when running table diff (#17322)

* fix: table diff implemented a safe iterator to handle the sneaky `KeyError` * changed method to safe_table_diff_iterator * format --------- Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com>
2025-11-10 15:59:57 +00:00 · 2024-08-07 17:21:17 +02:00 · 2024-08-07 17:21:17 +02:00 · 62c33e026c
commit 62c33e026c
parent af9f45b00a
1 changed files with 44 additions and 12 deletions
--- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableDiff.py
+++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableDiff.py
@ -9,6 +9,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 # pylint: disable=missing-module-docstring
 import logging
 import traceback
 from itertools import islice
 from typing import Dict, Iterable, List, Optional, Tuple
@ -59,6 +60,21 @@ class UnsupportedDialectError(Exception):
        super().__init__(f"Unsupported dialect in param {param}: {dialect}")
 def masked(s: str, mask: bool = True) -> str:
    """Mask a string if masked is True otherwise return the string.
    Only for development purposes, do not use in production.
    Change it False if you want to see the data in the logs.
    Args:
        s: string to mask
        mask: boolean to mask the string
    Returns:
        masked string if mask is True otherwise return the string
    """
    return "***" if mask else s
 class TableDiffValidator(BaseTestValidator, SQAValidatorMixin):
    """
    Compare two tables and fail if the number of differences exceeds a threshold
@ -110,12 +126,14 @@ class TableDiffValidator(BaseTestValidator, SQAValidatorMixin):
            stats = table_diff_iter.get_stats_dict()
            if stats["total"] > 0:
                logger.debug("Sample of failed rows:")
-                for s in islice(self.get_table_diff(), 10):
+                # depending on the data, this require scanning a lot of data
-                    # since the data can contiant sensitive information, we don't want to log it
+                # so we only log the sample in debug mode. data can be sensitive
-                    # we can uncomment this line if we must see the data in the logs
+                # so it is masked by default
-                    # logger.debug(s)
+                for s in islice(
-                    # by default we will log the data masked
+                    self.safe_table_diff_iterator(),
-                    logger.debug([s[0], ["*" for _ in s[1]]])
+                    10 if logger.level <= logging.DEBUG else 0,
                ):
                    logger.debug("%s", str([s[0]] + [masked(st) for st in s[1]]))
            test_case_result = self.get_row_diff_test_case_result(
                threshold,
                stats["total"],
@ -222,12 +240,10 @@ class TableDiffValidator(BaseTestValidator, SQAValidatorMixin):
            "where": self.get_where(),
        }
        logger.debug(
-            "Calling table diff with parameters:"  # pylint: disable=consider-using-f-string
+            "Calling table diff with parameters: table1=%s, table2=%s, kwargs=%s",
-            " table1={}, table2={}, kwargs={}".format(
+            table1.table_path,
-                table1.table_path,
+            table2.table_path,
-                table2.table_path,
+            ",".join(f"{k}={v}" for k, v in data_diff_kwargs.items()),
                ",".join(f"{k}={v}" for k, v in data_diff_kwargs.items()),
            )
        )
        return data_diff.diff_tables(table1, table2, **data_diff_kwargs)  # type: ignore
@ -400,3 +416,19 @@ class TableDiffValidator(BaseTestValidator, SQAValidatorMixin):
            if len(key_set) > limit:
                len(key_set)
        return len(key_set)
    def safe_table_diff_iterator(self) -> DiffResultWrapper:
        """A safe iterator object which properly closes the diff object when the generator is exhausted.
        Otherwise the data_diff library will continue to hold the connection open and eventually
        raise a KeyError.
        """
        gen = self.get_table_diff()
        try:
            yield from gen
        finally:
            try:
                gen.diff.close()
            except KeyError as ex:
                if str(ex) == "2":
                    # This is a known issue in data_diff where the diff object is closed
                    pass