From d3325cda933ee326135c6d7a692f8e22861610e3 Mon Sep 17 00:00:00 2001 From: Teddy Date: Fri, 27 Sep 2024 14:36:30 +0200 Subject: [PATCH] MINOR - Fix General Profiler Bugs (#17995) * fix import issue * fix: better handle None values in profiler processing * fix: profiler errors * chore: fix comment * style: fix python linting failure * fix: null byte error with the database client --------- Co-authored-by: Chirag Madlani <12962843+chirag-madlani@users.noreply.github.com> (cherry picked from commit 8dc6b7d282a700c2a3d4659f904a3966e16843a2) --- .../profiler/metrics/composed/null_ratio.py | 9 +++++---- .../orm/functions/table_metric_computer.py | 2 ++ .../orm/types/custom_hex_byte_string.py | 19 ++++++++++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py b/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py index cdc809eeb3f..74c4c914902 100644 --- a/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py +++ b/ingestion/src/metadata/profiler/metrics/composed/null_ratio.py @@ -50,8 +50,9 @@ class NullRatio(ComposedMetric): results of other Metrics """ - count = res.get(Count.name()) - null_count = res.get(NullCount.name()) - if count + null_count == 0: + count = res.get(Count.name(), 0) + null_count = res.get(NullCount.name(), 0) + total = count + null_count + if total == 0: return None - return null_count / (null_count + count) + return null_count / total diff --git a/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py b/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py index 17d0d84b396..5566600c61d 100644 --- a/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py +++ b/ingestion/src/metadata/profiler/orm/functions/table_metric_computer.py @@ -176,6 +176,8 @@ class SnowflakeTableMetricComputer(BaseTableMetricComputer): ) rest = self._runner._session.execute(query).first() + if not rest: + return None if rest.rowCount is None: # if we don't have any row count, fallback to the base logic return super().compute() diff --git a/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py b/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py index e87afd234e8..58fcfb6d94f 100644 --- a/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py +++ b/ingestion/src/metadata/profiler/orm/types/custom_hex_byte_string.py @@ -22,6 +22,7 @@ from sqlalchemy.sql.sqltypes import String, TypeDecorator from metadata.utils.logger import ingestion_logger logger = ingestion_logger() +NULL_BYTE = "\x00" class HexByteString(TypeDecorator): @@ -63,10 +64,22 @@ class HexByteString(TypeDecorator): detected_encoding = chardet.detect(bytes_value).get("encoding") if detected_encoding: try: - value = bytes_value.decode(encoding=detected_encoding) - return value + # Decode the bytes value with the detected encoding and replace errors with "?" + # if bytes cannot be decoded e.g. b"\x66\x67\x67\x9c", if detected_encoding="utf-8" + # will result in 'foo�' (instead of failing) + str_value = bytes_value.decode( + encoding=detected_encoding, errors="replace" + ) + # Replace NULL_BYTE with empty string to avoid errors with + # the database client (should be O(n)) + str_value = ( + str_value.replace(NULL_BYTE, "") + if NULL_BYTE in str_value + else str_value + ) + return str_value except Exception as exc: - logger.debug("Failed to parse bytes valud as string: %s", exc) + logger.debug("Failed to parse bytes value as string: %s", exc) logger.debug(traceback.format_exc()) return value.hex()