From 65c5b44eaa13ae860d5bd7ac386e977e997ca949 Mon Sep 17 00:00:00 2001 From: Keith Sirmons Date: Fri, 5 May 2023 14:45:30 -0500 Subject: [PATCH] Impala Connection Profiler is_nan rollback; Histogram fix. (#11388) --- .../profiler/metrics/hybrid/histogram.py | 33 ++++++++++++------- .../metadata/profiler/metrics/static/max.py | 7 ---- .../metadata/profiler/metrics/static/mean.py | 9 ----- .../metadata/profiler/metrics/static/min.py | 7 ---- .../profiler/metrics/static/stddev.py | 6 ---- .../metadata/profiler/orm/functions/median.py | 2 +- .../metadata/profiler/orm/functions/sum.py | 2 +- ingestion/tests/unit/profiler/test_metrics.py | 3 ++ 8 files changed, 26 insertions(+), 43 deletions(-) diff --git a/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py b/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py index 6d1718cd3bd..7c04a05f5f5 100644 --- a/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py +++ b/ingestion/src/metadata/profiler/metrics/hybrid/histogram.py @@ -23,7 +23,8 @@ from metadata.profiler.metrics.core import HybridMetric from metadata.profiler.metrics.static.count import Count from metadata.profiler.metrics.static.max import Max from metadata.profiler.metrics.static.min import Min -from metadata.profiler.orm.registry import is_quantifiable +from metadata.profiler.orm.functions.length import LenFn +from metadata.profiler.orm.registry import is_concatenable, is_quantifiable from metadata.utils.helpers import format_large_string_numbers from metadata.utils.logger import profiler_logger from metadata.utils.sqa_utils import handle_array @@ -66,11 +67,11 @@ class Histogram(HybridMetric): res_min = res.get(Min.name()) res_max = res.get(Max.name()) - if any(var is None for var in [res_iqr, res_row_count, res_min, res_max]): + if any(var is None for var in [res_row_count, res_min, res_max]): return None return ( - float(res_iqr), + float(res_iqr) if res_iqr is not None else res_iqr, float(res_row_count), float(res_min), float(res_max), @@ -110,18 +111,22 @@ class Histogram(HybridMetric): res_min (float): minimum value res_max (float): maximum value """ - # freedman-diaconis rule - bin_width = self._get_bin_width(float(res_iqr), res_row_count) # type: ignore - num_bins = math.ceil((res_max - res_min) / bin_width) # type: ignore + # preinint num_bins over 100. On the normal path freedman-diaconis will readjust according to the algorithm + # when we must fallback to sturges rule due to res_iqr being None, then num_bins will be readjusted. + max_bin_count = 100 + if res_iqr is not None: + # freedman-diaconis rule + bin_width = self._get_bin_width(float(res_iqr), res_row_count) # type: ignore + num_bins = math.ceil((res_max - res_min) / bin_width) # type: ignore # sturge's rule - if num_bins > 100: + if res_iqr is None or num_bins > max_bin_count: num_bins = int(math.ceil(math.log2(res_row_count) + 1)) bin_width = (res_max - res_min) / num_bins - # fallback to 100 bins - if num_bins > 100: - num_bins = 100 + # fallback to max_bin_count bins + if num_bins > max_bin_count: + num_bins = max_bin_count bin_width = (res_max - res_min) / num_bins return num_bins, bin_width @@ -141,7 +146,7 @@ class Histogram(HybridMetric): "We are missing the session attribute to compute the Histogram." ) - if not is_quantifiable(self.col.type): + if not (is_quantifiable(self.col.type) or is_concatenable(self.col.type)): return None # get the metric need for the freedman-diaconis rule @@ -159,7 +164,11 @@ class Histogram(HybridMetric): starting_bin_bound = res_min res_min = cast(Union[float, int], res_min) # satisfy mypy ending_bin_bound = res_min + bin_width - col = column(self.col.name) # type: ignore + + if is_concatenable(self.col.type): + col = LenFn(column(self.col.name)) + else: + col = column(self.col.name) # type: ignore case_stmts = [] for bin_num in range(num_bins): diff --git a/ingestion/src/metadata/profiler/metrics/static/max.py b/ingestion/src/metadata/profiler/metrics/static/max.py index 67c39ea7897..43557fd4fca 100644 --- a/ingestion/src/metadata/profiler/metrics/static/max.py +++ b/ingestion/src/metadata/profiler/metrics/static/max.py @@ -22,7 +22,6 @@ from sqlalchemy.sql.functions import GenericFunction from metadata.profiler.metrics.core import CACHE, StaticMetric, _label from metadata.profiler.orm.functions.length import LenFn from metadata.profiler.orm.registry import ( - Dialects, is_concatenable, is_date_time, is_quantifiable, @@ -40,12 +39,6 @@ def _(element, compiler, **kw): return f"MAX({col})" -@compiles(MaxFn, Dialects.Impala) -def _(element, compiler, **kw): - col = compiler.process(element.clauses, **kw) - return f"MAX(if(is_nan({col}) or is_inf({col}), null, {col}))" - - class Max(StaticMetric): """ MAX Metric diff --git a/ingestion/src/metadata/profiler/metrics/static/mean.py b/ingestion/src/metadata/profiler/metrics/static/mean.py index 33443017434..881911eae6e 100644 --- a/ingestion/src/metadata/profiler/metrics/static/mean.py +++ b/ingestion/src/metadata/profiler/metrics/static/mean.py @@ -52,15 +52,6 @@ def _(element, compiler, **kw): return f"avg(cast({proc} as decimal))" -@compiles(avg, Dialects.Impala) -def _(element, compiler, **kw): - """ - Convert NaN and Inf values to null before computing average. - """ - proc = compiler.process(element.clauses, **kw) - return f"avg(if(is_nan({proc}) or is_inf({proc}), null, {proc}))" - - class Mean(StaticMetric): """ AVG Metric diff --git a/ingestion/src/metadata/profiler/metrics/static/min.py b/ingestion/src/metadata/profiler/metrics/static/min.py index 8971b2ff757..89f94914967 100644 --- a/ingestion/src/metadata/profiler/metrics/static/min.py +++ b/ingestion/src/metadata/profiler/metrics/static/min.py @@ -22,7 +22,6 @@ from sqlalchemy.sql.functions import GenericFunction from metadata.profiler.metrics.core import CACHE, StaticMetric, _label from metadata.profiler.orm.functions.length import LenFn from metadata.profiler.orm.registry import ( - Dialects, is_concatenable, is_date_time, is_quantifiable, @@ -40,12 +39,6 @@ def _(element, compiler, **kw): return f"MIN({col})" -@compiles(MinFn, Dialects.Impala) -def _(element, compiler, **kw): - col = compiler.process(element.clauses, **kw) - return f"MIN(if(is_nan({col}) or is_inf({col}), null, {col}))" - - class Min(StaticMetric): """ MIN Metric diff --git a/ingestion/src/metadata/profiler/metrics/static/stddev.py b/ingestion/src/metadata/profiler/metrics/static/stddev.py index 0bcf16aaf07..465972788de 100644 --- a/ingestion/src/metadata/profiler/metrics/static/stddev.py +++ b/ingestion/src/metadata/profiler/metrics/static/stddev.py @@ -44,12 +44,6 @@ def _(element, compiler, **kw): return "STDEVP(%s)" % compiler.process(element.clauses, **kw) -@compiles(StdDevFn, Dialects.Impala) -def _(element, compiler, **kw): - col = compiler.process(element.clauses, **kw) - return f"STDDEV_POP(if(is_nan({col}) or is_inf({col}), null, {col}))" - - @compiles(StdDevFn, Dialects.SQLite) # Needed for unit tests def _(element, compiler, **kw): """ diff --git a/ingestion/src/metadata/profiler/orm/functions/median.py b/ingestion/src/metadata/profiler/orm/functions/median.py index 8041420a9af..1826e52a8a9 100644 --- a/ingestion/src/metadata/profiler/orm/functions/median.py +++ b/ingestion/src/metadata/profiler/orm/functions/median.py @@ -111,7 +111,7 @@ def _(elements, compiler, **kwargs): col, _, percentile = [ compiler.process(element, **kwargs) for element in elements.clauses ] - return f"if({percentile} = .5, appx_median(if(is_nan({col}) or is_inf({col}), null, {col})), null)" + return f"if({percentile} = .5, appx_median({col}), null)" @compiles(MedianFn, Dialects.MySQL) diff --git a/ingestion/src/metadata/profiler/orm/functions/sum.py b/ingestion/src/metadata/profiler/orm/functions/sum.py index 3a1f449661e..534be4c8aa2 100644 --- a/ingestion/src/metadata/profiler/orm/functions/sum.py +++ b/ingestion/src/metadata/profiler/orm/functions/sum.py @@ -78,7 +78,7 @@ def _(element, compiler, **kw): def _(element, compiler, **kw): """Impala casting. Default BIGINT isn't big enough for some sums""" proc = compiler.process(element.clauses, **kw) - return f"SUM(CAST(if(is_nan({proc}) or is_inf({proc}), null, {proc}) AS DOUBLE))" + return f"SUM(CAST({proc} AS DOUBLE))" @compiles(SumFn, Dialects.IbmDbSa) diff --git a/ingestion/tests/unit/profiler/test_metrics.py b/ingestion/tests/unit/profiler/test_metrics.py index 08eb95b09ac..2c3bf082490 100644 --- a/ingestion/tests/unit/profiler/test_metrics.py +++ b/ingestion/tests/unit/profiler/test_metrics.py @@ -361,11 +361,14 @@ class MetricsTest(TestCase): age_histogram = res.get(User.age.name)[Metrics.HISTOGRAM.name] id_histogram = res.get(User.id.name)[Metrics.HISTOGRAM.name] + comments_histogram = res.get(User.comments.name)[Metrics.HISTOGRAM.name] assert age_histogram assert len(age_histogram["frequencies"]) == 1 assert id_histogram assert len(id_histogram["frequencies"]) == 2 + assert comments_histogram + assert len(comments_histogram["frequencies"]) == 1 def test_like_count(self): """