From 27984c25f329013e7254787ecbae5b545ba531a0 Mon Sep 17 00:00:00 2001 From: Keith Sirmons Date: Thu, 6 Apr 2023 23:48:18 -0500 Subject: [PATCH] Fixes 10959: MSSQL profile has error when averaging large integer column (#10960) * updated metadata to work with the impala query engine. Uses the describe function to grab column names, data types, and comments. * added the ordinalPosition data point into the Column constructor. * renamed variable to better describe its usage. * updated profile errors. Hive connections now comment columns by default. * removed print statements * Cleaned up code by pulling check into its own function * Updated median function to return null when it is being used for first and third quartiles. * updated metadata to work with the impala query engine. Uses the describe function to grab column names, data types, and comments. * added the ordinalPosition data point into the Column constructor. * renamed variable to better describe its usage. * updated profile errors. Hive connections now comment columns by default. * removed print statements * Cleaned up code by pulling check into its own function * Updated median function to return null when it is being used for first and third quartiles. * removed print statements and ran make py_format * updated to fix some pylint errors. imported Dialects to remove string compare to "impala" engine * moved huge comment into function docstring. This comment shows us the sql to get quartiles in Impala * added cast to decimal for column when running average in mean.py * fixed lint error --- ingestion/src/metadata/profiler/metrics/static/mean.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ingestion/src/metadata/profiler/metrics/static/mean.py b/ingestion/src/metadata/profiler/metrics/static/mean.py index 01538152d52..7261d256377 100644 --- a/ingestion/src/metadata/profiler/metrics/static/mean.py +++ b/ingestion/src/metadata/profiler/metrics/static/mean.py @@ -41,6 +41,16 @@ def _(element, compiler, **kw): return f"if(isNaN(avg({proc})), null, avg({proc}))" +@compiles(avg, Dialects.MSSQL) +def _(element, compiler, **kw): + """ + Cast to decimal to get around potential integer overflow error - + Error 8115: Arithmetic overflow error converting expression to data type int. + """ + proc = compiler.process(element.clauses, **kw) + return f"avg(cast({proc} as decimal))" + + class Mean(StaticMetric): """ AVG Metric