From eeda6d24ae469bab771e15c2ecbd76e09395d736 Mon Sep 17 00:00:00 2001 From: Ayush Shah Date: Fri, 12 Jul 2024 17:23:53 +0530 Subject: [PATCH] Fixes #16697: Modify the Query to avoid Numeric Data Overflow (#16920) --- .../metadata/profiler/metrics/static/mean.py | 9 ++++++++ .../metadata/profiler/orm/functions/sum.py | 7 +++++++ ingestion/tests/cli_e2e/test_cli_redshift.py | 21 ++++++++++--------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/ingestion/src/metadata/profiler/metrics/static/mean.py b/ingestion/src/metadata/profiler/metrics/static/mean.py index 2507e30692d..f680c22a7f8 100644 --- a/ingestion/src/metadata/profiler/metrics/static/mean.py +++ b/ingestion/src/metadata/profiler/metrics/static/mean.py @@ -52,6 +52,15 @@ def _(element, compiler, **kw): return f"if(isNaN(avg({proc})), null, avg({proc}))" +@compiles(avg, Dialects.Redshift) +def _(element, compiler, **kw): + """ + Cast to decimal to get around potential integer overflow error + """ + proc = compiler.process(element.clauses, **kw) + return f"avg(CAST({proc} AS DECIMAL(38,0)))" + + @compiles(avg, Dialects.MSSQL) def _(element, compiler, **kw): """ diff --git a/ingestion/src/metadata/profiler/orm/functions/sum.py b/ingestion/src/metadata/profiler/orm/functions/sum.py index 6cc19181b61..a89810d2ba0 100644 --- a/ingestion/src/metadata/profiler/orm/functions/sum.py +++ b/ingestion/src/metadata/profiler/orm/functions/sum.py @@ -32,6 +32,13 @@ def _(element, compiler, **kw): return f"SUM(CAST({proc} AS BIGINT))" +@compiles(SumFn, Dialects.Redshift) +def _(element, compiler, **kw): + """Cast to Decimal to address overflow error from summing 32-bit int in most database dialects""" + proc = compiler.process(element.clauses, **kw) + return f"SUM(CAST({proc} AS Decimal(38,0)))" + + @compiles(SumFn, Dialects.Athena) @compiles(SumFn, Dialects.Trino) @compiles(SumFn, Dialects.Presto) diff --git a/ingestion/tests/cli_e2e/test_cli_redshift.py b/ingestion/tests/cli_e2e/test_cli_redshift.py index 3780155da7a..405dfe3e9d2 100644 --- a/ingestion/tests/cli_e2e/test_cli_redshift.py +++ b/ingestion/tests/cli_e2e/test_cli_redshift.py @@ -26,7 +26,8 @@ class RedshiftCliTest(CliCommonDB.TestSuite, SQACommonMethods): CREATE TABLE IF NOT EXISTS e2e_cli_tests.dbt_jaffle.persons ( person_id int, full_name varchar(255), - birthdate date + birthdate date, + bigint_col bigint ) """ @@ -38,13 +39,13 @@ class RedshiftCliTest(CliCommonDB.TestSuite, SQACommonMethods): insert_data_queries: List[str] = [ """ - INSERT INTO e2e_cli_tests.dbt_jaffle.persons (person_id, full_name, birthdate) VALUES - (1,'Peter Parker', '2004-08-10'), - (2,'Bruce Banner', '1988-12-18'), - (3,'Steve Rogers', '1988-07-04'), - (4,'Natasha Romanoff', '1997-12-03'), - (5,'Wanda Maximoff', '1998-02-10'), - (6,'Diana Prince', '1976-03-17'); + INSERT INTO e2e_cli_tests.dbt_jaffle.persons (person_id, full_name, birthdate, bigint_col) VALUES + (1,'Peter Parker', '2004-08-10', 9223372036854775807), + (2,'Bruce Banner', '1988-12-18', 9223372036854775807), + (3,'Steve Rogers', '1988-07-04', 9223372036854775807), + (4,'Natasha Romanoff', '1997-12-03', 9223372036854775807), + (5,'Wanda Maximoff', '1998-02-10', 9223372036854775807), + (6,'Diana Prince', '1976-03-17', 9000000000000000007); """ ] @@ -193,13 +194,13 @@ class RedshiftCliTest(CliCommonDB.TestSuite, SQACommonMethods): "interQuartileRange": 467.7975, "max": 856.41, "maxLength": None, - "mean": -160.16, + "mean": -159.0, "median": -288.81, "min": -999.63, "minLength": None, "missingCount": None, "missingPercentage": None, - "nonParametricSkew": 0.24351799263849705, + "nonParametricSkew": 0.24571372424720792, "nullCount": 0.0, "nullProportion": 0.0, "stddev": 528.297718809555,