From eeda6d24ae469bab771e15c2ecbd76e09395d736 Mon Sep 17 00:00:00 2001
From: Ayush Shah <ayush@getcollate.io>
Date: Fri, 12 Jul 2024 17:23:53 +0530
Subject: [PATCH] Fixes #16697: Modify the Query to avoid Numeric Data Overflow
 (#16920)

---
 .../metadata/profiler/metrics/static/mean.py  |  9 ++++++++
 .../metadata/profiler/orm/functions/sum.py    |  7 +++++++
 ingestion/tests/cli_e2e/test_cli_redshift.py  | 21 ++++++++++---------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/ingestion/src/metadata/profiler/metrics/static/mean.py b/ingestion/src/metadata/profiler/metrics/static/mean.py
index 2507e30692d..f680c22a7f8 100644
--- a/ingestion/src/metadata/profiler/metrics/static/mean.py
+++ b/ingestion/src/metadata/profiler/metrics/static/mean.py
@@ -52,6 +52,15 @@ def _(element, compiler, **kw):
     return f"if(isNaN(avg({proc})), null, avg({proc}))"
 
 
+@compiles(avg, Dialects.Redshift)
+def _(element, compiler, **kw):
+    """
+    Cast to decimal to get around potential integer overflow error
+    """
+    proc = compiler.process(element.clauses, **kw)
+    return f"avg(CAST({proc} AS DECIMAL(38,0)))"
+
+
 @compiles(avg, Dialects.MSSQL)
 def _(element, compiler, **kw):
     """
diff --git a/ingestion/src/metadata/profiler/orm/functions/sum.py b/ingestion/src/metadata/profiler/orm/functions/sum.py
index 6cc19181b61..a89810d2ba0 100644
--- a/ingestion/src/metadata/profiler/orm/functions/sum.py
+++ b/ingestion/src/metadata/profiler/orm/functions/sum.py
@@ -32,6 +32,13 @@ def _(element, compiler, **kw):
     return f"SUM(CAST({proc} AS BIGINT))"
 
 
+@compiles(SumFn, Dialects.Redshift)
+def _(element, compiler, **kw):
+    """Cast to Decimal to address overflow error from summing 32-bit int in most database dialects"""
+    proc = compiler.process(element.clauses, **kw)
+    return f"SUM(CAST({proc} AS Decimal(38,0)))"
+
+
 @compiles(SumFn, Dialects.Athena)
 @compiles(SumFn, Dialects.Trino)
 @compiles(SumFn, Dialects.Presto)
diff --git a/ingestion/tests/cli_e2e/test_cli_redshift.py b/ingestion/tests/cli_e2e/test_cli_redshift.py
index 3780155da7a..405dfe3e9d2 100644
--- a/ingestion/tests/cli_e2e/test_cli_redshift.py
+++ b/ingestion/tests/cli_e2e/test_cli_redshift.py
@@ -26,7 +26,8 @@ class RedshiftCliTest(CliCommonDB.TestSuite, SQACommonMethods):
         CREATE TABLE IF NOT EXISTS e2e_cli_tests.dbt_jaffle.persons (
             person_id int,
             full_name varchar(255),
-            birthdate date
+            birthdate date,
+            bigint_col bigint
         )
     """
 
@@ -38,13 +39,13 @@ class RedshiftCliTest(CliCommonDB.TestSuite, SQACommonMethods):
 
     insert_data_queries: List[str] = [
         """
-    INSERT INTO e2e_cli_tests.dbt_jaffle.persons (person_id, full_name, birthdate) VALUES
-        (1,'Peter Parker', '2004-08-10'),
-        (2,'Bruce Banner', '1988-12-18'),
-        (3,'Steve Rogers', '1988-07-04'),
-        (4,'Natasha Romanoff', '1997-12-03'),
-        (5,'Wanda Maximoff', '1998-02-10'),
-        (6,'Diana Prince', '1976-03-17');
+    INSERT INTO e2e_cli_tests.dbt_jaffle.persons (person_id, full_name, birthdate, bigint_col) VALUES
+        (1,'Peter Parker', '2004-08-10', 9223372036854775807),
+        (2,'Bruce Banner', '1988-12-18', 9223372036854775807),
+        (3,'Steve Rogers', '1988-07-04', 9223372036854775807),
+        (4,'Natasha Romanoff', '1997-12-03', 9223372036854775807),
+        (5,'Wanda Maximoff', '1998-02-10', 9223372036854775807),
+        (6,'Diana Prince', '1976-03-17', 9000000000000000007);
     """
     ]
 
@@ -193,13 +194,13 @@ class RedshiftCliTest(CliCommonDB.TestSuite, SQACommonMethods):
                         "interQuartileRange": 467.7975,
                         "max": 856.41,
                         "maxLength": None,
-                        "mean": -160.16,
+                        "mean": -159.0,
                         "median": -288.81,
                         "min": -999.63,
                         "minLength": None,
                         "missingCount": None,
                         "missingPercentage": None,
-                        "nonParametricSkew": 0.24351799263849705,
+                        "nonParametricSkew": 0.24571372424720792,
                         "nullCount": 0.0,
                         "nullProportion": 0.0,
                         "stddev": 528.297718809555,