From 342eaee0929530c15b3332bd345812e4a0400e3e Mon Sep 17 00:00:00 2001
From: gpby <gpby@users.noreply.github.com>
Date: Wed, 21 May 2025 10:12:15 +0300
Subject: [PATCH] Fixes #20956: Teradata profiler (#21292)

* add teradata functions

* fix teradata schema

* reformat code

* change random approach for teradata

---------

Co-authored-by: Teddy <teddy.crepineau@gmail.com>
---
 ingestion/src/metadata/profiler/orm/functions/length.py  | 1 +
 ingestion/src/metadata/profiler/orm/functions/md5.py     | 9 +++++++++
 ingestion/src/metadata/profiler/orm/functions/modulo.py  | 1 +
 .../src/metadata/profiler/orm/functions/random_num.py    | 4 ++++
 ingestion/src/metadata/profiler/orm/registry.py          | 1 +
 5 files changed, 16 insertions(+)
diff --git a/ingestion/src/metadata/profiler/orm/functions/length.py b/ingestion/src/metadata/profiler/orm/functions/length.py
index 153791479d0..8c9c71a9f32 100644
--- a/ingestion/src/metadata/profiler/orm/functions/length.py
+++ b/ingestion/src/metadata/profiler/orm/functions/length.py
@@ -52,6 +52,7 @@ def _(element, compiler, **kw):
 @compiles(LenFn, Dialects.Hana)
 @compiles(LenFn, Dialects.Druid)
 @compiles(LenFn, Dialects.Doris)
+@compiles(LenFn, Dialects.Teradata)
 def _(element, compiler, **kw):
     return "LENGTH(%s)" % compiler.process(element.clauses, **kw)
 
diff --git a/ingestion/src/metadata/profiler/orm/functions/md5.py b/ingestion/src/metadata/profiler/orm/functions/md5.py
index 2a493e95f12..76f96c9792c 100644
--- a/ingestion/src/metadata/profiler/orm/functions/md5.py
+++ b/ingestion/src/metadata/profiler/orm/functions/md5.py
@@ -40,3 +40,12 @@ def _(element, compiler, **kw):
 @compiles(MD5, PythonDialects.BigQuery.value)
 def _(element, compiler, **kw):
     return f"TO_HEX(MD5(CAST({compiler.process(element.clauses, **kw)} AS STRING)))"
+
+
+@compiles(MD5, PythonDialects.Teradata.value)
+def _(element, compiler, **kw):
+    # There is no MD5 in Teradata or any other hashes
+    # But we can use UDF function hash_md5 published by Teradata Community
+    return (
+        f"HASH_MD5(CAST({compiler.process(element.clauses, **kw)} AS VARCHAR(32000)))"
+    )
diff --git a/ingestion/src/metadata/profiler/orm/functions/modulo.py b/ingestion/src/metadata/profiler/orm/functions/modulo.py
index 699461e5ef6..1986a1df8fa 100644
--- a/ingestion/src/metadata/profiler/orm/functions/modulo.py
+++ b/ingestion/src/metadata/profiler/orm/functions/modulo.py
@@ -61,6 +61,7 @@ def _(element, compiler, **kw):
 @compiles(ModuloFn, Dialects.Vertica)
 @compiles(ModuloFn, Dialects.Hana)
 @compiles(ModuloFn, Dialects.Cockroach)
+@compiles(ModuloFn, Dialects.Teradata)
 def _(element, compiler, **kw):
     """Modulo function for specific dialect"""
     value, base = validate_and_compile(element, compiler, **kw)
diff --git a/ingestion/src/metadata/profiler/orm/functions/random_num.py b/ingestion/src/metadata/profiler/orm/functions/random_num.py
index 18351b886c7..36ea3a633b7 100644
--- a/ingestion/src/metadata/profiler/orm/functions/random_num.py
+++ b/ingestion/src/metadata/profiler/orm/functions/random_num.py
@@ -98,10 +98,14 @@ def _(*_, **__):
 
 
 @compiles(RandomNumFn, Dialects.Snowflake)
+@compiles(RandomNumFn, Dialects.Teradata)
 def _(*_, **__):
     """We use FROM <table> SAMPLE BERNOULLI (n) for sampling
     in snowflake. We'll return 0 to make sure we get all the rows
     from the already sampled results when executing row::MOD(0, 100) < profile_sample.
+
+    Teradata RANDOM(0,100) function can't be used inside ORDER BY clause. That's why
+    use the same trick.
     """
     return "0"
 
diff --git a/ingestion/src/metadata/profiler/orm/registry.py b/ingestion/src/metadata/profiler/orm/registry.py
index 38a618a3433..5b44265c457 100644
--- a/ingestion/src/metadata/profiler/orm/registry.py
+++ b/ingestion/src/metadata/profiler/orm/registry.py
@@ -86,6 +86,7 @@ class PythonDialects(Enum):
     SingleStore = "singlestore"
     SQLite = "sqlite"
     Snowflake = "snowflake"
+    Teradata = "teradatasql"
     Trino = "trino"
     Vertica = "vertica"