From 342eaee0929530c15b3332bd345812e4a0400e3e Mon Sep 17 00:00:00 2001 From: gpby Date: Wed, 21 May 2025 10:12:15 +0300 Subject: [PATCH] Fixes #20956: Teradata profiler (#21292) * add teradata functions * fix teradata schema * reformat code * change random approach for teradata --------- Co-authored-by: Teddy --- ingestion/src/metadata/profiler/orm/functions/length.py | 1 + ingestion/src/metadata/profiler/orm/functions/md5.py | 9 +++++++++ ingestion/src/metadata/profiler/orm/functions/modulo.py | 1 + .../src/metadata/profiler/orm/functions/random_num.py | 4 ++++ ingestion/src/metadata/profiler/orm/registry.py | 1 + 5 files changed, 16 insertions(+) diff --git a/ingestion/src/metadata/profiler/orm/functions/length.py b/ingestion/src/metadata/profiler/orm/functions/length.py index 153791479d0..8c9c71a9f32 100644 --- a/ingestion/src/metadata/profiler/orm/functions/length.py +++ b/ingestion/src/metadata/profiler/orm/functions/length.py @@ -52,6 +52,7 @@ def _(element, compiler, **kw): @compiles(LenFn, Dialects.Hana) @compiles(LenFn, Dialects.Druid) @compiles(LenFn, Dialects.Doris) +@compiles(LenFn, Dialects.Teradata) def _(element, compiler, **kw): return "LENGTH(%s)" % compiler.process(element.clauses, **kw) diff --git a/ingestion/src/metadata/profiler/orm/functions/md5.py b/ingestion/src/metadata/profiler/orm/functions/md5.py index 2a493e95f12..76f96c9792c 100644 --- a/ingestion/src/metadata/profiler/orm/functions/md5.py +++ b/ingestion/src/metadata/profiler/orm/functions/md5.py @@ -40,3 +40,12 @@ def _(element, compiler, **kw): @compiles(MD5, PythonDialects.BigQuery.value) def _(element, compiler, **kw): return f"TO_HEX(MD5(CAST({compiler.process(element.clauses, **kw)} AS STRING)))" + + +@compiles(MD5, PythonDialects.Teradata.value) +def _(element, compiler, **kw): + # There is no MD5 in Teradata or any other hashes + # But we can use UDF function hash_md5 published by Teradata Community + return ( + f"HASH_MD5(CAST({compiler.process(element.clauses, **kw)} AS VARCHAR(32000)))" + ) diff --git a/ingestion/src/metadata/profiler/orm/functions/modulo.py b/ingestion/src/metadata/profiler/orm/functions/modulo.py index 699461e5ef6..1986a1df8fa 100644 --- a/ingestion/src/metadata/profiler/orm/functions/modulo.py +++ b/ingestion/src/metadata/profiler/orm/functions/modulo.py @@ -61,6 +61,7 @@ def _(element, compiler, **kw): @compiles(ModuloFn, Dialects.Vertica) @compiles(ModuloFn, Dialects.Hana) @compiles(ModuloFn, Dialects.Cockroach) +@compiles(ModuloFn, Dialects.Teradata) def _(element, compiler, **kw): """Modulo function for specific dialect""" value, base = validate_and_compile(element, compiler, **kw) diff --git a/ingestion/src/metadata/profiler/orm/functions/random_num.py b/ingestion/src/metadata/profiler/orm/functions/random_num.py index 18351b886c7..36ea3a633b7 100644 --- a/ingestion/src/metadata/profiler/orm/functions/random_num.py +++ b/ingestion/src/metadata/profiler/orm/functions/random_num.py @@ -98,10 +98,14 @@ def _(*_, **__): @compiles(RandomNumFn, Dialects.Snowflake) +@compiles(RandomNumFn, Dialects.Teradata) def _(*_, **__): """We use FROM SAMPLE BERNOULLI (n) for sampling in snowflake. We'll return 0 to make sure we get all the rows from the already sampled results when executing row::MOD(0, 100) < profile_sample. + + Teradata RANDOM(0,100) function can't be used inside ORDER BY clause. That's why + use the same trick. """ return "0" diff --git a/ingestion/src/metadata/profiler/orm/registry.py b/ingestion/src/metadata/profiler/orm/registry.py index 38a618a3433..5b44265c457 100644 --- a/ingestion/src/metadata/profiler/orm/registry.py +++ b/ingestion/src/metadata/profiler/orm/registry.py @@ -86,6 +86,7 @@ class PythonDialects(Enum): SingleStore = "singlestore" SQLite = "sqlite" Snowflake = "snowflake" + Teradata = "teradatasql" Trino = "trino" Vertica = "vertica"