From 3c5fbffeaa633d1aa1393d9460fddd4f9cf8cc45 Mon Sep 17 00:00:00 2001 From: Teddy Date: Mon, 2 Jun 2025 17:55:48 +0200 Subject: [PATCH] feat: add regex support for dbx (#21514) --- .../metrics/static/not_regexp_match_count.py | 7 ++-- .../metrics/static/regexp_match_count.py | 5 ++- .../metadata/profiler/orm/functions/regexp.py | 40 +++++++++++++++++++ 3 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 ingestion/src/metadata/profiler/orm/functions/regexp.py diff --git a/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py b/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py index ebb851afe6a..d1bd893225f 100644 --- a/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py @@ -18,6 +18,7 @@ from sqlalchemy import case, column, not_ from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.profiler.metrics.core import StaticMetric, _label +from metadata.profiler.orm.functions.regexp import RegexpMatchFn from metadata.profiler.orm.functions.sum import SumFn from metadata.profiler.orm.registry import is_concatenable @@ -58,9 +59,9 @@ class NotRegexCount(StaticMetric): [ ( not_( - column(self.col.name, self.col.type).regexp_match( - self.expression - ) + RegexpMatchFn( + column(self.col.name, self.col.type), self.expression + ), ), 0, ) diff --git a/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py b/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py index 95e629a45f3..35bf379ea32 100644 --- a/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py @@ -18,6 +18,7 @@ from sqlalchemy import case, column from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.profiler.metrics.core import StaticMetric, _label +from metadata.profiler.orm.functions.regexp import RegexpMatchFn from metadata.profiler.orm.functions.sum import SumFn from metadata.profiler.orm.registry import is_concatenable @@ -57,8 +58,8 @@ class RegexCount(StaticMetric): case( [ ( - column(self.col.name, self.col.type).regexp_match( - self.expression + RegexpMatchFn( + column(self.col.name, self.col.type), self.expression ), 1, ) diff --git a/ingestion/src/metadata/profiler/orm/functions/regexp.py b/ingestion/src/metadata/profiler/orm/functions/regexp.py new file mode 100644 index 00000000000..01617d86da9 --- /dev/null +++ b/ingestion/src/metadata/profiler/orm/functions/regexp.py @@ -0,0 +1,40 @@ +# Copyright 2025 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Define a regexp match function.""" + +from sqlalchemy.ext.compiler import compiles +from sqlalchemy.sql.functions import GenericFunction + +from metadata.profiler.metrics.core import CACHE +from metadata.profiler.orm.registry import Dialects + + +class RegexpMatchFn(GenericFunction): + name = "regexp_match" + inherit_cache = CACHE + + +@compiles(RegexpMatchFn) +def _(element, compiler, **kw): + """Base function for regexp_match""" + column, pattern = element.clauses + fn = column.regexp_match(pattern) + return compiler.process(fn, **kw) + + +@compiles(RegexpMatchFn, Dialects.Databricks) +def _(element, compiler, **kw): + """Databricks function for regexp_match""" + column, pattern = element.clauses + compiled_column = compiler.process(column, **kw) + compiled_pattern = compiler.process(pattern, **kw) + return f"REGEXP_LIKE({compiled_column}, {compiled_pattern})"