diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py index 10c30f810e9..661bf4f3d8e 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py @@ -24,6 +24,9 @@ from metadata.generated.schema.entity.services.connections.database.databricksCo from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) +from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( + MariaDBConnection, +) from metadata.generated.schema.entity.services.connections.database.singleStoreConnection import ( SingleStoreConnection, ) @@ -47,6 +50,9 @@ from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( DatabricksProfilerInterface, ) +from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( + MariaDBProfilerInterface, +) from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -100,6 +106,7 @@ profilers = { BigQueryConnection.__name__: BigQueryProfilerInterface, SingleStoreConnection.__name__: SingleStoreProfilerInterface, DatalakeConnection.__name__: PandasProfilerInterface, + MariaDBConnection.__name__: MariaDBProfilerInterface, SnowflakeConnection.__name__: SnowflakeProfilerInterface, TrinoConnection.__name__: TrinoProfilerInterface, UnityCatalogConnection.__name__: UnityCatalogProfilerInterface, diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py new file mode 100644 index 00000000000..153a04a852c --- /dev/null +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/mariadb/profiler_interface.py @@ -0,0 +1,85 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Interfaces with database for all database engine +supporting sqlalchemy abstraction layer +""" + +from typing import List + +from sqlalchemy.exc import ProgrammingError + +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, + handle_query_exception, +) +from metadata.profiler.metrics.registry import Metrics +from metadata.profiler.processor.runner import QueryRunner +from metadata.profiler.source.mariadb.metrics.window.first_quartile import ( + MariaDBFirstQuartile, +) +from metadata.profiler.source.mariadb.metrics.window.median import MariaDBMedian +from metadata.profiler.source.mariadb.metrics.window.third_quartile import ( + MariaDBThirdQuartile, +) +from metadata.utils.logger import profiler_interface_registry_logger + +logger = profiler_interface_registry_logger() + + +class MariaDBProfilerInterface(SQAProfilerInterface): + """ + Interface to interact with registry supporting + sqlalchemy. + """ + + def _compute_window_metrics( + self, + metrics: List[Metrics], + runner: QueryRunner, + *args, + **kwargs, + ): + """Given a list of metrics, compute the given results + and returns the values + + Args: + column: the column to compute the metrics against + metrics: list of metrics to compute + Returns: + dictionnary of results + """ + session = kwargs.get("session") + column = kwargs.get("column") + + if not metrics: + return None + + try: + # we patch the metrics at runtime to use the MariaDB specific functions + # as we can't compile the query based on the dialect as it return `mysql` + metrics = [MariaDBFirstQuartile, MariaDBMedian, MariaDBThirdQuartile] # type: ignore + row = runner.select_first_from_sample( + *[metric(column).fn() for metric in metrics], + ) + if row: + return dict(row) + except ProgrammingError: + logger.info( + f"Skipping window metrics for {runner.table.__tablename__}.{column.name} due to overflow" + ) + return None + + except Exception as exc: + msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" + handle_query_exception(msg, exc, session) + return None diff --git a/ingestion/src/metadata/profiler/source/mariadb/functions/median.py b/ingestion/src/metadata/profiler/source/mariadb/functions/median.py new file mode 100644 index 00000000000..bb90356c64d --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/functions/median.py @@ -0,0 +1,20 @@ +"""Median function for MariaDB""" + +from sqlalchemy.ext.compiler import compiles +from sqlalchemy.sql.functions import FunctionElement + +from metadata.profiler.metrics.core import CACHE + + +class MariaDBMedianFn(FunctionElement): + inherit_cache = CACHE + + +@compiles(MariaDBMedianFn) +def _(elements, compiler, **kwargs): # pylint: disable=unused-argument + col = compiler.process(elements.clauses.clauses[0]) + percentile = elements.clauses.clauses[2].value + # According to the documentation available at https://mariadb.com/kb/en/median/#description, + # the PERCENTILE_CONT function can be utilized to calculate the median. Therefore, it is + # being used in this context. + return f"PERCENTILE_CONT({percentile:.2f}) WITHIN GROUP (ORDER BY {col}) OVER()" diff --git a/ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py new file mode 100644 index 00000000000..a56cf84cca6 --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/first_quartile.py @@ -0,0 +1,10 @@ +"""Override first quartile metric definition for MariaDB""" + +from metadata.profiler.metrics.window.first_quartile import FirstQuartile +from metadata.profiler.source.mariadb.functions.median import MariaDBMedianFn + + +class MariaDBFirstQuartile(FirstQuartile): + def _compute_sqa_fn(self, column, table, percentile): + """Generic method to compute the quartile using sqlalchemy""" + return MariaDBMedianFn(column, table, percentile) diff --git a/ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py new file mode 100644 index 00000000000..a5903175f11 --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/median.py @@ -0,0 +1,10 @@ +"""Override first quartile metric definition for MariaDB""" + +from metadata.profiler.metrics.window.median import Median +from metadata.profiler.source.mariadb.functions.median import MariaDBMedianFn + + +class MariaDBMedian(Median): + def _compute_sqa_fn(self, column, table, percentile): + """Generic method to compute the quartile using sqlalchemy""" + return MariaDBMedianFn(column, table, percentile) diff --git a/ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py new file mode 100644 index 00000000000..9211f5dfd75 --- /dev/null +++ b/ingestion/src/metadata/profiler/source/mariadb/metrics/window/third_quartile.py @@ -0,0 +1,10 @@ +"""Override first quartile metric definition for MariaDB""" + +from metadata.profiler.metrics.window.third_quartile import ThirdQuartile +from metadata.profiler.source.mariadb.functions.median import MariaDBMedianFn + + +class MariaDBThirdQuartile(ThirdQuartile): + def _compute_sqa_fn(self, column, table, percentile): + """Generic method to compute the quartile using sqlalchemy""" + return MariaDBMedianFn(column, table, percentile) diff --git a/ingestion/tests/unit/profiler/test_profiler_interface.py b/ingestion/tests/unit/profiler/test_profiler_interface.py index 2dc878a8e37..bff960b031c 100644 --- a/ingestion/tests/unit/profiler/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/test_profiler_interface.py @@ -39,6 +39,9 @@ from metadata.generated.schema.entity.services.connections.database.databricksCo from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) +from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( + MariaDBConnection, +) from metadata.generated.schema.entity.services.connections.database.singleStoreConnection import ( SingleStoreConnection, ) @@ -75,6 +78,9 @@ from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( DatabricksProfilerInterface, ) +from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( + MariaDBProfilerInterface, +) from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -369,6 +375,7 @@ class ProfilerInterfaceTest(TestCase): TrinoConnection.__name__: TrinoProfilerInterface, UnityCatalogConnection.__name__: UnityCatalogProfilerInterface, DatabricksConnection.__name__: DatabricksProfilerInterface, + MariaDBConnection.__name__: MariaDBProfilerInterface, } # Register profiles