From 603d61eaa25eb24f49e68e3d1e2f12c79eaa3bf8 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Wed, 26 Feb 2025 20:30:42 +0100 Subject: [PATCH] Fix #19856 - Set the db in mysql/mariadb for metric computations (#19994) * use db for mysql/mariadb * format --- .../metadata/mixins/sqalchemy/sqa_mixin.py | 27 +++++++++++++------ .../sqlalchemy/profiler_interface.py | 2 +- .../profiler/metrics/window/first_quartile.py | 2 +- .../profiler/metrics/window/median.py | 2 +- .../profiler/metrics/window/third_quartile.py | 2 +- .../sampler/sqlalchemy/azuresql/sampler.py | 4 ++- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py index 32cda7c1008..a869db6ee49 100644 --- a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py +++ b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py @@ -24,6 +24,12 @@ from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.services.connections.database.databricksConnection import ( DatabricksConnection, ) +from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( + MariaDBConnection, +) +from metadata.generated.schema.entity.services.connections.database.mysqlConnection import ( + MysqlConnection, +) from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( SnowflakeType, ) @@ -79,21 +85,26 @@ class SQAInterfaceMixin(Root): ) def set_catalog(self, session) -> None: - """Set catalog for the session. Right now only databricks and unity catalog requires it + """Set the catalog or database for the session. Args: session (Session): sqa session object """ - if not isinstance( + if isinstance( self.service_connection_config, (UnityCatalogConnection, DatabricksConnection), ): - return - bind = session.get_bind() - bind.execute( - "USE CATALOG %(catalog)s;", - {"catalog": self.service_connection_config.catalog}, - ).first() + session.get_bind().execute( + "USE CATALOG %(catalog)s;", + {"catalog": self.service_connection_config.catalog}, + ).first() + + if isinstance( + self.service_connection_config, (MysqlConnection, MariaDBConnection) + ): + session.get_bind().execute( + f"USE {self.table_entity.databaseSchema.name};", + ) def close(self): """close session""" diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 53ca20c6e4c..5614d42e665 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -297,7 +297,7 @@ class SQAProfilerInterface(ProfilerInterface, SQAInterfaceMixin): column: the column to compute the metrics against metrics: list of metrics to compute Returns: - dictionnary of results + dictionary of results """ if not metrics: diff --git a/ingestion/src/metadata/profiler/metrics/window/first_quartile.py b/ingestion/src/metadata/profiler/metrics/window/first_quartile.py index 2b20bd3af6e..fc0ca051475 100644 --- a/ingestion/src/metadata/profiler/metrics/window/first_quartile.py +++ b/ingestion/src/metadata/profiler/metrics/window/first_quartile.py @@ -88,7 +88,7 @@ class FirstQuartile(StaticMetric, PercentilMixin): except MemoryError: logger.error( f"Unable to compute Median for {self.col.name} due to memory constraints." - f"We recommend using a smaller sample size or partitionning." + f"We recommend using a smaller sample size or partitioning." ) return None # check if nan diff --git a/ingestion/src/metadata/profiler/metrics/window/median.py b/ingestion/src/metadata/profiler/metrics/window/median.py index c12878de918..8f34f5f7a95 100644 --- a/ingestion/src/metadata/profiler/metrics/window/median.py +++ b/ingestion/src/metadata/profiler/metrics/window/median.py @@ -87,7 +87,7 @@ class Median(StaticMetric, PercentilMixin): except MemoryError: logger.error( f"Unable to compute Median for {self.col.name} due to memory constraints." - f"We recommend using a smaller sample size or partitionning." + f"We recommend using a smaller sample size or partitioning." ) return None try: diff --git a/ingestion/src/metadata/profiler/metrics/window/third_quartile.py b/ingestion/src/metadata/profiler/metrics/window/third_quartile.py index 8f0479a097c..a8b629b91a0 100644 --- a/ingestion/src/metadata/profiler/metrics/window/third_quartile.py +++ b/ingestion/src/metadata/profiler/metrics/window/third_quartile.py @@ -88,7 +88,7 @@ class ThirdQuartile(StaticMetric, PercentilMixin): except MemoryError: logger.error( f"Unable to compute Median for {self.col.name} due to memory constraints." - f"We recommend using a smaller sample size or partitionning." + f"We recommend using a smaller sample size or partitioning." ) return None # check if nan diff --git a/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py index 56289b63a87..09974ff8f96 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py @@ -51,7 +51,9 @@ class AzureSQLSampler(SQASampler): def get_sample_query(self, *, column=None) -> Query: """get query for sample data""" - rnd = self._base_sample_query(column).cte(f"{self.get_sampler_table_name()}_rnd") + rnd = self._base_sample_query(column).cte( + f"{self.get_sampler_table_name()}_rnd" + ) query = self.client.query(rnd) return query.cte(f"{self.get_sampler_table_name()}_sample")