diff --git a/ingestion/src/metadata/orm_profiler/metrics/core.py b/ingestion/src/metadata/orm_profiler/metrics/core.py index d4c3995617d..faf22373968 100644 --- a/ingestion/src/metadata/orm_profiler/metrics/core.py +++ b/ingestion/src/metadata/orm_profiler/metrics/core.py @@ -123,6 +123,15 @@ class Metric(ABC): """ return True + @classmethod + def is_window_metric(cls) -> bool: + """ + Marks the metric as a window metric. + + By default, assume it is not a window metric + """ + return False + @property def metric_type(self): """ diff --git a/ingestion/src/metadata/orm_profiler/metrics/registry.py b/ingestion/src/metadata/orm_profiler/metrics/registry.py index 4b18e5589b4..7decfc9ffdc 100644 --- a/ingestion/src/metadata/orm_profiler/metrics/registry.py +++ b/ingestion/src/metadata/orm_profiler/metrics/registry.py @@ -35,7 +35,6 @@ from metadata.orm_profiler.metrics.static.like_count import LikeCount from metadata.orm_profiler.metrics.static.max import Max from metadata.orm_profiler.metrics.static.max_length import MaxLength from metadata.orm_profiler.metrics.static.mean import Mean -from metadata.orm_profiler.metrics.static.median import Median from metadata.orm_profiler.metrics.static.min import Min from metadata.orm_profiler.metrics.static.min_length import MinLength from metadata.orm_profiler.metrics.static.not_like_count import NotLikeCount @@ -44,6 +43,7 @@ from metadata.orm_profiler.metrics.static.row_count import RowCount from metadata.orm_profiler.metrics.static.stddev import StdDev from metadata.orm_profiler.metrics.static.sum import Sum from metadata.orm_profiler.metrics.static.unique_count import UniqueCount +from metadata.orm_profiler.metrics.window.median import Median from metadata.orm_profiler.registry import MetricRegistry diff --git a/ingestion/src/metadata/orm_profiler/metrics/static/median.py b/ingestion/src/metadata/orm_profiler/metrics/window/median.py similarity index 95% rename from ingestion/src/metadata/orm_profiler/metrics/static/median.py rename to ingestion/src/metadata/orm_profiler/metrics/window/median.py index e343187a540..be487b50a9c 100644 --- a/ingestion/src/metadata/orm_profiler/metrics/static/median.py +++ b/ingestion/src/metadata/orm_profiler/metrics/window/median.py @@ -37,6 +37,10 @@ class Median(StaticMetric): def name(cls): return "median" + @classmethod + def is_window_metric(cls): + return True + @property def metric_type(self): return float diff --git a/ingestion/src/metadata/orm_profiler/profiler/core.py b/ingestion/src/metadata/orm_profiler/profiler/core.py index 96d8e614cf1..1f179139177 100644 --- a/ingestion/src/metadata/orm_profiler/profiler/core.py +++ b/ingestion/src/metadata/orm_profiler/profiler/core.py @@ -18,6 +18,7 @@ from typing import Any, Dict, Generic, List, Optional, Tuple, Type, Union from pydantic import ValidationError from sqlalchemy import Column, inspect +from sqlalchemy.engine.row import Row from sqlalchemy.orm import DeclarativeMeta from sqlalchemy.orm.session import Session from sqlalchemy.orm.util import AliasedClass @@ -241,7 +242,11 @@ class Profiler(Generic[TMetric]): try: row = self.runner.select_first_from_sample( - *[metric(col).fn() for metric in col_metrics] + *[ + metric(col).fn() + for metric in col_metrics + if not metric.is_window_metric() + ] ) self._column_results[col.name].update(dict(row)) except (TimeoutError, Exception) as err: @@ -342,6 +347,39 @@ class Profiler(Generic[TMetric]): current_col_results ) + def run_window_metrics(self, col: Column): + """ + Run windown metrics in isolation + + Args: + col: column name + """ + + col_metrics = [ + metric + for metric in self.get_col_metrics(self.static_metrics) + if metric.is_window_metric() + ] + + if not col_metrics: + return None + + for metric in col_metrics: + try: + row = self.runner.select_first_from_sample(metric(col).fn()) + self._column_results[col.name].update( + dict(row) + if isinstance(row, Row) + else { + metric.name(): row + } # Snowflake does not return a Row object when table is empty throwing an error + ) + except (Exception) as err: + logger.warning( + f"Error trying to compute column profile for {col.name} - {err}" + ) + self.session.rollback() + def execute_column(self, col: Column) -> None: """ Run the profiler on all the columns that @@ -351,6 +389,7 @@ class Profiler(Generic[TMetric]): columns are of allowed types """ self.run_static_metrics(col) + self.run_window_metrics(col) self.run_query_metrics(col) self.run_composed_metrics(col) diff --git a/ingestion/src/metadata/orm_profiler/profiler/default.py b/ingestion/src/metadata/orm_profiler/profiler/default.py index 8a991506a51..c1f33de38cd 100644 --- a/ingestion/src/metadata/orm_profiler/profiler/default.py +++ b/ingestion/src/metadata/orm_profiler/profiler/default.py @@ -31,7 +31,7 @@ def get_default_metrics(table: DeclarativeMeta) -> List[Metric]: add_props(table=table)(Metrics.COLUMN_COUNT.value), add_props(table=table)(Metrics.COLUMN_NAMES.value), # Column Metrics - # Metrics.MEDIAN.value, # TODO: enable it back after #5866 + Metrics.MEDIAN.value, Metrics.MEAN.value, Metrics.COUNT.value, Metrics.DISTINCT_COUNT.value, diff --git a/ingestion/tests/unit/profiler/test_profiler.py b/ingestion/tests/unit/profiler/test_profiler.py index 0c8f17a7595..04f228c980b 100644 --- a/ingestion/tests/unit/profiler/test_profiler.py +++ b/ingestion/tests/unit/profiler/test_profiler.py @@ -104,6 +104,7 @@ class ProfilerTest(TestCase): variance=None, distinctCount=2.0, distinctProportion=1.0, + median=30.5, # histogram=Histogram( # boundaries=["30.0 to 30.25", "31.0 to 31.25"], frequencies=[1, 1] # ),