Fixes #5866 -- Implement Support for Median Metric (#5887)

* Added additional table + test coverage

* Added logic for front end input fields

* Added comment for median metric

* skipping `Update owner and check description` cypress test

* Added support to run window metrics for the profiler

* Fix except code smell

* moved median metric to windown folder

* Fix pyformat

Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
This commit is contained in:
Teddy 2022-07-07 06:58:42 +02:00 committed by GitHub
parent 09b37d28f2
commit 48f6553fb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 56 additions and 3 deletions

View File

@ -123,6 +123,15 @@ class Metric(ABC):
"""
return True
@classmethod
def is_window_metric(cls) -> bool:
"""
Marks the metric as a window metric.
By default, assume it is not a window metric
"""
return False
@property
def metric_type(self):
"""

View File

@ -35,7 +35,6 @@ from metadata.orm_profiler.metrics.static.like_count import LikeCount
from metadata.orm_profiler.metrics.static.max import Max
from metadata.orm_profiler.metrics.static.max_length import MaxLength
from metadata.orm_profiler.metrics.static.mean import Mean
from metadata.orm_profiler.metrics.static.median import Median
from metadata.orm_profiler.metrics.static.min import Min
from metadata.orm_profiler.metrics.static.min_length import MinLength
from metadata.orm_profiler.metrics.static.not_like_count import NotLikeCount
@ -44,6 +43,7 @@ from metadata.orm_profiler.metrics.static.row_count import RowCount
from metadata.orm_profiler.metrics.static.stddev import StdDev
from metadata.orm_profiler.metrics.static.sum import Sum
from metadata.orm_profiler.metrics.static.unique_count import UniqueCount
from metadata.orm_profiler.metrics.window.median import Median
from metadata.orm_profiler.registry import MetricRegistry

View File

@ -37,6 +37,10 @@ class Median(StaticMetric):
def name(cls):
return "median"
@classmethod
def is_window_metric(cls):
return True
@property
def metric_type(self):
return float

View File

@ -18,6 +18,7 @@ from typing import Any, Dict, Generic, List, Optional, Tuple, Type, Union
from pydantic import ValidationError
from sqlalchemy import Column, inspect
from sqlalchemy.engine.row import Row
from sqlalchemy.orm import DeclarativeMeta
from sqlalchemy.orm.session import Session
from sqlalchemy.orm.util import AliasedClass
@ -241,7 +242,11 @@ class Profiler(Generic[TMetric]):
try:
row = self.runner.select_first_from_sample(
*[metric(col).fn() for metric in col_metrics]
*[
metric(col).fn()
for metric in col_metrics
if not metric.is_window_metric()
]
)
self._column_results[col.name].update(dict(row))
except (TimeoutError, Exception) as err:
@ -342,6 +347,39 @@ class Profiler(Generic[TMetric]):
current_col_results
)
def run_window_metrics(self, col: Column):
"""
Run windown metrics in isolation
Args:
col: column name
"""
col_metrics = [
metric
for metric in self.get_col_metrics(self.static_metrics)
if metric.is_window_metric()
]
if not col_metrics:
return None
for metric in col_metrics:
try:
row = self.runner.select_first_from_sample(metric(col).fn())
self._column_results[col.name].update(
dict(row)
if isinstance(row, Row)
else {
metric.name(): row
} # Snowflake does not return a Row object when table is empty throwing an error
)
except (Exception) as err:
logger.warning(
f"Error trying to compute column profile for {col.name} - {err}"
)
self.session.rollback()
def execute_column(self, col: Column) -> None:
"""
Run the profiler on all the columns that
@ -351,6 +389,7 @@ class Profiler(Generic[TMetric]):
columns are of allowed types
"""
self.run_static_metrics(col)
self.run_window_metrics(col)
self.run_query_metrics(col)
self.run_composed_metrics(col)

View File

@ -31,7 +31,7 @@ def get_default_metrics(table: DeclarativeMeta) -> List[Metric]:
add_props(table=table)(Metrics.COLUMN_COUNT.value),
add_props(table=table)(Metrics.COLUMN_NAMES.value),
# Column Metrics
# Metrics.MEDIAN.value, # TODO: enable it back after #5866
Metrics.MEDIAN.value,
Metrics.MEAN.value,
Metrics.COUNT.value,
Metrics.DISTINCT_COUNT.value,

View File

@ -104,6 +104,7 @@ class ProfilerTest(TestCase):
variance=None,
distinctCount=2.0,
distinctProportion=1.0,
median=30.5,
# histogram=Histogram(
# boundaries=["30.0 to 30.25", "31.0 to 31.25"], frequencies=[1, 1]
# ),