mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-14 17:17:15 +00:00
parent
18883ba939
commit
b3087d08b9
@ -53,7 +53,7 @@ from metadata.ingestion.source.sql_source_common import (
|
|||||||
SQLSourceStatus,
|
SQLSourceStatus,
|
||||||
)
|
)
|
||||||
from metadata.orm_profiler.orm.converter import ometa_to_orm
|
from metadata.orm_profiler.orm.converter import ometa_to_orm
|
||||||
from metadata.orm_profiler.profiles.default import DefaultProfiler
|
from metadata.orm_profiler.profiler.default import DefaultProfiler
|
||||||
from metadata.utils.column_type_parser import ColumnTypeParser
|
from metadata.utils.column_type_parser import ColumnTypeParser
|
||||||
from metadata.utils.engines import create_and_bind_session, get_engine
|
from metadata.utils.engines import create_and_bind_session, get_engine
|
||||||
from metadata.utils.helpers import get_database_service_or_create, ingest_lineage
|
from metadata.utils.helpers import get_database_service_or_create, ingest_lineage
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from typing import Optional
|
|||||||
|
|
||||||
from metadata.config.common import ConfigModel
|
from metadata.config.common import ConfigModel
|
||||||
from metadata.generated.schema.entity.data.table import Table, TableProfile
|
from metadata.generated.schema.entity.data.table import Table, TableProfile
|
||||||
from metadata.orm_profiler.profiles.models import ProfilerDef
|
from metadata.orm_profiler.profiler.models import ProfilerDef
|
||||||
from metadata.orm_profiler.validations.models import TestDef, TestSuite
|
from metadata.orm_profiler.validations.models import TestDef, TestSuite
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -36,8 +36,8 @@ from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
|||||||
from metadata.orm_profiler.api.models import ProfilerProcessorConfig, ProfilerResponse
|
from metadata.orm_profiler.api.models import ProfilerProcessorConfig, ProfilerResponse
|
||||||
from metadata.orm_profiler.metrics.registry import Metrics
|
from metadata.orm_profiler.metrics.registry import Metrics
|
||||||
from metadata.orm_profiler.orm.converter import ometa_to_orm
|
from metadata.orm_profiler.orm.converter import ometa_to_orm
|
||||||
from metadata.orm_profiler.profiles.core import Profiler
|
from metadata.orm_profiler.profiler.core import Profiler
|
||||||
from metadata.orm_profiler.profiles.default import DefaultProfiler
|
from metadata.orm_profiler.profiler.default import DefaultProfiler, get_default_metrics
|
||||||
from metadata.orm_profiler.validations.core import validate
|
from metadata.orm_profiler.validations.core import validate
|
||||||
from metadata.orm_profiler.validations.models import TestDef
|
from metadata.orm_profiler.validations.models import TestDef
|
||||||
|
|
||||||
@ -147,7 +147,12 @@ class OrmProfilerProcessor(Processor[Table]):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Here we will need to add the logic to pass kwargs to the metrics
|
# Here we will need to add the logic to pass kwargs to the metrics
|
||||||
metrics = [Metrics.get(name) for name in self.config.profiler.metrics]
|
# TODO: add_props when needed for incoming metrics
|
||||||
|
metrics = (
|
||||||
|
[Metrics.get(name) for name in self.config.profiler.metrics]
|
||||||
|
if self.config.profiler.metrics
|
||||||
|
else get_default_metrics(orm)
|
||||||
|
)
|
||||||
|
|
||||||
return Profiler(
|
return Profiler(
|
||||||
*metrics,
|
*metrics,
|
||||||
@ -155,6 +160,7 @@ class OrmProfilerProcessor(Processor[Table]):
|
|||||||
table=orm,
|
table=orm,
|
||||||
profile_date=self.execution_date,
|
profile_date=self.execution_date,
|
||||||
profile_sample=profile_sample,
|
profile_sample=profile_sample,
|
||||||
|
timeout_seconds=self.config.profiler.timeout_seconds,
|
||||||
)
|
)
|
||||||
|
|
||||||
def profile_entity(self, orm: DeclarativeMeta, table: Table) -> TableProfile:
|
def profile_entity(self, orm: DeclarativeMeta, table: Table) -> TableProfile:
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
"""
|
"""
|
||||||
Main Profile definition and queries to execute
|
Main Profile definition and queries to execute
|
||||||
"""
|
"""
|
||||||
|
import traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict, Generic, List, Optional, Tuple, Type, Union
|
from typing import Any, Dict, Generic, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
@ -30,9 +31,12 @@ from metadata.orm_profiler.metrics.core import (
|
|||||||
StaticMetric,
|
StaticMetric,
|
||||||
)
|
)
|
||||||
from metadata.orm_profiler.metrics.static.row_count import RowCount
|
from metadata.orm_profiler.metrics.static.row_count import RowCount
|
||||||
from metadata.orm_profiler.orm.functions.random_num import RandomNumFn
|
|
||||||
from metadata.orm_profiler.orm.registry import NOT_COMPUTE
|
from metadata.orm_profiler.orm.registry import NOT_COMPUTE
|
||||||
|
from metadata.orm_profiler.profiler.runner import QueryRunner
|
||||||
|
from metadata.orm_profiler.profiler.sampler import Sampler
|
||||||
from metadata.orm_profiler.utils import logger
|
from metadata.orm_profiler.utils import logger
|
||||||
|
from metadata.utils.constants import TEN_MIN
|
||||||
|
from metadata.utils.timeout import cls_timeout
|
||||||
|
|
||||||
logger = logger()
|
logger = logger()
|
||||||
|
|
||||||
@ -63,6 +67,7 @@ class Profiler(Generic[MetricType]):
|
|||||||
ignore_cols: Optional[List[str]] = None,
|
ignore_cols: Optional[List[str]] = None,
|
||||||
use_cols: Optional[List[Column]] = None,
|
use_cols: Optional[List[Column]] = None,
|
||||||
profile_sample: Optional[float] = None,
|
profile_sample: Optional[float] = None,
|
||||||
|
timeout_seconds: Optional[int] = TEN_MIN,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param metrics: Metrics to run. We are receiving the uninitialized classes
|
:param metrics: Metrics to run. We are receiving the uninitialized classes
|
||||||
@ -81,9 +86,10 @@ class Profiler(Generic[MetricType]):
|
|||||||
self._ignore_cols = ignore_cols
|
self._ignore_cols = ignore_cols
|
||||||
self._use_cols = use_cols
|
self._use_cols = use_cols
|
||||||
self._profile_sample = profile_sample
|
self._profile_sample = profile_sample
|
||||||
|
|
||||||
self._profile_date = profile_date
|
self._profile_date = profile_date
|
||||||
|
|
||||||
|
self.validate_composed_metric()
|
||||||
|
|
||||||
# Initialize profiler results
|
# Initialize profiler results
|
||||||
self._table_results: Dict[str, Any] = {}
|
self._table_results: Dict[str, Any] = {}
|
||||||
self._column_results: Dict[str, Any] = {}
|
self._column_results: Dict[str, Any] = {}
|
||||||
@ -92,9 +98,15 @@ class Profiler(Generic[MetricType]):
|
|||||||
self._columns: Optional[List[Column]] = None
|
self._columns: Optional[List[Column]] = None
|
||||||
|
|
||||||
# We will compute the sample from the property
|
# We will compute the sample from the property
|
||||||
|
self._sampler = Sampler(
|
||||||
|
session=session, table=table, profile_sample=profile_sample
|
||||||
|
)
|
||||||
self._sample: Optional[Union[DeclarativeMeta, AliasedClass]] = None
|
self._sample: Optional[Union[DeclarativeMeta, AliasedClass]] = None
|
||||||
|
|
||||||
self.validate_composed_metric()
|
# Prepare a timeout controlled query runner
|
||||||
|
self.runner: QueryRunner = cls_timeout(timeout_seconds)(
|
||||||
|
QueryRunner(session=session, table=table, sample=self.sample)
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def session(self) -> Session:
|
def session(self) -> Session:
|
||||||
@ -123,10 +135,6 @@ class Profiler(Generic[MetricType]):
|
|||||||
"""
|
"""
|
||||||
return self._use_cols
|
return self._use_cols
|
||||||
|
|
||||||
@property
|
|
||||||
def profile_sample(self) -> Optional[float]:
|
|
||||||
return self._profile_sample
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def profile_date(self) -> datetime:
|
def profile_date(self) -> datetime:
|
||||||
return self._profile_date
|
return self._profile_date
|
||||||
@ -181,31 +189,8 @@ class Profiler(Generic[MetricType]):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def sample(self):
|
def sample(self):
|
||||||
"""
|
|
||||||
Either return a sampled CTE of table, or
|
|
||||||
the full table if no sampling is required.
|
|
||||||
"""
|
|
||||||
if not self._sample:
|
if not self._sample:
|
||||||
|
self._sample = self._sampler.random_sample()
|
||||||
if not self.profile_sample:
|
|
||||||
# Use the full table
|
|
||||||
self._sample = self.table
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Add new RandomNumFn column
|
|
||||||
rnd = self.session.query(
|
|
||||||
self.table, (RandomNumFn() % 100).label("random")
|
|
||||||
).cte(f"{self.table.__tablename__}_rnd")
|
|
||||||
|
|
||||||
# Prepare sampled CTE
|
|
||||||
sampled = (
|
|
||||||
self.session.query(rnd)
|
|
||||||
.where(rnd.c.random <= self.profile_sample)
|
|
||||||
.cte(f"{self.table.__tablename__}_sample")
|
|
||||||
)
|
|
||||||
|
|
||||||
# Assign as an alias
|
|
||||||
self._sample = aliased(self.table, sampled)
|
|
||||||
|
|
||||||
return self._sample
|
return self._sample
|
||||||
|
|
||||||
@ -222,7 +207,7 @@ class Profiler(Generic[MetricType]):
|
|||||||
f"We need {metric.required_metrics()} for {metric.name}, but only got {names} in the profiler"
|
f"We need {metric.required_metrics()} for {metric.name}, but only got {names} in the profiler"
|
||||||
)
|
)
|
||||||
|
|
||||||
def sql_col_run(self, col: Column):
|
def run_static_metrics(self, col: Column):
|
||||||
"""
|
"""
|
||||||
Run the profiler and store its results
|
Run the profiler and store its results
|
||||||
|
|
||||||
@ -238,19 +223,17 @@ class Profiler(Generic[MetricType]):
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
query = self.session.query(
|
row = self.runner.select_first_from_sample(
|
||||||
*[metric(col).fn() for metric in col_metrics]
|
*[metric(col).fn() for metric in col_metrics]
|
||||||
).select_from(self.sample)
|
)
|
||||||
|
|
||||||
row = query.first()
|
|
||||||
self._column_results[col.name].update(dict(row))
|
self._column_results[col.name].update(dict(row))
|
||||||
except Exception as err:
|
except (TimeoutError, Exception) as err:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Error trying to compute column profile for {col.name} - {err}"
|
f"Error trying to compute column profile for {col.name} - {err}"
|
||||||
)
|
)
|
||||||
self.session.rollback()
|
self.session.rollback()
|
||||||
|
|
||||||
def sql_table_run(self):
|
def run_table_metrics(self):
|
||||||
"""
|
"""
|
||||||
Run Table Static metrics
|
Run Table Static metrics
|
||||||
|
|
||||||
@ -266,15 +249,13 @@ class Profiler(Generic[MetricType]):
|
|||||||
if not table_metrics:
|
if not table_metrics:
|
||||||
return
|
return
|
||||||
|
|
||||||
query = self.session.query(
|
row = self.runner.select_first_from_table(
|
||||||
*[metric().fn() for metric in table_metrics]
|
*[metric().fn() for metric in table_metrics]
|
||||||
).select_from(self.table)
|
)
|
||||||
|
|
||||||
row = query.first()
|
|
||||||
if row:
|
if row:
|
||||||
self._table_results.update(dict(row))
|
self._table_results.update(dict(row))
|
||||||
|
|
||||||
def sql_col_query_run(self, col: Column) -> None:
|
def run_query_metrics(self, col: Column) -> None:
|
||||||
"""
|
"""
|
||||||
Run QueryMetrics
|
Run QueryMetrics
|
||||||
"""
|
"""
|
||||||
@ -292,7 +273,7 @@ class Profiler(Generic[MetricType]):
|
|||||||
if not metric_query:
|
if not metric_query:
|
||||||
continue
|
continue
|
||||||
if col_metric.metric_type == dict:
|
if col_metric.metric_type == dict:
|
||||||
query_res = metric_query.all()
|
query_res = self.runner.select_all_from_query(metric_query)
|
||||||
# query_res has the shape of List[Row], where each row is a dict,
|
# query_res has the shape of List[Row], where each row is a dict,
|
||||||
# e.g., [{colA: 1, colB: 2},...]
|
# e.g., [{colA: 1, colB: 2},...]
|
||||||
# We are going to transform this into a Dict[List] by pivoting, so that
|
# We are going to transform this into a Dict[List] by pivoting, so that
|
||||||
@ -303,16 +284,18 @@ class Profiler(Generic[MetricType]):
|
|||||||
self._column_results[col.name].update({metric.name(): data})
|
self._column_results[col.name].update({metric.name(): data})
|
||||||
|
|
||||||
else:
|
else:
|
||||||
row = metric_query.first()
|
row = self.runner.select_first_from_query(metric_query)
|
||||||
self._column_results[col.name].update(dict(row))
|
self._column_results[col.name].update(dict(row))
|
||||||
|
|
||||||
except Exception as err: # pylint: disable=broad-except
|
except (TimeoutError, Exception) as err: # pylint: disable=broad-except
|
||||||
|
print(err)
|
||||||
|
print(traceback.format_exc())
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Exception encountered computing {metric.name()} for {self.table.__tablename__}.{col.name} - {err}"
|
f"Error computing query metric {metric.name()} for {self.table.__tablename__}.{col.name} - {err}"
|
||||||
)
|
)
|
||||||
self.session.rollback()
|
self.session.rollback()
|
||||||
|
|
||||||
def post_col_run(self, col: Column):
|
def run_composed_metrics(self, col: Column):
|
||||||
"""
|
"""
|
||||||
Run this after the metrics have been computed
|
Run this after the metrics have been computed
|
||||||
|
|
||||||
@ -336,15 +319,6 @@ class Profiler(Generic[MetricType]):
|
|||||||
current_col_results
|
current_col_results
|
||||||
)
|
)
|
||||||
|
|
||||||
def execute_table(self) -> None:
|
|
||||||
"""
|
|
||||||
Run table metrics
|
|
||||||
|
|
||||||
So far we only support Static Metrics
|
|
||||||
for Table Metrics
|
|
||||||
"""
|
|
||||||
self.sql_table_run()
|
|
||||||
|
|
||||||
def execute_column(self, col: Column) -> None:
|
def execute_column(self, col: Column) -> None:
|
||||||
"""
|
"""
|
||||||
Run the profiler on all the columns that
|
Run the profiler on all the columns that
|
||||||
@ -353,9 +327,9 @@ class Profiler(Generic[MetricType]):
|
|||||||
We can assume from this point onwards that
|
We can assume from this point onwards that
|
||||||
columns are of allowed types
|
columns are of allowed types
|
||||||
"""
|
"""
|
||||||
self.sql_col_run(col)
|
self.run_static_metrics(col)
|
||||||
self.sql_col_query_run(col)
|
self.run_query_metrics(col)
|
||||||
self.post_col_run(col)
|
self.run_composed_metrics(col)
|
||||||
|
|
||||||
def execute(self) -> "Profiler":
|
def execute(self) -> "Profiler":
|
||||||
"""
|
"""
|
||||||
@ -364,7 +338,7 @@ class Profiler(Generic[MetricType]):
|
|||||||
|
|
||||||
logger.debug(f"Running profiler for {self.table.__tablename__}")
|
logger.debug(f"Running profiler for {self.table.__tablename__}")
|
||||||
|
|
||||||
self.execute_table()
|
self.run_table_metrics()
|
||||||
|
|
||||||
for col in self.columns:
|
for col in self.columns:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@ -18,27 +18,14 @@ from typing import List, Optional
|
|||||||
from sqlalchemy.orm import DeclarativeMeta
|
from sqlalchemy.orm import DeclarativeMeta
|
||||||
from sqlalchemy.orm.session import Session
|
from sqlalchemy.orm.session import Session
|
||||||
|
|
||||||
from metadata.orm_profiler.metrics.core import add_props
|
from metadata.orm_profiler.metrics.core import Metric, add_props
|
||||||
from metadata.orm_profiler.metrics.registry import Metrics
|
from metadata.orm_profiler.metrics.registry import Metrics
|
||||||
from metadata.orm_profiler.profiles.core import Profiler
|
from metadata.orm_profiler.profiler.core import Profiler
|
||||||
|
from metadata.utils.constants import TEN_MIN
|
||||||
|
|
||||||
|
|
||||||
class DefaultProfiler(Profiler):
|
def get_default_metrics(table: DeclarativeMeta) -> List[Metric]:
|
||||||
"""
|
return [
|
||||||
Pre-built profiler with a simple
|
|
||||||
set of metrics that we can use as
|
|
||||||
a default.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
session: Session,
|
|
||||||
table: DeclarativeMeta,
|
|
||||||
ignore_cols: Optional[List[str]] = None,
|
|
||||||
profile_date: datetime = datetime.now(),
|
|
||||||
profile_sample: Optional[float] = None,
|
|
||||||
):
|
|
||||||
_metrics = [
|
|
||||||
# Table Metrics
|
# Table Metrics
|
||||||
Metrics.ROW_COUNT.value,
|
Metrics.ROW_COUNT.value,
|
||||||
add_props(table=table)(Metrics.COLUMN_COUNT.value),
|
add_props(table=table)(Metrics.COLUMN_COUNT.value),
|
||||||
@ -59,6 +46,27 @@ class DefaultProfiler(Profiler):
|
|||||||
Metrics.UNIQUE_RATIO.value,
|
Metrics.UNIQUE_RATIO.value,
|
||||||
Metrics.HISTOGRAM.value,
|
Metrics.HISTOGRAM.value,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultProfiler(Profiler):
|
||||||
|
"""
|
||||||
|
Pre-built profiler with a simple
|
||||||
|
set of metrics that we can use as
|
||||||
|
a default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
session: Session,
|
||||||
|
table: DeclarativeMeta,
|
||||||
|
ignore_cols: Optional[List[str]] = None,
|
||||||
|
profile_date: datetime = datetime.now(),
|
||||||
|
profile_sample: Optional[float] = None,
|
||||||
|
timeout_seconds: Optional[int] = TEN_MIN,
|
||||||
|
):
|
||||||
|
|
||||||
|
_metrics = get_default_metrics(table)
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
*_metrics,
|
*_metrics,
|
||||||
session=session,
|
session=session,
|
||||||
@ -66,4 +74,5 @@ class DefaultProfiler(Profiler):
|
|||||||
ignore_cols=ignore_cols,
|
ignore_cols=ignore_cols,
|
||||||
profile_date=profile_date,
|
profile_date=profile_date,
|
||||||
profile_sample=profile_sample,
|
profile_sample=profile_sample,
|
||||||
|
timeout_seconds=timeout_seconds,
|
||||||
)
|
)
|
||||||
@ -10,10 +10,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Models to map profiles definitions
|
Models to map profiler definitions
|
||||||
JSON workflows to the profiler
|
JSON workflows to the profiler
|
||||||
"""
|
"""
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel, validator
|
||||||
|
|
||||||
@ -27,7 +27,12 @@ class ProfilerDef(BaseModel):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
name: str # Profiler name
|
name: str # Profiler name
|
||||||
metrics: List[str] # names of currently supported Static and Composed metrics
|
timeout_seconds: Optional[
|
||||||
|
int
|
||||||
|
] = None # Stop running a query after X seconds and continue
|
||||||
|
metrics: Optional[
|
||||||
|
List[str]
|
||||||
|
] = None # names of currently supported Static and Composed metrics
|
||||||
# TBD:
|
# TBD:
|
||||||
# time_metrics: List[TimeMetricDef] = None
|
# time_metrics: List[TimeMetricDef] = None
|
||||||
# custom_metrics: List[CustomMetricDef] = None
|
# custom_metrics: List[CustomMetricDef] = None
|
||||||
71
ingestion/src/metadata/orm_profiler/profiler/runner.py
Normal file
71
ingestion/src/metadata/orm_profiler/profiler/runner.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Module in charge of running the queries against
|
||||||
|
the session.
|
||||||
|
|
||||||
|
This is useful to centralise the running logic
|
||||||
|
and manage behavior such as timeouts.
|
||||||
|
"""
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from sqlalchemy.orm import DeclarativeMeta, Query, Session
|
||||||
|
from sqlalchemy.orm.util import AliasedClass
|
||||||
|
|
||||||
|
|
||||||
|
class QueryRunner:
|
||||||
|
"""
|
||||||
|
Handles the query runs and returns the results
|
||||||
|
to the caller.
|
||||||
|
|
||||||
|
The goal of this class is abstract a bit
|
||||||
|
how to get the query results. Moreover,
|
||||||
|
we can then wrap it up with a timeout
|
||||||
|
to make sure that methods executed from this class
|
||||||
|
won't take more than X seconds to execute.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
session: Session,
|
||||||
|
table: DeclarativeMeta,
|
||||||
|
sample: Union[DeclarativeMeta, AliasedClass],
|
||||||
|
):
|
||||||
|
self._session = session
|
||||||
|
self._table = table
|
||||||
|
self._sample = sample
|
||||||
|
|
||||||
|
def _build_query(self, *entities, **kwargs) -> Query:
|
||||||
|
return self._session.query(*entities, **kwargs)
|
||||||
|
|
||||||
|
def _select_from_sample(self, *entities, **kwargs):
|
||||||
|
return self._build_query(*entities, **kwargs).select_from(self._sample)
|
||||||
|
|
||||||
|
def select_first_from_table(self, *entities, **kwargs):
|
||||||
|
return self._build_query(*entities, **kwargs).select_from(self._table).first()
|
||||||
|
|
||||||
|
def select_all_from_table(self, *entities, **kwargs):
|
||||||
|
return self._build_query(*entities, **kwargs).select_from(self._table).all()
|
||||||
|
|
||||||
|
def select_first_from_sample(self, *entities, **kwargs):
|
||||||
|
return self._select_from_sample(*entities, **kwargs).first()
|
||||||
|
|
||||||
|
def select_all_from_sample(self, *entities, **kwargs):
|
||||||
|
return self._select_from_sample(*entities, **kwargs).all()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def select_first_from_query(query: Query):
|
||||||
|
return query.first()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def select_all_from_query(query: Query):
|
||||||
|
return query.all()
|
||||||
62
ingestion/src/metadata/orm_profiler/profiler/sampler.py
Normal file
62
ingestion/src/metadata/orm_profiler/profiler/sampler.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Helper module to handle data sampling
|
||||||
|
for the profiler
|
||||||
|
"""
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from sqlalchemy.orm import DeclarativeMeta, Session, aliased
|
||||||
|
from sqlalchemy.orm.util import AliasedClass
|
||||||
|
|
||||||
|
from metadata.orm_profiler.orm.functions.random_num import RandomNumFn
|
||||||
|
|
||||||
|
|
||||||
|
class Sampler:
|
||||||
|
"""
|
||||||
|
Generates a sample of the data to not
|
||||||
|
run the query in the whole table.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
session: Session,
|
||||||
|
table: DeclarativeMeta,
|
||||||
|
profile_sample: Optional[float] = None,
|
||||||
|
):
|
||||||
|
self.profile_sample = profile_sample
|
||||||
|
self.session = session
|
||||||
|
self.table = table
|
||||||
|
|
||||||
|
def random_sample(self) -> Union[DeclarativeMeta, AliasedClass]:
|
||||||
|
"""
|
||||||
|
Either return a sampled CTE of table, or
|
||||||
|
the full table if no sampling is required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self.profile_sample:
|
||||||
|
# Use the full table
|
||||||
|
return self.table
|
||||||
|
|
||||||
|
# Add new RandomNumFn column
|
||||||
|
rnd = self.session.query(self.table, (RandomNumFn() % 100).label("random")).cte(
|
||||||
|
f"{self.table.__tablename__}_rnd"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare sampled CTE
|
||||||
|
sampled = (
|
||||||
|
self.session.query(rnd)
|
||||||
|
.where(rnd.c.random <= self.profile_sample)
|
||||||
|
.cte(f"{self.table.__tablename__}_sample")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assign as an alias
|
||||||
|
return aliased(self.table, sampled)
|
||||||
@ -19,7 +19,7 @@ from sqlalchemy import inspect
|
|||||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||||
|
|
||||||
from metadata.orm_profiler.metrics.core import Metric
|
from metadata.orm_profiler.metrics.core import Metric
|
||||||
from metadata.orm_profiler.profiles.core import Profiler
|
from metadata.orm_profiler.profiler.core import Profiler
|
||||||
|
|
||||||
|
|
||||||
def run_col_metric(
|
def run_col_metric(
|
||||||
|
|||||||
@ -14,3 +14,4 @@ Define constants useful for the metadata ingestion
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
DOT = "_DOT_"
|
DOT = "_DOT_"
|
||||||
|
TEN_MIN = 10 * 60
|
||||||
|
|||||||
73
ingestion/src/metadata/utils/timeout.py
Normal file
73
ingestion/src/metadata/utils/timeout.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Timeout utilities
|
||||||
|
"""
|
||||||
|
import errno
|
||||||
|
import functools
|
||||||
|
import inspect
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from metadata.utils.constants import TEN_MIN
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_timeout(signum, frame):
|
||||||
|
"""
|
||||||
|
Handler for signal timeout
|
||||||
|
"""
|
||||||
|
logger.debug(traceback.print_stack(frame))
|
||||||
|
raise TimeoutError(f"[SIGNUM {signum}] {os.strerror(errno.ETIME)}")
|
||||||
|
|
||||||
|
|
||||||
|
def timeout(seconds: int = TEN_MIN):
|
||||||
|
"""
|
||||||
|
Decorator factory to handle timeouts in functions. Defaults
|
||||||
|
to 10 min
|
||||||
|
:param seconds: seconds to wait until raising the timeout
|
||||||
|
"""
|
||||||
|
|
||||||
|
def decorator(fn):
|
||||||
|
@functools.wraps(fn)
|
||||||
|
def inner(*args, **kwargs):
|
||||||
|
signal.signal(signal.SIGALRM, _handle_timeout)
|
||||||
|
signal.alarm(seconds)
|
||||||
|
try:
|
||||||
|
result = fn(*args, **kwargs)
|
||||||
|
finally:
|
||||||
|
signal.alarm(0)
|
||||||
|
return result
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def cls_timeout(seconds: int = TEN_MIN):
|
||||||
|
"""
|
||||||
|
Decorates with `timeout` all methods
|
||||||
|
of a class cls
|
||||||
|
:param seconds: timeout to use
|
||||||
|
:return: class with decorated methods
|
||||||
|
"""
|
||||||
|
|
||||||
|
def inner(cls):
|
||||||
|
for attr_name, attr in inspect.getmembers(cls, inspect.isfunction):
|
||||||
|
setattr(cls, attr_name, timeout(seconds)(getattr(cls, attr_name)))
|
||||||
|
|
||||||
|
return cls
|
||||||
|
|
||||||
|
return inner
|
||||||
@ -119,6 +119,7 @@ class ProfilerWorkflowTest(TestCase):
|
|||||||
"config": {
|
"config": {
|
||||||
"profiler": {
|
"profiler": {
|
||||||
"name": "my_profiler",
|
"name": "my_profiler",
|
||||||
|
"timeout_seconds": 60,
|
||||||
"metrics": ["row_count", "min", "max", "COUNT", "null_count"],
|
"metrics": ["row_count", "min", "max", "COUNT", "null_count"],
|
||||||
},
|
},
|
||||||
"test_suite": {
|
"test_suite": {
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from sqlalchemy.orm import declarative_base
|
|||||||
|
|
||||||
from metadata.orm_profiler.metrics.core import add_props
|
from metadata.orm_profiler.metrics.core import add_props
|
||||||
from metadata.orm_profiler.metrics.registry import Metrics
|
from metadata.orm_profiler.metrics.registry import Metrics
|
||||||
from metadata.orm_profiler.profiles.core import Profiler
|
from metadata.orm_profiler.profiler.core import Profiler
|
||||||
from metadata.utils.engines import create_and_bind_session
|
from metadata.utils.engines import create_and_bind_session
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|||||||
@ -21,8 +21,8 @@ from sqlalchemy.orm import declarative_base
|
|||||||
from metadata.generated.schema.entity.data.table import ColumnProfile, Histogram
|
from metadata.generated.schema.entity.data.table import ColumnProfile, Histogram
|
||||||
from metadata.orm_profiler.metrics.core import add_props
|
from metadata.orm_profiler.metrics.core import add_props
|
||||||
from metadata.orm_profiler.metrics.registry import Metrics
|
from metadata.orm_profiler.metrics.registry import Metrics
|
||||||
from metadata.orm_profiler.profiles.core import MissingMetricException, Profiler
|
from metadata.orm_profiler.profiler.core import MissingMetricException, Profiler
|
||||||
from metadata.orm_profiler.profiles.default import DefaultProfiler
|
from metadata.orm_profiler.profiler.default import DefaultProfiler
|
||||||
from metadata.utils.engines import create_and_bind_session
|
from metadata.utils.engines import create_and_bind_session
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|||||||
154
ingestion/tests/unit/profiler/test_runner.py
Normal file
154
ingestion/tests/unit/profiler/test_runner.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Test Sample behavior
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import TEXT, Column, Integer, String, create_engine, func
|
||||||
|
from sqlalchemy.orm import DeclarativeMeta, declarative_base
|
||||||
|
|
||||||
|
from metadata.orm_profiler.profiler.runner import QueryRunner
|
||||||
|
from metadata.orm_profiler.profiler.sampler import Sampler
|
||||||
|
from metadata.utils.engines import create_and_bind_session
|
||||||
|
from metadata.utils.timeout import cls_timeout
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
|
class User(Base):
|
||||||
|
__tablename__ = "users"
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String(256))
|
||||||
|
fullname = Column(String(256))
|
||||||
|
nickname = Column(String(256))
|
||||||
|
comments = Column(TEXT)
|
||||||
|
age = Column(Integer)
|
||||||
|
|
||||||
|
|
||||||
|
class Timer:
|
||||||
|
"""
|
||||||
|
Helper to test timeouts
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def slow():
|
||||||
|
time.sleep(10)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fast():
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
class RunnerTest(TestCase):
|
||||||
|
"""
|
||||||
|
Run checks on different metrics
|
||||||
|
"""
|
||||||
|
|
||||||
|
engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True)
|
||||||
|
session = create_and_bind_session(engine)
|
||||||
|
|
||||||
|
sampler = Sampler(session=session, table=User, profile_sample=50.0)
|
||||||
|
sample = sampler.random_sample()
|
||||||
|
|
||||||
|
raw_runner = QueryRunner(session=session, table=User, sample=sample)
|
||||||
|
|
||||||
|
timeout_runner: Timer = cls_timeout(1)(Timer())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls) -> None:
|
||||||
|
"""
|
||||||
|
Prepare Ingredients
|
||||||
|
"""
|
||||||
|
User.__table__.create(bind=cls.engine)
|
||||||
|
|
||||||
|
# Insert 30 rows
|
||||||
|
for i in range(10):
|
||||||
|
data = [
|
||||||
|
User(
|
||||||
|
name="John",
|
||||||
|
fullname="John Doe",
|
||||||
|
nickname="johnny b goode",
|
||||||
|
comments="no comments",
|
||||||
|
age=30,
|
||||||
|
),
|
||||||
|
User(
|
||||||
|
name="Jane",
|
||||||
|
fullname="Jone Doe",
|
||||||
|
nickname=None,
|
||||||
|
comments="maybe some comments",
|
||||||
|
age=31,
|
||||||
|
),
|
||||||
|
User(
|
||||||
|
name="John",
|
||||||
|
fullname="John Doe",
|
||||||
|
nickname=None,
|
||||||
|
comments=None,
|
||||||
|
age=None,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
cls.session.add_all(data)
|
||||||
|
cls.session.commit()
|
||||||
|
|
||||||
|
def test_select_from_table(self):
|
||||||
|
"""
|
||||||
|
We can run queries against the table
|
||||||
|
"""
|
||||||
|
res = self.raw_runner.select_first_from_table(func.count())
|
||||||
|
assert res[0] == 30
|
||||||
|
|
||||||
|
res = self.raw_runner.select_all_from_table(Column(User.name.name))
|
||||||
|
assert len(res) == 30
|
||||||
|
|
||||||
|
def test_select_from_sample(self):
|
||||||
|
"""
|
||||||
|
We can run queries against the sample
|
||||||
|
"""
|
||||||
|
res = self.raw_runner.select_first_from_sample(func.count())
|
||||||
|
assert res[0] < 30
|
||||||
|
|
||||||
|
# Note how we need to pass the column by name, not from the table
|
||||||
|
# object, or it will run a cartesian product.
|
||||||
|
res = self.raw_runner.select_all_from_sample(Column(User.name.name))
|
||||||
|
assert len(res) < 30
|
||||||
|
|
||||||
|
def test_select_from_query(self):
|
||||||
|
"""
|
||||||
|
We can pick up results from a given query
|
||||||
|
"""
|
||||||
|
query = self.session.query(func.count()).select_from(User)
|
||||||
|
res = self.raw_runner.select_first_from_query(query)
|
||||||
|
assert res[0] == 30
|
||||||
|
|
||||||
|
query = self.session.query(func.count()).select_from(self.sample)
|
||||||
|
res = self.raw_runner.select_first_from_query(query)
|
||||||
|
assert res[0] < 30
|
||||||
|
|
||||||
|
query = self.session.query(Column(User.name.name)).select_from(User)
|
||||||
|
res = self.raw_runner.select_all_from_query(query)
|
||||||
|
assert len(res) == 30
|
||||||
|
|
||||||
|
query = self.session.query(func.count()).select_from(self.sample)
|
||||||
|
res = self.raw_runner.select_all_from_query(query)
|
||||||
|
assert len(res) < 30
|
||||||
|
|
||||||
|
def test_timeout_runner(self):
|
||||||
|
"""
|
||||||
|
Check that timeout alarms get executed
|
||||||
|
"""
|
||||||
|
assert self.timeout_runner.fast() == 1
|
||||||
|
|
||||||
|
with pytest.raises(TimeoutError):
|
||||||
|
self.timeout_runner.slow()
|
||||||
@ -18,7 +18,8 @@ from sqlalchemy import TEXT, Column, Integer, String, create_engine, func
|
|||||||
from sqlalchemy.orm import DeclarativeMeta, declarative_base
|
from sqlalchemy.orm import DeclarativeMeta, declarative_base
|
||||||
|
|
||||||
from metadata.orm_profiler.metrics.registry import Metrics
|
from metadata.orm_profiler.metrics.registry import Metrics
|
||||||
from metadata.orm_profiler.profiles.core import Profiler
|
from metadata.orm_profiler.profiler.core import Profiler
|
||||||
|
from metadata.orm_profiler.profiler.sampler import Sampler
|
||||||
from metadata.utils.engines import create_and_bind_session
|
from metadata.utils.engines import create_and_bind_session
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
@ -77,6 +78,16 @@ class SampleTest(TestCase):
|
|||||||
cls.session.add_all(data)
|
cls.session.add_all(data)
|
||||||
cls.session.commit()
|
cls.session.commit()
|
||||||
|
|
||||||
|
def test_random_sampler(self):
|
||||||
|
"""
|
||||||
|
The random sampler should be able to
|
||||||
|
generate a random subset of data
|
||||||
|
"""
|
||||||
|
sampler = Sampler(session=self.session, table=User, profile_sample=50.0)
|
||||||
|
random_sample = sampler.random_sample()
|
||||||
|
res = self.session.query(func.count()).select_from(random_sample).first()
|
||||||
|
assert res[0] < 30
|
||||||
|
|
||||||
def test_sample_property(self):
|
def test_sample_property(self):
|
||||||
"""
|
"""
|
||||||
Sample property should be properly generated
|
Sample property should be properly generated
|
||||||
|
|||||||
@ -34,8 +34,8 @@ from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
|||||||
from metadata.ingestion.source.sqlite import SQLiteConfig
|
from metadata.ingestion.source.sqlite import SQLiteConfig
|
||||||
from metadata.orm_profiler.api.workflow import ProfilerWorkflow
|
from metadata.orm_profiler.api.workflow import ProfilerWorkflow
|
||||||
from metadata.orm_profiler.processor.orm_profiler import OrmProfilerProcessor
|
from metadata.orm_profiler.processor.orm_profiler import OrmProfilerProcessor
|
||||||
from metadata.orm_profiler.profiles.default import DefaultProfiler
|
from metadata.orm_profiler.profiler.default import DefaultProfiler
|
||||||
from metadata.orm_profiler.profiles.models import ProfilerDef
|
from metadata.orm_profiler.profiler.models import ProfilerDef
|
||||||
from metadata.orm_profiler.validations.models import TestDef, TestSuite
|
from metadata.orm_profiler.validations.models import TestDef, TestSuite
|
||||||
|
|
||||||
config = {
|
config = {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user