2022-02-18 07:48:38 +01:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Test Profiler behavior
|
|
|
|
"""
|
2022-07-29 10:41:53 +02:00
|
|
|
import os
|
2022-12-16 17:01:12 +01:00
|
|
|
from concurrent.futures import TimeoutError
|
2023-03-08 18:01:25 +01:00
|
|
|
from datetime import datetime
|
2022-02-18 07:48:38 +01:00
|
|
|
from unittest import TestCase
|
2022-10-11 15:57:25 +02:00
|
|
|
from unittest.mock import patch
|
2022-08-04 07:22:47 -07:00
|
|
|
from uuid import uuid4
|
2022-02-18 07:48:38 +01:00
|
|
|
|
2022-02-24 07:08:39 +01:00
|
|
|
import pytest
|
2022-05-30 06:53:16 +02:00
|
|
|
import sqlalchemy.types
|
2022-07-20 17:54:10 +02:00
|
|
|
from sqlalchemy import Column, Integer, String
|
2022-02-18 07:48:38 +01:00
|
|
|
from sqlalchemy.orm import declarative_base
|
|
|
|
|
2022-09-15 17:39:29 +02:00
|
|
|
from metadata.generated.schema.api.data.createTableProfile import (
|
|
|
|
CreateTableProfileRequest,
|
|
|
|
)
|
2022-08-04 07:22:47 -07:00
|
|
|
from metadata.generated.schema.entity.data.table import Column as EntityColumn
|
|
|
|
from metadata.generated.schema.entity.data.table import (
|
|
|
|
ColumnName,
|
|
|
|
ColumnProfile,
|
2023-04-19 16:09:13 +02:00
|
|
|
ColumnProfilerConfig,
|
2022-08-04 07:22:47 -07:00
|
|
|
DataType,
|
2023-03-13 19:43:51 +05:30
|
|
|
Histogram,
|
2022-08-04 07:22:47 -07:00
|
|
|
Table,
|
2022-09-15 17:39:29 +02:00
|
|
|
TableProfile,
|
2023-04-19 16:09:13 +02:00
|
|
|
TableProfilerConfig,
|
2022-08-04 07:22:47 -07:00
|
|
|
)
|
2022-07-20 17:54:10 +02:00
|
|
|
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
|
|
|
|
SQLiteConnection,
|
|
|
|
SQLiteScheme,
|
|
|
|
)
|
2023-11-17 17:51:39 +01:00
|
|
|
from metadata.generated.schema.tests.customMetric import CustomMetric
|
2024-06-05 21:18:37 +02:00
|
|
|
from metadata.generated.schema.type.basic import Timestamp
|
2022-05-30 06:53:16 +02:00
|
|
|
from metadata.ingestion.source import sqa_types
|
2023-06-22 12:51:56 +05:30
|
|
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
2023-03-01 08:20:38 +01:00
|
|
|
SQAProfilerInterface,
|
|
|
|
)
|
2023-04-19 16:09:13 +02:00
|
|
|
from metadata.profiler.metrics.core import MetricTypes, add_props
|
2023-04-04 17:16:44 +02:00
|
|
|
from metadata.profiler.metrics.registry import Metrics
|
|
|
|
from metadata.profiler.processor.core import MissingMetricException, Profiler
|
|
|
|
from metadata.profiler.processor.default import DefaultProfiler
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
|
|
|
|
|
|
|
|
class User(Base):
|
|
|
|
__tablename__ = "users"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
name = Column(String(256))
|
|
|
|
fullname = Column(String(256))
|
|
|
|
nickname = Column(String(256))
|
|
|
|
age = Column(Integer)
|
|
|
|
|
|
|
|
|
|
|
|
class ProfilerTest(TestCase):
|
|
|
|
"""
|
|
|
|
Run checks on different metrics
|
|
|
|
"""
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
db_path = os.path.join(
|
|
|
|
os.path.dirname(__file__), f"{os.path.splitext(__file__)[0]}.db"
|
|
|
|
)
|
|
|
|
sqlite_conn = SQLiteConnection(
|
|
|
|
scheme=SQLiteScheme.sqlite_pysqlite,
|
|
|
|
databaseMode=db_path + "?check_same_thread=False",
|
|
|
|
)
|
2022-08-17 12:53:16 +02:00
|
|
|
|
2022-08-04 07:22:47 -07:00
|
|
|
table_entity = Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("id"),
|
2022-08-04 07:22:47 -07:00
|
|
|
dataType=DataType.INT,
|
2023-11-17 17:51:39 +01:00
|
|
|
customMetrics=[
|
|
|
|
CustomMetric(
|
|
|
|
name="custom_metric",
|
|
|
|
description="custom metric",
|
|
|
|
expression="SELECT cos(id) FROM users",
|
|
|
|
)
|
|
|
|
],
|
2022-08-04 07:22:47 -07:00
|
|
|
)
|
|
|
|
],
|
2023-11-17 17:51:39 +01:00
|
|
|
customMetrics=[
|
|
|
|
CustomMetric(
|
|
|
|
name="custom_metric",
|
|
|
|
description="custom metric",
|
|
|
|
expression="SELECT COUNT(id) / COUNT(age) FROM users",
|
|
|
|
),
|
|
|
|
CustomMetric(
|
|
|
|
name="custom_metric_two",
|
|
|
|
description="custom metric",
|
|
|
|
expression="SELECT COUNT(id) * COUNT(age) FROM users",
|
|
|
|
),
|
|
|
|
],
|
2022-08-04 07:22:47 -07:00
|
|
|
)
|
2022-10-11 15:57:25 +02:00
|
|
|
with patch.object(
|
|
|
|
SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User
|
|
|
|
):
|
|
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
2023-11-09 18:49:42 +05:30
|
|
|
sqlite_conn, None, table_entity, None, None, None, None, None, 5, 43200
|
2022-10-11 15:57:25 +02:00
|
|
|
)
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def setUpClass(cls) -> None:
|
|
|
|
"""
|
|
|
|
Prepare Ingredients
|
|
|
|
"""
|
2022-07-20 17:54:10 +02:00
|
|
|
User.__table__.create(bind=cls.sqa_profiler_interface.session.get_bind())
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
data = [
|
|
|
|
User(name="John", fullname="John Doe", nickname="johnny b goode", age=30),
|
|
|
|
User(name="Jane", fullname="Jone Doe", nickname=None, age=31),
|
|
|
|
]
|
2022-07-20 17:54:10 +02:00
|
|
|
cls.sqa_profiler_interface.session.add_all(data)
|
|
|
|
cls.sqa_profiler_interface.session.commit()
|
|
|
|
|
2022-03-04 06:59:47 +01:00
|
|
|
def test_default_profiler(self):
|
2022-02-18 07:48:38 +01:00
|
|
|
"""
|
|
|
|
Check our pre-cooked profiler
|
|
|
|
"""
|
2022-07-20 17:54:10 +02:00
|
|
|
simple = DefaultProfiler(
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
simple.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
|
|
|
|
profile = simple.get_profile()
|
|
|
|
|
2022-08-22 09:01:24 -07:00
|
|
|
assert profile.tableProfile.rowCount == 2
|
|
|
|
assert profile.tableProfile.columnCount == 5
|
2022-02-25 18:26:30 +01:00
|
|
|
|
|
|
|
age_profile = next(
|
2022-07-20 17:54:10 +02:00
|
|
|
(
|
|
|
|
col_profile
|
|
|
|
for col_profile in profile.columnProfile
|
|
|
|
if col_profile.name == "age"
|
2022-02-25 18:26:30 +01:00
|
|
|
),
|
|
|
|
None,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert age_profile == ColumnProfile(
|
|
|
|
name="age",
|
|
|
|
valuesCount=2,
|
|
|
|
valuesPercentage=None,
|
|
|
|
validCount=None,
|
|
|
|
duplicateCount=None,
|
|
|
|
nullCount=0,
|
|
|
|
nullProportion=0.0,
|
|
|
|
uniqueCount=2,
|
|
|
|
uniqueProportion=1.0,
|
|
|
|
min=30.0,
|
|
|
|
max=31.0,
|
|
|
|
minLength=None,
|
|
|
|
maxLength=None,
|
|
|
|
mean=30.5,
|
|
|
|
sum=61.0,
|
|
|
|
stddev=0.25,
|
|
|
|
variance=None,
|
2022-03-08 11:44:39 +01:00
|
|
|
distinctCount=2.0,
|
|
|
|
distinctProportion=1.0,
|
2023-03-03 21:56:32 +01:00
|
|
|
median=30.0,
|
2023-03-08 18:01:25 +01:00
|
|
|
timestamp=age_profile.timestamp,
|
2023-03-03 21:56:32 +01:00
|
|
|
firstQuartile=30.0,
|
|
|
|
thirdQuartile=31.0,
|
|
|
|
interQuartileRange=1.0,
|
|
|
|
nonParametricSkew=2.0,
|
2024-01-24 09:19:19 +01:00
|
|
|
histogram=Histogram(boundaries=["30.000 and up"], frequencies=[2]),
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_required_metrics(self):
|
|
|
|
"""
|
|
|
|
Check that we raise properly MissingMetricException
|
|
|
|
when not building the profiler with all the
|
|
|
|
required ingredients
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
like_ratio = Metrics.LIKE_RATIO.value
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# This should run properly
|
2022-02-25 18:26:30 +01:00
|
|
|
Profiler(
|
|
|
|
like,
|
|
|
|
count,
|
|
|
|
like_ratio,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
with pytest.raises(MissingMetricException):
|
|
|
|
# We are missing ingredients here
|
2022-02-25 18:26:30 +01:00
|
|
|
Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
like,
|
|
|
|
like_ratio,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-05-30 06:53:16 +02:00
|
|
|
|
|
|
|
def test_skipped_types(self):
|
|
|
|
"""
|
|
|
|
Check that we are properly skipping computations for
|
|
|
|
not supported types
|
|
|
|
"""
|
|
|
|
|
|
|
|
class NotCompute(Base):
|
|
|
|
__tablename__ = "not_compute"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
null_col = Column(sqlalchemy.types.NULLTYPE)
|
|
|
|
array_col = Column(sqlalchemy.ARRAY(Integer, dimensions=2))
|
|
|
|
json_col = Column(sqlalchemy.JSON)
|
|
|
|
map_col = Column(sqa_types.SQAMap)
|
|
|
|
struct_col = Column(sqa_types.SQAStruct)
|
|
|
|
|
|
|
|
profiler = Profiler(
|
|
|
|
Metrics.COUNT.value,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-05-30 06:53:16 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert not profiler.column_results
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2022-09-15 17:39:29 +02:00
|
|
|
def test__check_profile_and_handle(self):
|
|
|
|
"""test _check_profile_and_handle returns as expected"""
|
|
|
|
profiler = Profiler(
|
|
|
|
Metrics.COUNT.value,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
|
2023-07-14 09:12:46 +02:00
|
|
|
profiler._check_profile_and_handle(
|
2022-09-15 17:39:29 +02:00
|
|
|
CreateTableProfileRequest(
|
|
|
|
tableProfile=TableProfile(
|
2024-06-05 21:18:37 +02:00
|
|
|
timestamp=Timestamp(int(datetime.now().timestamp())), columnCount=10
|
2022-09-15 17:39:29 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(Exception):
|
|
|
|
profiler._check_profile_and_handle(
|
|
|
|
CreateTableProfileRequest(
|
|
|
|
tableProfile=TableProfile(
|
2024-06-05 21:18:37 +02:00
|
|
|
timestamp=Timestamp(int(datetime.now().timestamp())),
|
|
|
|
profileSample=100,
|
2022-09-15 17:39:29 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2023-11-17 17:51:39 +01:00
|
|
|
def test__prepare_column_metrics(self):
|
|
|
|
"""test _prepare_column_metrics returns as expected"""
|
|
|
|
profiler = Profiler(
|
|
|
|
Metrics.FIRST_QUARTILE.value,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
|
|
|
|
metrics = profiler._prepare_column_metrics()
|
|
|
|
for metric in metrics:
|
|
|
|
if metric.metrics:
|
|
|
|
if isinstance(metric.metrics[0], CustomMetric):
|
2024-06-05 21:18:37 +02:00
|
|
|
assert metric.metrics[0].name.root == "custom_metric"
|
2023-11-17 17:51:39 +01:00
|
|
|
else:
|
|
|
|
assert metric.metrics[0].name() == "firstQuartile"
|
|
|
|
|
|
|
|
def test__prepare_table_metrics(self):
|
|
|
|
"""test _prepare_table_metrics returns as expected"""
|
|
|
|
profiler = Profiler(
|
|
|
|
Metrics.COLUMN_COUNT.value,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
metrics = profiler._prepare_table_metrics()
|
|
|
|
self.assertEqual(2, len(metrics))
|
|
|
|
|
2022-12-16 17:01:12 +01:00
|
|
|
def test_profiler_with_timeout(self):
|
|
|
|
"""check timeout is properly used"""
|
|
|
|
|
|
|
|
with patch.object(
|
|
|
|
SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User
|
|
|
|
):
|
|
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
2023-03-01 08:20:38 +01:00
|
|
|
self.sqlite_conn,
|
|
|
|
None,
|
|
|
|
self.table_entity,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
2023-11-09 18:49:42 +05:30
|
|
|
None,
|
2023-03-01 08:20:38 +01:00
|
|
|
timeout_seconds=0,
|
2022-12-16 17:01:12 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
simple = DefaultProfiler(
|
|
|
|
profiler_interface=sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(TimeoutError):
|
|
|
|
simple.compute_metrics()
|
|
|
|
|
2023-04-19 16:09:13 +02:00
|
|
|
def test_profiler_get_col_metrics(self):
|
|
|
|
"""check getc column metrics"""
|
|
|
|
metric_filter = ["mean", "min", "max", "firstQuartile"]
|
2023-11-17 17:51:39 +01:00
|
|
|
custom_metric_filter = ["custom_metric"]
|
2023-04-19 16:09:13 +02:00
|
|
|
self.sqa_profiler_interface.table_entity.tableProfilerConfig = (
|
|
|
|
TableProfilerConfig(
|
|
|
|
includeColumns=[
|
|
|
|
ColumnProfilerConfig(columnName="id", metrics=metric_filter)
|
|
|
|
]
|
|
|
|
)
|
|
|
|
) # type: ignore
|
|
|
|
|
|
|
|
default_profiler = DefaultProfiler(
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
|
|
|
|
column_metrics = default_profiler._prepare_column_metrics()
|
|
|
|
for metric in column_metrics:
|
2023-11-17 17:51:39 +01:00
|
|
|
if (
|
|
|
|
metric.metric_type is not MetricTypes.Table
|
|
|
|
and metric.column.name == "id"
|
|
|
|
):
|
|
|
|
assert all(
|
|
|
|
metric_filter.count(m.name())
|
|
|
|
for m in metric.metrics
|
|
|
|
if not isinstance(m, CustomMetric)
|
|
|
|
)
|
|
|
|
assert all(
|
2024-06-05 21:18:37 +02:00
|
|
|
custom_metric_filter.count(m.name.root)
|
2023-11-17 17:51:39 +01:00
|
|
|
for m in metric.metrics
|
|
|
|
if isinstance(m, CustomMetric)
|
|
|
|
)
|
2023-04-19 16:09:13 +02:00
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
@classmethod
|
|
|
|
def tearDownClass(cls) -> None:
|
|
|
|
os.remove(cls.db_path)
|