2022-02-18 07:48:38 +01:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Test Metrics behavior
|
|
|
|
"""
|
2022-07-13 15:23:03 -04:00
|
|
|
import datetime
|
2022-07-29 10:41:53 +02:00
|
|
|
import os
|
2022-02-18 07:48:38 +01:00
|
|
|
from unittest import TestCase
|
2022-10-11 15:57:25 +02:00
|
|
|
from unittest.mock import patch
|
2022-08-04 07:22:47 -07:00
|
|
|
from uuid import uuid4
|
2022-02-18 07:48:38 +01:00
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
from sqlalchemy import TEXT, Column, Date, DateTime, Integer, String, Time
|
2022-02-18 07:48:38 +01:00
|
|
|
from sqlalchemy.orm import declarative_base
|
|
|
|
|
2022-08-04 07:22:47 -07:00
|
|
|
from metadata.generated.schema.entity.data.table import Column as EntityColumn
|
|
|
|
from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
|
2022-07-20 17:54:10 +02:00
|
|
|
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
|
|
|
|
SQLiteConnection,
|
|
|
|
SQLiteScheme,
|
|
|
|
)
|
2023-11-17 17:51:39 +01:00
|
|
|
from metadata.generated.schema.tests.customMetric import CustomMetric
|
2023-06-22 12:51:56 +05:30
|
|
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
2023-04-04 17:16:44 +02:00
|
|
|
SQAProfilerInterface,
|
|
|
|
)
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.core import add_props
|
|
|
|
from metadata.profiler.metrics.registry import Metrics
|
|
|
|
from metadata.profiler.orm.functions.sum import SumFn
|
2023-04-04 17:16:44 +02:00
|
|
|
from metadata.profiler.processor.core import Profiler
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
|
|
|
|
|
|
|
|
class User(Base):
|
|
|
|
__tablename__ = "users"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
name = Column(String(256))
|
|
|
|
fullname = Column(String(256))
|
|
|
|
nickname = Column(String(256))
|
2022-02-22 08:09:02 +01:00
|
|
|
comments = Column(TEXT)
|
2022-02-18 07:48:38 +01:00
|
|
|
age = Column(Integer)
|
2022-07-13 15:23:03 -04:00
|
|
|
dob = Column(DateTime) # date of birth
|
|
|
|
tob = Column(Time) # time of birth
|
|
|
|
doe = Column(Date) # date of employment
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
|
|
|
|
class MetricsTest(TestCase):
|
|
|
|
"""
|
|
|
|
Run checks on different metrics
|
|
|
|
"""
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
db_path = os.path.join(
|
|
|
|
os.path.dirname(__file__), f"{os.path.splitext(__file__)[0]}.db"
|
|
|
|
)
|
|
|
|
sqlite_conn = SQLiteConnection(
|
|
|
|
scheme=SQLiteScheme.sqlite_pysqlite,
|
|
|
|
databaseMode=db_path + "?check_same_thread=False",
|
|
|
|
)
|
2022-08-17 12:53:16 +02:00
|
|
|
|
2022-08-04 07:22:47 -07:00
|
|
|
table_entity = Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("id"),
|
2022-08-04 07:22:47 -07:00
|
|
|
dataType=DataType.INT,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
)
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def setUpClass(cls) -> None:
|
|
|
|
"""
|
|
|
|
Prepare Ingredients
|
|
|
|
"""
|
2022-10-11 15:57:25 +02:00
|
|
|
|
|
|
|
with patch.object(
|
|
|
|
SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User
|
|
|
|
):
|
|
|
|
cls.sqa_profiler_interface = SQAProfilerInterface(
|
2023-03-01 08:20:38 +01:00
|
|
|
cls.sqlite_conn,
|
|
|
|
None,
|
|
|
|
cls.table_entity,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
2023-11-09 18:49:42 +05:30
|
|
|
None,
|
2023-03-01 08:20:38 +01:00
|
|
|
thread_count=1,
|
2022-10-11 15:57:25 +02:00
|
|
|
)
|
|
|
|
cls.engine = cls.sqa_profiler_interface.session.get_bind()
|
|
|
|
|
2022-02-18 07:48:38 +01:00
|
|
|
User.__table__.create(bind=cls.engine)
|
|
|
|
|
|
|
|
data = [
|
2022-02-22 08:09:02 +01:00
|
|
|
User(
|
|
|
|
name="John",
|
|
|
|
fullname="John Doe",
|
|
|
|
nickname="johnny b goode",
|
|
|
|
comments="no comments",
|
|
|
|
age=30,
|
2022-07-13 15:23:03 -04:00
|
|
|
dob=datetime.datetime(1992, 5, 17),
|
|
|
|
tob=datetime.time(11, 2, 32),
|
|
|
|
doe=datetime.date(2020, 1, 12),
|
2022-02-22 08:09:02 +01:00
|
|
|
),
|
|
|
|
User(
|
|
|
|
name="Jane",
|
|
|
|
fullname="Jone Doe",
|
|
|
|
nickname=None,
|
|
|
|
comments="maybe some comments",
|
|
|
|
age=31,
|
2022-07-13 15:23:03 -04:00
|
|
|
dob=datetime.datetime(1991, 4, 4),
|
|
|
|
tob=datetime.time(10, 1, 31),
|
|
|
|
doe=datetime.date(2009, 11, 11),
|
2022-02-22 08:09:02 +01:00
|
|
|
),
|
2022-03-08 11:44:39 +01:00
|
|
|
User(
|
|
|
|
name="John",
|
|
|
|
fullname="John Doe",
|
|
|
|
nickname=None,
|
|
|
|
comments=None,
|
|
|
|
age=None,
|
2022-07-13 15:23:03 -04:00
|
|
|
dob=datetime.datetime(1982, 2, 2),
|
|
|
|
tob=datetime.time(9, 3, 25),
|
|
|
|
doe=datetime.date(2012, 12, 1),
|
2022-03-08 11:44:39 +01:00
|
|
|
),
|
2022-02-18 07:48:38 +01:00
|
|
|
]
|
2022-07-20 17:54:10 +02:00
|
|
|
cls.sqa_profiler_interface.session.add_all(data)
|
|
|
|
cls.sqa_profiler_interface.session.commit()
|
|
|
|
|
2022-03-16 06:05:59 +01:00
|
|
|
def test_count(self):
|
|
|
|
"""
|
|
|
|
Check the Count metric
|
|
|
|
"""
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
profiler = Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
count,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-03-16 06:05:59 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-03-16 06:05:59 +01:00
|
|
|
|
|
|
|
# Note how we can get the result value by passing the metrics name
|
|
|
|
assert res.get(User.name.name).get(Metrics.COUNT.name) == 3
|
|
|
|
|
2022-02-18 07:48:38 +01:00
|
|
|
def test_min(self):
|
|
|
|
"""
|
|
|
|
Check the Min metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
min_age = Metrics.MIN.value
|
|
|
|
profiler = Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
min_age,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
# Note how we can get the result value by passing the metrics name
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name).get(Metrics.MIN.name) == 30
|
2023-05-02 12:45:26 +02:00
|
|
|
assert res.get(User.comments.name).get(Metrics.MIN.name) == 11
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
def test_std(self):
|
|
|
|
"""
|
|
|
|
Check STD metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
std_age = Metrics.STDDEV.value
|
|
|
|
profiler = Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
std_age,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-02-18 07:48:38 +01:00
|
|
|
# SQLITE STD custom implementation returns the squared STD.
|
|
|
|
# Only useful for testing purposes
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name).get(Metrics.STDDEV.name) == 0.25
|
2022-02-18 07:48:38 +01:00
|
|
|
|
2022-07-13 15:23:03 -04:00
|
|
|
def test_earliest_time(self):
|
|
|
|
"""
|
|
|
|
Check Earliest Time Metric
|
|
|
|
"""
|
|
|
|
earliest_time = Metrics.MIN.value
|
|
|
|
profiler = Profiler(
|
|
|
|
earliest_time,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-07-13 15:23:03 -04:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-07-13 15:23:03 -04:00
|
|
|
assert (
|
|
|
|
res.get(User.dob.name).get(Metrics.MIN.name) == "1982-02-02 00:00:00.000000"
|
|
|
|
)
|
|
|
|
assert res.get(User.tob.name).get(Metrics.MIN.name) == "09:03:25.000000"
|
|
|
|
assert res.get(User.doe.name).get(Metrics.MIN.name) == "2009-11-11"
|
|
|
|
|
|
|
|
def test_latest_time(self):
|
|
|
|
"""
|
|
|
|
Check Latest Time Metric
|
|
|
|
"""
|
|
|
|
latest_time = Metrics.MAX.value
|
|
|
|
profiler = Profiler(
|
|
|
|
latest_time,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-07-13 15:23:03 -04:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-07-13 15:23:03 -04:00
|
|
|
assert (
|
|
|
|
res.get(User.dob.name).get(Metrics.MAX.name) == "1992-05-17 00:00:00.000000"
|
|
|
|
)
|
|
|
|
assert res.get(User.tob.name).get(Metrics.MAX.name) == "11:02:32.000000"
|
|
|
|
assert res.get(User.doe.name).get(Metrics.MAX.name) == "2020-01-12"
|
|
|
|
|
2022-02-18 07:48:38 +01:00
|
|
|
def test_null_count(self):
|
|
|
|
"""
|
|
|
|
Check null count
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
null_count = Metrics.NULL_COUNT.value
|
|
|
|
profiler = Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
null_count,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-02-18 07:48:38 +01:00
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.nickname.name).get(Metrics.NULL_COUNT.name) == 2
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
def test_null_ratio(self):
|
|
|
|
"""
|
|
|
|
Check composed metric run
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
count = Metrics.COUNT.value
|
|
|
|
null_count = Metrics.NULL_COUNT.value
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
# Build the ratio based on the other two metrics
|
2022-02-25 18:26:30 +01:00
|
|
|
null_ratio = Metrics.NULL_RATIO.value
|
|
|
|
|
|
|
|
profiler = Profiler(
|
|
|
|
count,
|
|
|
|
null_count,
|
|
|
|
null_ratio,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-18 07:48:38 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._column_results
|
2022-03-08 11:44:39 +01:00
|
|
|
assert (
|
|
|
|
str(round(res.get(User.nickname.name).get(Metrics.NULL_RATIO.name), 2))
|
|
|
|
== "0.67"
|
|
|
|
)
|
2022-02-18 07:48:38 +01:00
|
|
|
|
2022-03-04 06:59:47 +01:00
|
|
|
def test_table_row_count(self):
|
2022-02-18 07:48:38 +01:00
|
|
|
"""
|
|
|
|
Check Table Metric run
|
|
|
|
"""
|
2023-05-22 09:04:18 +02:00
|
|
|
row_count = Metrics.ROW_COUNT.value
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler = Profiler(
|
2023-05-22 09:04:18 +02:00
|
|
|
row_count,
|
2022-08-04 07:22:47 -07:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-07-20 17:54:10 +02:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._table_results
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(Metrics.ROW_COUNT.name) == 3
|
2022-02-22 08:09:02 +01:00
|
|
|
|
2022-03-04 06:59:47 +01:00
|
|
|
def test_table_column_count(self):
|
|
|
|
"""
|
|
|
|
Check Column Count metric
|
|
|
|
"""
|
|
|
|
col_count = add_props(table=User)(Metrics.COLUMN_COUNT.value)
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler = Profiler(
|
2022-08-04 07:22:47 -07:00
|
|
|
col_count,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-07-20 17:54:10 +02:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
res = profiler.compute_metrics()._table_results
|
2022-07-13 15:23:03 -04:00
|
|
|
assert res.get(Metrics.COLUMN_COUNT.name) == 9
|
2022-03-04 06:59:47 +01:00
|
|
|
|
2022-02-22 08:09:02 +01:00
|
|
|
def test_avg(self):
|
|
|
|
"""
|
|
|
|
Check avg for distinct types
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Integer
|
2022-02-25 18:26:30 +01:00
|
|
|
avg = Metrics.MEAN.value
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
avg,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-22 08:09:02 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name)[Metrics.MEAN.name] == 30.5
|
2022-02-22 08:09:02 +01:00
|
|
|
|
|
|
|
# String
|
2022-02-25 18:26:30 +01:00
|
|
|
avg = Metrics.MEAN.value
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
avg,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-22 08:09:02 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.MEAN.name] == 4.0
|
2022-02-22 08:09:02 +01:00
|
|
|
|
|
|
|
# Text
|
2022-02-25 18:26:30 +01:00
|
|
|
avg = Metrics.MEAN.value
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
avg,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-22 08:09:02 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.comments.name)[Metrics.MEAN.name] == 15.0
|
2022-02-22 08:09:02 +01:00
|
|
|
|
|
|
|
def test_duplicate_count(self):
|
|
|
|
"""
|
|
|
|
Check composed duplicate count
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
count = Metrics.COUNT.value
|
2022-03-08 11:44:39 +01:00
|
|
|
unique = Metrics.DISTINCT_COUNT.value
|
2022-02-25 18:26:30 +01:00
|
|
|
dup_count = Metrics.DUPLICATE_COUNT.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
count,
|
2022-03-02 16:46:28 +01:00
|
|
|
unique,
|
2022-02-25 18:26:30 +01:00
|
|
|
dup_count,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-22 08:09:02 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name)[Metrics.DUPLICATE_COUNT.name] == 0
|
2022-02-22 08:09:02 +01:00
|
|
|
|
|
|
|
def test_histogram(self):
|
|
|
|
"""
|
|
|
|
Check histogram computation
|
|
|
|
"""
|
|
|
|
|
2023-03-03 21:56:32 +01:00
|
|
|
hist = Metrics.HISTOGRAM.value
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
min = Metrics.MIN.value
|
|
|
|
max = Metrics.MAX.value
|
|
|
|
first_quartile = Metrics.FIRST_QUARTILE.value
|
|
|
|
third_quartile = Metrics.THIRD_QUARTILE.value
|
|
|
|
iqr = Metrics.IQR.value
|
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
hist,
|
2023-03-03 21:56:32 +01:00
|
|
|
count,
|
|
|
|
min,
|
|
|
|
max,
|
|
|
|
first_quartile,
|
|
|
|
third_quartile,
|
|
|
|
iqr,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-22 08:09:02 +01:00
|
|
|
|
2023-03-03 21:56:32 +01:00
|
|
|
age_histogram = res.get(User.age.name)[Metrics.HISTOGRAM.name]
|
|
|
|
id_histogram = res.get(User.id.name)[Metrics.HISTOGRAM.name]
|
2023-05-05 14:45:30 -05:00
|
|
|
comments_histogram = res.get(User.comments.name)[Metrics.HISTOGRAM.name]
|
2023-03-03 21:56:32 +01:00
|
|
|
|
|
|
|
assert age_histogram
|
|
|
|
assert len(age_histogram["frequencies"]) == 1
|
|
|
|
assert id_histogram
|
|
|
|
assert len(id_histogram["frequencies"]) == 2
|
2023-05-05 14:45:30 -05:00
|
|
|
assert comments_histogram
|
|
|
|
assert len(comments_histogram["frequencies"]) == 1
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_like_count(self):
|
|
|
|
"""
|
|
|
|
Check LIKE count
|
|
|
|
"""
|
|
|
|
# In sqlite, LIKE is insensitive by default, so we just check here
|
|
|
|
# that the metrics runs correctly rather than the implementation logic.
|
2022-02-25 18:26:30 +01:00
|
|
|
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
like,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 3
|
2022-03-04 18:11:49 +01:00
|
|
|
|
|
|
|
like = add_props(expression="Jo%")(Metrics.LIKE_COUNT.value)
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
like,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-04 18:11:49 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 2
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_ilike_count(self):
|
|
|
|
"""
|
|
|
|
Check ILIKE count: case-insensitive LIKE
|
|
|
|
"""
|
2022-03-04 18:11:49 +01:00
|
|
|
ilike = add_props(expression="j%")(Metrics.ILIKE_COUNT.value)
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
ilike,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-04 18:11:49 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 3
|
2022-03-04 18:11:49 +01:00
|
|
|
|
|
|
|
ilike = add_props(expression="ja%")(Metrics.ILIKE_COUNT.value)
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
ilike,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-03-04 18:11:49 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 1
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_like_ratio(self):
|
|
|
|
"""
|
|
|
|
Check LIKE ratio
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
like_ratio = Metrics.LIKE_RATIO.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
like,
|
|
|
|
count,
|
|
|
|
like_ratio,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.LIKE_RATIO.name] == 1.0
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_ilike_ratio(self):
|
|
|
|
"""
|
|
|
|
Check LIKE ratio
|
|
|
|
"""
|
|
|
|
# In sqlite, LIKE is insensitive by default, so we just check here
|
|
|
|
# that the metrics runs correctly rather than the implementation logic.
|
2022-02-25 18:26:30 +01:00
|
|
|
ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value)
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
ilike_ratio = Metrics.ILIKE_RATIO.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
ilike,
|
|
|
|
count,
|
|
|
|
ilike_ratio,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.ILIKE_RATIO.name] == 1.0
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_max(self):
|
|
|
|
"""
|
|
|
|
Check MAX metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
_max = Metrics.MAX.value
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
_max,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name)[Metrics.MAX.name] == 31
|
2023-05-02 12:45:26 +02:00
|
|
|
assert res.get(User.name.name).get(Metrics.MAX.name) == 4
|
2022-02-25 18:26:30 +01:00
|
|
|
|
2022-02-24 07:08:39 +01:00
|
|
|
def test_min_length(self):
|
|
|
|
"""
|
|
|
|
Check MIN_LENGTH metric
|
|
|
|
"""
|
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
min_length = Metrics.MIN_LENGTH.value
|
|
|
|
|
2022-02-24 07:08:39 +01:00
|
|
|
# Integer
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
min_length,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name).get(Metrics.MIN_LENGTH.name) is None
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# String
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
min_length,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.MIN_LENGTH.name] == 4
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# Text
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
|
|
|
Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
min_length,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.comments.name)[Metrics.MIN_LENGTH.name] == 11
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_max_length(self):
|
|
|
|
"""
|
|
|
|
Check MAX_LENGTH metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
max_length = Metrics.MAX_LENGTH.value
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# Integer
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
max_length,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name).get(Metrics.MAX_LENGTH.name) is None
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# String
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
max_length,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.MAX_LENGTH.name] == 4
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# Text
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
|
|
|
Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
max_length,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.comments.name)[Metrics.MAX_LENGTH.name] == 19
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_sum(self):
|
|
|
|
"""
|
|
|
|
Check SUM Metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
_sum = Metrics.SUM.value
|
|
|
|
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
_sum,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
assert res.get(User.age.name)[Metrics.SUM.name] == 61
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-02-25 18:26:30 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
_sum,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2023-04-30 03:03:56 -05:00
|
|
|
assert res.get(User.name.name).get(Metrics.SUM.name) == 12
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_unique_count(self):
|
|
|
|
"""
|
|
|
|
Check Unique Count metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
unique_count = Metrics.UNIQUE_COUNT.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
unique_count,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 1
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_unique_ratio(self):
|
|
|
|
"""
|
|
|
|
Check Unique Count metric
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
count = Metrics.COUNT.value
|
|
|
|
unique_count = Metrics.UNIQUE_COUNT.value
|
|
|
|
unique_ratio = Metrics.UNIQUE_RATIO.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
count,
|
|
|
|
unique_count,
|
|
|
|
unique_ratio,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-03-08 11:44:39 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-08 11:44:39 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert (
|
|
|
|
str(round(res.get(User.name.name)[Metrics.UNIQUE_RATIO.name], 2)) == "0.33"
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_distinct_count(self):
|
|
|
|
"""
|
|
|
|
Check Distinct Count Metric
|
|
|
|
"""
|
|
|
|
count = Metrics.DISTINCT_COUNT.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
count,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
._column_results
|
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.DISTINCT_COUNT.name] == 2.0
|
|
|
|
|
|
|
|
def test_distinct_ratio(self):
|
|
|
|
"""
|
|
|
|
Check Distinct Ratio Metric
|
|
|
|
"""
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
distinct_count = Metrics.DISTINCT_COUNT.value
|
|
|
|
distinct_ratio = Metrics.DISTINCT_RATIO.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
count,
|
|
|
|
distinct_count,
|
|
|
|
distinct_ratio,
|
2022-07-20 17:54:10 +02:00
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-03-08 11:44:39 +01:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-08 11:44:39 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert (
|
|
|
|
str(round(res.get(User.name.name)[Metrics.DISTINCT_RATIO.name], 2))
|
|
|
|
== "0.67"
|
|
|
|
)
|
2022-03-04 06:59:47 +01:00
|
|
|
|
|
|
|
def test_count_in_set(self):
|
|
|
|
"""
|
|
|
|
Check Count In Set metric
|
|
|
|
"""
|
|
|
|
|
|
|
|
set_count = add_props(values=["John"])(Metrics.COUNT_IN_SET.value)
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
set_count,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-04 06:59:47 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 2.0
|
2022-03-04 06:59:47 +01:00
|
|
|
|
|
|
|
set_count = add_props(values=["John", "Jane"])(Metrics.COUNT_IN_SET.value)
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
set_count,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-04 06:59:47 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
2022-03-08 11:44:39 +01:00
|
|
|
assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 3
|
2022-03-22 15:55:44 +01:00
|
|
|
|
|
|
|
def test_histogram_empty(self):
|
|
|
|
"""
|
|
|
|
Run the histogram on an empty table
|
|
|
|
"""
|
|
|
|
|
|
|
|
class EmptyUser(Base):
|
|
|
|
__tablename__ = "empty_users"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
name = Column(String(256))
|
|
|
|
fullname = Column(String(256))
|
|
|
|
nickname = Column(String(256))
|
|
|
|
comments = Column(TEXT)
|
|
|
|
age = Column(Integer)
|
|
|
|
|
|
|
|
EmptyUser.__table__.create(bind=self.engine)
|
|
|
|
|
2022-10-11 15:57:25 +02:00
|
|
|
with patch.object(
|
|
|
|
SQAProfilerInterface, "_convert_table_to_orm_object", return_value=EmptyUser
|
|
|
|
):
|
|
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
2023-03-01 08:20:38 +01:00
|
|
|
self.sqlite_conn,
|
|
|
|
None,
|
|
|
|
self.table_entity,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
2023-11-09 18:49:42 +05:30
|
|
|
None,
|
2022-10-11 15:57:25 +02:00
|
|
|
)
|
2022-08-17 12:53:16 +02:00
|
|
|
|
2023-03-03 21:56:32 +01:00
|
|
|
hist = Metrics.HISTOGRAM.value
|
2022-03-22 15:55:44 +01:00
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
hist,
|
2022-08-17 12:53:16 +02:00
|
|
|
profiler_interface=sqa_profiler_interface,
|
2022-07-20 17:54:10 +02:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-03-22 15:55:44 +01:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
2024-04-22 22:35:37 +02:00
|
|
|
assert res.get(EmptyUser.age.name) is None
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
def test_not_like_count(self):
|
|
|
|
"""
|
|
|
|
Check NOT_LIKE count
|
|
|
|
"""
|
|
|
|
# In sqlite, LIKE is insensitive by default, so we just check here
|
|
|
|
# that the metrics runs correctly rather than the implementation logic.
|
|
|
|
|
|
|
|
test_cases = [
|
|
|
|
("b%", 0),
|
|
|
|
("Jo%", 2),
|
|
|
|
("Ja%", 1),
|
|
|
|
("J%", 3),
|
|
|
|
]
|
|
|
|
|
|
|
|
for expression, expected in test_cases:
|
|
|
|
with self.subTest(expression=expression, expected=expected):
|
|
|
|
not_like = add_props(expression=expression)(
|
|
|
|
Metrics.NOT_LIKE_COUNT.value
|
|
|
|
)
|
|
|
|
res = (
|
|
|
|
Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
not_like,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
2022-07-06 10:12:29 +02:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-07-06 10:12:29 +02:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert res.get(User.name.name)[Metrics.NOT_LIKE_COUNT.name] == expected
|
|
|
|
|
|
|
|
def test_median(self):
|
|
|
|
"""
|
|
|
|
Check MEDIAN
|
|
|
|
"""
|
|
|
|
|
|
|
|
median = Metrics.MEDIAN.value
|
|
|
|
res = (
|
2022-07-20 17:54:10 +02:00
|
|
|
Profiler(
|
|
|
|
median,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
.compute_metrics()
|
2022-07-06 10:12:29 +02:00
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert res.get(User.age.name)[Metrics.MEDIAN.name] == 30
|
2023-05-02 12:45:26 +02:00
|
|
|
assert res.get(User.comments.name)[Metrics.MEDIAN.name] == 11
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2023-03-03 21:56:32 +01:00
|
|
|
def test_first_quartile(self):
|
|
|
|
"""
|
|
|
|
Check first quartile
|
|
|
|
"""
|
|
|
|
|
|
|
|
first_quartile = Metrics.FIRST_QUARTILE.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
first_quartile,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
.compute_metrics()
|
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert res.get(User.age.name)[Metrics.FIRST_QUARTILE.name] == 30
|
2023-05-02 12:45:26 +02:00
|
|
|
assert res.get(User.comments.name)[Metrics.FIRST_QUARTILE.name] == 11
|
2023-03-03 21:56:32 +01:00
|
|
|
|
|
|
|
def test_third_quartile(self):
|
|
|
|
"""
|
|
|
|
Check third quartile
|
|
|
|
"""
|
|
|
|
|
|
|
|
third_quartile = Metrics.THIRD_QUARTILE.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
third_quartile,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
.compute_metrics()
|
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert res.get(User.age.name)[Metrics.THIRD_QUARTILE.name] == 31
|
2023-05-02 12:45:26 +02:00
|
|
|
assert res.get(User.comments.name)[Metrics.THIRD_QUARTILE.name] == 19
|
2023-03-03 21:56:32 +01:00
|
|
|
|
|
|
|
def test_iqr(self):
|
|
|
|
"""Check IQR metric"""
|
|
|
|
iqr = Metrics.IQR.value
|
|
|
|
first_quartile = Metrics.FIRST_QUARTILE.value
|
|
|
|
third_quartile = Metrics.THIRD_QUARTILE.value
|
|
|
|
res = (
|
|
|
|
Profiler(
|
|
|
|
first_quartile,
|
|
|
|
third_quartile,
|
|
|
|
iqr,
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
.compute_metrics()
|
|
|
|
._column_results
|
|
|
|
)
|
|
|
|
|
|
|
|
assert res.get(User.age.name)[Metrics.IQR.name] == 1
|
2023-05-02 12:45:26 +02:00
|
|
|
assert res.get(User.comments.name)[Metrics.IQR.name] == 8
|
2023-03-03 21:56:32 +01:00
|
|
|
|
2022-07-30 18:39:36 +02:00
|
|
|
def test_sum_function(self):
|
|
|
|
"""Check overwritten sum function"""
|
|
|
|
session = self.sqa_profiler_interface.session
|
|
|
|
res = session.query(SumFn(User.age)).select_from(User).scalar()
|
|
|
|
|
|
|
|
assert res == 61
|
|
|
|
|
2022-12-07 14:33:30 +01:00
|
|
|
def test_system_metric(self):
|
2024-01-26 14:11:16 +01:00
|
|
|
system = add_props(table=User, ometa_client=None, db_service=None)(
|
|
|
|
Metrics.SYSTEM.value
|
|
|
|
)
|
2022-12-07 14:33:30 +01:00
|
|
|
session = self.sqa_profiler_interface.session
|
|
|
|
system().sql(session)
|
|
|
|
|
2023-11-17 17:51:39 +01:00
|
|
|
def test_table_custom_metric(self):
|
|
|
|
table_entity = Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("id"),
|
2023-11-17 17:51:39 +01:00
|
|
|
dataType=DataType.INT,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
customMetrics=[
|
|
|
|
CustomMetric(
|
|
|
|
name="CustomerBornedAfter1991",
|
|
|
|
expression="SELECT COUNT(id) FROM users WHERE dob > '1991-01-01'",
|
|
|
|
),
|
|
|
|
CustomMetric(
|
|
|
|
name="AverageAge",
|
|
|
|
expression="SELECT SUM(age)/COUNT(*) FROM users",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
with patch.object(
|
|
|
|
SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User
|
|
|
|
):
|
|
|
|
self.sqa_profiler_interface = SQAProfilerInterface(
|
|
|
|
self.sqlite_conn,
|
|
|
|
None,
|
|
|
|
table_entity,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
thread_count=1,
|
|
|
|
)
|
|
|
|
|
|
|
|
profiler = Profiler(
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
metrics = profiler.compute_metrics()
|
|
|
|
for k, v in metrics._table_results.items():
|
|
|
|
for metric in v:
|
|
|
|
if metric.name == "CustomerBornedAfter1991":
|
|
|
|
assert metric.value == 2
|
|
|
|
if metric.name == "AverageAge":
|
|
|
|
assert metric.value == 20.0
|
|
|
|
|
|
|
|
def test_column_custom_metric(self):
|
|
|
|
table_entity = Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("id"),
|
2023-11-17 17:51:39 +01:00
|
|
|
dataType=DataType.INT,
|
|
|
|
customMetrics=[
|
|
|
|
CustomMetric(
|
|
|
|
name="CustomerBornedAfter1991",
|
|
|
|
columnName="id",
|
|
|
|
expression="SELECT SUM(id) FROM users WHERE dob > '1991-01-01'",
|
|
|
|
),
|
|
|
|
CustomMetric(
|
|
|
|
name="AverageAge",
|
|
|
|
columnName="id",
|
|
|
|
expression="SELECT SUM(age)/COUNT(*) FROM users",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
],
|
|
|
|
)
|
|
|
|
with patch.object(
|
|
|
|
SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User
|
|
|
|
):
|
|
|
|
self.sqa_profiler_interface = SQAProfilerInterface(
|
|
|
|
self.sqlite_conn,
|
|
|
|
None,
|
|
|
|
table_entity,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
thread_count=1,
|
|
|
|
)
|
|
|
|
|
|
|
|
profiler = Profiler(
|
|
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
|
|
)
|
|
|
|
metrics = profiler.compute_metrics()
|
|
|
|
for k, v in metrics._column_results.items():
|
|
|
|
for metric in v.get("customMetrics", []):
|
|
|
|
if metric.name == "CustomerBornedAfter1991":
|
|
|
|
assert metric.value == 3.0
|
|
|
|
if metric.name == "AverageAge":
|
|
|
|
assert metric.value == 20.0
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
@classmethod
|
|
|
|
def tearDownClass(cls) -> None:
|
|
|
|
os.remove(cls.db_path)
|
|
|
|
return super().tearDownClass()
|