838 lines
24 KiB
Python
Raw Normal View History

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test Metrics behavior
"""
import datetime
from unittest import TestCase
from sqlalchemy import (
TEXT,
Column,
Date,
DateTime,
Integer,
String,
Time,
create_engine,
)
from sqlalchemy.orm import declarative_base
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
SQLiteConnection,
SQLiteScheme,
)
from metadata.orm_profiler.interfaces.sqa_profiler_interface import SQAProfilerInterface
from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiler.core import Profiler
Base = declarative_base()
class User(Base):
__tablename__ = "users"
id = Column(Integer, primary_key=True)
name = Column(String(256))
fullname = Column(String(256))
nickname = Column(String(256))
comments = Column(TEXT)
age = Column(Integer)
dob = Column(DateTime) # date of birth
tob = Column(Time) # time of birth
doe = Column(Date) # date of employment
class MetricsTest(TestCase):
"""
Run checks on different metrics
"""
sqlite_conn = SQLiteConnection(scheme=SQLiteScheme.sqlite_pysqlite)
sqa_profiler_interface = SQAProfilerInterface(sqlite_conn)
engine = sqa_profiler_interface.session.get_bind()
@classmethod
def setUpClass(cls) -> None:
"""
Prepare Ingredients
"""
User.__table__.create(bind=cls.engine)
data = [
User(
name="John",
fullname="John Doe",
nickname="johnny b goode",
comments="no comments",
age=30,
dob=datetime.datetime(1992, 5, 17),
tob=datetime.time(11, 2, 32),
doe=datetime.date(2020, 1, 12),
),
User(
name="Jane",
fullname="Jone Doe",
nickname=None,
comments="maybe some comments",
age=31,
dob=datetime.datetime(1991, 4, 4),
tob=datetime.time(10, 1, 31),
doe=datetime.date(2009, 11, 11),
),
User(
name="John",
fullname="John Doe",
nickname=None,
comments=None,
age=None,
dob=datetime.datetime(1982, 2, 2),
tob=datetime.time(9, 3, 25),
doe=datetime.date(2012, 12, 1),
),
]
cls.sqa_profiler_interface.session.add_all(data)
cls.sqa_profiler_interface.session.commit()
def setUp(self) -> None:
self.sqa_profiler_interface.create_sampler(User)
self.sqa_profiler_interface.create_runner(User)
def test_count(self):
"""
Check the Count metric
"""
count = Metrics.COUNT.value
profiler = Profiler(
count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
res = profiler.execute()._column_results
# Note how we can get the result value by passing the metrics name
assert res.get(User.name.name).get(Metrics.COUNT.name) == 3
def test_min(self):
"""
Check the Min metric
"""
min_age = Metrics.MIN.value
profiler = Profiler(
min_age,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
res = profiler.execute()._column_results
# Note how we can get the result value by passing the metrics name
assert res.get(User.age.name).get(Metrics.MIN.name) == 30
def test_std(self):
"""
Check STD metric
"""
std_age = Metrics.STDDEV.value
profiler = Profiler(
std_age,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
res = profiler.execute()._column_results
# SQLITE STD custom implementation returns the squared STD.
# Only useful for testing purposes
assert res.get(User.age.name).get(Metrics.STDDEV.name) == 0.25
def test_earliest_time(self):
"""
Check Earliest Time Metric
"""
earliest_time = Metrics.MIN.value
profiler = Profiler(
earliest_time,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.dob, User.tob, User.doe],
)
res = profiler.execute()._column_results
assert (
res.get(User.dob.name).get(Metrics.MIN.name) == "1982-02-02 00:00:00.000000"
)
assert res.get(User.tob.name).get(Metrics.MIN.name) == "09:03:25.000000"
assert res.get(User.doe.name).get(Metrics.MIN.name) == "2009-11-11"
def test_latest_time(self):
"""
Check Latest Time Metric
"""
latest_time = Metrics.MAX.value
profiler = Profiler(
latest_time,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.dob, User.tob, User.doe],
)
res = profiler.execute()._column_results
assert (
res.get(User.dob.name).get(Metrics.MAX.name) == "1992-05-17 00:00:00.000000"
)
assert res.get(User.tob.name).get(Metrics.MAX.name) == "11:02:32.000000"
assert res.get(User.doe.name).get(Metrics.MAX.name) == "2020-01-12"
def test_null_count(self):
"""
Check null count
"""
null_count = Metrics.NULL_COUNT.value
profiler = Profiler(
null_count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.nickname],
)
res = profiler.execute()._column_results
assert res.get(User.nickname.name).get(Metrics.NULL_COUNT.name) == 2
def test_null_ratio(self):
"""
Check composed metric run
"""
count = Metrics.COUNT.value
null_count = Metrics.NULL_COUNT.value
# Build the ratio based on the other two metrics
null_ratio = Metrics.NULL_RATIO.value
profiler = Profiler(
count,
null_count,
null_ratio,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.nickname],
)
res = profiler.execute()._column_results
assert (
str(round(res.get(User.nickname.name).get(Metrics.NULL_RATIO.name), 2))
== "0.67"
)
def test_table_row_count(self):
"""
Check Table Metric run
"""
table_count = Metrics.ROW_COUNT.value
profiler = Profiler(
table_count, profiler_interface=self.sqa_profiler_interface, table=User
)
res = profiler.execute()._table_results
assert res.get(Metrics.ROW_COUNT.name) == 3
def test_table_column_count(self):
"""
Check Column Count metric
"""
col_count = add_props(table=User)(Metrics.COLUMN_COUNT.value)
profiler = Profiler(
col_count, profiler_interface=self.sqa_profiler_interface, table=User
)
res = profiler.execute()._table_results
assert res.get(Metrics.COLUMN_COUNT.name) == 9
def test_avg(self):
"""
Check avg for distinct types
"""
# Integer
avg = Metrics.MEAN.value
res = (
Profiler(
avg,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name)[Metrics.MEAN.name] == 30.5
# String
avg = Metrics.MEAN.value
res = (
Profiler(
avg,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.MEAN.name] == 4.0
# Text
avg = Metrics.MEAN.value
res = (
Profiler(
avg,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.comments],
)
.execute()
._column_results
)
assert res.get(User.comments.name)[Metrics.MEAN.name] == 15.0
def test_duplicate_count(self):
"""
Check composed duplicate count
"""
count = Metrics.COUNT.value
unique = Metrics.DISTINCT_COUNT.value
dup_count = Metrics.DUPLICATE_COUNT.value
res = (
Profiler(
count,
unique,
dup_count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name)[Metrics.DUPLICATE_COUNT.name] == 0
def test_histogram(self):
"""
Check histogram computation
"""
hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
res = (
Profiler(
hist,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name)[Metrics.HISTOGRAM.name]
assert (
len(res.get(User.age.name)[Metrics.HISTOGRAM.name]["frequencies"])
== 3 # Too little values. Counts nulls
)
def test_like_count(self):
"""
Check LIKE count
"""
# In sqlite, LIKE is insensitive by default, so we just check here
# that the metrics runs correctly rather than the implementation logic.
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
res = (
Profiler(
like,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 3
like = add_props(expression="Jo%")(Metrics.LIKE_COUNT.value)
res = (
Profiler(
like,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 2
# Running safely
# with pytest.raises(AttributeError):
# Profiler(
# Metrics.LIKE_COUNT.value,
# profiler_interface=self.sqa_profiler_interface,
# table=User,
# use_cols=[User.age],
# ).execute()
def test_ilike_count(self):
"""
Check ILIKE count: case-insensitive LIKE
"""
ilike = add_props(expression="j%")(Metrics.ILIKE_COUNT.value)
res = (
Profiler(
ilike,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 3
ilike = add_props(expression="ja%")(Metrics.ILIKE_COUNT.value)
res = (
Profiler(
ilike,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 1
# Running safely
# with pytest.raises(AttributeError):
# Profiler(
# Metrics.ILIKE_COUNT.value,
# profiler_interface=self.sqa_profiler_interface,
# table=User,
# use_cols=[User.age],
# ).execute()
def test_like_ratio(self):
"""
Check LIKE ratio
"""
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
count = Metrics.COUNT.value
like_ratio = Metrics.LIKE_RATIO.value
res = (
Profiler(
like,
count,
like_ratio,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.LIKE_RATIO.name] == 1.0
def test_ilike_ratio(self):
"""
Check LIKE ratio
"""
# In sqlite, LIKE is insensitive by default, so we just check here
# that the metrics runs correctly rather than the implementation logic.
ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value)
count = Metrics.COUNT.value
ilike_ratio = Metrics.ILIKE_RATIO.value
res = (
Profiler(
ilike,
count,
ilike_ratio,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.ILIKE_RATIO.name] == 1.0
def test_max(self):
"""
Check MAX metric
"""
_max = Metrics.MAX.value
res = (
Profiler(
_max,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name)[Metrics.MAX.name] == 31
# TMP disable min/max on strings
# res = (
# Profiler(_max, profiler_interface=self.sqa_profiler_interface, table=User, use_cols=[User.name])
# .execute()
# ._column_results
# )
# assert res.get(User.name.name)[Metrics.MAX.name] == "John"
def test_min_length(self):
"""
Check MIN_LENGTH metric
"""
min_length = Metrics.MIN_LENGTH.value
# Integer
res = (
Profiler(
min_length,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name).get(Metrics.MIN_LENGTH.name) is None
# String
res = (
Profiler(
min_length,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.MIN_LENGTH.name] == 4
# Text
res = (
Profiler(
min_length,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.comments],
)
.execute()
._column_results
)
assert res.get(User.comments.name)[Metrics.MIN_LENGTH.name] == 11
def test_max_length(self):
"""
Check MAX_LENGTH metric
"""
max_length = Metrics.MAX_LENGTH.value
# Integer
res = (
Profiler(
max_length,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name).get(Metrics.MAX_LENGTH.name) is None
# String
res = (
Profiler(
max_length,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.MAX_LENGTH.name] == 4
# Text
res = (
Profiler(
max_length,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.comments],
)
.execute()
._column_results
)
assert res.get(User.comments.name)[Metrics.MAX_LENGTH.name] == 19
def test_sum(self):
"""
Check SUM Metric
"""
_sum = Metrics.SUM.value
res = (
Profiler(
_sum,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name)[Metrics.SUM.name] == 61
res = (
Profiler(
_sum,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name).get(Metrics.SUM.name) is None
def test_unique_count(self):
"""
Check Unique Count metric
"""
unique_count = Metrics.UNIQUE_COUNT.value
res = (
Profiler(
unique_count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 1
def test_unique_ratio(self):
"""
Check Unique Count metric
"""
count = Metrics.COUNT.value
unique_count = Metrics.UNIQUE_COUNT.value
unique_ratio = Metrics.UNIQUE_RATIO.value
res = (
Profiler(
count,
unique_count,
unique_ratio,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert (
str(round(res.get(User.name.name)[Metrics.UNIQUE_RATIO.name], 2)) == "0.33"
)
def test_distinct_count(self):
"""
Check Distinct Count Metric
"""
count = Metrics.DISTINCT_COUNT.value
res = (
Profiler(
count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.DISTINCT_COUNT.name] == 2.0
def test_distinct_ratio(self):
"""
Check Distinct Ratio Metric
"""
count = Metrics.COUNT.value
distinct_count = Metrics.DISTINCT_COUNT.value
distinct_ratio = Metrics.DISTINCT_RATIO.value
res = (
Profiler(
count,
distinct_count,
distinct_ratio,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert (
str(round(res.get(User.name.name)[Metrics.DISTINCT_RATIO.name], 2))
== "0.67"
)
def test_count_in_set(self):
"""
Check Count In Set metric
"""
set_count = add_props(values=["John"])(Metrics.COUNT_IN_SET.value)
res = (
Profiler(
set_count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 2.0
set_count = add_props(values=["John", "Jane"])(Metrics.COUNT_IN_SET.value)
res = (
Profiler(
set_count,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 3
def test_histogram_empty(self):
"""
Run the histogram on an empty table
"""
class EmptyUser(Base):
__tablename__ = "empty_users"
id = Column(Integer, primary_key=True)
name = Column(String(256))
fullname = Column(String(256))
nickname = Column(String(256))
comments = Column(TEXT)
age = Column(Integer)
self.sqa_profiler_interface.create_sampler(EmptyUser)
self.sqa_profiler_interface.create_runner(EmptyUser)
EmptyUser.__table__.create(bind=self.engine)
hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
res = (
Profiler(
hist,
profiler_interface=self.sqa_profiler_interface,
table=EmptyUser,
use_cols=[EmptyUser.age],
)
.execute()
._column_results
)
assert res.get(EmptyUser.age.name).get(Metrics.HISTOGRAM.name) is None
def test_not_like_count(self):
"""
Check NOT_LIKE count
"""
# In sqlite, LIKE is insensitive by default, so we just check here
# that the metrics runs correctly rather than the implementation logic.
test_cases = [
("b%", 0),
("Jo%", 2),
("Ja%", 1),
("J%", 3),
]
for expression, expected in test_cases:
with self.subTest(expression=expression, expected=expected):
not_like = add_props(expression=expression)(
Metrics.NOT_LIKE_COUNT.value
)
res = (
Profiler(
not_like,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.name],
)
.execute()
._column_results
)
assert res.get(User.name.name)[Metrics.NOT_LIKE_COUNT.name] == expected
def test_median(self):
"""
Check MEDIAN
"""
median = Metrics.MEDIAN.value
res = (
Profiler(
median,
profiler_interface=self.sqa_profiler_interface,
table=User,
use_cols=[User.age],
)
.execute()
._column_results
)
assert res.get(User.age.name)[Metrics.MEDIAN.name] == 30