mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2026-01-01 01:46:57 +00:00
* Implement Cardinality Metric for String and Enum * Add Unit Tests * Update generated TypeScript types * Update ingestion/src/metadata/profiler/metrics/hybrid/cardinality_distribution.py Co-authored-by: Teddy <teddy.crepineau@gmail.com> * Fix CTE to simplify it to work with sqlite * Fix CTE to simplify it to work with sqlite * Update generated TypeScript types * Update generated TypeScript types * Add 'cardinalityDistribution' metric to profiler configuration * Update generated TypeScript types --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Teddy <teddy.crepineau@gmail.com> Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
1181 lines
35 KiB
Python
1181 lines
35 KiB
Python
# Copyright 2025 Collate
|
|
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Test Metrics behavior
|
|
"""
|
|
import datetime
|
|
import math
|
|
import os
|
|
from unittest import TestCase
|
|
from unittest.mock import patch
|
|
from uuid import uuid4
|
|
|
|
from sqlalchemy import TEXT, Column, Date, DateTime, Float, Integer, String, Time
|
|
from sqlalchemy.orm import declarative_base
|
|
|
|
from metadata.generated.schema.entity.data.table import Column as EntityColumn
|
|
from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
|
|
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
|
|
SQLiteConnection,
|
|
SQLiteScheme,
|
|
)
|
|
from metadata.generated.schema.tests.customMetric import CustomMetric
|
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
|
SQAProfilerInterface,
|
|
)
|
|
from metadata.profiler.metrics.core import add_props
|
|
from metadata.profiler.metrics.registry import Metrics
|
|
from metadata.profiler.orm.functions.sum import SumFn
|
|
from metadata.profiler.processor.core import Profiler
|
|
from metadata.sampler.sqlalchemy.sampler import SQASampler
|
|
|
|
Base = declarative_base()
|
|
|
|
|
|
class User(Base):
|
|
__tablename__ = "users"
|
|
id = Column(Integer, primary_key=True)
|
|
name = Column(String(256))
|
|
fullname = Column(String(256))
|
|
nickname = Column(String(256))
|
|
comments = Column(TEXT)
|
|
age = Column(Integer)
|
|
dob = Column(DateTime) # date of birth
|
|
tob = Column(Time) # time of birth
|
|
doe = Column(Date) # date of employment
|
|
|
|
|
|
class MetricsTest(TestCase):
|
|
"""
|
|
Run checks on different metrics
|
|
"""
|
|
|
|
db_path = os.path.join(
|
|
os.path.dirname(__file__), f"{os.path.splitext(__file__)[0]}.db"
|
|
)
|
|
sqlite_conn = SQLiteConnection(
|
|
scheme=SQLiteScheme.sqlite_pysqlite,
|
|
databaseMode=db_path + "?check_same_thread=False",
|
|
)
|
|
|
|
table_entity = Table(
|
|
id=uuid4(),
|
|
name="user",
|
|
columns=[
|
|
EntityColumn(
|
|
name=ColumnName("id"),
|
|
dataType=DataType.INT,
|
|
)
|
|
],
|
|
)
|
|
|
|
@classmethod
|
|
def setUpClass(cls) -> None:
|
|
"""
|
|
Prepare Ingredients
|
|
"""
|
|
|
|
with patch.object(SQASampler, "build_table_orm", return_value=User):
|
|
sampler = SQASampler(
|
|
service_connection_config=cls.sqlite_conn,
|
|
ometa_client=None,
|
|
entity=None,
|
|
)
|
|
cls.sqa_profiler_interface = SQAProfilerInterface(
|
|
cls.sqlite_conn,
|
|
None,
|
|
cls.table_entity,
|
|
None,
|
|
sampler,
|
|
1,
|
|
43200,
|
|
)
|
|
|
|
cls.engine = cls.sqa_profiler_interface.session.get_bind()
|
|
|
|
User.__table__.create(bind=cls.engine)
|
|
|
|
data = [
|
|
User(
|
|
name="John",
|
|
fullname="John Doe",
|
|
nickname="johnny b goode",
|
|
comments="no comments",
|
|
age=30,
|
|
dob=datetime.datetime(1992, 5, 17),
|
|
tob=datetime.time(11, 2, 32),
|
|
doe=datetime.date(2020, 1, 12),
|
|
),
|
|
User(
|
|
name="Jane",
|
|
fullname="Jone Doe",
|
|
nickname=None,
|
|
comments="maybe some comments",
|
|
age=31,
|
|
dob=datetime.datetime(1991, 4, 4),
|
|
tob=datetime.time(10, 1, 31),
|
|
doe=datetime.date(2009, 11, 11),
|
|
),
|
|
User(
|
|
name="John",
|
|
fullname="John Doe",
|
|
nickname=None,
|
|
comments=None,
|
|
age=None,
|
|
dob=datetime.datetime(1982, 2, 2),
|
|
tob=datetime.time(9, 3, 25),
|
|
doe=datetime.date(2012, 12, 1),
|
|
),
|
|
]
|
|
cls.sqa_profiler_interface.session.add_all(data)
|
|
cls.sqa_profiler_interface.session.commit()
|
|
|
|
def test_count(self):
|
|
"""
|
|
Check the Count metric
|
|
"""
|
|
count = Metrics.COUNT.value
|
|
profiler = Profiler(
|
|
count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
|
|
# Note how we can get the result value by passing the metrics name
|
|
assert res.get(User.name.name).get(Metrics.COUNT.name) == 3
|
|
|
|
def test_min(self):
|
|
"""
|
|
Check the Min metric
|
|
"""
|
|
min_age = Metrics.MIN.value
|
|
profiler = Profiler(
|
|
min_age,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
|
|
# Note how we can get the result value by passing the metrics name
|
|
assert res.get(User.age.name).get(Metrics.MIN.name) == 30
|
|
assert res.get(User.comments.name).get(Metrics.MIN.name) == 11
|
|
|
|
def test_std(self):
|
|
"""
|
|
Check STD metric
|
|
"""
|
|
std_age = Metrics.STDDEV.value
|
|
profiler = Profiler(
|
|
std_age,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
# SQLITE STD custom implementation returns the squared STD.
|
|
# Only useful for testing purposes
|
|
assert res.get(User.age.name).get(Metrics.STDDEV.name) == 0.25
|
|
|
|
def test_earliest_time(self):
|
|
"""
|
|
Check Earliest Time Metric
|
|
"""
|
|
earliest_time = Metrics.MIN.value
|
|
profiler = Profiler(
|
|
earliest_time,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
assert res.get(User.dob.name).get(Metrics.MIN.name) == datetime.datetime(
|
|
1982, 2, 2
|
|
)
|
|
assert res.get(User.tob.name).get(Metrics.MIN.name) == datetime.time(9, 3, 25)
|
|
assert res.get(User.doe.name).get(Metrics.MIN.name) == datetime.date(
|
|
2009, 11, 11
|
|
)
|
|
|
|
def test_latest_time(self):
|
|
"""
|
|
Check Latest Time Metric
|
|
"""
|
|
latest_time = Metrics.MAX.value
|
|
profiler = Profiler(
|
|
latest_time,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
assert res.get(User.dob.name).get(Metrics.MAX.name) == datetime.datetime(
|
|
1992, 5, 17
|
|
)
|
|
assert res.get(User.tob.name).get(Metrics.MAX.name) == datetime.time(11, 2, 32)
|
|
assert res.get(User.doe.name).get(Metrics.MAX.name) == datetime.date(
|
|
2020, 1, 12
|
|
)
|
|
|
|
def test_null_count(self):
|
|
"""
|
|
Check null count
|
|
"""
|
|
null_count = Metrics.NULL_COUNT.value
|
|
profiler = Profiler(
|
|
null_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
|
|
assert res.get(User.nickname.name).get(Metrics.NULL_COUNT.name) == 2
|
|
|
|
def test_null_ratio(self):
|
|
"""
|
|
Check composed metric run
|
|
"""
|
|
count = Metrics.COUNT.value
|
|
null_count = Metrics.NULL_COUNT.value
|
|
|
|
# Build the ratio based on the other two metrics
|
|
null_ratio = Metrics.NULL_RATIO.value
|
|
|
|
profiler = Profiler(
|
|
count,
|
|
null_count,
|
|
null_ratio,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
assert (
|
|
str(round(res.get(User.nickname.name).get(Metrics.NULL_RATIO.name), 2))
|
|
== "0.67"
|
|
)
|
|
|
|
def test_non_numeric(self):
|
|
"""
|
|
Check Null Count, Null Ratio
|
|
"""
|
|
|
|
class NonNumericNumbers(Base):
|
|
__tablename__ = "non_numeric_numbers"
|
|
id = Column(Integer, primary_key=True)
|
|
float_col = Column(Float()) # date of employment
|
|
|
|
NonNumericNumbers.__table__.create(bind=self.engine)
|
|
with patch.object(
|
|
SQASampler, "build_table_orm", return_value=NonNumericNumbers
|
|
):
|
|
sampler = SQASampler(
|
|
service_connection_config=self.sqlite_conn,
|
|
ometa_client=None,
|
|
entity=self.table_entity,
|
|
)
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
|
self.sqlite_conn,
|
|
None,
|
|
self.table_entity,
|
|
None,
|
|
sampler,
|
|
1,
|
|
43200,
|
|
)
|
|
|
|
data = [
|
|
NonNumericNumbers(float_col=math.nan),
|
|
NonNumericNumbers(float_col=math.inf),
|
|
NonNumericNumbers(float_col=-math.inf),
|
|
NonNumericNumbers(float_col=10),
|
|
NonNumericNumbers(float_col=20),
|
|
NonNumericNumbers(float_col=None),
|
|
]
|
|
sqa_profiler_interface.session.add_all(data)
|
|
sqa_profiler_interface.session.commit()
|
|
count = Metrics.COUNT.value
|
|
null_count = Metrics.NULL_COUNT.value
|
|
|
|
# Build the ratio based on the other two metrics
|
|
null_ratio = Metrics.NULL_RATIO.value
|
|
profiler = Profiler(
|
|
count,
|
|
null_count,
|
|
null_ratio,
|
|
profiler_interface=sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._column_results
|
|
|
|
assert (
|
|
res.get(NonNumericNumbers.float_col.name).get(Metrics.NULL_COUNT.name) == 2
|
|
)
|
|
assert (
|
|
str(
|
|
round(
|
|
res.get(NonNumericNumbers.float_col.name).get(
|
|
Metrics.NULL_RATIO.name
|
|
),
|
|
2,
|
|
)
|
|
)
|
|
== "0.33"
|
|
)
|
|
|
|
def test_table_row_count(self):
|
|
"""
|
|
Check Table Metric run
|
|
"""
|
|
row_count = Metrics.ROW_COUNT.value
|
|
profiler = Profiler(
|
|
row_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._table_results
|
|
assert res.get(Metrics.ROW_COUNT.name) == 3
|
|
|
|
def test_table_column_count(self):
|
|
"""
|
|
Check Column Count metric
|
|
"""
|
|
col_count = add_props(table=User)(Metrics.COLUMN_COUNT.value)
|
|
profiler = Profiler(
|
|
col_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
res = profiler.compute_metrics()._table_results
|
|
assert res.get(Metrics.COLUMN_COUNT.name) == 9
|
|
|
|
def test_avg(self):
|
|
"""
|
|
Check avg for distinct types
|
|
"""
|
|
|
|
# Integer
|
|
avg = Metrics.MEAN.value
|
|
res = (
|
|
Profiler(
|
|
avg,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.MEAN.name] == 30.5
|
|
|
|
# String
|
|
avg = Metrics.MEAN.value
|
|
res = (
|
|
Profiler(
|
|
avg,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.MEAN.name] == 4.0
|
|
|
|
# Text
|
|
avg = Metrics.MEAN.value
|
|
res = (
|
|
Profiler(
|
|
avg,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.comments.name)[Metrics.MEAN.name] == 15.0
|
|
|
|
def test_duplicate_count(self):
|
|
"""
|
|
Check composed duplicate count
|
|
"""
|
|
count = Metrics.COUNT.value
|
|
unique = Metrics.DISTINCT_COUNT.value
|
|
dup_count = Metrics.DUPLICATE_COUNT.value
|
|
res = (
|
|
Profiler(
|
|
count,
|
|
unique,
|
|
dup_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.DUPLICATE_COUNT.name] == 0
|
|
|
|
def test_histogram(self):
|
|
"""
|
|
Check histogram computation
|
|
"""
|
|
|
|
hist = Metrics.HISTOGRAM.value
|
|
count = Metrics.COUNT.value
|
|
min = Metrics.MIN.value
|
|
max = Metrics.MAX.value
|
|
first_quartile = Metrics.FIRST_QUARTILE.value
|
|
third_quartile = Metrics.THIRD_QUARTILE.value
|
|
iqr = Metrics.IQR.value
|
|
|
|
res = (
|
|
Profiler(
|
|
hist,
|
|
count,
|
|
min,
|
|
max,
|
|
first_quartile,
|
|
third_quartile,
|
|
iqr,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
age_histogram = res.get(User.age.name)[Metrics.HISTOGRAM.name]
|
|
id_histogram = res.get(User.id.name)[Metrics.HISTOGRAM.name]
|
|
comments_histogram = res.get(User.comments.name)[Metrics.HISTOGRAM.name]
|
|
|
|
assert age_histogram
|
|
assert len(age_histogram["frequencies"]) == 1
|
|
assert id_histogram
|
|
assert len(id_histogram["frequencies"]) == 2
|
|
assert comments_histogram is None
|
|
|
|
def test_cardinality_distribution(self):
|
|
"""
|
|
Check cardinality distribution computation
|
|
"""
|
|
cardinality_dist = Metrics.CARDINALITY_DISTRIBUTION.value
|
|
count = Metrics.COUNT.value
|
|
distinct_count = Metrics.DISTINCT_COUNT.value
|
|
|
|
res = (
|
|
Profiler(
|
|
cardinality_dist,
|
|
count,
|
|
distinct_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
# Test with string column that has repeated values (name column has "John" twice)
|
|
name_cardinality = res.get(User.name.name)[
|
|
Metrics.CARDINALITY_DISTRIBUTION.name
|
|
]
|
|
|
|
assert name_cardinality
|
|
assert "categories" in name_cardinality
|
|
assert "counts" in name_cardinality
|
|
assert "percentages" in name_cardinality
|
|
assert len(name_cardinality["categories"]) > 0
|
|
assert len(name_cardinality["counts"]) > 0
|
|
assert len(name_cardinality["percentages"]) > 0
|
|
|
|
# Check that "John" appears as a category (appears twice in test data)
|
|
assert "John" in name_cardinality["categories"]
|
|
john_index = name_cardinality["categories"].index("John")
|
|
assert name_cardinality["counts"][john_index] == 2
|
|
|
|
# Check that percentages sum to approximately 100%
|
|
assert abs(sum(name_cardinality["percentages"]) - 100.0) < 0.1
|
|
|
|
def test_cardinality_distribution_all_distinct(self):
|
|
"""
|
|
Check cardinality distribution when all values are distinct
|
|
"""
|
|
cardinality_dist = Metrics.CARDINALITY_DISTRIBUTION.value
|
|
count = Metrics.COUNT.value
|
|
distinct_count = Metrics.DISTINCT_COUNT.value
|
|
|
|
res = (
|
|
Profiler(
|
|
cardinality_dist,
|
|
count,
|
|
distinct_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
# Test with id column where all values are distinct
|
|
id_cardinality = res.get(User.id.name)[Metrics.CARDINALITY_DISTRIBUTION.name]
|
|
|
|
# Should return None when all values are distinct
|
|
assert id_cardinality is None
|
|
|
|
def test_cardinality_distribution_unsupported_type(self):
|
|
"""
|
|
Check cardinality distribution with unsupported data types
|
|
"""
|
|
cardinality_dist = Metrics.CARDINALITY_DISTRIBUTION.value
|
|
count = Metrics.COUNT.value
|
|
distinct_count = Metrics.DISTINCT_COUNT.value
|
|
|
|
res = (
|
|
Profiler(
|
|
cardinality_dist,
|
|
count,
|
|
distinct_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
# Test with integer column (not concatenable)
|
|
age_cardinality = res.get(User.age.name)[Metrics.CARDINALITY_DISTRIBUTION.name]
|
|
|
|
# Should return None for unsupported types
|
|
assert age_cardinality is None
|
|
|
|
def test_cardinality_distribution_empty_table(self):
|
|
"""
|
|
Check cardinality distribution with empty table
|
|
"""
|
|
# Create a new table with no data
|
|
class EmptyUser(Base):
|
|
__tablename__ = "empty_users"
|
|
id = Column(Integer, primary_key=True)
|
|
name = Column(String(256))
|
|
|
|
EmptyUser.__table__.create(bind=self.engine)
|
|
|
|
with patch.object(SQASampler, "build_table_orm", return_value=EmptyUser):
|
|
sampler = SQASampler(
|
|
service_connection_config=self.sqlite_conn,
|
|
ometa_client=None,
|
|
entity=None,
|
|
)
|
|
empty_profiler_interface = SQAProfilerInterface(
|
|
self.sqlite_conn,
|
|
None,
|
|
self.table_entity,
|
|
None,
|
|
sampler,
|
|
1,
|
|
43200,
|
|
)
|
|
|
|
cardinality_dist = Metrics.CARDINALITY_DISTRIBUTION.value
|
|
count = Metrics.COUNT.value
|
|
distinct_count = Metrics.DISTINCT_COUNT.value
|
|
|
|
res = (
|
|
Profiler(
|
|
cardinality_dist,
|
|
count,
|
|
distinct_count,
|
|
profiler_interface=empty_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
# Should return None for empty table
|
|
name_cardinality = res.get(EmptyUser.name.name)[
|
|
Metrics.CARDINALITY_DISTRIBUTION.name
|
|
]
|
|
assert name_cardinality is None
|
|
|
|
def test_like_count(self):
|
|
"""
|
|
Check LIKE count
|
|
"""
|
|
# In sqlite, LIKE is insensitive by default, so we just check here
|
|
# that the metrics runs correctly rather than the implementation logic.
|
|
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
|
|
res = (
|
|
Profiler(
|
|
like,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 3
|
|
|
|
like = add_props(expression="Jo%")(Metrics.LIKE_COUNT.value)
|
|
res = (
|
|
Profiler(
|
|
like,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 2
|
|
|
|
def test_ilike_count(self):
|
|
"""
|
|
Check ILIKE count: case-insensitive LIKE
|
|
"""
|
|
ilike = add_props(expression="j%")(Metrics.ILIKE_COUNT.value)
|
|
res = (
|
|
Profiler(
|
|
ilike,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 3
|
|
|
|
ilike = add_props(expression="ja%")(Metrics.ILIKE_COUNT.value)
|
|
res = (
|
|
Profiler(
|
|
ilike,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 1
|
|
|
|
def test_like_ratio(self):
|
|
"""
|
|
Check LIKE ratio
|
|
"""
|
|
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
|
|
count = Metrics.COUNT.value
|
|
like_ratio = Metrics.LIKE_RATIO.value
|
|
res = (
|
|
Profiler(
|
|
like,
|
|
count,
|
|
like_ratio,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.LIKE_RATIO.name] == 1.0
|
|
|
|
def test_ilike_ratio(self):
|
|
"""
|
|
Check LIKE ratio
|
|
"""
|
|
# In sqlite, LIKE is insensitive by default, so we just check here
|
|
# that the metrics runs correctly rather than the implementation logic.
|
|
ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value)
|
|
count = Metrics.COUNT.value
|
|
ilike_ratio = Metrics.ILIKE_RATIO.value
|
|
res = (
|
|
Profiler(
|
|
ilike,
|
|
count,
|
|
ilike_ratio,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.ILIKE_RATIO.name] == 1.0
|
|
|
|
def test_max(self):
|
|
"""
|
|
Check MAX metric
|
|
"""
|
|
_max = Metrics.MAX.value
|
|
|
|
res = (
|
|
Profiler(
|
|
_max,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.MAX.name] == 31
|
|
assert res.get(User.name.name).get(Metrics.MAX.name) == 4
|
|
|
|
def test_min_length(self):
|
|
"""
|
|
Check MIN_LENGTH metric
|
|
"""
|
|
|
|
min_length = Metrics.MIN_LENGTH.value
|
|
|
|
# Integer
|
|
res = (
|
|
Profiler(
|
|
min_length,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name).get(Metrics.MIN_LENGTH.name) is None
|
|
|
|
# String
|
|
res = (
|
|
Profiler(
|
|
min_length,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.MIN_LENGTH.name] == 4
|
|
|
|
# Text
|
|
res = (
|
|
Profiler(
|
|
min_length,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.comments.name)[Metrics.MIN_LENGTH.name] == 11
|
|
|
|
def test_max_length(self):
|
|
"""
|
|
Check MAX_LENGTH metric
|
|
"""
|
|
max_length = Metrics.MAX_LENGTH.value
|
|
|
|
# Integer
|
|
res = (
|
|
Profiler(
|
|
max_length,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name).get(Metrics.MAX_LENGTH.name) is None
|
|
|
|
# String
|
|
res = (
|
|
Profiler(
|
|
max_length,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.MAX_LENGTH.name] == 4
|
|
|
|
# Text
|
|
res = (
|
|
Profiler(
|
|
max_length,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.comments.name)[Metrics.MAX_LENGTH.name] == 19
|
|
|
|
def test_sum(self):
|
|
"""
|
|
Check SUM Metric
|
|
"""
|
|
_sum = Metrics.SUM.value
|
|
|
|
res = (
|
|
Profiler(
|
|
_sum,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.SUM.name] == 61
|
|
|
|
res = (
|
|
Profiler(
|
|
_sum,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name).get(Metrics.SUM.name) == 12
|
|
|
|
def test_unique_count(self):
|
|
"""
|
|
Check Unique Count metric
|
|
"""
|
|
unique_count = Metrics.UNIQUE_COUNT.value
|
|
res = (
|
|
Profiler(
|
|
unique_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 1
|
|
|
|
def test_unique_ratio(self):
|
|
"""
|
|
Check Unique Count metric
|
|
"""
|
|
count = Metrics.COUNT.value
|
|
unique_count = Metrics.UNIQUE_COUNT.value
|
|
unique_ratio = Metrics.UNIQUE_RATIO.value
|
|
res = (
|
|
Profiler(
|
|
count,
|
|
unique_count,
|
|
unique_ratio,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert (
|
|
str(round(res.get(User.name.name)[Metrics.UNIQUE_RATIO.name], 2)) == "0.33"
|
|
)
|
|
|
|
def test_distinct_count(self):
|
|
"""
|
|
Check Distinct Count Metric
|
|
"""
|
|
count = Metrics.DISTINCT_COUNT.value
|
|
res = (
|
|
Profiler(
|
|
count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.DISTINCT_COUNT.name] == 2.0
|
|
|
|
def test_distinct_ratio(self):
|
|
"""
|
|
Check Distinct Ratio Metric
|
|
"""
|
|
count = Metrics.COUNT.value
|
|
distinct_count = Metrics.DISTINCT_COUNT.value
|
|
distinct_ratio = Metrics.DISTINCT_RATIO.value
|
|
res = (
|
|
Profiler(
|
|
count,
|
|
distinct_count,
|
|
distinct_ratio,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert (
|
|
str(round(res.get(User.name.name)[Metrics.DISTINCT_RATIO.name], 2))
|
|
== "0.67"
|
|
)
|
|
|
|
def test_count_in_set(self):
|
|
"""
|
|
Check Count In Set metric
|
|
"""
|
|
|
|
set_count = add_props(values=["John"])(Metrics.COUNT_IN_SET.value)
|
|
res = (
|
|
Profiler(
|
|
set_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 2.0
|
|
|
|
set_count = add_props(values=["John", "Jane"])(Metrics.COUNT_IN_SET.value)
|
|
res = (
|
|
Profiler(
|
|
set_count,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 3
|
|
|
|
def test_histogram_empty(self):
|
|
"""
|
|
Run the histogram on an empty table
|
|
"""
|
|
|
|
class EmptyUser2(Base):
|
|
__tablename__ = "empty_users2"
|
|
id = Column(Integer, primary_key=True)
|
|
name = Column(String(256))
|
|
fullname = Column(String(256))
|
|
nickname = Column(String(256))
|
|
comments = Column(TEXT)
|
|
age = Column(Integer)
|
|
|
|
EmptyUser2.__table__.create(bind=self.engine)
|
|
|
|
with patch.object(SQASampler, "build_table_orm", return_value=EmptyUser2):
|
|
sampler = SQASampler(
|
|
service_connection_config=self.sqlite_conn,
|
|
ometa_client=None,
|
|
entity=self.table_entity,
|
|
)
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
|
self.sqlite_conn,
|
|
None,
|
|
self.table_entity,
|
|
None,
|
|
sampler,
|
|
1,
|
|
43200,
|
|
)
|
|
|
|
hist = Metrics.HISTOGRAM.value
|
|
res = (
|
|
Profiler(
|
|
hist,
|
|
profiler_interface=sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(EmptyUser2.age.name) is None
|
|
|
|
def test_not_like_count(self):
|
|
"""
|
|
Check NOT_LIKE count
|
|
"""
|
|
# In sqlite, LIKE is insensitive by default, so we just check here
|
|
# that the metrics runs correctly rather than the implementation logic.
|
|
|
|
test_cases = [
|
|
("b%", 0),
|
|
("Jo%", 2),
|
|
("Ja%", 1),
|
|
("J%", 3),
|
|
]
|
|
|
|
for expression, expected in test_cases:
|
|
with self.subTest(expression=expression, expected=expected):
|
|
not_like = add_props(expression=expression)(
|
|
Metrics.NOT_LIKE_COUNT.value
|
|
)
|
|
res = (
|
|
Profiler(
|
|
not_like,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.name.name)[Metrics.NOT_LIKE_COUNT.name] == expected
|
|
|
|
def test_median(self):
|
|
"""
|
|
Check MEDIAN
|
|
"""
|
|
|
|
median = Metrics.MEDIAN.value
|
|
res = (
|
|
Profiler(
|
|
median,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.MEDIAN.name] == 30
|
|
assert res.get(User.comments.name)[Metrics.MEDIAN.name] == 11
|
|
|
|
def test_first_quartile(self):
|
|
"""
|
|
Check first quartile
|
|
"""
|
|
|
|
first_quartile = Metrics.FIRST_QUARTILE.value
|
|
res = (
|
|
Profiler(
|
|
first_quartile,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.FIRST_QUARTILE.name] == 30
|
|
assert res.get(User.comments.name)[Metrics.FIRST_QUARTILE.name] == 11
|
|
|
|
def test_third_quartile(self):
|
|
"""
|
|
Check third quartile
|
|
"""
|
|
|
|
third_quartile = Metrics.THIRD_QUARTILE.value
|
|
res = (
|
|
Profiler(
|
|
third_quartile,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.THIRD_QUARTILE.name] == 31
|
|
assert res.get(User.comments.name)[Metrics.THIRD_QUARTILE.name] == 19
|
|
|
|
def test_iqr(self):
|
|
"""Check IQR metric"""
|
|
iqr = Metrics.IQR.value
|
|
first_quartile = Metrics.FIRST_QUARTILE.value
|
|
third_quartile = Metrics.THIRD_QUARTILE.value
|
|
res = (
|
|
Profiler(
|
|
first_quartile,
|
|
third_quartile,
|
|
iqr,
|
|
profiler_interface=self.sqa_profiler_interface,
|
|
)
|
|
.compute_metrics()
|
|
._column_results
|
|
)
|
|
|
|
assert res.get(User.age.name)[Metrics.IQR.name] == 1
|
|
assert res.get(User.comments.name)[Metrics.IQR.name] == 8
|
|
|
|
def test_sum_function(self):
|
|
"""Check overwritten sum function"""
|
|
session = self.sqa_profiler_interface.session
|
|
res = session.query(SumFn(User.age)).select_from(User).scalar()
|
|
|
|
assert res == 61
|
|
|
|
def test_table_custom_metric(self):
|
|
table_entity = Table(
|
|
id=uuid4(),
|
|
name="user",
|
|
columns=[
|
|
EntityColumn(
|
|
name=ColumnName("id"),
|
|
dataType=DataType.INT,
|
|
)
|
|
],
|
|
customMetrics=[
|
|
CustomMetric(
|
|
name="CustomerBornAfter1991",
|
|
expression="SELECT COUNT(id) FROM users WHERE dob > '1991-01-01'",
|
|
),
|
|
CustomMetric(
|
|
name="AverageAge",
|
|
expression="SELECT SUM(age)/COUNT(*) FROM users",
|
|
),
|
|
],
|
|
)
|
|
with patch.object(SQASampler, "build_table_orm", return_value=User):
|
|
sampler = SQASampler(
|
|
service_connection_config=self.sqlite_conn,
|
|
ometa_client=None,
|
|
entity=None,
|
|
)
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
|
self.sqlite_conn,
|
|
None,
|
|
table_entity,
|
|
None,
|
|
sampler,
|
|
1,
|
|
43200,
|
|
)
|
|
|
|
profiler = Profiler(
|
|
profiler_interface=sqa_profiler_interface,
|
|
)
|
|
metrics = profiler.compute_metrics()
|
|
for k, v in metrics._table_results.items():
|
|
for metric in v:
|
|
if metric.name == "CustomerBornAfter1991":
|
|
assert metric.value == 2
|
|
if metric.name == "AverageAge":
|
|
assert metric.value == 20.0
|
|
|
|
def test_column_custom_metric(self):
|
|
table_entity = Table(
|
|
id=uuid4(),
|
|
name="user",
|
|
columns=[
|
|
EntityColumn(
|
|
name=ColumnName("id"),
|
|
dataType=DataType.INT,
|
|
customMetrics=[
|
|
CustomMetric(
|
|
name="CustomerBornAfter1991",
|
|
columnName="id",
|
|
expression="SELECT SUM(id) FROM users WHERE dob > '1991-01-01'",
|
|
),
|
|
CustomMetric(
|
|
name="AverageAge",
|
|
columnName="id",
|
|
expression="SELECT SUM(age)/COUNT(*) FROM users",
|
|
),
|
|
],
|
|
)
|
|
],
|
|
)
|
|
with patch.object(SQASampler, "build_table_orm", return_value=User):
|
|
sampler = SQASampler(
|
|
service_connection_config=self.sqlite_conn,
|
|
ometa_client=None,
|
|
entity=None,
|
|
)
|
|
sqa_profiler_interface = SQAProfilerInterface(
|
|
self.sqlite_conn,
|
|
None,
|
|
table_entity,
|
|
None,
|
|
sampler,
|
|
1,
|
|
43200,
|
|
)
|
|
|
|
profiler = Profiler(
|
|
profiler_interface=sqa_profiler_interface,
|
|
)
|
|
metrics = profiler.compute_metrics()
|
|
for k, v in metrics._column_results.items():
|
|
for metric in v.get("customMetrics", []):
|
|
if metric.name == "CustomerBornAfter1991":
|
|
assert metric.value == 3.0
|
|
if metric.name == "AverageAge":
|
|
assert metric.value == 20.0
|
|
|
|
@classmethod
|
|
def tearDownClass(cls) -> None:
|
|
os.remove(cls.db_path)
|
|
return super().tearDownClass()
|