# Copyright 2021 Collate # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Test Metrics behavior """ from unittest import TestCase from sqlalchemy import TEXT, Column, Integer, String, create_engine from sqlalchemy.orm import declarative_base from metadata.orm_profiler.engines import create_and_bind_session from metadata.orm_profiler.metrics.core import add_props from metadata.orm_profiler.metrics.registry import Metrics from metadata.orm_profiler.profiles.core import Profiler Base = declarative_base() class User(Base): __tablename__ = "users" id = Column(Integer, primary_key=True) name = Column(String(256)) fullname = Column(String(256)) nickname = Column(String(256)) comments = Column(TEXT) age = Column(Integer) class MetricsTest(TestCase): """ Run checks on different metrics """ engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True) session = create_and_bind_session(engine) @classmethod def setUpClass(cls) -> None: """ Prepare Ingredients """ User.__table__.create(bind=cls.engine) data = [ User( name="John", fullname="John Doe", nickname="johnny b goode", comments="no comments", age=30, ), User( name="Jane", fullname="Jone Doe", nickname=None, comments="maybe some comments", age=31, ), ] cls.session.add_all(data) cls.session.commit() def test_min(self): """ Check the Min metric """ min_age = Metrics.MIN.value profiler = Profiler( min_age, session=self.session, table=User, use_cols=[User.age] ) res = profiler.execute()._column_results # Note how we can get the result value by passing the metrics name assert res.get(User.age.name).get(Metrics.MIN.name) == 30 def test_std(self): """ Check STD metric """ std_age = Metrics.STDDEV.value profiler = Profiler( std_age, session=self.session, table=User, use_cols=[User.age] ) res = profiler.execute()._column_results # SQLITE STD custom implementation returns the squared STD. # Only useful for testing purposes assert res.get(User.age.name).get(Metrics.STDDEV.name) == 0.25 def test_null_count(self): """ Check null count """ null_count = Metrics.NULL_COUNT.value profiler = Profiler( null_count, session=self.session, table=User, use_cols=[User.nickname] ) res = profiler.execute()._column_results assert res.get(User.nickname.name).get(Metrics.NULL_COUNT.name) == 1 def test_null_ratio(self): """ Check composed metric run """ count = Metrics.COUNT.value null_count = Metrics.NULL_COUNT.value # Build the ratio based on the other two metrics null_ratio = Metrics.NULL_RATIO.value profiler = Profiler( count, null_count, null_ratio, session=self.session, table=User, use_cols=[User.nickname], ) res = profiler.execute()._column_results assert res.get(User.nickname.name).get(Metrics.NULL_RATIO.name) == 0.5 def test_table_count(self): """ Check Table Metric run """ table_count = Metrics.ROW_COUNT.value profiler = Profiler(table_count, session=self.session, table=User) res = profiler.execute()._table_results assert res.get(Metrics.ROW_COUNT.name) == 2 def test_avg(self): """ Check avg for distinct types """ # Integer avg = Metrics.MEAN.value res = ( Profiler(avg, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.MEAN.name] == 30.5 # String avg = Metrics.MEAN.value res = ( Profiler(avg, session=self.session, table=User, use_cols=[User.name]) .execute() ._column_results ) assert res.get(User.name.name)[Metrics.MEAN.name] == 4.0 # Text avg = Metrics.MEAN.value res = ( Profiler(avg, session=self.session, table=User, use_cols=[User.comments]) .execute() ._column_results ) assert res.get(User.comments.name)[Metrics.MEAN.name] == 15.0 def test_duplicate_count(self): """ Check composed duplicate count """ count = Metrics.COUNT.value unique = Metrics.UNIQUE_COUNT.value dup_count = Metrics.DUPLICATE_COUNT.value res = ( Profiler( count, unique, dup_count, session=self.session, table=User, use_cols=[User.age], ) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.DUPLICATE_COUNT.name] == 0 def test_histogram(self): """ Check histogram computation """ hist = add_props(bins=5)(Metrics.HISTOGRAM.value) res = ( Profiler(hist, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.HISTOGRAM.name] assert ( len(res.get(User.age.name)[Metrics.HISTOGRAM.name]["frequencies"]) == 2 # Too little values ) def test_like_count(self): """ Check LIKE count """ # In sqlite, LIKE is insensitive by default, so we just check here # that the metrics runs correctly rather than the implementation logic. like = add_props(expression="J%")(Metrics.LIKE_COUNT.value) res = ( Profiler(like, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.LIKE_COUNT.name] == 2 # Running safely # with pytest.raises(AttributeError): # Profiler( # Metrics.LIKE_COUNT.value, # session=self.session, # table=User, # use_cols=[User.age], # ).execute() def test_ilike_count(self): """ Check ILIKE count: case-insensitive LIKE """ ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value) res = ( Profiler(ilike, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.ILIKE_COUNT.name] == 2 # Running safely # with pytest.raises(AttributeError): # Profiler( # Metrics.ILIKE_COUNT.value, # session=self.session, # table=User, # use_cols=[User.age], # ).execute() def test_like_ratio(self): """ Check LIKE ratio """ like = add_props(expression="J%")(Metrics.LIKE_COUNT.value) count = Metrics.COUNT.value like_ratio = Metrics.LIKE_RATIO.value res = ( Profiler( like, count, like_ratio, session=self.session, table=User, use_cols=[User.name], ) .execute() ._column_results ) assert res.get(User.name.name)[Metrics.LIKE_RATIO.name] == 1.0 def test_ilike_ratio(self): """ Check LIKE ratio """ # In sqlite, LIKE is insensitive by default, so we just check here # that the metrics runs correctly rather than the implementation logic. ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value) count = Metrics.COUNT.value ilike_ratio = Metrics.ILIKE_RATIO.value res = ( Profiler( ilike, count, ilike_ratio, session=self.session, table=User, use_cols=[User.name], ) .execute() ._column_results ) assert res.get(User.name.name)[Metrics.ILIKE_RATIO.name] == 1.0 def test_max(self): """ Check MAX metric """ _max = Metrics.MAX.value res = ( Profiler(_max, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.MAX.name] == 31 # TMP disable min/max on strings # res = ( # Profiler(_max, session=self.session, table=User, use_cols=[User.name]) # .execute() # ._column_results # ) # assert res.get(User.name.name)[Metrics.MAX.name] == "John" def test_min_length(self): """ Check MIN_LENGTH metric """ min_length = Metrics.MIN_LENGTH.value # Integer res = ( Profiler(min_length, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name).get(Metrics.MIN_LENGTH.name) is None # String res = ( Profiler(min_length, session=self.session, table=User, use_cols=[User.name]) .execute() ._column_results ) assert res.get(User.name.name)[Metrics.MIN_LENGTH.name] == 4 # Text res = ( Profiler( min_length, session=self.session, table=User, use_cols=[User.comments] ) .execute() ._column_results ) assert res.get(User.comments.name)[Metrics.MIN_LENGTH.name] == 11 def test_max_length(self): """ Check MAX_LENGTH metric """ max_length = Metrics.MAX_LENGTH.value # Integer res = ( Profiler(max_length, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name).get(Metrics.MAX_LENGTH.name) is None # String res = ( Profiler(max_length, session=self.session, table=User, use_cols=[User.name]) .execute() ._column_results ) assert res.get(User.name.name)[Metrics.MAX_LENGTH.name] == 4 # Text res = ( Profiler( max_length, session=self.session, table=User, use_cols=[User.comments] ) .execute() ._column_results ) assert res.get(User.comments.name)[Metrics.MAX_LENGTH.name] == 19 def test_sum(self): """ Check SUM Metric """ _sum = Metrics.SUM.value res = ( Profiler(_sum, session=self.session, table=User, use_cols=[User.age]) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.SUM.name] == 61 res = ( Profiler(_sum, session=self.session, table=User, use_cols=[User.name]) .execute() ._column_results ) assert res.get(User.name.name).get(Metrics.SUM.name) is None def test_unique_count(self): """ Check Unique Count metric """ unique_count = Metrics.UNIQUE_COUNT.value res = ( Profiler( unique_count, session=self.session, table=User, use_cols=[User.age] ) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.UNIQUE_COUNT.name] == 2 def test_unique_ratio(self): """ Check Unique Count metric """ count = Metrics.COUNT.value unique_count = Metrics.UNIQUE_COUNT.value unique_ratio = Metrics.UNIQUE_RATIO.value res = ( Profiler( count, unique_count, unique_ratio, session=self.session, table=User, use_cols=[User.age], ) .execute() ._column_results ) assert res.get(User.age.name)[Metrics.UNIQUE_RATIO.name] == 1.0