OpenMetadata/ingestion/tests/unit/profiler/test_metrics.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Test Metrics behavior
"""
import datetime
from unittest import TestCase

from sqlalchemy import (
    TEXT,
    Column,
    Date,
    DateTime,
    Integer,
    String,
    Time,
    create_engine,
)
from sqlalchemy.orm import declarative_base

from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
    SQLiteConnection,
    SQLiteScheme,
)
from metadata.orm_profiler.interfaces.sqa_profiler_interface import SQAProfilerInterface
from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiler.core import Profiler

Base = declarative_base()


class User(Base):
    __tablename__ = "users"
    id = Column(Integer, primary_key=True)
    name = Column(String(256))
    fullname = Column(String(256))
    nickname = Column(String(256))
    comments = Column(TEXT)
    age = Column(Integer)
    dob = Column(DateTime)  # date of birth
    tob = Column(Time)  # time of birth
    doe = Column(Date)  # date of employment


class MetricsTest(TestCase):
    """
    Run checks on different metrics
    """

    sqlite_conn = SQLiteConnection(scheme=SQLiteScheme.sqlite_pysqlite)
    sqa_profiler_interface = SQAProfilerInterface(sqlite_conn)
    engine = sqa_profiler_interface.session.get_bind()

    @classmethod
    def setUpClass(cls) -> None:
        """
        Prepare Ingredients
        """
        User.__table__.create(bind=cls.engine)

        data = [
            User(
                name="John",
                fullname="John Doe",
                nickname="johnny b goode",
                comments="no comments",
                age=30,
                dob=datetime.datetime(1992, 5, 17),
                tob=datetime.time(11, 2, 32),
                doe=datetime.date(2020, 1, 12),
            ),
            User(
                name="Jane",
                fullname="Jone Doe",
                nickname=None,
                comments="maybe some comments",
                age=31,
                dob=datetime.datetime(1991, 4, 4),
                tob=datetime.time(10, 1, 31),
                doe=datetime.date(2009, 11, 11),
            ),
            User(
                name="John",
                fullname="John Doe",
                nickname=None,
                comments=None,
                age=None,
                dob=datetime.datetime(1982, 2, 2),
                tob=datetime.time(9, 3, 25),
                doe=datetime.date(2012, 12, 1),
            ),
        ]
        cls.sqa_profiler_interface.session.add_all(data)
        cls.sqa_profiler_interface.session.commit()

    def setUp(self) -> None:
        self.sqa_profiler_interface.create_sampler(User)
        self.sqa_profiler_interface.create_runner(User)

    def test_count(self):
        """
        Check the Count metric
        """
        count = Metrics.COUNT.value
        profiler = Profiler(
            count,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.name],
        )
        res = profiler.execute()._column_results

        # Note how we can get the result value by passing the metrics name
        assert res.get(User.name.name).get(Metrics.COUNT.name) == 3

    def test_min(self):
        """
        Check the Min metric
        """
        min_age = Metrics.MIN.value
        profiler = Profiler(
            min_age,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.age],
        )
        res = profiler.execute()._column_results

        # Note how we can get the result value by passing the metrics name
        assert res.get(User.age.name).get(Metrics.MIN.name) == 30

    def test_std(self):
        """
        Check STD metric
        """
        std_age = Metrics.STDDEV.value
        profiler = Profiler(
            std_age,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.age],
        )
        res = profiler.execute()._column_results
        # SQLITE STD custom implementation returns the squared STD.
        # Only useful for testing purposes
        assert res.get(User.age.name).get(Metrics.STDDEV.name) == 0.25

    def test_earliest_time(self):
        """
        Check Earliest Time Metric
        """
        earliest_time = Metrics.MIN.value
        profiler = Profiler(
            earliest_time,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.dob, User.tob, User.doe],
        )
        res = profiler.execute()._column_results
        assert (
            res.get(User.dob.name).get(Metrics.MIN.name) == "1982-02-02 00:00:00.000000"
        )
        assert res.get(User.tob.name).get(Metrics.MIN.name) == "09:03:25.000000"
        assert res.get(User.doe.name).get(Metrics.MIN.name) == "2009-11-11"

    def test_latest_time(self):
        """
        Check Latest Time Metric
        """
        latest_time = Metrics.MAX.value
        profiler = Profiler(
            latest_time,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.dob, User.tob, User.doe],
        )
        res = profiler.execute()._column_results
        assert (
            res.get(User.dob.name).get(Metrics.MAX.name) == "1992-05-17 00:00:00.000000"
        )
        assert res.get(User.tob.name).get(Metrics.MAX.name) == "11:02:32.000000"
        assert res.get(User.doe.name).get(Metrics.MAX.name) == "2020-01-12"

    def test_null_count(self):
        """
        Check null count
        """
        null_count = Metrics.NULL_COUNT.value
        profiler = Profiler(
            null_count,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.nickname],
        )
        res = profiler.execute()._column_results

        assert res.get(User.nickname.name).get(Metrics.NULL_COUNT.name) == 2

    def test_null_ratio(self):
        """
        Check composed metric run
        """
        count = Metrics.COUNT.value
        null_count = Metrics.NULL_COUNT.value

        # Build the ratio based on the other two metrics
        null_ratio = Metrics.NULL_RATIO.value

        profiler = Profiler(
            count,
            null_count,
            null_ratio,
            profiler_interface=self.sqa_profiler_interface,
            table=User,
            use_cols=[User.nickname],
        )
        res = profiler.execute()._column_results
        assert (
            str(round(res.get(User.nickname.name).get(Metrics.NULL_RATIO.name), 2))
            == "0.67"
        )

    def test_table_row_count(self):
        """
        Check Table Metric run
        """
        table_count = Metrics.ROW_COUNT.value
        profiler = Profiler(
            table_count, profiler_interface=self.sqa_profiler_interface, table=User
        )
        res = profiler.execute()._table_results
        assert res.get(Metrics.ROW_COUNT.name) == 3

    def test_table_column_count(self):
        """
        Check Column Count metric
        """
        col_count = add_props(table=User)(Metrics.COLUMN_COUNT.value)
        profiler = Profiler(
            col_count, profiler_interface=self.sqa_profiler_interface, table=User
        )
        res = profiler.execute()._table_results
        assert res.get(Metrics.COLUMN_COUNT.name) == 9

    def test_avg(self):
        """
        Check avg for distinct types
        """

        # Integer
        avg = Metrics.MEAN.value
        res = (
            Profiler(
                avg,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.MEAN.name] == 30.5

        # String
        avg = Metrics.MEAN.value
        res = (
            Profiler(
                avg,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.MEAN.name] == 4.0

        # Text
        avg = Metrics.MEAN.value
        res = (
            Profiler(
                avg,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.comments],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.comments.name)[Metrics.MEAN.name] == 15.0

    def test_duplicate_count(self):
        """
        Check composed duplicate count
        """
        count = Metrics.COUNT.value
        unique = Metrics.DISTINCT_COUNT.value
        dup_count = Metrics.DUPLICATE_COUNT.value
        res = (
            Profiler(
                count,
                unique,
                dup_count,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.DUPLICATE_COUNT.name] == 0

    def test_histogram(self):
        """
        Check histogram computation
        """

        hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
        res = (
            Profiler(
                hist,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.HISTOGRAM.name]
        assert (
            len(res.get(User.age.name)[Metrics.HISTOGRAM.name]["frequencies"])
            == 3  # Too little values. Counts nulls
        )

    def test_like_count(self):
        """
        Check LIKE count
        """
        # In sqlite, LIKE is insensitive by default, so we just check here
        # that the metrics runs correctly rather than the implementation logic.
        like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
        res = (
            Profiler(
                like,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 3

        like = add_props(expression="Jo%")(Metrics.LIKE_COUNT.value)
        res = (
            Profiler(
                like,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 2

        # Running safely
        # with pytest.raises(AttributeError):
        #     Profiler(
        #         Metrics.LIKE_COUNT.value,
        #         profiler_interface=self.sqa_profiler_interface,
        #         table=User,
        #         use_cols=[User.age],
        #     ).execute()

    def test_ilike_count(self):
        """
        Check ILIKE count: case-insensitive LIKE
        """
        ilike = add_props(expression="j%")(Metrics.ILIKE_COUNT.value)
        res = (
            Profiler(
                ilike,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 3

        ilike = add_props(expression="ja%")(Metrics.ILIKE_COUNT.value)
        res = (
            Profiler(
                ilike,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 1

        # Running safely
        # with pytest.raises(AttributeError):
        #     Profiler(
        #         Metrics.ILIKE_COUNT.value,
        #         profiler_interface=self.sqa_profiler_interface,
        #         table=User,
        #         use_cols=[User.age],
        #     ).execute()

    def test_like_ratio(self):
        """
        Check LIKE ratio
        """
        like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
        count = Metrics.COUNT.value
        like_ratio = Metrics.LIKE_RATIO.value
        res = (
            Profiler(
                like,
                count,
                like_ratio,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.LIKE_RATIO.name] == 1.0

    def test_ilike_ratio(self):
        """
        Check LIKE ratio
        """
        # In sqlite, LIKE is insensitive by default, so we just check here
        # that the metrics runs correctly rather than the implementation logic.
        ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value)
        count = Metrics.COUNT.value
        ilike_ratio = Metrics.ILIKE_RATIO.value
        res = (
            Profiler(
                ilike,
                count,
                ilike_ratio,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.ILIKE_RATIO.name] == 1.0

    def test_max(self):
        """
        Check MAX metric
        """
        _max = Metrics.MAX.value

        res = (
            Profiler(
                _max,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.MAX.name] == 31

        # TMP disable min/max on strings
        # res = (
        #     Profiler(_max, profiler_interface=self.sqa_profiler_interface, table=User, use_cols=[User.name])
        #     .execute()
        #     ._column_results
        # )

        # assert res.get(User.name.name)[Metrics.MAX.name] == "John"

    def test_min_length(self):
        """
        Check MIN_LENGTH metric
        """

        min_length = Metrics.MIN_LENGTH.value

        # Integer
        res = (
            Profiler(
                min_length,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name).get(Metrics.MIN_LENGTH.name) is None

        # String
        res = (
            Profiler(
                min_length,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.MIN_LENGTH.name] == 4

        # Text
        res = (
            Profiler(
                min_length,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.comments],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.comments.name)[Metrics.MIN_LENGTH.name] == 11

    def test_max_length(self):
        """
        Check MAX_LENGTH metric
        """
        max_length = Metrics.MAX_LENGTH.value

        # Integer
        res = (
            Profiler(
                max_length,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name).get(Metrics.MAX_LENGTH.name) is None

        # String
        res = (
            Profiler(
                max_length,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.MAX_LENGTH.name] == 4

        # Text
        res = (
            Profiler(
                max_length,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.comments],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.comments.name)[Metrics.MAX_LENGTH.name] == 19

    def test_sum(self):
        """
        Check SUM Metric
        """
        _sum = Metrics.SUM.value

        res = (
            Profiler(
                _sum,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.SUM.name] == 61

        res = (
            Profiler(
                _sum,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name).get(Metrics.SUM.name) is None

    def test_unique_count(self):
        """
        Check Unique Count metric
        """
        unique_count = Metrics.UNIQUE_COUNT.value
        res = (
            Profiler(
                unique_count,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 1

    def test_unique_ratio(self):
        """
        Check Unique Count metric
        """
        count = Metrics.COUNT.value
        unique_count = Metrics.UNIQUE_COUNT.value
        unique_ratio = Metrics.UNIQUE_RATIO.value
        res = (
            Profiler(
                count,
                unique_count,
                unique_ratio,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert (
            str(round(res.get(User.name.name)[Metrics.UNIQUE_RATIO.name], 2)) == "0.33"
        )

    def test_distinct_count(self):
        """
        Check Distinct Count Metric
        """
        count = Metrics.DISTINCT_COUNT.value
        res = (
            Profiler(
                count,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.DISTINCT_COUNT.name] == 2.0

    def test_distinct_ratio(self):
        """
        Check Distinct Ratio Metric
        """
        count = Metrics.COUNT.value
        distinct_count = Metrics.DISTINCT_COUNT.value
        distinct_ratio = Metrics.DISTINCT_RATIO.value
        res = (
            Profiler(
                count,
                distinct_count,
                distinct_ratio,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert (
            str(round(res.get(User.name.name)[Metrics.DISTINCT_RATIO.name], 2))
            == "0.67"
        )

    def test_count_in_set(self):
        """
        Check Count In Set metric
        """

        set_count = add_props(values=["John"])(Metrics.COUNT_IN_SET.value)
        res = (
            Profiler(
                set_count,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 2.0

        set_count = add_props(values=["John", "Jane"])(Metrics.COUNT_IN_SET.value)
        res = (
            Profiler(
                set_count,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 3

    def test_histogram_empty(self):
        """
        Run the histogram on an empty table
        """

        class EmptyUser(Base):
            __tablename__ = "empty_users"
            id = Column(Integer, primary_key=True)
            name = Column(String(256))
            fullname = Column(String(256))
            nickname = Column(String(256))
            comments = Column(TEXT)
            age = Column(Integer)

        self.sqa_profiler_interface.create_sampler(EmptyUser)
        self.sqa_profiler_interface.create_runner(EmptyUser)

        EmptyUser.__table__.create(bind=self.engine)

        hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
        res = (
            Profiler(
                hist,
                profiler_interface=self.sqa_profiler_interface,
                table=EmptyUser,
                use_cols=[EmptyUser.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(EmptyUser.age.name).get(Metrics.HISTOGRAM.name) is None

    def test_not_like_count(self):
        """
        Check NOT_LIKE count
        """
        # In sqlite, LIKE is insensitive by default, so we just check here
        # that the metrics runs correctly rather than the implementation logic.

        test_cases = [
            ("b%", 0),
            ("Jo%", 2),
            ("Ja%", 1),
            ("J%", 3),
        ]

        for expression, expected in test_cases:
            with self.subTest(expression=expression, expected=expected):
                not_like = add_props(expression=expression)(
                    Metrics.NOT_LIKE_COUNT.value
                )
                res = (
                    Profiler(
                        not_like,
                        profiler_interface=self.sqa_profiler_interface,
                        table=User,
                        use_cols=[User.name],
                    )
                    .execute()
                    ._column_results
                )

                assert res.get(User.name.name)[Metrics.NOT_LIKE_COUNT.name] == expected

    def test_median(self):
        """
        Check MEDIAN
        """

        median = Metrics.MEDIAN.value
        res = (
            Profiler(
                median,
                profiler_interface=self.sqa_profiler_interface,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.MEDIAN.name] == 30