OpenMetadata/ingestion/tests/unit/profiler/test_metrics.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Test Metrics behavior
"""
from unittest import TestCase

from sqlalchemy import TEXT, Column, Integer, String, create_engine
from sqlalchemy.orm import declarative_base

from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiler.core import Profiler
from metadata.utils.connections import create_and_bind_session

Base = declarative_base()


class User(Base):
    __tablename__ = "users"
    id = Column(Integer, primary_key=True)
    name = Column(String(256))
    fullname = Column(String(256))
    nickname = Column(String(256))
    comments = Column(TEXT)
    age = Column(Integer)


class MetricsTest(TestCase):
    """
    Run checks on different metrics
    """

    engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True)
    session = create_and_bind_session(engine)

    @classmethod
    def setUpClass(cls) -> None:
        """
        Prepare Ingredients
        """
        User.__table__.create(bind=cls.engine)

        data = [
            User(
                name="John",
                fullname="John Doe",
                nickname="johnny b goode",
                comments="no comments",
                age=30,
            ),
            User(
                name="Jane",
                fullname="Jone Doe",
                nickname=None,
                comments="maybe some comments",
                age=31,
            ),
            User(
                name="John",
                fullname="John Doe",
                nickname=None,
                comments=None,
                age=None,
            ),
        ]
        cls.session.add_all(data)
        cls.session.commit()

    def test_count(self):
        """
        Check the Count metric
        """
        count = Metrics.COUNT.value
        profiler = Profiler(
            count, session=self.session, table=User, use_cols=[User.name]
        )
        res = profiler.execute()._column_results

        # Note how we can get the result value by passing the metrics name
        assert res.get(User.name.name).get(Metrics.COUNT.name) == 3

    def test_min(self):
        """
        Check the Min metric
        """
        min_age = Metrics.MIN.value
        profiler = Profiler(
            min_age, session=self.session, table=User, use_cols=[User.age]
        )
        res = profiler.execute()._column_results

        # Note how we can get the result value by passing the metrics name
        assert res.get(User.age.name).get(Metrics.MIN.name) == 30

    def test_std(self):
        """
        Check STD metric
        """
        std_age = Metrics.STDDEV.value
        profiler = Profiler(
            std_age, session=self.session, table=User, use_cols=[User.age]
        )
        res = profiler.execute()._column_results
        # SQLITE STD custom implementation returns the squared STD.
        # Only useful for testing purposes
        assert res.get(User.age.name).get(Metrics.STDDEV.name) == 0.25

    def test_null_count(self):
        """
        Check null count
        """
        null_count = Metrics.NULL_COUNT.value
        profiler = Profiler(
            null_count, session=self.session, table=User, use_cols=[User.nickname]
        )
        res = profiler.execute()._column_results

        assert res.get(User.nickname.name).get(Metrics.NULL_COUNT.name) == 2

    def test_null_ratio(self):
        """
        Check composed metric run
        """
        count = Metrics.COUNT.value
        null_count = Metrics.NULL_COUNT.value

        # Build the ratio based on the other two metrics
        null_ratio = Metrics.NULL_RATIO.value

        profiler = Profiler(
            count,
            null_count,
            null_ratio,
            session=self.session,
            table=User,
            use_cols=[User.nickname],
        )
        res = profiler.execute()._column_results
        assert (
            str(round(res.get(User.nickname.name).get(Metrics.NULL_RATIO.name), 2))
            == "0.67"
        )

    def test_table_row_count(self):
        """
        Check Table Metric run
        """
        table_count = Metrics.ROW_COUNT.value
        profiler = Profiler(table_count, session=self.session, table=User)
        res = profiler.execute()._table_results
        assert res.get(Metrics.ROW_COUNT.name) == 3

    def test_table_column_count(self):
        """
        Check Column Count metric
        """
        col_count = add_props(table=User)(Metrics.COLUMN_COUNT.value)
        profiler = Profiler(col_count, session=self.session, table=User)
        res = profiler.execute()._table_results
        assert res.get(Metrics.COLUMN_COUNT.name) == 6

    def test_avg(self):
        """
        Check avg for distinct types
        """

        # Integer
        avg = Metrics.MEAN.value
        res = (
            Profiler(avg, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.MEAN.name] == 30.5

        # String
        avg = Metrics.MEAN.value
        res = (
            Profiler(avg, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.MEAN.name] == 4.0

        # Text
        avg = Metrics.MEAN.value
        res = (
            Profiler(avg, session=self.session, table=User, use_cols=[User.comments])
            .execute()
            ._column_results
        )

        assert res.get(User.comments.name)[Metrics.MEAN.name] == 15.0

    def test_duplicate_count(self):
        """
        Check composed duplicate count
        """
        count = Metrics.COUNT.value
        unique = Metrics.DISTINCT_COUNT.value
        dup_count = Metrics.DUPLICATE_COUNT.value
        res = (
            Profiler(
                count,
                unique,
                dup_count,
                session=self.session,
                table=User,
                use_cols=[User.age],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.DUPLICATE_COUNT.name] == 0

    def test_histogram(self):
        """
        Check histogram computation
        """

        hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
        res = (
            Profiler(hist, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.HISTOGRAM.name]
        assert (
            len(res.get(User.age.name)[Metrics.HISTOGRAM.name]["frequencies"])
            == 3  # Too little values. Counts nulls
        )

    def test_like_count(self):
        """
        Check LIKE count
        """
        # In sqlite, LIKE is insensitive by default, so we just check here
        # that the metrics runs correctly rather than the implementation logic.
        like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
        res = (
            Profiler(like, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 3

        like = add_props(expression="Jo%")(Metrics.LIKE_COUNT.value)
        res = (
            Profiler(like, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.LIKE_COUNT.name] == 2

        # Running safely
        # with pytest.raises(AttributeError):
        #     Profiler(
        #         Metrics.LIKE_COUNT.value,
        #         session=self.session,
        #         table=User,
        #         use_cols=[User.age],
        #     ).execute()

    def test_ilike_count(self):
        """
        Check ILIKE count: case-insensitive LIKE
        """
        ilike = add_props(expression="j%")(Metrics.ILIKE_COUNT.value)
        res = (
            Profiler(ilike, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 3

        ilike = add_props(expression="ja%")(Metrics.ILIKE_COUNT.value)
        res = (
            Profiler(ilike, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.ILIKE_COUNT.name] == 1

        # Running safely
        # with pytest.raises(AttributeError):
        #     Profiler(
        #         Metrics.ILIKE_COUNT.value,
        #         session=self.session,
        #         table=User,
        #         use_cols=[User.age],
        #     ).execute()

    def test_like_ratio(self):
        """
        Check LIKE ratio
        """
        like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
        count = Metrics.COUNT.value
        like_ratio = Metrics.LIKE_RATIO.value
        res = (
            Profiler(
                like,
                count,
                like_ratio,
                session=self.session,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.LIKE_RATIO.name] == 1.0

    def test_ilike_ratio(self):
        """
        Check LIKE ratio
        """
        # In sqlite, LIKE is insensitive by default, so we just check here
        # that the metrics runs correctly rather than the implementation logic.
        ilike = add_props(expression="J%")(Metrics.ILIKE_COUNT.value)
        count = Metrics.COUNT.value
        ilike_ratio = Metrics.ILIKE_RATIO.value
        res = (
            Profiler(
                ilike,
                count,
                ilike_ratio,
                session=self.session,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.ILIKE_RATIO.name] == 1.0

    def test_max(self):
        """
        Check MAX metric
        """
        _max = Metrics.MAX.value

        res = (
            Profiler(_max, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.MAX.name] == 31

        # TMP disable min/max on strings
        # res = (
        #     Profiler(_max, session=self.session, table=User, use_cols=[User.name])
        #     .execute()
        #     ._column_results
        # )

        # assert res.get(User.name.name)[Metrics.MAX.name] == "John"

    def test_min_length(self):
        """
        Check MIN_LENGTH metric
        """

        min_length = Metrics.MIN_LENGTH.value

        # Integer
        res = (
            Profiler(min_length, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name).get(Metrics.MIN_LENGTH.name) is None

        # String
        res = (
            Profiler(min_length, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.MIN_LENGTH.name] == 4

        # Text
        res = (
            Profiler(
                min_length, session=self.session, table=User, use_cols=[User.comments]
            )
            .execute()
            ._column_results
        )

        assert res.get(User.comments.name)[Metrics.MIN_LENGTH.name] == 11

    def test_max_length(self):
        """
        Check MAX_LENGTH metric
        """
        max_length = Metrics.MAX_LENGTH.value

        # Integer
        res = (
            Profiler(max_length, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name).get(Metrics.MAX_LENGTH.name) is None

        # String
        res = (
            Profiler(max_length, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.MAX_LENGTH.name] == 4

        # Text
        res = (
            Profiler(
                max_length, session=self.session, table=User, use_cols=[User.comments]
            )
            .execute()
            ._column_results
        )

        assert res.get(User.comments.name)[Metrics.MAX_LENGTH.name] == 19

    def test_sum(self):
        """
        Check SUM Metric
        """
        _sum = Metrics.SUM.value

        res = (
            Profiler(_sum, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.SUM.name] == 61

        res = (
            Profiler(_sum, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name).get(Metrics.SUM.name) is None

    def test_unique_count(self):
        """
        Check Unique Count metric
        """
        unique_count = Metrics.UNIQUE_COUNT.value
        res = (
            Profiler(
                unique_count, session=self.session, table=User, use_cols=[User.name]
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 1

    def test_unique_ratio(self):
        """
        Check Unique Count metric
        """
        count = Metrics.COUNT.value
        unique_count = Metrics.UNIQUE_COUNT.value
        unique_ratio = Metrics.UNIQUE_RATIO.value
        res = (
            Profiler(
                count,
                unique_count,
                unique_ratio,
                session=self.session,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert (
            str(round(res.get(User.name.name)[Metrics.UNIQUE_RATIO.name], 2)) == "0.33"
        )

    def test_distinct_count(self):
        """
        Check Distinct Count Metric
        """
        count = Metrics.DISTINCT_COUNT.value
        res = (
            Profiler(
                count,
                session=self.session,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.DISTINCT_COUNT.name] == 2.0

    def test_distinct_ratio(self):
        """
        Check Distinct Ratio Metric
        """
        count = Metrics.COUNT.value
        distinct_count = Metrics.DISTINCT_COUNT.value
        distinct_ratio = Metrics.DISTINCT_RATIO.value
        res = (
            Profiler(
                count,
                distinct_count,
                distinct_ratio,
                session=self.session,
                table=User,
                use_cols=[User.name],
            )
            .execute()
            ._column_results
        )

        assert (
            str(round(res.get(User.name.name)[Metrics.DISTINCT_RATIO.name], 2))
            == "0.67"
        )

    def test_count_in_set(self):
        """
        Check Count In Set metric
        """

        set_count = add_props(values=["John"])(Metrics.COUNT_IN_SET.value)
        res = (
            Profiler(set_count, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 2.0

        set_count = add_props(values=["John", "Jane"])(Metrics.COUNT_IN_SET.value)
        res = (
            Profiler(set_count, session=self.session, table=User, use_cols=[User.name])
            .execute()
            ._column_results
        )

        assert res.get(User.name.name)[Metrics.COUNT_IN_SET.name] == 3

    def test_histogram_empty(self):
        """
        Run the histogram on an empty table
        """

        class EmptyUser(Base):
            __tablename__ = "empty_users"
            id = Column(Integer, primary_key=True)
            name = Column(String(256))
            fullname = Column(String(256))
            nickname = Column(String(256))
            comments = Column(TEXT)
            age = Column(Integer)

        EmptyUser.__table__.create(bind=self.engine)

        hist = add_props(bins=5)(Metrics.HISTOGRAM.value)
        res = (
            Profiler(hist, session=self.session, table=EmptyUser, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name).get(Metrics.HISTOGRAM.name) is None

    def test_not_like_count(self):
        """
        Check NOT_LIKE count
        """
        # In sqlite, LIKE is insensitive by default, so we just check here
        # that the metrics runs correctly rather than the implementation logic.

        test_cases = [
            ("b%", 0),
            ("Jo%", 2),
            ("Ja%", 1),
            ("J%", 3),
        ]

        for expression, expected in test_cases:
            with self.subTest(expression=expression, expected=expected):
                not_like = add_props(expression=expression)(
                    Metrics.NOT_LIKE_COUNT.value
                )
                res = (
                    Profiler(
                        not_like, session=self.session, table=User, use_cols=[User.name]
                    )
                    .execute()
                    ._column_results
                )

                assert res.get(User.name.name)[Metrics.NOT_LIKE_COUNT.name] == expected

    def test_median(self):
        """
        Check MEDIAN
        """

        median = Metrics.MEDIAN.value
        res = (
            Profiler(median, session=self.session, table=User, use_cols=[User.age])
            .execute()
            ._column_results
        )

        assert res.get(User.age.name)[Metrics.MEDIAN.name] == 30