OpenMetadata/ingestion/tests/unit/profiler/test_sample.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Test Sample behavior
"""
from unittest import TestCase

from sqlalchemy import TEXT, Column, Integer, String, create_engine, func
from sqlalchemy.orm import DeclarativeMeta, declarative_base

from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiler.core import Profiler
from metadata.orm_profiler.profiler.sampler import Sampler
from metadata.utils.connections import create_and_bind_session

Base = declarative_base()


class User(Base):
    __tablename__ = "users"
    id = Column(Integer, primary_key=True)
    name = Column(String(256))
    fullname = Column(String(256))
    nickname = Column(String(256))
    comments = Column(TEXT)
    age = Column(Integer)


class SampleTest(TestCase):
    """
    Run checks on different metrics
    """

    engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True)
    session = create_and_bind_session(engine)

    @classmethod
    def setUpClass(cls) -> None:
        """
        Prepare Ingredients
        """
        User.__table__.create(bind=cls.engine)

        # Insert 30 rows
        for i in range(10):
            data = [
                User(
                    name="John",
                    fullname="John Doe",
                    nickname="johnny b goode",
                    comments="no comments",
                    age=30,
                ),
                User(
                    name="Jane",
                    fullname="Jone Doe",
                    nickname=None,
                    comments="maybe some comments",
                    age=31,
                ),
                User(
                    name="John",
                    fullname="John Doe",
                    nickname=None,
                    comments=None,
                    age=None,
                ),
            ]
            cls.session.add_all(data)
            cls.session.commit()

    def test_random_sampler(self):
        """
        The random sampler should be able to
        generate a random subset of data
        """
        sampler = Sampler(session=self.session, table=User, profile_sample=50.0)
        random_sample = sampler.random_sample()
        res = self.session.query(func.count()).select_from(random_sample).first()
        assert res[0] < 30

    def test_sample_property(self):
        """
        Sample property should be properly generated
        """

        # Randomly pick table_count to init the Profiler, we don't care for this test
        table_count = Metrics.ROW_COUNT.value
        profiler = Profiler(
            table_count, session=self.session, table=User, profile_sample=50.0
        )

        res = self.session.query(func.count()).select_from(profiler.sample).first()
        assert res[0] < 30

    def test_table_row_count(self):
        """
        Profile sample should be ignored in row count
        """
        table_count = Metrics.ROW_COUNT.value
        profiler = Profiler(
            table_count, session=self.session, table=User, profile_sample=50.0
        )
        res = profiler.execute()._table_results
        assert res.get(Metrics.ROW_COUNT.name) == 30

    def test_random_sample_count(self):
        """
        Check we can properly sample data.

        There's a random component, so we cannot ensure to always
        get 15 rows, but for sure we should get less than 30.
        """
        count = Metrics.COUNT.value
        profiler = Profiler(
            count,
            session=self.session,
            table=User,
            profile_sample=50.0,
            use_cols=[User.name],
        )
        res = profiler.execute()._column_results
        assert res.get(User.name.name)[Metrics.COUNT.name] < 30

        profiler = Profiler(
            count,
            session=self.session,
            table=User,
            profile_sample=100.0,
            use_cols=[User.name],
        )
        res = profiler.execute()._column_results
        assert res.get(User.name.name)[Metrics.COUNT.name] == 30

    def test_random_sample_histogram(self):
        """
        Histogram should run correctly
        """
        hist = Metrics.HISTOGRAM.value
        profiler = Profiler(
            hist,
            session=self.session,
            table=User,
            profile_sample=50.0,
            use_cols=[User.id],
        )
        res = profiler.execute()._column_results

        # The sum of all frequencies should be sampled
        assert sum(res.get(User.id.name)[Metrics.HISTOGRAM.name]["frequencies"]) < 30

        profiler = Profiler(
            hist,
            session=self.session,
            table=User,
            profile_sample=100.0,
            use_cols=[User.id],
        )
        res = profiler.execute()._column_results

        # The sum of all frequencies should be sampled
        assert sum(res.get(User.id.name)[Metrics.HISTOGRAM.name]["frequencies"]) == 30.0

    def test_random_sample_unique_count(self):
        """
        Unique count should run correctly
        """
        hist = Metrics.UNIQUE_COUNT.value
        profiler = Profiler(
            hist,
            session=self.session,
            table=User,
            profile_sample=50.0,
            use_cols=[User.name],
        )
        res = profiler.execute()._column_results

        # As we repeat data, we expect 0 unique counts.
        # This tests might very rarely, fail, depending on the sampled random data.
        assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] <= 1

        profiler = Profiler(
            hist,
            session=self.session,
            table=User,
            profile_sample=100.0,
            use_cols=[User.name],
        )
        res = profiler.execute()._column_results

        # As we repeat data, we expect 0 unique counts.
        # This tests might very rarely, fail, depending on the sampled random data.
        assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 0

    def test_sample_data(self):
        """
        We should be able to pick up sample data from the sampler
        """
        sampler = Sampler(session=self.session, table=User)
        sample_data = sampler.fetch_sample_data()

        assert len(sample_data.columns) == 6
        assert len(sample_data.rows) == 30

        # Order matters, this is how we'll present the data
        names = [str(col.__root__) for col in sample_data.columns]
        assert names == ["id", "name", "fullname", "nickname", "comments", "age"]

    def test_sample_from_user_query(self):
        """
        Test sample data are returned based on user query
        """
        stmt = "SELECT id, name FROM users"
        sampler = Sampler(session=self.session, table=User, profile_sample_query=stmt)
        sample_data = sampler.fetch_sample_data()

        assert len(sample_data.columns) == 2
        names = [col.__root__ for col in sample_data.columns]
        assert names == ["id", "name"]
Fix #3371 - Run Profiler and Tests on a % of the data (#3424) Fix #3371 - Run Profiler and Tests on a % of the data (#3424) 2022-03-16 06:05:59 +01:00			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`Test Sample behavior`
			`"""`
			`from unittest import TestCase`

			`from sqlalchemy import TEXT, Column, Integer, String, create_engine, func`
			`from sqlalchemy.orm import DeclarativeMeta, declarative_base`

			`from metadata.orm_profiler.metrics.registry import Metrics`
Fix #3522 - Add timeout to profiler (#3707) Fix #3522 - Add timeout to profiler (#3707) 2022-03-30 08:54:27 +02:00			`from metadata.orm_profiler.profiler.core import Profiler`
			`from metadata.orm_profiler.profiler.sampler import Sampler`
Fixing Test Connection for Dynamo & Glue (#4316) * Fixing Test Connection for Dynamo * Fixed Glue Connector * renamed engine to connection * Fixed the return signature * Added dataclass 2022-04-22 11:30:59 +05:30			`from metadata.utils.connections import create_and_bind_session`
Fix #3371 - Run Profiler and Tests on a % of the data (#3424) Fix #3371 - Run Profiler and Tests on a % of the data (#3424) 2022-03-16 06:05:59 +01:00
			`Base = declarative_base()`


			`class User(Base):`
			`__tablename__ = "users"`
			`id = Column(Integer, primary_key=True)`
			`name = Column(String(256))`
			`fullname = Column(String(256))`
			`nickname = Column(String(256))`
			`comments = Column(TEXT)`
			`age = Column(Integer)`


			`class SampleTest(TestCase):`
			`"""`
			`Run checks on different metrics`
			`"""`

			`engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True)`
			`session = create_and_bind_session(engine)`

			`@classmethod`
			`def setUpClass(cls) -> None:`
			`"""`
			`Prepare Ingredients`
			`"""`
			`User.__table__.create(bind=cls.engine)`

			`# Insert 30 rows`
			`for i in range(10):`
			`data = [`
			`User(`
			`name="John",`
			`fullname="John Doe",`
			`nickname="johnny b goode",`
			`comments="no comments",`
			`age=30,`
			`),`
			`User(`
			`name="Jane",`
			`fullname="Jone Doe",`
			`nickname=None,`
			`comments="maybe some comments",`
			`age=31,`
			`),`
			`User(`
			`name="John",`
			`fullname="John Doe",`
			`nickname=None,`
			`comments=None,`
			`age=None,`
			`),`
			`]`
			`cls.session.add_all(data)`
			`cls.session.commit()`

Fix #3522 - Add timeout to profiler (#3707) Fix #3522 - Add timeout to profiler (#3707) 2022-03-30 08:54:27 +02:00			`def test_random_sampler(self):`
			`"""`
			`The random sampler should be able to`
			`generate a random subset of data`
			`"""`
			`sampler = Sampler(session=self.session, table=User, profile_sample=50.0)`
			`random_sample = sampler.random_sample()`
			`res = self.session.query(func.count()).select_from(random_sample).first()`
			`assert res[0] < 30`

Fix #3371 - Run Profiler and Tests on a % of the data (#3424) Fix #3371 - Run Profiler and Tests on a % of the data (#3424) 2022-03-16 06:05:59 +01:00			`def test_sample_property(self):`
			`"""`
			`Sample property should be properly generated`
			`"""`

			`# Randomly pick table_count to init the Profiler, we don't care for this test`
			`table_count = Metrics.ROW_COUNT.value`
			`profiler = Profiler(`
			`table_count, session=self.session, table=User, profile_sample=50.0`
			`)`

			`res = self.session.query(func.count()).select_from(profiler.sample).first()`
			`assert res[0] < 30`

			`def test_table_row_count(self):`
			`"""`
			`Profile sample should be ignored in row count`
			`"""`
			`table_count = Metrics.ROW_COUNT.value`
			`profiler = Profiler(`
			`table_count, session=self.session, table=User, profile_sample=50.0`
			`)`
			`res = profiler.execute()._table_results`
			`assert res.get(Metrics.ROW_COUNT.name) == 30`

			`def test_random_sample_count(self):`
			`"""`
			`Check we can properly sample data.`

			`There's a random component, so we cannot ensure to always`
			`get 15 rows, but for sure we should get less than 30.`
			`"""`
			`count = Metrics.COUNT.value`
			`profiler = Profiler(`
			`count,`
			`session=self.session,`
			`table=User,`
			`profile_sample=50.0,`
			`use_cols=[User.name],`
			`)`
			`res = profiler.execute()._column_results`
			`assert res.get(User.name.name)[Metrics.COUNT.name] < 30`

			`profiler = Profiler(`
			`count,`
			`session=self.session,`
			`table=User,`
			`profile_sample=100.0,`
			`use_cols=[User.name],`
			`)`
			`res = profiler.execute()._column_results`
			`assert res.get(User.name.name)[Metrics.COUNT.name] == 30`

			`def test_random_sample_histogram(self):`
			`"""`
			`Histogram should run correctly`
			`"""`
			`hist = Metrics.HISTOGRAM.value`
			`profiler = Profiler(`
			`hist,`
			`session=self.session,`
			`table=User,`
			`profile_sample=50.0,`
			`use_cols=[User.id],`
			`)`
			`res = profiler.execute()._column_results`

			`# The sum of all frequencies should be sampled`
			`assert sum(res.get(User.id.name)[Metrics.HISTOGRAM.name]["frequencies"]) < 30`

			`profiler = Profiler(`
			`hist,`
			`session=self.session,`
			`table=User,`
			`profile_sample=100.0,`
			`use_cols=[User.id],`
			`)`
			`res = profiler.execute()._column_results`

			`# The sum of all frequencies should be sampled`
			`assert sum(res.get(User.id.name)[Metrics.HISTOGRAM.name]["frequencies"]) == 30.0`

			`def test_random_sample_unique_count(self):`
			`"""`
			`Unique count should run correctly`
			`"""`
			`hist = Metrics.UNIQUE_COUNT.value`
			`profiler = Profiler(`
			`hist,`
			`session=self.session,`
			`table=User,`
			`profile_sample=50.0,`
			`use_cols=[User.name],`
			`)`
			`res = profiler.execute()._column_results`

			`# As we repeat data, we expect 0 unique counts.`
			`# This tests might very rarely, fail, depending on the sampled random data.`
			`assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] <= 1`

			`profiler = Profiler(`
			`hist,`
			`session=self.session,`
			`table=User,`
			`profile_sample=100.0,`
			`use_cols=[User.name],`
			`)`
			`res = profiler.execute()._column_results`

			`# As we repeat data, we expect 0 unique counts.`
			`# This tests might very rarely, fail, depending on the sampled random data.`
			`assert res.get(User.name.name)[Metrics.UNIQUE_COUNT.name] == 0`
Fix #3573 - Sample Data refactor & ORM converter improvements (#5265) Fix #3573 - Sample Data refactor & ORM converter improvements (#5265) 2022-06-08 16:10:40 +02:00
			`def test_sample_data(self):`
			`"""`
			`We should be able to pick up sample data from the sampler`
			`"""`
			`sampler = Sampler(session=self.session, table=User)`
			`sample_data = sampler.fetch_sample_data()`

			`assert len(sample_data.columns) == 6`
			`assert len(sample_data.rows) == 30`

			`# Order matters, this is how we'll present the data`
			`names = [str(col.__root__) for col in sample_data.columns]`
			`assert names == ["id", "name", "fullname", "nickname", "comments", "age"]`
[Issue-5188] Implement User Query for Sampler and Profiler (#5578) * Added custom query sample for sample data ingestion * Added logic to run table profiling against user's query * Added tests for user query logic in profiler and sampler * Added user profiling to tableProfile + fixed format * staging commit * Added logic to add profileQuery to table entity * Added limit to sample rows 2022-06-24 14:46:34 +02:00
			`def test_sample_from_user_query(self):`
			`"""`
			`Test sample data are returned based on user query`
			`"""`
			`stmt = "SELECT id, name FROM users"`
			`sampler = Sampler(session=self.session, table=User, profile_sample_query=stmt)`
			`sample_data = sampler.fetch_sample_data()`

			`assert len(sample_data.columns) == 2`
			`names = [col.__root__ for col in sample_data.columns]`
			`assert names == ["id", "name"]`