OpenMetadata/ingestion/tests/unit/profiler/sqlalchemy/azuresql/test_azuresql_sampling.py

from unittest import TestCase
from unittest.mock import patch
from uuid import uuid4

import pytest

try:
    import pyodbc  # noqa: F401
except ImportError:
    # skip the test if pyodbc cannnot be imported: either because is not installed or
    # because a broken dynamic library not found
    pytest.skip("pyodbc not properly installed", allow_module_level=True)

from sqlalchemy import Column, Integer
from sqlalchemy.orm import declarative_base
from sqlalchemy.sql.selectable import CTE

from metadata.generated.schema.entity.data.table import Column as EntityColumn
from metadata.generated.schema.entity.data.table import (
    ColumnName,
    DataType,
    PartitionIntervalTypes,
    PartitionProfilerConfig,
    ProfileSampleType,
    Table,
)
from metadata.generated.schema.entity.services.connections.database.azureSQLConnection import (
    AzureSQLConnection,
)
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
    SQAProfilerInterface,
)
from metadata.sampler.models import SampleConfig
from metadata.sampler.sqlalchemy.azuresql.sampler import AzureSQLSampler
from metadata.sampler.sqlalchemy.sampler import SQASampler

Base = declarative_base()


class User(Base):
    __tablename__ = "users"
    id = Column(Integer, primary_key=True)


@patch.object(SQASampler, "build_table_orm", return_value=User)
class SampleTest(TestCase):
    @classmethod
    @patch.object(SQASampler, "build_table_orm", return_value=User)
    def setUpClass(cls, sampler_mock):
        cls.table_entity = Table(
            id=uuid4(),
            name="user",
            columns=[
                EntityColumn(
                    name=ColumnName("id"),
                    dataType=DataType.INT,
                ),
            ],
        )

        cls.azuresql_conn = AzureSQLConnection(
            username="myuser",
            password="myaccount",
            database="mywarehouse",
            hostPort="host//foo.bar:1433",
        )

        sampler = SQASampler(
            service_connection_config=cls.azuresql_conn,
            ometa_client=None,
            entity=None,
        )
        cls.sqa_profiler_interface = SQAProfilerInterface(
            cls.azuresql_conn,
            None,
            cls.table_entity,
            None,
            sampler,
            5,
            43200,
        )

        cls.session = cls.sqa_profiler_interface.session

    def test_omit_sampling_method_type(self, sampler_mock):
        """
        use percentage sampling.
        """
        sampler = AzureSQLSampler(
            service_connection_config=self.azuresql_conn,
            ometa_client=None,
            entity=self.table_entity,
            sample_config=SampleConfig(
                profileSampleType=ProfileSampleType.PERCENTAGE, profileSample=50.0
            ),
        )
        query: CTE = sampler.get_sample_query()
        expected_query = (
            'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users_1.id AS id \n'
            "FROM users AS users_1 TABLESAMPLE system(50.0 PERCENT))\n "
            'SELECT "9bc65c2abec141778ffaa729489f3e87_rnd".id \nFROM "9bc65c2abec141778ffaa729489f3e87_rnd"'
        )
        assert (
            expected_query.casefold()
            == str(query.compile(compile_kwargs={"literal_binds": True})).casefold()
        )

    def test_row_sampling(self, sampler_mock):
        """
        use ROW sampling if profile sample type is ROW.
        """
        sampler = AzureSQLSampler(
            service_connection_config=self.azuresql_conn,
            ometa_client=None,
            entity=self.table_entity,
            sample_config=SampleConfig(
                profileSampleType=ProfileSampleType.ROWS, profileSample=50
            ),
        )
        query: CTE = sampler.get_sample_query()
        expected_query = (
            'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users_1.id AS id '
            "\nFROM users AS users_1 TABLESAMPLE system(50 ROWS))\n "
            'SELECT "9bc65c2abec141778ffaa729489f3e87_rnd".id \nFROM "9bc65c2abec141778ffaa729489f3e87_rnd"'
        )
        assert (
            expected_query.casefold()
            == str(query.compile(compile_kwargs={"literal_binds": True})).casefold()
        )

    def test_sampling_with_partition(self, sampler_mock):
        """
        use specified partition columns.
        """
        sampler = AzureSQLSampler(
            service_connection_config=self.azuresql_conn,
            ometa_client=None,
            entity=self.table_entity,
            sample_config=SampleConfig(
                profileSampleType=ProfileSampleType.PERCENTAGE,
                profileSample=50.0,
            ),
            partition_details=PartitionProfilerConfig(
                enablePartitioning=True,
                partitionColumnName="id",
                partitionIntervalType=PartitionIntervalTypes.COLUMN_VALUE,
                partitionValues=["1", "2"],
            ),
        )
        query: CTE = sampler.get_sample_query()
        expected_query = (
            'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users_1.id AS id \n'
            "FROM users AS users_1 TABLESAMPLE system(50.0 PERCENT) "
            "\nWHERE id IN ('1', '2'))\n SELECT \"9bc65c2abec141778ffaa729489f3e87_rnd\".id "
            '\nFROM "9bc65c2abec141778ffaa729489f3e87_rnd"'
        )
        assert (
            expected_query.casefold()
            == str(query.compile(compile_kwargs={"literal_binds": True})).casefold()
        )
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`from unittest import TestCase`
			`from unittest.mock import patch`
			`from uuid import uuid4`

ci/nox-setup-testing (#21377) * Make pytest to user code from src rather than from install package * Fix test_amundsen: missing None * Update pytest configuration to use importlib mode * Fix custom_basemodel_validation to check model_fields on type(values) to prevent noisy warnings * Refactor referencedByQueries validation to use field_validator as per deprecation warning * Update ColumnJson to use model_rebuild rather as replacement for forward reference updates as per deprecation warning * Move superset test to integration test as they are using testcontainers * Update coverage source path * Fix wrong import. * Add install_dev_env target to Makefile for development dependencies * Add test-unit as extra in setup.py * Modify dependencies in dev environment. * Ignore all airflow tests * Remove coverage in unit_ingestion_dev_env. Revert coverage source to prevent broken CI. * Add nox for running unit test * FIx PowerBI integration test to use pathlib for resource paths and not os.getcwd to prevent failures when not executed from the right path * Move test_helpers.py to unit test, as it is not an integration test. * Remove utils empty folder in integration tests * Refactor testcontainers configuration to avoid pitfalls with max_tries setting * Add nox unit testing basic setup * Add format check session * Refactor nox-unit and add plugins tests * Add GHA for py-nox-ci * Add comment to GHA * Restore conftest.py file * Clarify comment * Simplify function * Fix matrix startegy and nox mismatch * Improve python version strategy with nox and GHA --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io> 2025-05-27 10:56:52 +02:00			`import pytest`

			`try:`
			`import pyodbc # noqa: F401`
			`except ImportError:`
			`# skip the test if pyodbc cannnot be imported: either because is not installed or`
			`# because a broken dynamic library not found`
			`pytest.skip("pyodbc not properly installed", allow_module_level=True)`

fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`from sqlalchemy import Column, Integer`
			`from sqlalchemy.orm import declarative_base`
			`from sqlalchemy.sql.selectable import CTE`

			`from metadata.generated.schema.entity.data.table import Column as EntityColumn`
			`from metadata.generated.schema.entity.data.table import (`
			`ColumnName,`
			`DataType,`
			`PartitionIntervalTypes,`
			`PartitionProfilerConfig,`
			`ProfileSampleType,`
			`Table,`
			`)`
			`from metadata.generated.schema.entity.services.connections.database.azureSQLConnection import (`
			`AzureSQLConnection,`
			`)`
			`from metadata.profiler.interface.sqlalchemy.profiler_interface import (`
			`SQAProfilerInterface,`
			`)`
			`from metadata.sampler.models import SampleConfig`
			`from metadata.sampler.sqlalchemy.azuresql.sampler import AzureSQLSampler`
			`from metadata.sampler.sqlalchemy.sampler import SQASampler`

			`Base = declarative_base()`


			`class User(Base):`
			`__tablename__ = "users"`
			`id = Column(Integer, primary_key=True)`


			`@patch.object(SQASampler, "build_table_orm", return_value=User)`
			`class SampleTest(TestCase):`
			`@classmethod`
			`@patch.object(SQASampler, "build_table_orm", return_value=User)`
			`def setUpClass(cls, sampler_mock):`
			`cls.table_entity = Table(`
			`id=uuid4(),`
			`name="user",`
			`columns=[`
			`EntityColumn(`
			`name=ColumnName("id"),`
			`dataType=DataType.INT,`
			`),`
			`],`
			`)`

			`cls.azuresql_conn = AzureSQLConnection(`
			`username="myuser",`
			`password="myaccount",`
			`database="mywarehouse",`
			`hostPort="host//foo.bar:1433",`
			`)`

			`sampler = SQASampler(`
			`service_connection_config=cls.azuresql_conn,`
			`ometa_client=None,`
			`entity=None,`
			`)`
			`cls.sqa_profiler_interface = SQAProfilerInterface(`
			`cls.azuresql_conn,`
			`None,`
			`cls.table_entity,`
			`None,`
			`sampler,`
			`5,`
			`43200,`
			`)`

			`cls.session = cls.sqa_profiler_interface.session`

			`def test_omit_sampling_method_type(self, sampler_mock):`
			`"""`
			`use percentage sampling.`
			`"""`
			`sampler = AzureSQLSampler(`
			`service_connection_config=self.azuresql_conn,`
			`ometa_client=None,`
			`entity=self.table_entity,`
			`sample_config=SampleConfig(`
MINOR: Wrong attribute name in SampleConfig model (#19641) * fix: wrong attribute name in SampleConfig model * fix: test attribute * fix: failing tests * fix: trino filter error + adjust test to take into account null value * fix: mssql and azuresql tablesample on views 2025-02-04 10:40:40 +01:00			`profileSampleType=ProfileSampleType.PERCENTAGE, profileSample=50.0`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`),`
			`)`
			`query: CTE = sampler.get_sample_query()`
			`expected_query = (`
MINOR - Update sampler tablenames (#19976) * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames 2025-02-26 14:08:14 +01:00			`'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users_1.id AS id \n'`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`"FROM users AS users_1 TABLESAMPLE system(50.0 PERCENT))\n "`
MINOR - Update sampler tablenames (#19976) * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames 2025-02-26 14:08:14 +01:00			`'SELECT "9bc65c2abec141778ffaa729489f3e87_rnd".id \nFROM "9bc65c2abec141778ffaa729489f3e87_rnd"'`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`)`
			`assert (`
			`expected_query.casefold()`
			`== str(query.compile(compile_kwargs={"literal_binds": True})).casefold()`
			`)`

			`def test_row_sampling(self, sampler_mock):`
			`"""`
			`use ROW sampling if profile sample type is ROW.`
			`"""`
			`sampler = AzureSQLSampler(`
			`service_connection_config=self.azuresql_conn,`
			`ometa_client=None,`
			`entity=self.table_entity,`
			`sample_config=SampleConfig(`
MINOR: Wrong attribute name in SampleConfig model (#19641) * fix: wrong attribute name in SampleConfig model * fix: test attribute * fix: failing tests * fix: trino filter error + adjust test to take into account null value * fix: mssql and azuresql tablesample on views 2025-02-04 10:40:40 +01:00			`profileSampleType=ProfileSampleType.ROWS, profileSample=50`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`),`
			`)`
			`query: CTE = sampler.get_sample_query()`
			`expected_query = (`
MINOR - Update sampler tablenames (#19976) * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames 2025-02-26 14:08:14 +01:00			`'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users_1.id AS id '`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`"\nFROM users AS users_1 TABLESAMPLE system(50 ROWS))\n "`
MINOR - Update sampler tablenames (#19976) * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames 2025-02-26 14:08:14 +01:00			`'SELECT "9bc65c2abec141778ffaa729489f3e87_rnd".id \nFROM "9bc65c2abec141778ffaa729489f3e87_rnd"'`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`)`
			`assert (`
			`expected_query.casefold()`
			`== str(query.compile(compile_kwargs={"literal_binds": True})).casefold()`
			`)`

			`def test_sampling_with_partition(self, sampler_mock):`
			`"""`
			`use specified partition columns.`
			`"""`
			`sampler = AzureSQLSampler(`
			`service_connection_config=self.azuresql_conn,`
			`ometa_client=None,`
			`entity=self.table_entity,`
			`sample_config=SampleConfig(`
MINOR: Wrong attribute name in SampleConfig model (#19641) * fix: wrong attribute name in SampleConfig model * fix: test attribute * fix: failing tests * fix: trino filter error + adjust test to take into account null value * fix: mssql and azuresql tablesample on views 2025-02-04 10:40:40 +01:00			`profileSampleType=ProfileSampleType.PERCENTAGE,`
			`profileSample=50.0,`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`),`
			`partition_details=PartitionProfilerConfig(`
			`enablePartitioning=True,`
			`partitionColumnName="id",`
			`partitionIntervalType=PartitionIntervalTypes.COLUMN_VALUE,`
			`partitionValues=["1", "2"],`
			`),`
			`)`
			`query: CTE = sampler.get_sample_query()`
			`expected_query = (`
MINOR - Update sampler tablenames (#19976) * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames 2025-02-26 14:08:14 +01:00			`'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users_1.id AS id \n'`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`"FROM users AS users_1 TABLESAMPLE system(50.0 PERCENT) "`
MINOR - Update sampler tablenames (#19976) * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames * MINOR - Update sampler tablenames 2025-02-26 14:08:14 +01:00			`"\nWHERE id IN ('1', '2'))\n SELECT \"9bc65c2abec141778ffaa729489f3e87_rnd\".id "`
			`'\nFROM "9bc65c2abec141778ffaa729489f3e87_rnd"'`
fix: azuresql sampler logic (#19034) 2024-12-13 07:35:04 +01:00			`)`
			`assert (`
			`expected_query.casefold()`
			`== str(query.compile(compile_kwargs={"literal_binds": True})).casefold()`
			`)`