OpenMetadata/ingestion/tests/unit/profiler/test_workflow.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Validate workflow configs and filters
"""
import uuid
from copy import deepcopy

from metadata.generated.schema.entity.data.table import Column, DataType, Table
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
from metadata.ingestion.source.sqlite import SQLiteConfig
from metadata.orm_profiler.api.workflow import ProfilerWorkflow
from metadata.orm_profiler.profiles.models import ProfilerDef

config = {
    "source": {"type": "sqlite", "config": {"service_name": "my_service"}},
    "sink": {"type": "metadata-rest", "config": {}},
    "metadata_server": {
        "type": "metadata-server",
        "config": {
            "api_endpoint": "http://localhost:8585/api",
            "auth_provider_type": "no-auth",
        },
    },
}

workflow = ProfilerWorkflow.create(config)


def test_init_workflow():
    """
    We can initialise the workflow from a config
    """
    assert isinstance(workflow.source_config, SQLiteConfig)
    assert isinstance(workflow.metadata_config, MetadataServerConfig)
    assert workflow.config.profiler is None
    assert workflow.config.tests is None


def test_filter_entities():
    """
    We can properly filter entities depending on the
    workflow configuration
    """

    service_name = "service"
    db_reference1 = EntityReference(id=uuid.uuid4(), name="one_db", type="database")
    db_reference2 = EntityReference(id=uuid.uuid4(), name="another_db", type="database")

    all_tables = [
        Table(
            id=uuid.uuid4(),
            name="table1",
            database=db_reference1,
            fullyQualifiedName=f"{service_name}.{db_reference1.name}.table1",
            columns=[Column(name="id", dataType=DataType.BIGINT)],
        ),
        Table(
            id=uuid.uuid4(),
            name="table2",
            database=db_reference1,
            fullyQualifiedName=f"{service_name}.{db_reference1.name}.table2",
            columns=[Column(name="id", dataType=DataType.BIGINT)],
        ),
        Table(
            id=uuid.uuid4(),
            name="table3",
            database=db_reference2,
            fullyQualifiedName=f"{service_name}.{db_reference2.name}.table3",
            columns=[Column(name="id", dataType=DataType.BIGINT)],
        ),
    ]

    # Simple workflow does not filter
    assert len(list(workflow.filter_entities(all_tables))) == 3

    # We can exclude based on the schema name
    exclude_filter_schema_config = deepcopy(config)
    exclude_filter_schema_config["source"]["config"]["schema_filter_pattern"] = {
        "excludes": ["one_db"]
    }

    exclude_filter_schema_workflow = ProfilerWorkflow.create(
        exclude_filter_schema_config
    )
    assert len(list(exclude_filter_schema_workflow.filter_entities(all_tables))) == 1

    # We can include based on the schema name
    include_filter_schema_config = deepcopy(config)
    include_filter_schema_config["source"]["config"]["schema_filter_pattern"] = {
        "includes": ["another_db"]
    }

    include_filter_schema_workflow = ProfilerWorkflow.create(
        include_filter_schema_config
    )
    assert len(list(include_filter_schema_workflow.filter_entities(all_tables))) == 1

    # We can exclude based on the table name
    exclude_filter_table_config = deepcopy(config)
    exclude_filter_table_config["source"]["config"]["table_filter_pattern"] = {
        "excludes": ["tab*"]
    }

    exclude_filter_table_workflow = ProfilerWorkflow.create(exclude_filter_table_config)
    assert len(list(exclude_filter_table_workflow.filter_entities(all_tables))) == 0

    # We can include based on the table name
    include_filter_table_config = deepcopy(config)
    include_filter_table_config["source"]["config"]["table_filter_pattern"] = {
        "includes": ["table1"]
    }

    include_filter_table_workflow = ProfilerWorkflow.create(include_filter_table_config)
    assert len(list(include_filter_table_workflow.filter_entities(all_tables))) == 1


def test_profile_def():
    """
    Validate the definitions of the profile in the JSON
    """
    profile_config = deepcopy(config)
    profile_config["profiler"] = {
        "name": "my_profiler",
        "table_metrics": ["row_number"],
        "metrics": ["min", "COUNT"],
    }

    profile_workflow = ProfilerWorkflow.create(profile_config)

    profile_definition = ProfilerDef(
        name="my_profiler",
        table_metrics=["ROW_NUMBER"],
        metrics=["MIN", "COUNT"],
        time_metrics=None,
        custom_metrics=None,
    )

    assert profile_workflow.config.profiler == profile_definition


def test_tests_def():
    """
    Validate the test case definition
    """
    test_config = deepcopy(config)
    test_config["tests"] = {
        "name": "my_tests",
        "table_tests": [
            {
                "name": "first_test",
                "table": "service.db.name",
                "expression": "row_number > 100",
                "enabled": False,
            },
            {
                "name": "another_test",
                "table": "service.db.name",
                "expression": "row_number > 1000 & row_number < 2000",
            },
        ],
        "column_tests": [
            {
                "table": "service.db.name",
                "name": "set_of_col_tests",
                "columns": [
                    {
                        "name": "first_col_test",
                        "column": "column_name_1",
                        "expression": "min > 5",
                    },
                    {
                        "name": "another_col_test",
                        "column": "column_name_1",
                        "expression": "min > 5 & min < 10",
                    },
                    {
                        "name": "second_col_test",
                        "column": "column_name_2",
                        "expression": "null_ratio < 0.1",
                    },
                ],
            }
        ],
    }

    test_workflow = ProfilerWorkflow.create(test_config)

    tests = test_workflow.config.tests

    assert tests.name == "my_tests"

    # Check cardinality
    assert len(tests.table_tests) == 2
    assert len(tests.column_tests) == 1
    assert len(tests.column_tests[0].columns) == 3

    assert tests.table_tests[0].name == "first_test"
    assert tests.table_tests[0].table == "service.db.name"
    assert tests.table_tests[0].expression[0].metric == "ROWNUMBER"
    assert not tests.table_tests[0].enabled

    assert tests.column_tests[0].columns[0].name == "first_col_test"
    assert tests.column_tests[0].columns[0].column == "column_name_1"
    assert tests.column_tests[0].columns[0].expression[0].metric == "MIN"
    assert tests.column_tests[0].columns[0].enabled

    # We cannot do a 1:1 general assertion because we are dynamically
    # creating the Validation classes. Then, the internal IDs don't match
    # and the assertion fails. However, and for visual representation,
    # the resulting class looks like follows:

    # TestDef(
    #     name="my_tests",
    #     table_tests=[
    #         # I can have multiple tests on the same table
    #         TableTest(
    #             name="first_test",
    #             table="service.db.name",
    #             expression="row_number > 100",  # This will be one Validation
    #             enabled=False,
    #         ),
    #         TableTest(
    #             name="another_test",
    #             table="service.db.name",
    #             expression="row_number > 1000 & row_number < 2000",  # This will be two Validations
    #         ),
    #     ],
    #     column_tests=[
    #         ColumnTest(
    #             table="service.db.name",
    #             name="set_of_col_tests",
    #             columns=[
    #                 ColumnTestExpression(
    #                     name="first_col_test",
    #                     column="column_name_1",
    #                     expression="min > 5",  # One Validation
    #                 ),
    #                 ColumnTestExpression(
    #                     name="another_col_test",
    #                     column="column_name_1",
    #                     expression="min > 5 & min < 10",  # Two Validations
    #                 ),
    #                 ColumnTestExpression(
    #                     name="second_col_test",
    #                     column="column_name_2",
    #                     expression="null_ratio < 0.1",  # One Validation
    #                 ),
    #             ],
    #         )
    #     ],
    # )
Fix #2845 - Init Profiler Workflow (#2862) * Fix list typing * Add sqlite service * Add sqlite service * Add sqlite service * Refactor validation into class * Refactor validation into class * Prepare table simple profiler * Add note * test ORM conversion * Prepare workflow config utilities * Prepare workflow skeleton * Use new core Validation * Refactor workflow config parsing * Add comment * Simplify workflow validations * Fix table metric check * Add init for convenience, otherwise interpreter cries when trying to __call__ the get result * Fix table metric check * Format * Format * Fix table list and metrics init * Prepare profiler workflow integration tests * Bump version * Fix pycharm imports * format 2022-02-20 17:55:12 +01:00			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`Validate workflow configs and filters`
			`"""`
			`import uuid`
			`from copy import deepcopy`

			`from metadata.generated.schema.entity.data.table import Column, DataType, Table`
			`from metadata.generated.schema.type.entityReference import EntityReference`
			`from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig`
			`from metadata.ingestion.source.sqlite import SQLiteConfig`
			`from metadata.orm_profiler.api.workflow import ProfilerWorkflow`
			`from metadata.orm_profiler.profiles.models import ProfilerDef`

			`config = {`
			`"source": {"type": "sqlite", "config": {"service_name": "my_service"}},`
			`"sink": {"type": "metadata-rest", "config": {}},`
			`"metadata_server": {`
			`"type": "metadata-server",`
			`"config": {`
			`"api_endpoint": "http://localhost:8585/api",`
			`"auth_provider_type": "no-auth",`
			`},`
			`},`
			`}`

			`workflow = ProfilerWorkflow.create(config)`


			`def test_init_workflow():`
			`"""`
			`We can initialise the workflow from a config`
			`"""`
			`assert isinstance(workflow.source_config, SQLiteConfig)`
			`assert isinstance(workflow.metadata_config, MetadataServerConfig)`
			`assert workflow.config.profiler is None`
			`assert workflow.config.tests is None`


			`def test_filter_entities():`
			`"""`
			`We can properly filter entities depending on the`
			`workflow configuration`
			`"""`

			`service_name = "service"`
			`db_reference1 = EntityReference(id=uuid.uuid4(), name="one_db", type="database")`
			`db_reference2 = EntityReference(id=uuid.uuid4(), name="another_db", type="database")`

			`all_tables = [`
			`Table(`
			`id=uuid.uuid4(),`
			`name="table1",`
			`database=db_reference1,`
			`fullyQualifiedName=f"{service_name}.{db_reference1.name}.table1",`
			`columns=[Column(name="id", dataType=DataType.BIGINT)],`
			`),`
			`Table(`
			`id=uuid.uuid4(),`
			`name="table2",`
			`database=db_reference1,`
			`fullyQualifiedName=f"{service_name}.{db_reference1.name}.table2",`
			`columns=[Column(name="id", dataType=DataType.BIGINT)],`
			`),`
			`Table(`
			`id=uuid.uuid4(),`
			`name="table3",`
			`database=db_reference2,`
			`fullyQualifiedName=f"{service_name}.{db_reference2.name}.table3",`
			`columns=[Column(name="id", dataType=DataType.BIGINT)],`
			`),`
			`]`

			`# Simple workflow does not filter`
			`assert len(list(workflow.filter_entities(all_tables))) == 3`

			`# We can exclude based on the schema name`
			`exclude_filter_schema_config = deepcopy(config)`
			`exclude_filter_schema_config["source"]["config"]["schema_filter_pattern"] = {`
			`"excludes": ["one_db"]`
			`}`

			`exclude_filter_schema_workflow = ProfilerWorkflow.create(`
			`exclude_filter_schema_config`
			`)`
			`assert len(list(exclude_filter_schema_workflow.filter_entities(all_tables))) == 1`

			`# We can include based on the schema name`
			`include_filter_schema_config = deepcopy(config)`
			`include_filter_schema_config["source"]["config"]["schema_filter_pattern"] = {`
			`"includes": ["another_db"]`
			`}`

			`include_filter_schema_workflow = ProfilerWorkflow.create(`
			`include_filter_schema_config`
			`)`
			`assert len(list(include_filter_schema_workflow.filter_entities(all_tables))) == 1`

			`# We can exclude based on the table name`
			`exclude_filter_table_config = deepcopy(config)`
			`exclude_filter_table_config["source"]["config"]["table_filter_pattern"] = {`
			`"excludes": ["tab*"]`
			`}`

			`exclude_filter_table_workflow = ProfilerWorkflow.create(exclude_filter_table_config)`
			`assert len(list(exclude_filter_table_workflow.filter_entities(all_tables))) == 0`

			`# We can include based on the table name`
			`include_filter_table_config = deepcopy(config)`
			`include_filter_table_config["source"]["config"]["table_filter_pattern"] = {`
			`"includes": ["table1"]`
			`}`

			`include_filter_table_workflow = ProfilerWorkflow.create(include_filter_table_config)`
			`assert len(list(include_filter_table_workflow.filter_entities(all_tables))) == 1`


			`def test_profile_def():`
			`"""`
			`Validate the definitions of the profile in the JSON`
			`"""`
			`profile_config = deepcopy(config)`
			`profile_config["profiler"] = {`
			`"name": "my_profiler",`
			`"table_metrics": ["row_number"],`
			`"metrics": ["min", "COUNT"],`
			`}`

			`profile_workflow = ProfilerWorkflow.create(profile_config)`

			`profile_definition = ProfilerDef(`
			`name="my_profiler",`
			`table_metrics=["ROW_NUMBER"],`
			`metrics=["MIN", "COUNT"],`
			`time_metrics=None,`
			`custom_metrics=None,`
			`)`

			`assert profile_workflow.config.profiler == profile_definition`


			`def test_tests_def():`
			`"""`
			`Validate the test case definition`
			`"""`
			`test_config = deepcopy(config)`
			`test_config["tests"] = {`
			`"name": "my_tests",`
			`"table_tests": [`
			`{`
			`"name": "first_test",`
			`"table": "service.db.name",`
			`"expression": "row_number > 100",`
			`"enabled": False,`
			`},`
			`{`
			`"name": "another_test",`
			`"table": "service.db.name",`
			`"expression": "row_number > 1000 & row_number < 2000",`
			`},`
			`],`
			`"column_tests": [`
			`{`
			`"table": "service.db.name",`
			`"name": "set_of_col_tests",`
			`"columns": [`
			`{`
			`"name": "first_col_test",`
			`"column": "column_name_1",`
			`"expression": "min > 5",`
			`},`
			`{`
			`"name": "another_col_test",`
			`"column": "column_name_1",`
			`"expression": "min > 5 & min < 10",`
			`},`
			`{`
			`"name": "second_col_test",`
			`"column": "column_name_2",`
			`"expression": "null_ratio < 0.1",`
			`},`
			`],`
			`}`
			`],`
			`}`

			`test_workflow = ProfilerWorkflow.create(test_config)`

			`tests = test_workflow.config.tests`

			`assert tests.name == "my_tests"`

			`# Check cardinality`
			`assert len(tests.table_tests) == 2`
			`assert len(tests.column_tests) == 1`
			`assert len(tests.column_tests[0].columns) == 3`

			`assert tests.table_tests[0].name == "first_test"`
			`assert tests.table_tests[0].table == "service.db.name"`
			`assert tests.table_tests[0].expression[0].metric == "ROWNUMBER"`
			`assert not tests.table_tests[0].enabled`

			`assert tests.column_tests[0].columns[0].name == "first_col_test"`
			`assert tests.column_tests[0].columns[0].column == "column_name_1"`
			`assert tests.column_tests[0].columns[0].expression[0].metric == "MIN"`
			`assert tests.column_tests[0].columns[0].enabled`

			`# We cannot do a 1:1 general assertion because we are dynamically`
			`# creating the Validation classes. Then, the internal IDs don't match`
			`# and the assertion fails. However, and for visual representation,`
			`# the resulting class looks like follows:`

			`# TestDef(`
			`# name="my_tests",`
			`# table_tests=[`
			`# # I can have multiple tests on the same table`
			`# TableTest(`
			`# name="first_test",`
			`# table="service.db.name",`
			`# expression="row_number > 100", # This will be one Validation`
			`# enabled=False,`
			`# ),`
			`# TableTest(`
			`# name="another_test",`
			`# table="service.db.name",`
			`# expression="row_number > 1000 & row_number < 2000", # This will be two Validations`
			`# ),`
			`# ],`
			`# column_tests=[`
			`# ColumnTest(`
			`# table="service.db.name",`
			`# name="set_of_col_tests",`
			`# columns=[`
			`# ColumnTestExpression(`
			`# name="first_col_test",`
			`# column="column_name_1",`
			`# expression="min > 5", # One Validation`
			`# ),`
			`# ColumnTestExpression(`
			`# name="another_col_test",`
			`# column="column_name_1",`
			`# expression="min > 5 & min < 10", # Two Validations`
			`# ),`
			`# ColumnTestExpression(`
			`# name="second_col_test",`
			`# column="column_name_2",`
			`# expression="null_ratio < 0.1", # One Validation`
			`# ),`
			`# ],`
			`# )`
			`# ],`
			`# )`