# Copyright 2021 Collate # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Validate workflow configs and filters """ import uuid from copy import deepcopy from metadata.generated.schema.entity.data.table import Column, DataType, Table from metadata.generated.schema.type.entityReference import EntityReference from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig from metadata.ingestion.source.sqlite import SQLiteConfig from metadata.orm_profiler.api.workflow import ProfilerWorkflow from metadata.orm_profiler.profiles.models import ProfilerDef config = { "source": {"type": "sqlite", "config": {"service_name": "my_service"}}, "sink": {"type": "metadata-rest", "config": {}}, "metadata_server": { "type": "metadata-server", "config": { "api_endpoint": "http://localhost:8585/api", "auth_provider_type": "no-auth", }, }, } workflow = ProfilerWorkflow.create(config) def test_init_workflow(): """ We can initialise the workflow from a config """ assert isinstance(workflow.source_config, SQLiteConfig) assert isinstance(workflow.metadata_config, MetadataServerConfig) assert workflow.config.profiler is None assert workflow.config.tests is None def test_filter_entities(): """ We can properly filter entities depending on the workflow configuration """ service_name = "service" db_reference1 = EntityReference(id=uuid.uuid4(), name="one_db", type="database") db_reference2 = EntityReference(id=uuid.uuid4(), name="another_db", type="database") all_tables = [ Table( id=uuid.uuid4(), name="table1", database=db_reference1, fullyQualifiedName=f"{service_name}.{db_reference1.name}.table1", columns=[Column(name="id", dataType=DataType.BIGINT)], ), Table( id=uuid.uuid4(), name="table2", database=db_reference1, fullyQualifiedName=f"{service_name}.{db_reference1.name}.table2", columns=[Column(name="id", dataType=DataType.BIGINT)], ), Table( id=uuid.uuid4(), name="table3", database=db_reference2, fullyQualifiedName=f"{service_name}.{db_reference2.name}.table3", columns=[Column(name="id", dataType=DataType.BIGINT)], ), ] # Simple workflow does not filter assert len(list(workflow.filter_entities(all_tables))) == 3 # We can exclude based on the schema name exclude_filter_schema_config = deepcopy(config) exclude_filter_schema_config["source"]["config"]["schema_filter_pattern"] = { "excludes": ["one_db"] } exclude_filter_schema_workflow = ProfilerWorkflow.create( exclude_filter_schema_config ) assert len(list(exclude_filter_schema_workflow.filter_entities(all_tables))) == 1 # We can include based on the schema name include_filter_schema_config = deepcopy(config) include_filter_schema_config["source"]["config"]["schema_filter_pattern"] = { "includes": ["another_db"] } include_filter_schema_workflow = ProfilerWorkflow.create( include_filter_schema_config ) assert len(list(include_filter_schema_workflow.filter_entities(all_tables))) == 1 # We can exclude based on the table name exclude_filter_table_config = deepcopy(config) exclude_filter_table_config["source"]["config"]["table_filter_pattern"] = { "excludes": ["tab*"] } exclude_filter_table_workflow = ProfilerWorkflow.create(exclude_filter_table_config) assert len(list(exclude_filter_table_workflow.filter_entities(all_tables))) == 0 # We can include based on the table name include_filter_table_config = deepcopy(config) include_filter_table_config["source"]["config"]["table_filter_pattern"] = { "includes": ["table1"] } include_filter_table_workflow = ProfilerWorkflow.create(include_filter_table_config) assert len(list(include_filter_table_workflow.filter_entities(all_tables))) == 1 def test_profile_def(): """ Validate the definitions of the profile in the JSON """ profile_config = deepcopy(config) profile_config["profiler"] = { "name": "my_profiler", "table_metrics": ["row_number"], "metrics": ["min", "COUNT"], } profile_workflow = ProfilerWorkflow.create(profile_config) profile_definition = ProfilerDef( name="my_profiler", table_metrics=["ROW_NUMBER"], metrics=["MIN", "COUNT"], time_metrics=None, custom_metrics=None, ) assert profile_workflow.config.profiler == profile_definition def test_tests_def(): """ Validate the test case definition """ test_config = deepcopy(config) test_config["tests"] = { "name": "my_tests", "table_tests": [ { "name": "first_test", "table": "service.db.name", "expression": "row_number > 100", "enabled": False, }, { "name": "another_test", "table": "service.db.name", "expression": "row_number > 1000 & row_number < 2000", }, ], "column_tests": [ { "table": "service.db.name", "name": "set_of_col_tests", "columns": [ { "name": "first_col_test", "column": "column_name_1", "expression": "min > 5", }, { "name": "another_col_test", "column": "column_name_1", "expression": "min > 5 & min < 10", }, { "name": "second_col_test", "column": "column_name_2", "expression": "null_ratio < 0.1", }, ], } ], } test_workflow = ProfilerWorkflow.create(test_config) tests = test_workflow.config.tests assert tests.name == "my_tests" # Check cardinality assert len(tests.table_tests) == 2 assert len(tests.column_tests) == 1 assert len(tests.column_tests[0].columns) == 3 assert tests.table_tests[0].name == "first_test" assert tests.table_tests[0].table == "service.db.name" assert tests.table_tests[0].expression[0].metric == "ROWNUMBER" assert not tests.table_tests[0].enabled assert tests.column_tests[0].columns[0].name == "first_col_test" assert tests.column_tests[0].columns[0].column == "column_name_1" assert tests.column_tests[0].columns[0].expression[0].metric == "MIN" assert tests.column_tests[0].columns[0].enabled # We cannot do a 1:1 general assertion because we are dynamically # creating the Validation classes. Then, the internal IDs don't match # and the assertion fails. However, and for visual representation, # the resulting class looks like follows: # TestDef( # name="my_tests", # table_tests=[ # # I can have multiple tests on the same table # TableTest( # name="first_test", # table="service.db.name", # expression="row_number > 100", # This will be one Validation # enabled=False, # ), # TableTest( # name="another_test", # table="service.db.name", # expression="row_number > 1000 & row_number < 2000", # This will be two Validations # ), # ], # column_tests=[ # ColumnTest( # table="service.db.name", # name="set_of_col_tests", # columns=[ # ColumnTestExpression( # name="first_col_test", # column="column_name_1", # expression="min > 5", # One Validation # ), # ColumnTestExpression( # name="another_col_test", # column="column_name_1", # expression="min > 5 & min < 10", # Two Validations # ), # ColumnTestExpression( # name="second_col_test", # column="column_name_2", # expression="null_ratio < 0.1", # One Validation # ), # ], # ) # ], # )