From 2444b884bf7809e268695b3245e86bf44dbfbfe4 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 7 Mar 2022 07:19:13 +0100 Subject: [PATCH] Fix #3150 - Missing count & value length tests (#3193) Fix #3150 - Missing count & value length tests (#3193) --- .../column_values_length_to_be_between.py | 70 ++++++++++++++ ...column_values_missing_count_to_be_equal.py | 96 +++++++++++++++++++ .../column/column_values_not_in_set.py | 25 ++--- .../column/column_values_to_match_regex.py | 20 ++-- .../metadata/orm_profiler/validations/core.py | 8 ++ .../orm_profiler/validations/utils.py | 54 +++++++++++ .../ometa/test_ometa_database_service_api.py | 1 - .../unit/profiler/test_session_validations.py | 76 ++++++++++++++- .../tests/unit/profiler/test_validations.py | 53 ++++++++++ 9 files changed, 369 insertions(+), 34 deletions(-) create mode 100644 ingestion/src/metadata/orm_profiler/validations/column/column_values_length_to_be_between.py create mode 100644 ingestion/src/metadata/orm_profiler/validations/column/column_values_missing_count_to_be_equal.py create mode 100644 ingestion/src/metadata/orm_profiler/validations/utils.py diff --git a/ingestion/src/metadata/orm_profiler/validations/column/column_values_length_to_be_between.py b/ingestion/src/metadata/orm_profiler/validations/column/column_values_length_to_be_between.py new file mode 100644 index 00000000000..27220896275 --- /dev/null +++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_length_to_be_between.py @@ -0,0 +1,70 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +ColumnValueLengthsToBeBetween validation implementation +""" + +from datetime import datetime + +from metadata.generated.schema.entity.data.table import ColumnProfile +from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus +from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import ( + ColumnValueLengthsToBeBetween, +) +from metadata.generated.schema.tests.column.columnValuesToBeBetween import ( + ColumnValuesToBeBetween, +) +from metadata.orm_profiler.utils import logger + +logger = logger() + + +def column_value_length_to_be_between( + test_case: ColumnValueLengthsToBeBetween, + col_profile: ColumnProfile, + execution_date: datetime, + **__, +) -> TestCaseResult: + """ + Validate Column Values metric + :param test_case: ColumnValueLengthsToBeBetween + :param col_profile: should contain minLength & maxLength metrics + :param execution_date: Datetime when the tests ran + :return: TestCaseResult with status and results + """ + + if col_profile.minLength is None or col_profile.maxLength is None: + msg = ( + "We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween" + + f" but got minLength={col_profile.minLength}, maxLength={col_profile.maxLength}." + ) + logger.error(msg) + return TestCaseResult( + executionTime=execution_date.timestamp(), + testCaseStatus=TestCaseStatus.Aborted, + result=msg, + ) + + status = ( + TestCaseStatus.Success + if col_profile.minLength >= test_case.minValue + and col_profile.maxLength <= test_case.maxValue + else TestCaseStatus.Failed + ) + result = ( + f"Found minLength={col_profile.minLength}, maxLength={col_profile.maxLength} vs." + + f" the expected minLength={test_case.minValue}, maxLength={test_case.maxValue}." + ) + + return TestCaseResult( + executionTime=execution_date.timestamp(), testCaseStatus=status, result=result + ) diff --git a/ingestion/src/metadata/orm_profiler/validations/column/column_values_missing_count_to_be_equal.py b/ingestion/src/metadata/orm_profiler/validations/column/column_values_missing_count_to_be_equal.py new file mode 100644 index 00000000000..e284c686adc --- /dev/null +++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_missing_count_to_be_equal.py @@ -0,0 +1,96 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +ColumnValuesMissingCount validation implementation +""" + +from datetime import datetime +from typing import Optional + +from sqlalchemy.orm import DeclarativeMeta, Session + +from metadata.generated.schema.entity.data.table import ColumnProfile +from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus +from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import ( + ColumnValuesMissingCount, +) +from metadata.orm_profiler.metrics.core import add_props +from metadata.orm_profiler.metrics.registry import Metrics +from metadata.orm_profiler.utils import logger +from metadata.orm_profiler.validations.utils import run_col_metric + +logger = logger() + + +def column_values_missing_count_to_be_equal( + test_case: ColumnValuesMissingCount, + col_profile: ColumnProfile, + execution_date: datetime, + session: Optional[Session] = None, + table: Optional[DeclarativeMeta] = None, +) -> TestCaseResult: + """ + Validate Column Values metric + :param test_case: ColumnValuesMissingCount. Just used to trigger singledispatch + :param col_profile: should contain count and distinct count metrics + :param execution_date: Datetime when the tests ran + :param session: SQLAlchemy Session, for tests that need to compute new metrics + :param table: SQLAlchemy Table, for tests that need to compute new metrics + :return: TestCaseResult with status and results + """ + + if col_profile.nullCount is None: + msg = "We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount." + logger.error(msg) + return TestCaseResult( + executionTime=execution_date.timestamp(), + testCaseStatus=TestCaseStatus.Aborted, + result=msg, + ) + + missing_count = col_profile.nullCount + if test_case.missingValueMatch: + set_count = add_props(values=test_case.missingValueMatch)( + Metrics.COUNT_IN_SET.value + ) + + try: + set_count_res = run_col_metric( + metric=set_count, + session=session, + table=table, + column=col_profile.name, + ) + + # Add set count for special values into the missing count + missing_count += set_count_res + + except Exception as err: # pylint: disable=broad-except + session.rollback() + msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}" + logger.error(msg) + return TestCaseResult( + executionTime=execution_date.timestamp(), + testCaseStatus=TestCaseStatus.Aborted, + result=msg, + ) + + status = ( + TestCaseStatus.Success + if missing_count == test_case.missingCountValue + else TestCaseStatus.Failed + ) + result = f"Found missingCount={missing_count}. It should be {test_case.missingCountValue}." + + return TestCaseResult( + executionTime=execution_date.timestamp(), testCaseStatus=status, result=result + ) diff --git a/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py b/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py index c02e9f35f19..bbaa5b9bed9 100644 --- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py +++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py @@ -16,7 +16,6 @@ ColumnValuesToBeNotNull validation implementation from datetime import datetime from typing import Optional -from sqlalchemy import inspect from sqlalchemy.orm import DeclarativeMeta, Session from metadata.generated.schema.entity.data.table import ColumnProfile @@ -26,8 +25,8 @@ from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import ( ) from metadata.orm_profiler.metrics.core import add_props from metadata.orm_profiler.metrics.registry import Metrics -from metadata.orm_profiler.profiles.core import Profiler from metadata.orm_profiler.utils import logger +from metadata.orm_profiler.validations.utils import run_col_metric logger = logger() @@ -52,26 +51,16 @@ def column_values_not_in_set( set_count = add_props(values=test_case.forbiddenValues)(Metrics.COUNT_IN_SET.value) try: - col = next( - iter([col for col in inspect(table).c if col.name == col_profile.name]), - None, + set_count_res = run_col_metric( + metric=set_count, + session=session, + table=table, + column=col_profile.name, ) - if col is None: - raise ValueError( - f"Cannot find the configured column {col_profile.name} for ColumnValuesToBeNotInSet" - ) - - res = ( - Profiler(set_count, session=session, table=table, use_cols=[col]) - .execute() - .column_results - ) - set_count_res = res.get(col.name)[Metrics.COUNT_IN_SET.name] - except Exception as err: # pylint: disable=broad-except session.rollback() - msg = f"Error computing ColumnValuesToBeNotInSet for {col_profile.name} - {err}" + msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}" logger.error(msg) return TestCaseResult( executionTime=execution_date.timestamp(), diff --git a/ingestion/src/metadata/orm_profiler/validations/column/column_values_to_match_regex.py b/ingestion/src/metadata/orm_profiler/validations/column/column_values_to_match_regex.py index 3472dde685f..2424ae50e9f 100644 --- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_to_match_regex.py +++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_to_match_regex.py @@ -28,6 +28,7 @@ from metadata.orm_profiler.metrics.core import add_props from metadata.orm_profiler.metrics.registry import Metrics from metadata.orm_profiler.profiles.core import Profiler from metadata.orm_profiler.utils import logger +from metadata.orm_profiler.validations.utils import run_col_metric logger = logger() @@ -61,22 +62,13 @@ def column_values_to_match_regex( ) try: - col = next( - iter([col for col in inspect(table).c if col.name == col_profile.name]), - None, - ) - if col is None: - raise ValueError( - f"Cannot find the configured column {col_profile.name} for ColumnValuesToMatchRegex" - ) - - res = ( - Profiler(like_count, session=session, table=table, use_cols=[col]) - .execute() - .column_results + like_count_res = run_col_metric( + metric=like_count, + session=session, + table=table, + column=col_profile.name, ) - like_count_res = res.get(col.name)[Metrics.LIKE_COUNT.name] except Exception as err: # pylint: disable=broad-except session.rollback() diff --git a/ingestion/src/metadata/orm_profiler/validations/core.py b/ingestion/src/metadata/orm_profiler/validations/core.py index 2416c1667ca..8be483a9a29 100644 --- a/ingestion/src/metadata/orm_profiler/validations/core.py +++ b/ingestion/src/metadata/orm_profiler/validations/core.py @@ -24,6 +24,12 @@ from functools import singledispatch from metadata.generated.schema.tests.basic import TestCaseResult from metadata.orm_profiler.utils import logger +from metadata.orm_profiler.validations.column.column_values_length_to_be_between import ( + column_value_length_to_be_between, +) +from metadata.orm_profiler.validations.column.column_values_missing_count_to_be_equal import ( + column_values_missing_count_to_be_equal, +) from metadata.orm_profiler.validations.column.column_values_not_in_set import ( column_values_not_in_set, ) @@ -73,7 +79,9 @@ validate.register(table_column_count_to_equal) validate.register(column_values_to_be_between) validate.register(column_values_to_be_unique) validate.register(column_values_to_be_not_null) +validate.register(column_value_length_to_be_between) # Column Session Tests validate.register(column_values_not_in_set) validate.register(column_values_to_match_regex) +validate.register(column_values_missing_count_to_be_equal) diff --git a/ingestion/src/metadata/orm_profiler/validations/utils.py b/ingestion/src/metadata/orm_profiler/validations/utils.py new file mode 100644 index 00000000000..13c231a1e0d --- /dev/null +++ b/ingestion/src/metadata/orm_profiler/validations/utils.py @@ -0,0 +1,54 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Validation Utilities +""" + +from typing import Type + +from sqlalchemy import inspect +from sqlalchemy.orm import DeclarativeMeta, Session + +from metadata.orm_profiler.metrics.core import Metric +from metadata.orm_profiler.profiles.core import Profiler + + +def run_col_metric( + metric: Type[Metric], session: Session, table: DeclarativeMeta, column: str +) -> int: + """ + Runs a metric on a table column and returns the results + + :param metric: Metric to run + :param session: SQLAlchemy session + :param table: ORM table + :param column: column name + :return: metric result + """ + + col = next( + iter([col for col in inspect(table).c if col.name == column]), + None, + ) + + if col is None: + raise ValueError( + f"Cannot find the configured column {column} for ColumnValuesToBeNotInSet" + ) + + res = ( + Profiler(metric, session=session, table=table, use_cols=[col]) + .execute() + .column_results + ) + + return res.get(col.name)[metric.name()] diff --git a/ingestion/tests/integration/ometa/test_ometa_database_service_api.py b/ingestion/tests/integration/ometa/test_ometa_database_service_api.py index b056f4a9417..56b85e0c6c1 100644 --- a/ingestion/tests/integration/ometa/test_ometa_database_service_api.py +++ b/ingestion/tests/integration/ometa/test_ometa_database_service_api.py @@ -101,7 +101,6 @@ class OMetaDatabaseServiceTest(TestCase): name="test-db-service", serviceType=DatabaseServiceType.MySQL, databaseConnection=new_connection, - href="http://resource-uri/", ) updated_res = self.metadata.create_or_update(data=update_request) diff --git a/ingestion/tests/unit/profiler/test_session_validations.py b/ingestion/tests/unit/profiler/test_session_validations.py index 2cdd2c733f5..a6d70174ccf 100644 --- a/ingestion/tests/unit/profiler/test_session_validations.py +++ b/ingestion/tests/unit/profiler/test_session_validations.py @@ -20,6 +20,9 @@ from sqlalchemy.orm import declarative_base from metadata.generated.schema.entity.data.table import ColumnProfile from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus +from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import ( + ColumnValuesMissingCount, +) from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import ( ColumnValuesToBeNotInSet, ) @@ -123,7 +126,7 @@ class MetricsTest(TestCase): executionTime=EXECUTION_DATE.timestamp(), testCaseStatus=TestCaseStatus.Aborted, result=( - "Error computing ColumnValuesToBeNotInSet for random - Cannot find" + "Error computing ColumnValuesToBeNotInSet for users.random - Cannot find" + " the configured column random for ColumnValuesToBeNotInSet" ), ) @@ -177,3 +180,74 @@ class MetricsTest(TestCase): "We expect `valuesCount` to be informed for ColumnValuesToMatchRegex." ), ) + + def test_column_values_missing_count_to_be_equal(self): + """ + Check that the metric runs and the results are correctly validated + """ + column_profile = ColumnProfile(name="nickname", nullCount=1) + + res_ok = validate( + ColumnValuesMissingCount(missingCountValue=1), + col_profile=column_profile, + execution_date=EXECUTION_DATE, + session=self.session, + table=User, + ) + + assert res_ok == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Success, + result="Found missingCount=1.0. It should be 1.", + ) + + res_ok_2 = validate( + ColumnValuesMissingCount( + missingCountValue=2, + missingValueMatch=["johnny b goode"], + ), + col_profile=column_profile, + execution_date=EXECUTION_DATE, + session=self.session, + table=User, + ) + + assert res_ok_2 == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Success, + result="Found missingCount=2.0. It should be 2.", + ) + + res_ko = validate( + ColumnValuesMissingCount( + missingCountValue=0, + ), + col_profile=column_profile, + execution_date=EXECUTION_DATE, + session=self.session, + table=User, + ) + + assert res_ko == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Failed, + result="Found missingCount=1.0. It should be 0.", + ) + + res_aborted = validate( + ColumnValuesMissingCount( + missingCountValue=0, + ), + col_profile=ColumnProfile(name="nickname"), + execution_date=EXECUTION_DATE, + session=self.session, + table=User, + ) + + assert res_aborted == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Aborted, + result=( + "We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount." + ), + ) diff --git a/ingestion/tests/unit/profiler/test_validations.py b/ingestion/tests/unit/profiler/test_validations.py index 314982b13c4..9188acc987c 100644 --- a/ingestion/tests/unit/profiler/test_validations.py +++ b/ingestion/tests/unit/profiler/test_validations.py @@ -19,6 +19,9 @@ from datetime import datetime from metadata.generated.schema.entity.data.table import ColumnProfile, TableProfile from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus +from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import ( + ColumnValueLengthsToBeBetween, +) from metadata.generated.schema.tests.column.columnValuesToBeBetween import ( ColumnValuesToBeBetween, ) @@ -364,3 +367,53 @@ def test_column_values_to_be_not_null(): "We expect `nullCount` to be informed on the profiler for ColumnValuesToBeNotNull." ), ) + + +def test_column_value_length_to_be_between(): + """ + Check ColumnValueLengthsToBeBetween + """ + col_profile = ColumnProfile( + minLength=4, + maxLength=16, + ) + + res_ok = validate( + ColumnValueLengthsToBeBetween(minValue=2, maxValue=20), + col_profile=col_profile, + execution_date=EXECUTION_DATE, + ) + assert res_ok == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Success, + result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=2, maxLength=20.", + ) + + res_ko = validate( + ColumnValueLengthsToBeBetween(minValue=10, maxValue=20), + col_profile=col_profile, + execution_date=EXECUTION_DATE, + ) + + assert res_ko == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Failed, + result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=10, maxLength=20.", + ) + + col_profile_aborted = ColumnProfile(minLength=4) + + res_aborted = validate( + ColumnValueLengthsToBeBetween(minValue=2, maxValue=20), + col_profile=col_profile_aborted, + execution_date=EXECUTION_DATE, + ) + + assert res_aborted == TestCaseResult( + executionTime=EXECUTION_DATE.timestamp(), + testCaseStatus=TestCaseStatus.Aborted, + result=( + "We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween" + + " but got minLength=4.0, maxLength=None." + ), + )