Fix #3150 - Missing count & value length tests (#3193)

2025-12-28 07:58:31 +00:00 · 2022-03-07 07:19:13 +01:00 · 2022-03-07 07:19:13 +01:00 · 2444b884bf
commit 2444b884bf
parent 4d09c165d7
9 changed files with 369 additions and 34 deletions
--- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_length_to_be_between.py
+++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_length_to_be_between.py
@ -0,0 +1,70 @@
+#  Copyright 2021 Collate
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+ColumnValueLengthsToBeBetween validation implementation
+"""
+
+from datetime import datetime
+
+from metadata.generated.schema.entity.data.table import ColumnProfile
+from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
+from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import (
+    ColumnValueLengthsToBeBetween,
+)
+from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
+    ColumnValuesToBeBetween,
+)
+from metadata.orm_profiler.utils import logger
+
+logger = logger()
+
+
+def column_value_length_to_be_between(
+    test_case: ColumnValueLengthsToBeBetween,
+    col_profile: ColumnProfile,
+    execution_date: datetime,
+    **__,
+) -> TestCaseResult:
+    """
+    Validate Column Values metric
+    :param test_case: ColumnValueLengthsToBeBetween
+    :param col_profile: should contain minLength & maxLength metrics
+    :param execution_date: Datetime when the tests ran
+    :return: TestCaseResult with status and results
+    """
+
+    if col_profile.minLength is None or col_profile.maxLength is None:
+        msg = (
+            "We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween"
+            + f" but got minLength={col_profile.minLength}, maxLength={col_profile.maxLength}."
+        )
+        logger.error(msg)
+        return TestCaseResult(
+            executionTime=execution_date.timestamp(),
+            testCaseStatus=TestCaseStatus.Aborted,
+            result=msg,
+        )
+
+    status = (
+        TestCaseStatus.Success
+        if col_profile.minLength >= test_case.minValue
+        and col_profile.maxLength <= test_case.maxValue
+        else TestCaseStatus.Failed
+    )
+    result = (
+        f"Found minLength={col_profile.minLength}, maxLength={col_profile.maxLength} vs."
+        + f" the expected minLength={test_case.minValue}, maxLength={test_case.maxValue}."
+    )
+
+    return TestCaseResult(
+        executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
+    )
--- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_missing_count_to_be_equal.py
+++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_missing_count_to_be_equal.py
@ -0,0 +1,96 @@
+#  Copyright 2021 Collate
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+ColumnValuesMissingCount validation implementation
+"""
+
+from datetime import datetime
+from typing import Optional
+
+from sqlalchemy.orm import DeclarativeMeta, Session
+
+from metadata.generated.schema.entity.data.table import ColumnProfile
+from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
+from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
+    ColumnValuesMissingCount,
+)
+from metadata.orm_profiler.metrics.core import add_props
+from metadata.orm_profiler.metrics.registry import Metrics
+from metadata.orm_profiler.utils import logger
+from metadata.orm_profiler.validations.utils import run_col_metric
+
+logger = logger()
+
+
+def column_values_missing_count_to_be_equal(
+    test_case: ColumnValuesMissingCount,
+    col_profile: ColumnProfile,
+    execution_date: datetime,
+    session: Optional[Session] = None,
+    table: Optional[DeclarativeMeta] = None,
+) -> TestCaseResult:
+    """
+    Validate Column Values metric
+    :param test_case: ColumnValuesMissingCount. Just used to trigger singledispatch
+    :param col_profile: should contain count and distinct count metrics
+    :param execution_date: Datetime when the tests ran
+    :param session: SQLAlchemy Session, for tests that need to compute new metrics
+    :param table: SQLAlchemy Table, for tests that need to compute new metrics
+    :return: TestCaseResult with status and results
+    """
+
+    if col_profile.nullCount is None:
+        msg = "We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
+        logger.error(msg)
+        return TestCaseResult(
+            executionTime=execution_date.timestamp(),
+            testCaseStatus=TestCaseStatus.Aborted,
+            result=msg,
+        )
+
+    missing_count = col_profile.nullCount
+    if test_case.missingValueMatch:
+        set_count = add_props(values=test_case.missingValueMatch)(
+            Metrics.COUNT_IN_SET.value
+        )
+
+        try:
+            set_count_res = run_col_metric(
+                metric=set_count,
+                session=session,
+                table=table,
+                column=col_profile.name,
+            )
+
+            # Add set count for special values into the missing count
+            missing_count += set_count_res
+
+        except Exception as err:  # pylint: disable=broad-except
+            session.rollback()
+            msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}"
+            logger.error(msg)
+            return TestCaseResult(
+                executionTime=execution_date.timestamp(),
+                testCaseStatus=TestCaseStatus.Aborted,
+                result=msg,
+            )
+
+    status = (
+        TestCaseStatus.Success
+        if missing_count == test_case.missingCountValue
+        else TestCaseStatus.Failed
+    )
+    result = f"Found missingCount={missing_count}. It should be {test_case.missingCountValue}."
+
+    return TestCaseResult(
+        executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
+    )
--- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py
+++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py
@ -16,7 +16,6 @@ ColumnValuesToBeNotNull validation implementation
 from datetime import datetime
 from typing import Optional

-from sqlalchemy import inspect
 from sqlalchemy.orm import DeclarativeMeta, Session

 from metadata.generated.schema.entity.data.table import ColumnProfile
@ -26,8 +25,8 @@ from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
 )
 from metadata.orm_profiler.metrics.core import add_props
 from metadata.orm_profiler.metrics.registry import Metrics
-from metadata.orm_profiler.profiles.core import Profiler
 from metadata.orm_profiler.utils import logger
+from metadata.orm_profiler.validations.utils import run_col_metric

 logger = logger()

@ -52,26 +51,16 @@ def column_values_not_in_set(
    set_count = add_props(values=test_case.forbiddenValues)(Metrics.COUNT_IN_SET.value)

    try:
-        col = next(
-            iter([col for col in inspect(table).c if col.name == col_profile.name]),
-            None,
+        set_count_res = run_col_metric(
+            metric=set_count,
+            session=session,
+            table=table,
+            column=col_profile.name,
        )

-        if col is None:
-            raise ValueError(
-                f"Cannot find the configured column {col_profile.name} for ColumnValuesToBeNotInSet"
-            )
-
-        res = (
-            Profiler(set_count, session=session, table=table, use_cols=[col])
-            .execute()
-            .column_results
-        )
-        set_count_res = res.get(col.name)[Metrics.COUNT_IN_SET.name]
-
    except Exception as err:  # pylint: disable=broad-except
        session.rollback()
-        msg = f"Error computing ColumnValuesToBeNotInSet for {col_profile.name} - {err}"
+        msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}"
        logger.error(msg)
        return TestCaseResult(
            executionTime=execution_date.timestamp(),
--- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_to_match_regex.py
+++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_to_match_regex.py
@ -28,6 +28,7 @@ from metadata.orm_profiler.metrics.core import add_props
 from metadata.orm_profiler.metrics.registry import Metrics
 from metadata.orm_profiler.profiles.core import Profiler
 from metadata.orm_profiler.utils import logger
+from metadata.orm_profiler.validations.utils import run_col_metric

 logger = logger()

@ -61,22 +62,13 @@ def column_values_to_match_regex(
        )

    try:
-        col = next(
-            iter([col for col in inspect(table).c if col.name == col_profile.name]),
-            None,
-        )

-        if col is None:
-            raise ValueError(
-                f"Cannot find the configured column {col_profile.name} for ColumnValuesToMatchRegex"
-            )
-
-        res = (
-            Profiler(like_count, session=session, table=table, use_cols=[col])
-            .execute()
-            .column_results
+        like_count_res = run_col_metric(
+            metric=like_count,
+            session=session,
+            table=table,
+            column=col_profile.name,
        )
-        like_count_res = res.get(col.name)[Metrics.LIKE_COUNT.name]

    except Exception as err:  # pylint: disable=broad-except
        session.rollback()
--- a/ingestion/src/metadata/orm_profiler/validations/core.py
+++ b/ingestion/src/metadata/orm_profiler/validations/core.py
@ -24,6 +24,12 @@ from functools import singledispatch

 from metadata.generated.schema.tests.basic import TestCaseResult
 from metadata.orm_profiler.utils import logger
+from metadata.orm_profiler.validations.column.column_values_length_to_be_between import (
+    column_value_length_to_be_between,
+)
+from metadata.orm_profiler.validations.column.column_values_missing_count_to_be_equal import (
+    column_values_missing_count_to_be_equal,
+)
 from metadata.orm_profiler.validations.column.column_values_not_in_set import (
    column_values_not_in_set,
 )
@ -73,7 +79,9 @@ validate.register(table_column_count_to_equal)
 validate.register(column_values_to_be_between)
 validate.register(column_values_to_be_unique)
 validate.register(column_values_to_be_not_null)
+validate.register(column_value_length_to_be_between)

 # Column Session Tests
 validate.register(column_values_not_in_set)
 validate.register(column_values_to_match_regex)
+validate.register(column_values_missing_count_to_be_equal)
--- a/ingestion/src/metadata/orm_profiler/validations/utils.py
+++ b/ingestion/src/metadata/orm_profiler/validations/utils.py
@ -0,0 +1,54 @@
+#  Copyright 2021 Collate
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+Validation Utilities
+"""
+
+from typing import Type
+
+from sqlalchemy import inspect
+from sqlalchemy.orm import DeclarativeMeta, Session
+
+from metadata.orm_profiler.metrics.core import Metric
+from metadata.orm_profiler.profiles.core import Profiler
+
+
+def run_col_metric(
+    metric: Type[Metric], session: Session, table: DeclarativeMeta, column: str
+) -> int:
+    """
+    Runs a metric on a table column and returns the results
+
+    :param metric: Metric to run
+    :param session: SQLAlchemy session
+    :param table:  ORM table
+    :param column: column name
+    :return: metric result
+    """
+
+    col = next(
+        iter([col for col in inspect(table).c if col.name == column]),
+        None,
+    )
+
+    if col is None:
+        raise ValueError(
+            f"Cannot find the configured column {column} for ColumnValuesToBeNotInSet"
+        )
+
+    res = (
+        Profiler(metric, session=session, table=table, use_cols=[col])
+        .execute()
+        .column_results
+    )
+
+    return res.get(col.name)[metric.name()]
--- a/ingestion/tests/integration/ometa/test_ometa_database_service_api.py
+++ b/ingestion/tests/integration/ometa/test_ometa_database_service_api.py
@ -101,7 +101,6 @@ class OMetaDatabaseServiceTest(TestCase):
            name="test-db-service",
            serviceType=DatabaseServiceType.MySQL,
            databaseConnection=new_connection,
-            href="http://resource-uri/",
        )

        updated_res = self.metadata.create_or_update(data=update_request)
--- a/ingestion/tests/unit/profiler/test_session_validations.py
+++ b/ingestion/tests/unit/profiler/test_session_validations.py
@ -20,6 +20,9 @@ from sqlalchemy.orm import declarative_base

 from metadata.generated.schema.entity.data.table import ColumnProfile
 from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
+from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
+    ColumnValuesMissingCount,
+)
 from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
    ColumnValuesToBeNotInSet,
 )
@ -123,7 +126,7 @@ class MetricsTest(TestCase):
            executionTime=EXECUTION_DATE.timestamp(),
            testCaseStatus=TestCaseStatus.Aborted,
            result=(
-                "Error computing ColumnValuesToBeNotInSet for random - Cannot find"
+                "Error computing ColumnValuesToBeNotInSet for users.random - Cannot find"
                + " the configured column random for ColumnValuesToBeNotInSet"
            ),
        )
@ -177,3 +180,74 @@ class MetricsTest(TestCase):
                "We expect `valuesCount` to be informed for ColumnValuesToMatchRegex."
            ),
        )
+
+    def test_column_values_missing_count_to_be_equal(self):
+        """
+        Check that the metric runs and the results are correctly validated
+        """
+        column_profile = ColumnProfile(name="nickname", nullCount=1)
+
+        res_ok = validate(
+            ColumnValuesMissingCount(missingCountValue=1),
+            col_profile=column_profile,
+            execution_date=EXECUTION_DATE,
+            session=self.session,
+            table=User,
+        )
+
+        assert res_ok == TestCaseResult(
+            executionTime=EXECUTION_DATE.timestamp(),
+            testCaseStatus=TestCaseStatus.Success,
+            result="Found missingCount=1.0. It should be 1.",
+        )
+
+        res_ok_2 = validate(
+            ColumnValuesMissingCount(
+                missingCountValue=2,
+                missingValueMatch=["johnny b goode"],
+            ),
+            col_profile=column_profile,
+            execution_date=EXECUTION_DATE,
+            session=self.session,
+            table=User,
+        )
+
+        assert res_ok_2 == TestCaseResult(
+            executionTime=EXECUTION_DATE.timestamp(),
+            testCaseStatus=TestCaseStatus.Success,
+            result="Found missingCount=2.0. It should be 2.",
+        )
+
+        res_ko = validate(
+            ColumnValuesMissingCount(
+                missingCountValue=0,
+            ),
+            col_profile=column_profile,
+            execution_date=EXECUTION_DATE,
+            session=self.session,
+            table=User,
+        )
+
+        assert res_ko == TestCaseResult(
+            executionTime=EXECUTION_DATE.timestamp(),
+            testCaseStatus=TestCaseStatus.Failed,
+            result="Found missingCount=1.0. It should be 0.",
+        )
+
+        res_aborted = validate(
+            ColumnValuesMissingCount(
+                missingCountValue=0,
+            ),
+            col_profile=ColumnProfile(name="nickname"),
+            execution_date=EXECUTION_DATE,
+            session=self.session,
+            table=User,
+        )
+
+        assert res_aborted == TestCaseResult(
+            executionTime=EXECUTION_DATE.timestamp(),
+            testCaseStatus=TestCaseStatus.Aborted,
+            result=(
+                "We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
+            ),
+        )
--- a/ingestion/tests/unit/profiler/test_validations.py
+++ b/ingestion/tests/unit/profiler/test_validations.py
@ -19,6 +19,9 @@ from datetime import datetime

 from metadata.generated.schema.entity.data.table import ColumnProfile, TableProfile
 from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
+from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import (
+    ColumnValueLengthsToBeBetween,
+)
 from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
    ColumnValuesToBeBetween,
 )
@ -364,3 +367,53 @@ def test_column_values_to_be_not_null():
            "We expect `nullCount` to be informed on the profiler for ColumnValuesToBeNotNull."
        ),
    )
+
+
+def test_column_value_length_to_be_between():
+    """
+    Check ColumnValueLengthsToBeBetween
+    """
+    col_profile = ColumnProfile(
+        minLength=4,
+        maxLength=16,
+    )
+
+    res_ok = validate(
+        ColumnValueLengthsToBeBetween(minValue=2, maxValue=20),
+        col_profile=col_profile,
+        execution_date=EXECUTION_DATE,
+    )
+    assert res_ok == TestCaseResult(
+        executionTime=EXECUTION_DATE.timestamp(),
+        testCaseStatus=TestCaseStatus.Success,
+        result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=2, maxLength=20.",
+    )
+
+    res_ko = validate(
+        ColumnValueLengthsToBeBetween(minValue=10, maxValue=20),
+        col_profile=col_profile,
+        execution_date=EXECUTION_DATE,
+    )
+
+    assert res_ko == TestCaseResult(
+        executionTime=EXECUTION_DATE.timestamp(),
+        testCaseStatus=TestCaseStatus.Failed,
+        result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=10, maxLength=20.",
+    )
+
+    col_profile_aborted = ColumnProfile(minLength=4)
+
+    res_aborted = validate(
+        ColumnValueLengthsToBeBetween(minValue=2, maxValue=20),
+        col_profile=col_profile_aborted,
+        execution_date=EXECUTION_DATE,
+    )
+
+    assert res_aborted == TestCaseResult(
+        executionTime=EXECUTION_DATE.timestamp(),
+        testCaseStatus=TestCaseStatus.Aborted,
+        result=(
+            "We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween"
+            + " but got minLength=4.0, maxLength=None."
+        ),
+    )