Fix #3150 - Missing count & value length tests (#3193)

Fix #3150 - Missing count & value length tests (#3193)
This commit is contained in:
Pere Miquel Brull 2022-03-07 07:19:13 +01:00 committed by GitHub
parent 4d09c165d7
commit 2444b884bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 369 additions and 34 deletions

View File

@ -0,0 +1,70 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ColumnValueLengthsToBeBetween validation implementation
"""
from datetime import datetime
from metadata.generated.schema.entity.data.table import ColumnProfile
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import (
ColumnValueLengthsToBeBetween,
)
from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
ColumnValuesToBeBetween,
)
from metadata.orm_profiler.utils import logger
logger = logger()
def column_value_length_to_be_between(
test_case: ColumnValueLengthsToBeBetween,
col_profile: ColumnProfile,
execution_date: datetime,
**__,
) -> TestCaseResult:
"""
Validate Column Values metric
:param test_case: ColumnValueLengthsToBeBetween
:param col_profile: should contain minLength & maxLength metrics
:param execution_date: Datetime when the tests ran
:return: TestCaseResult with status and results
"""
if col_profile.minLength is None or col_profile.maxLength is None:
msg = (
"We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween"
+ f" but got minLength={col_profile.minLength}, maxLength={col_profile.maxLength}."
)
logger.error(msg)
return TestCaseResult(
executionTime=execution_date.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=msg,
)
status = (
TestCaseStatus.Success
if col_profile.minLength >= test_case.minValue
and col_profile.maxLength <= test_case.maxValue
else TestCaseStatus.Failed
)
result = (
f"Found minLength={col_profile.minLength}, maxLength={col_profile.maxLength} vs."
+ f" the expected minLength={test_case.minValue}, maxLength={test_case.maxValue}."
)
return TestCaseResult(
executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
)

View File

@ -0,0 +1,96 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ColumnValuesMissingCount validation implementation
"""
from datetime import datetime
from typing import Optional
from sqlalchemy.orm import DeclarativeMeta, Session
from metadata.generated.schema.entity.data.table import ColumnProfile
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
ColumnValuesMissingCount,
)
from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.utils import logger
from metadata.orm_profiler.validations.utils import run_col_metric
logger = logger()
def column_values_missing_count_to_be_equal(
test_case: ColumnValuesMissingCount,
col_profile: ColumnProfile,
execution_date: datetime,
session: Optional[Session] = None,
table: Optional[DeclarativeMeta] = None,
) -> TestCaseResult:
"""
Validate Column Values metric
:param test_case: ColumnValuesMissingCount. Just used to trigger singledispatch
:param col_profile: should contain count and distinct count metrics
:param execution_date: Datetime when the tests ran
:param session: SQLAlchemy Session, for tests that need to compute new metrics
:param table: SQLAlchemy Table, for tests that need to compute new metrics
:return: TestCaseResult with status and results
"""
if col_profile.nullCount is None:
msg = "We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
logger.error(msg)
return TestCaseResult(
executionTime=execution_date.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=msg,
)
missing_count = col_profile.nullCount
if test_case.missingValueMatch:
set_count = add_props(values=test_case.missingValueMatch)(
Metrics.COUNT_IN_SET.value
)
try:
set_count_res = run_col_metric(
metric=set_count,
session=session,
table=table,
column=col_profile.name,
)
# Add set count for special values into the missing count
missing_count += set_count_res
except Exception as err: # pylint: disable=broad-except
session.rollback()
msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}"
logger.error(msg)
return TestCaseResult(
executionTime=execution_date.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=msg,
)
status = (
TestCaseStatus.Success
if missing_count == test_case.missingCountValue
else TestCaseStatus.Failed
)
result = f"Found missingCount={missing_count}. It should be {test_case.missingCountValue}."
return TestCaseResult(
executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
)

View File

@ -16,7 +16,6 @@ ColumnValuesToBeNotNull validation implementation
from datetime import datetime
from typing import Optional
from sqlalchemy import inspect
from sqlalchemy.orm import DeclarativeMeta, Session
from metadata.generated.schema.entity.data.table import ColumnProfile
@ -26,8 +25,8 @@ from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
)
from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiles.core import Profiler
from metadata.orm_profiler.utils import logger
from metadata.orm_profiler.validations.utils import run_col_metric
logger = logger()
@ -52,26 +51,16 @@ def column_values_not_in_set(
set_count = add_props(values=test_case.forbiddenValues)(Metrics.COUNT_IN_SET.value)
try:
col = next(
iter([col for col in inspect(table).c if col.name == col_profile.name]),
None,
set_count_res = run_col_metric(
metric=set_count,
session=session,
table=table,
column=col_profile.name,
)
if col is None:
raise ValueError(
f"Cannot find the configured column {col_profile.name} for ColumnValuesToBeNotInSet"
)
res = (
Profiler(set_count, session=session, table=table, use_cols=[col])
.execute()
.column_results
)
set_count_res = res.get(col.name)[Metrics.COUNT_IN_SET.name]
except Exception as err: # pylint: disable=broad-except
session.rollback()
msg = f"Error computing ColumnValuesToBeNotInSet for {col_profile.name} - {err}"
msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}"
logger.error(msg)
return TestCaseResult(
executionTime=execution_date.timestamp(),

View File

@ -28,6 +28,7 @@ from metadata.orm_profiler.metrics.core import add_props
from metadata.orm_profiler.metrics.registry import Metrics
from metadata.orm_profiler.profiles.core import Profiler
from metadata.orm_profiler.utils import logger
from metadata.orm_profiler.validations.utils import run_col_metric
logger = logger()
@ -61,22 +62,13 @@ def column_values_to_match_regex(
)
try:
col = next(
iter([col for col in inspect(table).c if col.name == col_profile.name]),
None,
)
if col is None:
raise ValueError(
f"Cannot find the configured column {col_profile.name} for ColumnValuesToMatchRegex"
)
res = (
Profiler(like_count, session=session, table=table, use_cols=[col])
.execute()
.column_results
like_count_res = run_col_metric(
metric=like_count,
session=session,
table=table,
column=col_profile.name,
)
like_count_res = res.get(col.name)[Metrics.LIKE_COUNT.name]
except Exception as err: # pylint: disable=broad-except
session.rollback()

View File

@ -24,6 +24,12 @@ from functools import singledispatch
from metadata.generated.schema.tests.basic import TestCaseResult
from metadata.orm_profiler.utils import logger
from metadata.orm_profiler.validations.column.column_values_length_to_be_between import (
column_value_length_to_be_between,
)
from metadata.orm_profiler.validations.column.column_values_missing_count_to_be_equal import (
column_values_missing_count_to_be_equal,
)
from metadata.orm_profiler.validations.column.column_values_not_in_set import (
column_values_not_in_set,
)
@ -73,7 +79,9 @@ validate.register(table_column_count_to_equal)
validate.register(column_values_to_be_between)
validate.register(column_values_to_be_unique)
validate.register(column_values_to_be_not_null)
validate.register(column_value_length_to_be_between)
# Column Session Tests
validate.register(column_values_not_in_set)
validate.register(column_values_to_match_regex)
validate.register(column_values_missing_count_to_be_equal)

View File

@ -0,0 +1,54 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Validation Utilities
"""
from typing import Type
from sqlalchemy import inspect
from sqlalchemy.orm import DeclarativeMeta, Session
from metadata.orm_profiler.metrics.core import Metric
from metadata.orm_profiler.profiles.core import Profiler
def run_col_metric(
metric: Type[Metric], session: Session, table: DeclarativeMeta, column: str
) -> int:
"""
Runs a metric on a table column and returns the results
:param metric: Metric to run
:param session: SQLAlchemy session
:param table: ORM table
:param column: column name
:return: metric result
"""
col = next(
iter([col for col in inspect(table).c if col.name == column]),
None,
)
if col is None:
raise ValueError(
f"Cannot find the configured column {column} for ColumnValuesToBeNotInSet"
)
res = (
Profiler(metric, session=session, table=table, use_cols=[col])
.execute()
.column_results
)
return res.get(col.name)[metric.name()]

View File

@ -101,7 +101,6 @@ class OMetaDatabaseServiceTest(TestCase):
name="test-db-service",
serviceType=DatabaseServiceType.MySQL,
databaseConnection=new_connection,
href="http://resource-uri/",
)
updated_res = self.metadata.create_or_update(data=update_request)

View File

@ -20,6 +20,9 @@ from sqlalchemy.orm import declarative_base
from metadata.generated.schema.entity.data.table import ColumnProfile
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
ColumnValuesMissingCount,
)
from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
ColumnValuesToBeNotInSet,
)
@ -123,7 +126,7 @@ class MetricsTest(TestCase):
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"Error computing ColumnValuesToBeNotInSet for random - Cannot find"
"Error computing ColumnValuesToBeNotInSet for users.random - Cannot find"
+ " the configured column random for ColumnValuesToBeNotInSet"
),
)
@ -177,3 +180,74 @@ class MetricsTest(TestCase):
"We expect `valuesCount` to be informed for ColumnValuesToMatchRegex."
),
)
def test_column_values_missing_count_to_be_equal(self):
"""
Check that the metric runs and the results are correctly validated
"""
column_profile = ColumnProfile(name="nickname", nullCount=1)
res_ok = validate(
ColumnValuesMissingCount(missingCountValue=1),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ok == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found missingCount=1.0. It should be 1.",
)
res_ok_2 = validate(
ColumnValuesMissingCount(
missingCountValue=2,
missingValueMatch=["johnny b goode"],
),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ok_2 == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found missingCount=2.0. It should be 2.",
)
res_ko = validate(
ColumnValuesMissingCount(
missingCountValue=0,
),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ko == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Failed,
result="Found missingCount=1.0. It should be 0.",
)
res_aborted = validate(
ColumnValuesMissingCount(
missingCountValue=0,
),
col_profile=ColumnProfile(name="nickname"),
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_aborted == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
),
)

View File

@ -19,6 +19,9 @@ from datetime import datetime
from metadata.generated.schema.entity.data.table import ColumnProfile, TableProfile
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import (
ColumnValueLengthsToBeBetween,
)
from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
ColumnValuesToBeBetween,
)
@ -364,3 +367,53 @@ def test_column_values_to_be_not_null():
"We expect `nullCount` to be informed on the profiler for ColumnValuesToBeNotNull."
),
)
def test_column_value_length_to_be_between():
"""
Check ColumnValueLengthsToBeBetween
"""
col_profile = ColumnProfile(
minLength=4,
maxLength=16,
)
res_ok = validate(
ColumnValueLengthsToBeBetween(minValue=2, maxValue=20),
col_profile=col_profile,
execution_date=EXECUTION_DATE,
)
assert res_ok == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=2, maxLength=20.",
)
res_ko = validate(
ColumnValueLengthsToBeBetween(minValue=10, maxValue=20),
col_profile=col_profile,
execution_date=EXECUTION_DATE,
)
assert res_ko == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Failed,
result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=10, maxLength=20.",
)
col_profile_aborted = ColumnProfile(minLength=4)
res_aborted = validate(
ColumnValueLengthsToBeBetween(minValue=2, maxValue=20),
col_profile=col_profile_aborted,
execution_date=EXECUTION_DATE,
)
assert res_aborted == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween"
+ " but got minLength=4.0, maxLength=None."
),
)