mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-04 11:33:07 +00:00
parent
4d09c165d7
commit
2444b884bf
@ -0,0 +1,70 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
ColumnValueLengthsToBeBetween validation implementation
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from metadata.generated.schema.entity.data.table import ColumnProfile
|
||||
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
|
||||
from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import (
|
||||
ColumnValueLengthsToBeBetween,
|
||||
)
|
||||
from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
|
||||
ColumnValuesToBeBetween,
|
||||
)
|
||||
from metadata.orm_profiler.utils import logger
|
||||
|
||||
logger = logger()
|
||||
|
||||
|
||||
def column_value_length_to_be_between(
|
||||
test_case: ColumnValueLengthsToBeBetween,
|
||||
col_profile: ColumnProfile,
|
||||
execution_date: datetime,
|
||||
**__,
|
||||
) -> TestCaseResult:
|
||||
"""
|
||||
Validate Column Values metric
|
||||
:param test_case: ColumnValueLengthsToBeBetween
|
||||
:param col_profile: should contain minLength & maxLength metrics
|
||||
:param execution_date: Datetime when the tests ran
|
||||
:return: TestCaseResult with status and results
|
||||
"""
|
||||
|
||||
if col_profile.minLength is None or col_profile.maxLength is None:
|
||||
msg = (
|
||||
"We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween"
|
||||
+ f" but got minLength={col_profile.minLength}, maxLength={col_profile.maxLength}."
|
||||
)
|
||||
logger.error(msg)
|
||||
return TestCaseResult(
|
||||
executionTime=execution_date.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Aborted,
|
||||
result=msg,
|
||||
)
|
||||
|
||||
status = (
|
||||
TestCaseStatus.Success
|
||||
if col_profile.minLength >= test_case.minValue
|
||||
and col_profile.maxLength <= test_case.maxValue
|
||||
else TestCaseStatus.Failed
|
||||
)
|
||||
result = (
|
||||
f"Found minLength={col_profile.minLength}, maxLength={col_profile.maxLength} vs."
|
||||
+ f" the expected minLength={test_case.minValue}, maxLength={test_case.maxValue}."
|
||||
)
|
||||
|
||||
return TestCaseResult(
|
||||
executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
|
||||
)
|
||||
@ -0,0 +1,96 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
ColumnValuesMissingCount validation implementation
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||
|
||||
from metadata.generated.schema.entity.data.table import ColumnProfile
|
||||
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
|
||||
from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
|
||||
ColumnValuesMissingCount,
|
||||
)
|
||||
from metadata.orm_profiler.metrics.core import add_props
|
||||
from metadata.orm_profiler.metrics.registry import Metrics
|
||||
from metadata.orm_profiler.utils import logger
|
||||
from metadata.orm_profiler.validations.utils import run_col_metric
|
||||
|
||||
logger = logger()
|
||||
|
||||
|
||||
def column_values_missing_count_to_be_equal(
|
||||
test_case: ColumnValuesMissingCount,
|
||||
col_profile: ColumnProfile,
|
||||
execution_date: datetime,
|
||||
session: Optional[Session] = None,
|
||||
table: Optional[DeclarativeMeta] = None,
|
||||
) -> TestCaseResult:
|
||||
"""
|
||||
Validate Column Values metric
|
||||
:param test_case: ColumnValuesMissingCount. Just used to trigger singledispatch
|
||||
:param col_profile: should contain count and distinct count metrics
|
||||
:param execution_date: Datetime when the tests ran
|
||||
:param session: SQLAlchemy Session, for tests that need to compute new metrics
|
||||
:param table: SQLAlchemy Table, for tests that need to compute new metrics
|
||||
:return: TestCaseResult with status and results
|
||||
"""
|
||||
|
||||
if col_profile.nullCount is None:
|
||||
msg = "We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
|
||||
logger.error(msg)
|
||||
return TestCaseResult(
|
||||
executionTime=execution_date.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Aborted,
|
||||
result=msg,
|
||||
)
|
||||
|
||||
missing_count = col_profile.nullCount
|
||||
if test_case.missingValueMatch:
|
||||
set_count = add_props(values=test_case.missingValueMatch)(
|
||||
Metrics.COUNT_IN_SET.value
|
||||
)
|
||||
|
||||
try:
|
||||
set_count_res = run_col_metric(
|
||||
metric=set_count,
|
||||
session=session,
|
||||
table=table,
|
||||
column=col_profile.name,
|
||||
)
|
||||
|
||||
# Add set count for special values into the missing count
|
||||
missing_count += set_count_res
|
||||
|
||||
except Exception as err: # pylint: disable=broad-except
|
||||
session.rollback()
|
||||
msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}"
|
||||
logger.error(msg)
|
||||
return TestCaseResult(
|
||||
executionTime=execution_date.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Aborted,
|
||||
result=msg,
|
||||
)
|
||||
|
||||
status = (
|
||||
TestCaseStatus.Success
|
||||
if missing_count == test_case.missingCountValue
|
||||
else TestCaseStatus.Failed
|
||||
)
|
||||
result = f"Found missingCount={missing_count}. It should be {test_case.missingCountValue}."
|
||||
|
||||
return TestCaseResult(
|
||||
executionTime=execution_date.timestamp(), testCaseStatus=status, result=result
|
||||
)
|
||||
@ -16,7 +16,6 @@ ColumnValuesToBeNotNull validation implementation
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import inspect
|
||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||
|
||||
from metadata.generated.schema.entity.data.table import ColumnProfile
|
||||
@ -26,8 +25,8 @@ from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
|
||||
)
|
||||
from metadata.orm_profiler.metrics.core import add_props
|
||||
from metadata.orm_profiler.metrics.registry import Metrics
|
||||
from metadata.orm_profiler.profiles.core import Profiler
|
||||
from metadata.orm_profiler.utils import logger
|
||||
from metadata.orm_profiler.validations.utils import run_col_metric
|
||||
|
||||
logger = logger()
|
||||
|
||||
@ -52,26 +51,16 @@ def column_values_not_in_set(
|
||||
set_count = add_props(values=test_case.forbiddenValues)(Metrics.COUNT_IN_SET.value)
|
||||
|
||||
try:
|
||||
col = next(
|
||||
iter([col for col in inspect(table).c if col.name == col_profile.name]),
|
||||
None,
|
||||
set_count_res = run_col_metric(
|
||||
metric=set_count,
|
||||
session=session,
|
||||
table=table,
|
||||
column=col_profile.name,
|
||||
)
|
||||
|
||||
if col is None:
|
||||
raise ValueError(
|
||||
f"Cannot find the configured column {col_profile.name} for ColumnValuesToBeNotInSet"
|
||||
)
|
||||
|
||||
res = (
|
||||
Profiler(set_count, session=session, table=table, use_cols=[col])
|
||||
.execute()
|
||||
.column_results
|
||||
)
|
||||
set_count_res = res.get(col.name)[Metrics.COUNT_IN_SET.name]
|
||||
|
||||
except Exception as err: # pylint: disable=broad-except
|
||||
session.rollback()
|
||||
msg = f"Error computing ColumnValuesToBeNotInSet for {col_profile.name} - {err}"
|
||||
msg = f"Error computing {test_case.__class__.__name__} for {table.__tablename__}.{col_profile.name} - {err}"
|
||||
logger.error(msg)
|
||||
return TestCaseResult(
|
||||
executionTime=execution_date.timestamp(),
|
||||
|
||||
@ -28,6 +28,7 @@ from metadata.orm_profiler.metrics.core import add_props
|
||||
from metadata.orm_profiler.metrics.registry import Metrics
|
||||
from metadata.orm_profiler.profiles.core import Profiler
|
||||
from metadata.orm_profiler.utils import logger
|
||||
from metadata.orm_profiler.validations.utils import run_col_metric
|
||||
|
||||
logger = logger()
|
||||
|
||||
@ -61,22 +62,13 @@ def column_values_to_match_regex(
|
||||
)
|
||||
|
||||
try:
|
||||
col = next(
|
||||
iter([col for col in inspect(table).c if col.name == col_profile.name]),
|
||||
None,
|
||||
)
|
||||
|
||||
if col is None:
|
||||
raise ValueError(
|
||||
f"Cannot find the configured column {col_profile.name} for ColumnValuesToMatchRegex"
|
||||
)
|
||||
|
||||
res = (
|
||||
Profiler(like_count, session=session, table=table, use_cols=[col])
|
||||
.execute()
|
||||
.column_results
|
||||
like_count_res = run_col_metric(
|
||||
metric=like_count,
|
||||
session=session,
|
||||
table=table,
|
||||
column=col_profile.name,
|
||||
)
|
||||
like_count_res = res.get(col.name)[Metrics.LIKE_COUNT.name]
|
||||
|
||||
except Exception as err: # pylint: disable=broad-except
|
||||
session.rollback()
|
||||
|
||||
@ -24,6 +24,12 @@ from functools import singledispatch
|
||||
|
||||
from metadata.generated.schema.tests.basic import TestCaseResult
|
||||
from metadata.orm_profiler.utils import logger
|
||||
from metadata.orm_profiler.validations.column.column_values_length_to_be_between import (
|
||||
column_value_length_to_be_between,
|
||||
)
|
||||
from metadata.orm_profiler.validations.column.column_values_missing_count_to_be_equal import (
|
||||
column_values_missing_count_to_be_equal,
|
||||
)
|
||||
from metadata.orm_profiler.validations.column.column_values_not_in_set import (
|
||||
column_values_not_in_set,
|
||||
)
|
||||
@ -73,7 +79,9 @@ validate.register(table_column_count_to_equal)
|
||||
validate.register(column_values_to_be_between)
|
||||
validate.register(column_values_to_be_unique)
|
||||
validate.register(column_values_to_be_not_null)
|
||||
validate.register(column_value_length_to_be_between)
|
||||
|
||||
# Column Session Tests
|
||||
validate.register(column_values_not_in_set)
|
||||
validate.register(column_values_to_match_regex)
|
||||
validate.register(column_values_missing_count_to_be_equal)
|
||||
|
||||
54
ingestion/src/metadata/orm_profiler/validations/utils.py
Normal file
54
ingestion/src/metadata/orm_profiler/validations/utils.py
Normal file
@ -0,0 +1,54 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Validation Utilities
|
||||
"""
|
||||
|
||||
from typing import Type
|
||||
|
||||
from sqlalchemy import inspect
|
||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||
|
||||
from metadata.orm_profiler.metrics.core import Metric
|
||||
from metadata.orm_profiler.profiles.core import Profiler
|
||||
|
||||
|
||||
def run_col_metric(
|
||||
metric: Type[Metric], session: Session, table: DeclarativeMeta, column: str
|
||||
) -> int:
|
||||
"""
|
||||
Runs a metric on a table column and returns the results
|
||||
|
||||
:param metric: Metric to run
|
||||
:param session: SQLAlchemy session
|
||||
:param table: ORM table
|
||||
:param column: column name
|
||||
:return: metric result
|
||||
"""
|
||||
|
||||
col = next(
|
||||
iter([col for col in inspect(table).c if col.name == column]),
|
||||
None,
|
||||
)
|
||||
|
||||
if col is None:
|
||||
raise ValueError(
|
||||
f"Cannot find the configured column {column} for ColumnValuesToBeNotInSet"
|
||||
)
|
||||
|
||||
res = (
|
||||
Profiler(metric, session=session, table=table, use_cols=[col])
|
||||
.execute()
|
||||
.column_results
|
||||
)
|
||||
|
||||
return res.get(col.name)[metric.name()]
|
||||
@ -101,7 +101,6 @@ class OMetaDatabaseServiceTest(TestCase):
|
||||
name="test-db-service",
|
||||
serviceType=DatabaseServiceType.MySQL,
|
||||
databaseConnection=new_connection,
|
||||
href="http://resource-uri/",
|
||||
)
|
||||
|
||||
updated_res = self.metadata.create_or_update(data=update_request)
|
||||
|
||||
@ -20,6 +20,9 @@ from sqlalchemy.orm import declarative_base
|
||||
|
||||
from metadata.generated.schema.entity.data.table import ColumnProfile
|
||||
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
|
||||
from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
|
||||
ColumnValuesMissingCount,
|
||||
)
|
||||
from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
|
||||
ColumnValuesToBeNotInSet,
|
||||
)
|
||||
@ -123,7 +126,7 @@ class MetricsTest(TestCase):
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Aborted,
|
||||
result=(
|
||||
"Error computing ColumnValuesToBeNotInSet for random - Cannot find"
|
||||
"Error computing ColumnValuesToBeNotInSet for users.random - Cannot find"
|
||||
+ " the configured column random for ColumnValuesToBeNotInSet"
|
||||
),
|
||||
)
|
||||
@ -177,3 +180,74 @@ class MetricsTest(TestCase):
|
||||
"We expect `valuesCount` to be informed for ColumnValuesToMatchRegex."
|
||||
),
|
||||
)
|
||||
|
||||
def test_column_values_missing_count_to_be_equal(self):
|
||||
"""
|
||||
Check that the metric runs and the results are correctly validated
|
||||
"""
|
||||
column_profile = ColumnProfile(name="nickname", nullCount=1)
|
||||
|
||||
res_ok = validate(
|
||||
ColumnValuesMissingCount(missingCountValue=1),
|
||||
col_profile=column_profile,
|
||||
execution_date=EXECUTION_DATE,
|
||||
session=self.session,
|
||||
table=User,
|
||||
)
|
||||
|
||||
assert res_ok == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Success,
|
||||
result="Found missingCount=1.0. It should be 1.",
|
||||
)
|
||||
|
||||
res_ok_2 = validate(
|
||||
ColumnValuesMissingCount(
|
||||
missingCountValue=2,
|
||||
missingValueMatch=["johnny b goode"],
|
||||
),
|
||||
col_profile=column_profile,
|
||||
execution_date=EXECUTION_DATE,
|
||||
session=self.session,
|
||||
table=User,
|
||||
)
|
||||
|
||||
assert res_ok_2 == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Success,
|
||||
result="Found missingCount=2.0. It should be 2.",
|
||||
)
|
||||
|
||||
res_ko = validate(
|
||||
ColumnValuesMissingCount(
|
||||
missingCountValue=0,
|
||||
),
|
||||
col_profile=column_profile,
|
||||
execution_date=EXECUTION_DATE,
|
||||
session=self.session,
|
||||
table=User,
|
||||
)
|
||||
|
||||
assert res_ko == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Failed,
|
||||
result="Found missingCount=1.0. It should be 0.",
|
||||
)
|
||||
|
||||
res_aborted = validate(
|
||||
ColumnValuesMissingCount(
|
||||
missingCountValue=0,
|
||||
),
|
||||
col_profile=ColumnProfile(name="nickname"),
|
||||
execution_date=EXECUTION_DATE,
|
||||
session=self.session,
|
||||
table=User,
|
||||
)
|
||||
|
||||
assert res_aborted == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Aborted,
|
||||
result=(
|
||||
"We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
|
||||
),
|
||||
)
|
||||
|
||||
@ -19,6 +19,9 @@ from datetime import datetime
|
||||
|
||||
from metadata.generated.schema.entity.data.table import ColumnProfile, TableProfile
|
||||
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
|
||||
from metadata.generated.schema.tests.column.columnValuesLengthsToBeBetween import (
|
||||
ColumnValueLengthsToBeBetween,
|
||||
)
|
||||
from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
|
||||
ColumnValuesToBeBetween,
|
||||
)
|
||||
@ -364,3 +367,53 @@ def test_column_values_to_be_not_null():
|
||||
"We expect `nullCount` to be informed on the profiler for ColumnValuesToBeNotNull."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_column_value_length_to_be_between():
|
||||
"""
|
||||
Check ColumnValueLengthsToBeBetween
|
||||
"""
|
||||
col_profile = ColumnProfile(
|
||||
minLength=4,
|
||||
maxLength=16,
|
||||
)
|
||||
|
||||
res_ok = validate(
|
||||
ColumnValueLengthsToBeBetween(minValue=2, maxValue=20),
|
||||
col_profile=col_profile,
|
||||
execution_date=EXECUTION_DATE,
|
||||
)
|
||||
assert res_ok == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Success,
|
||||
result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=2, maxLength=20.",
|
||||
)
|
||||
|
||||
res_ko = validate(
|
||||
ColumnValueLengthsToBeBetween(minValue=10, maxValue=20),
|
||||
col_profile=col_profile,
|
||||
execution_date=EXECUTION_DATE,
|
||||
)
|
||||
|
||||
assert res_ko == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Failed,
|
||||
result="Found minLength=4.0, maxLength=16.0 vs. the expected minLength=10, maxLength=20.",
|
||||
)
|
||||
|
||||
col_profile_aborted = ColumnProfile(minLength=4)
|
||||
|
||||
res_aborted = validate(
|
||||
ColumnValueLengthsToBeBetween(minValue=2, maxValue=20),
|
||||
col_profile=col_profile_aborted,
|
||||
execution_date=EXECUTION_DATE,
|
||||
)
|
||||
|
||||
assert res_aborted == TestCaseResult(
|
||||
executionTime=EXECUTION_DATE.timestamp(),
|
||||
testCaseStatus=TestCaseStatus.Aborted,
|
||||
result=(
|
||||
"We expect `minLength` & `maxLength` to be informed on the profiler for ColumnValueLengthsToBeBetween"
|
||||
+ " but got minLength=4.0, maxLength=None."
|
||||
),
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user