OpenMetadata/ingestion/tests/unit/profiler/test_session_validations.py
Mayur Singal db0e34c709
Fixing Test Connection for Dynamo & Glue (#4316)
* Fixing Test Connection for Dynamo

* Fixed Glue Connector

* renamed engine to connection

* Fixed the return signature

* Added dataclass
2022-04-22 11:30:59 +05:30

254 lines
8.2 KiB
Python

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test validations that need a session configured to run
"""
from datetime import datetime
from unittest import TestCase
from sqlalchemy import TEXT, Column, Integer, String, create_engine
from sqlalchemy.orm import declarative_base
from metadata.generated.schema.entity.data.table import ColumnProfile
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.generated.schema.tests.column.columnValuesMissingCountToBeEqual import (
ColumnValuesMissingCount,
)
from metadata.generated.schema.tests.column.columnValuesToBeNotInSet import (
ColumnValuesToBeNotInSet,
)
from metadata.generated.schema.tests.column.columnValuesToMatchRegex import (
ColumnValuesToMatchRegex,
)
from metadata.orm_profiler.validations.core import validate
from metadata.utils.connections import create_and_bind_session
EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d")
Base = declarative_base()
class User(Base):
__tablename__ = "users"
id = Column(Integer, primary_key=True)
name = Column(String(256))
fullname = Column(String(256))
nickname = Column(String(256))
comments = Column(TEXT)
age = Column(Integer)
class MetricsTest(TestCase):
"""
Run checks on different metrics
"""
engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True)
session = create_and_bind_session(engine)
@classmethod
def setUpClass(cls) -> None:
"""
Prepare Ingredients
"""
User.__table__.create(bind=cls.engine)
data = [
User(
name="John",
fullname="John Doe",
nickname="johnny b goode",
comments="no comments",
age=30,
),
User(
name="Jane",
fullname="Jone Doe",
nickname=None,
comments="maybe some comments",
age=31,
),
]
cls.session.add_all(data)
cls.session.commit()
def test_column_values_not_in_set(self):
"""
Check that the metric runs and the results are correctly validated
"""
column_profile = ColumnProfile(name="name") # column name
res_ok = validate(
ColumnValuesToBeNotInSet(forbiddenValues=["random", "forbidden"]),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ok == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found countInSet=0. It should be 0.",
)
res_ko = validate(
ColumnValuesToBeNotInSet(forbiddenValues=["John", "forbidden"]),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ko == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Failed,
result="Found countInSet=1. It should be 0.",
)
res_aborted = validate(
ColumnValuesToBeNotInSet(forbiddenValues=["John", "forbidden"]),
col_profile=ColumnProfile(name="random"),
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_aborted == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"Error computing ColumnValuesToBeNotInSet for users.random - Cannot find"
+ " the configured column random for ColumnValuesToBeNotInSet"
),
)
def test_column_values_to_match_regex(self):
"""
Check that the metric runs and the results are correctly validated
"""
column_profile = ColumnProfile(name="name", valuesCount=2) # column name
res_ok = validate(
ColumnValuesToMatchRegex(regex="J%"),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ok == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found likeCount=2 & valuesCount=2.0. They should be equal.",
)
res_ko = validate(
ColumnValuesToMatchRegex(regex="Jo%"),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ko == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Failed,
result="Found likeCount=1 & valuesCount=2.0. They should be equal.",
)
res_aborted = validate(
ColumnValuesToMatchRegex(regex="J%"),
col_profile=ColumnProfile(name="name"),
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_aborted == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"We expect `valuesCount` to be informed for ColumnValuesToMatchRegex."
),
)
def test_column_values_missing_count_to_be_equal(self):
"""
Check that the metric runs and the results are correctly validated
"""
column_profile = ColumnProfile(name="nickname", nullCount=1)
res_ok = validate(
ColumnValuesMissingCount(missingCountValue=1),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ok == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found missingCount=1.0. It should be 1.",
)
res_ok_2 = validate(
ColumnValuesMissingCount(
missingCountValue=2,
missingValueMatch=["johnny b goode"],
),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ok_2 == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Success,
result="Found missingCount=2.0. It should be 2.",
)
res_ko = validate(
ColumnValuesMissingCount(
missingCountValue=0,
),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_ko == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Failed,
result="Found missingCount=1.0. It should be 0.",
)
res_aborted = validate(
ColumnValuesMissingCount(
missingCountValue=0,
),
col_profile=ColumnProfile(name="nickname"),
execution_date=EXECUTION_DATE,
session=self.session,
table=User,
)
assert res_aborted == TestCaseResult(
executionTime=EXECUTION_DATE.timestamp(),
testCaseStatus=TestCaseStatus.Aborted,
result=(
"We expect `nullCount` to be informed on the profiler for ColumnValuesMissingCount."
),
)