Ayush Shah 9880f06b2c
Fixes #17489: Allow non numeric numbers to be sent via Json, Replace NaN value… (#17490)
* fix: Allow non numeric numbers to be sent via Json, Replace NaN values with None in SQAProfilerInterface

Replace NaN values with None in the SQAProfilerInterface class to maintain database parity. NaN values will be cast to null in OpenMetadata. This change ensures that data handling processes account for this conversion.

* fix: histogram overflow error

* test: Add Unit Test for Null and Null Ratio Metric

* chore: Address comments

* chore: Address comments

* fix: checkstyle and message

* fix: failing tests as null count works as expected
2024-08-20 16:33:55 +05:30

300 lines
11 KiB
Python

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Validate workflow e2e
"""
import os
import unittest
from datetime import datetime, timedelta
import sqlalchemy as sqa
from sqlalchemy.orm import Session, declarative_base
from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest
from metadata.generated.schema.api.data.createDatabaseSchema import (
CreateDatabaseSchemaRequest,
)
from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.api.services.createDatabaseService import (
CreateDatabaseServiceRequest,
)
from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
from metadata.generated.schema.entity.data.table import Column, DataType
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
SQLiteConnection,
SQLiteScheme,
)
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection,
)
from metadata.generated.schema.entity.services.databaseService import (
DatabaseConnection,
DatabaseService,
DatabaseServiceType,
)
from metadata.generated.schema.tests.testCase import TestCase
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.workflow.data_quality import TestSuiteWorkflow
test_suite_config = {
"source": {
"type": "custom-database",
"serviceName": "test_suite_service_test",
"sourceConfig": {
"config": {
"type": "TestSuite",
"entityFullyQualifiedName": "test_suite_service_test.test_suite_database.test_suite_database_schema.users",
}
},
},
"processor": {
"type": "orm-test-runner",
"config": {
"testCases": [
{
"name": "my_test_case",
"testDefinitionName": "tableColumnCountToBeBetween",
"parameterValues": [
{"name": "minColValue", "value": "1"},
{"name": "maxColValue", "value": "5"},
],
},
{
"name": "table_column_to_be_not_null",
"testDefinitionName": "columnValuesToBeNotNull",
"columnName": "id",
"computePassedFailedRowCount": True,
},
],
},
},
"sink": {"type": "metadata-rest", "config": {}},
"workflowConfig": {
"openMetadataServerConfig": {
"hostPort": "http://localhost:8585/api",
"authProvider": "openmetadata",
"securityConfig": {
"jwtToken": "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
},
}
},
}
Base = declarative_base()
class User(Base):
__tablename__ = "users"
id = sqa.Column(sqa.Integer, primary_key=True)
name = sqa.Column(sqa.String(256))
fullname = sqa.Column(sqa.String(256))
nickname = sqa.Column(sqa.String(256))
age = sqa.Column(sqa.Integer)
class EmptyUser(Base):
__tablename__ = "empty_users"
id = sqa.Column(sqa.Integer, primary_key=True)
name = sqa.Column(sqa.String(256))
fullname = sqa.Column(sqa.String(256))
nickname = sqa.Column(sqa.String(256))
age = sqa.Column(sqa.Integer)
class TestE2EWorkflow(unittest.TestCase):
"""e2e test for the workflow"""
metadata = OpenMetadata(
OpenMetadataConnection.model_validate(
test_suite_config["workflowConfig"]["openMetadataServerConfig"]
)
)
db_path = os.path.join(
os.path.dirname(__file__), f"{os.path.splitext(__file__)[0]}.db"
)
sqlite_conn = DatabaseConnection(
config=SQLiteConnection(
scheme=SQLiteScheme.sqlite_pysqlite,
databaseMode=db_path + "?check_same_thread=False",
)
)
@classmethod
def setUpClass(cls):
"""set up class"""
service: DatabaseService = cls.metadata.create_or_update(
CreateDatabaseServiceRequest(
name="test_suite_service_test",
serviceType=DatabaseServiceType.SQLite,
connection=cls.sqlite_conn,
)
)
database: Database = cls.metadata.create_or_update(
CreateDatabaseRequest(
name="test_suite_database",
service=service.fullyQualifiedName,
)
)
database_schema: DatabaseSchema = cls.metadata.create_or_update(
CreateDatabaseSchemaRequest(
name="test_suite_database_schema",
database=database.fullyQualifiedName,
)
)
cls.metadata.create_or_update(
CreateTableRequest(
name="users",
columns=[
Column(name="id", dataType=DataType.INT),
Column(name="name", dataType=DataType.STRING),
Column(name="fullname", dataType=DataType.STRING),
Column(name="nickname", dataType=DataType.STRING),
Column(name="age", dataType=DataType.INT),
],
databaseSchema=database_schema.fullyQualifiedName,
)
)
cls.metadata.create_or_update(
CreateTableRequest(
name="empty_users",
columns=[
Column(name="id", dataType=DataType.INT),
Column(name="name", dataType=DataType.STRING),
Column(name="fullname", dataType=DataType.STRING),
Column(name="nickname", dataType=DataType.STRING),
Column(name="age", dataType=DataType.INT),
],
databaseSchema=database_schema.fullyQualifiedName,
)
)
engine = sqa.create_engine(f"sqlite:///{cls.sqlite_conn.config.databaseMode}")
session = Session(bind=engine)
User.__table__.create(bind=engine)
EmptyUser.__table__.create(bind=engine)
for _ in range(10):
data = [
User(
name="John",
fullname="John Doe",
nickname="johnny b goode",
age=30,
),
User(
name="Jane",
fullname="Jone Doe",
nickname="Johnny d",
age=31,
),
User(
name="John",
fullname="John Doe",
nickname=None,
age=None,
),
]
session.add_all(data)
session.commit()
@classmethod
def tearDownClass(cls) -> None:
"""
Clean up
"""
service_db_id = str(
cls.metadata.get_by_name(
entity=DatabaseService, fqn="test_suite_service_test"
).id.root
)
cls.metadata.delete(
entity=DatabaseService,
entity_id=service_db_id,
recursive=True,
hard_delete=True,
)
os.remove(cls.db_path)
return super().tearDownClass()
def test_e2e_cli_workflow(self):
"""test cli workflow e2e"""
parameters = [
{"table_name": "users", "status": "Success"},
{"table_name": "empty_users", "status": "Success"},
]
for param in parameters:
with self.subTest(param=param):
table_name = param["table_name"]
status = param["status"]
test_suite_config["source"]["sourceConfig"]["config"].update(
{
"entityFullyQualifiedName": f"test_suite_service_test.test_suite_database.test_suite_database_schema.{table_name}"
}
)
workflow = TestSuiteWorkflow.create(test_suite_config)
workflow.execute()
workflow.raise_from_status()
test_case_1 = self.metadata.get_by_name(
entity=TestCase,
fqn=f"test_suite_service_test.test_suite_database.test_suite_database_schema.{table_name}.my_test_case",
fields=["testDefinition", "testSuite"],
)
test_case_2 = self.metadata.get_by_name(
entity=TestCase,
fqn=f"test_suite_service_test.test_suite_database.test_suite_database_schema.{table_name}.id.table_column_to_be_not_null",
fields=["testDefinition", "testSuite"],
)
assert test_case_1
assert test_case_2
test_case_result_1 = self.metadata.client.get(
f"/dataQuality/testCases/test_suite_service_test.test_suite_database.test_suite_database_schema.{table_name}"
".my_test_case/testCaseResult",
data={
"startTs": int((datetime.now() - timedelta(days=3)).timestamp())
* 1000,
"endTs": int((datetime.now() + timedelta(days=3)).timestamp())
* 1000,
},
)
test_case_result_2 = self.metadata.client.get(
f"/dataQuality/testCases/test_suite_service_test.test_suite_database.test_suite_database_schema.{table_name}"
".id.table_column_to_be_not_null/testCaseResult",
data={
"startTs": int((datetime.now() - timedelta(days=3)).timestamp())
* 1000,
"endTs": int((datetime.now() + timedelta(days=3)).timestamp())
* 1000,
},
)
data_test_case_result_1: dict = test_case_result_1.get("data") # type: ignore
data_test_case_result_2: dict = test_case_result_2.get("data") # type: ignore
assert data_test_case_result_1
assert data_test_case_result_1[0]["testCaseStatus"] == "Success"
assert data_test_case_result_2
assert data_test_case_result_2[0]["testCaseStatus"] == status