OpenMetadata/ingestion/tests/unit/test_suite/test_validations_databases.py
Imri Paran b960b60965
Fix #16421: add tableDiff test case (#16554)
* feat: add tableDiff test case

This changed introduces a "table diff" test case which
compares two tables and fails if they are not identical.
The similarity is made based on a specific "key" (because the test only makes sense when performed on ordered collections).

1. Added the `tableDiff` test definition.
2. Implemented a "runtime" parameters feature which injects additional parameters for the test at runtime.
3. Integration tests (because of course).

This feature was not tested end-to-end yet because "array" data

* pydantic v2

* format

* format

* format and added data diff to setup.py

* format

* fixed param issue which has type ARRAY

* fixed runtime_parameter_setter

* moved models to parent directory

* handle errors in table diff

* fixed issue with edit test case

* format

* added more details to pytest skip

* format

* refactor: Improve createTestCaseParameters function in DataQualityUtils

* fixed unit test

* removed unused fixture

* removed validator.py

* fixed tests

* added validate kwarg to tests_mixin

* removed "postgres" data diff extra as they interfere with psycopg2-binary

* fixed tests

* pinned tenacity for tests

* reverted tenacity pinning

* added ui support for test diff

* fixed dq cypress and added edit flow

* organized the test case

* added dialect support

* fixed tests

* option style fix

* fixed calculation for passing/failing rows

* restrict the tableDiff test to limited services

* set where to None if blank string

* fixed where clause

* fixed tests for where clause

* use displayName in place of name in edit form

* added docs for RuntimeParameterSetter

* fixed cypress

---------

Co-authored-by: Shailesh Parmar <shailesh.parmar.webdev@gmail.com>
2024-06-20 16:54:12 +02:00

498 lines
14 KiB
Python

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test Table and Column Tests' validate implementations.
Each test should validate the Success, Failure and Aborted statuses
"""
from datetime import date, datetime
from unittest.mock import patch
import pytest
from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus
from metadata.utils.importer import import_test_case_class
EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d")
# pylint: disable=line-too-long
@pytest.mark.parametrize(
"test_case_name,test_case_type,test_type,expected",
[
(
"test_case_column_values_to_be_between_date",
"columnValuesToBeBetween",
"COLUMN",
(
TestCaseResult,
"2021-07-01 00:00:00",
"2021-07-01 23:59:59.999999",
TestCaseStatus.Failed,
0.0,
30.0,
0.0,
100.0,
),
),
(
"test_case_column_values_to_be_between_datetime",
"columnValuesToBeBetween",
"COLUMN",
(
TestCaseResult,
"2021-07-01 10:37:59",
"2021-07-01 10:37:59",
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_column_value_length_to_be_between",
"columnValueLengthsToBeBetween",
"COLUMN",
(
TestCaseResult,
"0",
"8",
TestCaseStatus.Failed,
20.0,
10.0,
66.67,
33.33,
),
),
(
"test_case_column_value_length_to_be_between_col_space",
"columnValueLengthsToBeBetween",
"COLUMN",
(TestCaseResult, "2", "3", TestCaseStatus.Success, 30.0, 0.0, 100.0, 0.0),
),
(
"test_case_column_value_length_to_be_between_no_min",
"columnValueLengthsToBeBetween",
"COLUMN",
(TestCaseResult, None, None, TestCaseStatus.Success, 30.0, 0.0, 100.0, 0.0),
),
(
"test_case_column_value_max_to_be_between",
"columnValueMaxToBeBetween",
"COLUMN",
(TestCaseResult, "31", None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_column_value_max_to_be_between_no_min",
"columnValueMaxToBeBetween",
"COLUMN",
(TestCaseResult, None, None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_column_value_mean_to_be_between",
"columnValueMeanToBeBetween",
"COLUMN",
(
TestCaseResult,
"30.5",
None,
TestCaseStatus.Failed,
None,
None,
None,
None,
),
),
(
"test_case_column_value_mean_to_be_between_no_max",
"columnValueMeanToBeBetween",
"COLUMN",
(
TestCaseResult,
None,
None,
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_column_value_median_to_be_between",
"columnValueMedianToBeBetween",
"COLUMN",
(TestCaseResult, "30", None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_column_value_min_to_be_between",
"columnValueMinToBeBetween",
"COLUMN",
(
TestCaseResult,
"30",
None,
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_column_value_min_to_be_between_no_min",
"columnValueMinToBeBetween",
"COLUMN",
(
TestCaseResult,
None,
None,
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_column_value_stddev_to_be_between",
"columnValueStdDevToBeBetween",
"COLUMN",
(
TestCaseResult,
"0.25",
None,
TestCaseStatus.Failed,
None,
None,
None,
None,
),
),
(
"test_case_column_value_stddev_to_be_between_no_min",
"columnValueStdDevToBeBetween",
"COLUMN",
(
TestCaseResult,
None,
None,
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_column_value_in_set",
"columnValuesToBeInSet",
"COLUMN",
(
TestCaseResult,
"20",
None,
TestCaseStatus.Success,
20.0,
10.0,
66.67,
33.33,
),
),
(
"test_case_column_values_missing_count_to_be_equal",
"columnValuesMissingCount",
"COLUMN",
(
TestCaseResult,
"20",
None,
TestCaseStatus.Failed,
None,
None,
None,
None,
),
),
(
"test_case_column_values_missing_count_to_be_equal_missing_values",
"columnValuesMissingCount",
"COLUMN",
(TestCaseResult, "30", None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_column_values_not_in_set",
"columnValuesToBeNotInSet",
"COLUMN",
(
TestCaseResult,
"20",
None,
TestCaseStatus.Failed,
10.0,
20.0,
33.33,
66.67,
),
),
(
"test_case_column_sum_to_be_between",
"columnValuesSumToBeBetween",
"COLUMN",
(
TestCaseResult,
"610",
None,
TestCaseStatus.Failed,
None,
None,
None,
None,
),
),
(
"test_case_column_values_to_be_between",
"columnValuesToBeBetween",
"COLUMN",
(TestCaseResult, "30", None, TestCaseStatus.Success, 30.0, 0.0, 100.0, 0.0),
),
(
"test_case_column_values_to_be_not_null",
"columnValuesToBeNotNull",
"COLUMN",
(
TestCaseResult,
"10",
None,
TestCaseStatus.Failed,
20.0,
10.0,
66.67,
33.33,
),
),
(
"test_case_column_values_to_be_unique",
"columnValuesToBeUnique",
"COLUMN",
(TestCaseResult, "20", "0", TestCaseStatus.Failed, 0.0, 20.0, 0.0, 100.0),
),
(
"test_case_column_values_to_match_regex",
"columnValuesToMatchRegex",
"COLUMN",
(TestCaseResult, "30", None, TestCaseStatus.Success, 30.0, 0.0, 100.0, 0.0),
),
(
"test_case_column_values_to_not_match_regex",
"columnValuesToNotMatchRegex",
"COLUMN",
(TestCaseResult, "0", None, TestCaseStatus.Success, 30.0, 0.0, 100.0, 0.0),
),
(
"test_case_table_column_count_to_be_between",
"tableColumnCountToBeBetween",
"TABLE",
(TestCaseResult, "7", None, TestCaseStatus.Success, None, None, None, None),
),
(
"test_case_table_column_count_to_equal",
"tableColumnCountToEqual",
"TABLE",
(TestCaseResult, "7", None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_table_column_name_to_exist",
"tableColumnNameToExist",
"TABLE",
(TestCaseResult, "1", None, TestCaseStatus.Success, None, None, None, None),
),
(
"test_case_column_to_match_set",
"tableColumnToMatchSet",
"TABLE",
(
TestCaseResult,
"0",
None,
TestCaseStatus.Failed,
None,
None,
None,
None,
),
),
(
"test_case_column_to_match_set_ordered",
"tableColumnToMatchSet",
"TABLE",
(TestCaseResult, None, None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_table_custom_sql_query",
"tableCustomSQLQuery",
"TABLE",
(TestCaseResult, "20", None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_table_custom_sql_query_success",
"tableCustomSQLQuery",
"TABLE",
(TestCaseResult, "0", None, TestCaseStatus.Success, None, None, None, None),
),
(
"test_case_table_row_count_to_be_between",
"tableRowCountToBeBetween",
"TABLE",
(
TestCaseResult,
"30",
None,
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_table_row_count_to_be_equal",
"tableRowCountToEqual",
"TABLE",
(TestCaseResult, "30", None, TestCaseStatus.Failed, None, None, None, None),
),
(
"test_case_table_row_inserted_count_to_be_between",
"tableRowInsertedCountToBeBetween",
"TABLE",
(TestCaseResult, "6", None, TestCaseStatus.Success, None, None, None, None),
),
(
"test_case_table_custom_sql_query_with_threshold_success",
"tableCustomSQLQuery",
"TABLE",
(
TestCaseResult,
"10",
None,
TestCaseStatus.Success,
None,
None,
None,
None,
),
),
(
"test_case_table_custom_sql_unsafe_query_aborted",
"tableCustomSQLQuery",
"TABLE",
(
TestCaseResult,
None,
None,
TestCaseStatus.Aborted,
None,
None,
None,
None,
),
),
],
)
def test_suite_validation_database(
test_case_name,
test_case_type,
test_type,
expected,
request,
create_sqlite_table,
):
"""Generic test runner for test validations"""
test_case = request.getfixturevalue(test_case_name)
(
type_,
val_1,
val_2,
status,
passed_rows,
failed_rows,
passed_percentage,
failed_percentage,
) = expected
if test_case_name == "test_case_column_values_to_be_between_date":
with patch(
"metadata.data_quality.validations.column.sqlalchemy.columnValuesToBeBetween.ColumnValuesToBeBetweenValidator._run_results",
return_value=date(2021, 7, 1),
):
test_handler_obj = import_test_case_class(
test_type,
"sqlalchemy",
test_case_type,
)
test_handler = test_handler_obj(
create_sqlite_table,
test_case=test_case,
execution_date=EXECUTION_DATE.timestamp(),
)
res = test_handler.run_validation()
elif test_case_name == "test_case_column_values_to_be_between_datetime":
with patch(
"metadata.data_quality.validations.column.sqlalchemy.columnValuesToBeBetween.ColumnValuesToBeBetweenValidator._run_results",
return_value=datetime(2021, 7, 1, 10, 37, 59),
):
test_handler_obj = import_test_case_class(
test_type,
"sqlalchemy",
test_case_type,
)
test_handler = test_handler_obj(
create_sqlite_table,
test_case=test_case,
execution_date=EXECUTION_DATE.timestamp(),
)
res = test_handler.run_validation()
else:
test_handler_obj = import_test_case_class(
test_type,
"sqlalchemy",
test_case_type,
)
test_handler = test_handler_obj(
create_sqlite_table,
test_case=test_case,
execution_date=EXECUTION_DATE.timestamp(),
)
res = test_handler.run_validation()
assert isinstance(res, type_)
if val_1:
assert res.testResultValue[0].value == val_1
if val_2:
assert res.testResultValue[1].value == val_2
if passed_rows:
assert res.passedRows == passed_rows
if failed_rows:
assert res.failedRows == failed_rows
if passed_percentage:
assert round(res.passedRowsPercentage, 2) == passed_percentage
if failed_percentage:
assert round(res.failedRowsPercentage, 2) == failed_percentage
assert res.testCaseStatus == status