2024-11-11 10:07:23 +01:00
|
|
|
from unittest.mock import Mock, patch
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from metadata.data_quality.validations.models import (
|
|
|
|
TableDiffRuntimeParameters,
|
|
|
|
TableParameter,
|
|
|
|
)
|
|
|
|
from metadata.data_quality.validations.table.sqlalchemy.tableDiff import (
|
|
|
|
TableDiffValidator,
|
|
|
|
compile_and_clauses,
|
|
|
|
)
|
|
|
|
from metadata.generated.schema.entity.data.table import (
|
|
|
|
Column,
|
|
|
|
DataType,
|
|
|
|
ProfileSampleType,
|
|
|
|
TableProfilerConfig,
|
|
|
|
)
|
|
|
|
from metadata.generated.schema.entity.services.databaseService import (
|
|
|
|
DatabaseServiceType,
|
|
|
|
)
|
2024-11-15 08:22:13 +01:00
|
|
|
from metadata.generated.schema.tests.testCase import TestCase, TestCaseParameterValue
|
2024-11-11 10:07:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"elements, expected",
|
|
|
|
[
|
|
|
|
("a", "a"),
|
|
|
|
(["a", "b"], "a and b"),
|
|
|
|
(["a", ["b", "c"]], "a and (b and c)"),
|
|
|
|
(["a", ["b", ["c", "d"]]], "a and (b and (c and d))"),
|
|
|
|
(["a", ["b", "c"], "d"], "a and (b and c) and d"),
|
|
|
|
([], ""),
|
|
|
|
("", ""),
|
|
|
|
(["a"], "a"),
|
|
|
|
([["a"]], "a"),
|
|
|
|
([["a"]], "a"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_compile_and_clauses(elements, expected):
|
|
|
|
assert compile_and_clauses(elements) == expected
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"config,expected",
|
|
|
|
[
|
|
|
|
(
|
|
|
|
TableDiffRuntimeParameters.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": "BigQuery",
|
|
|
|
"table_profile_config": TableProfilerConfig(
|
|
|
|
profileSampleType=ProfileSampleType.PERCENTAGE,
|
|
|
|
profileSample=10,
|
|
|
|
),
|
|
|
|
"table1": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
"table2": TableParameter.model_construct(
|
2024-11-15 08:22:13 +01:00
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
"keyColumns": ["id"],
|
|
|
|
}
|
|
|
|
),
|
2024-11-15 08:22:13 +01:00
|
|
|
("SUBSTRING(MD5(id || 'a'), 1, 8) < '19999999'",) * 2,
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
(
|
|
|
|
TableDiffRuntimeParameters.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": "BigQuery",
|
|
|
|
"table_profile_config": TableProfilerConfig(
|
|
|
|
profileSampleType=ProfileSampleType.PERCENTAGE,
|
|
|
|
profileSample=20,
|
|
|
|
),
|
|
|
|
"table1": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
"table2": TableParameter.model_construct(
|
2024-11-15 08:22:13 +01:00
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
"keyColumns": ["id"],
|
|
|
|
}
|
|
|
|
),
|
2024-11-15 08:22:13 +01:00
|
|
|
("SUBSTRING(MD5(id || 'a'), 1, 8) < '33333333'",) * 2,
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
(
|
|
|
|
TableDiffRuntimeParameters.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": "BigQuery",
|
|
|
|
"table_profile_config": TableProfilerConfig(
|
|
|
|
profileSampleType=ProfileSampleType.PERCENTAGE,
|
|
|
|
profileSample=10,
|
|
|
|
),
|
|
|
|
"table1": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
"table2": TableParameter.model_construct(
|
2024-11-15 08:22:13 +01:00
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
"keyColumns": ["id", "name"],
|
|
|
|
}
|
|
|
|
),
|
2024-11-15 08:22:13 +01:00
|
|
|
("SUBSTRING(MD5(id || name || 'a'), 1, 8) < '19999999'",) * 2,
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
(
|
|
|
|
TableDiffRuntimeParameters.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": "BigQuery",
|
|
|
|
"table_profile_config": TableProfilerConfig(
|
|
|
|
profileSampleType=ProfileSampleType.ROWS,
|
|
|
|
profileSample=20,
|
|
|
|
),
|
|
|
|
"table1": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
"table2": TableParameter.model_construct(
|
2024-11-15 08:22:13 +01:00
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
},
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
"keyColumns": ["id", "name"],
|
|
|
|
}
|
|
|
|
),
|
2024-11-15 08:22:13 +01:00
|
|
|
("SUBSTRING(MD5(id || name || 'a'), 1, 8) < '0083126e'",) * 2,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
TableDiffRuntimeParameters.model_construct(
|
|
|
|
**{
|
|
|
|
"table_profile_config": TableProfilerConfig(
|
|
|
|
profileSampleType=ProfileSampleType.ROWS,
|
|
|
|
profileSample=20,
|
|
|
|
),
|
|
|
|
"table1": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
"table2": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="ID", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
},
|
|
|
|
),
|
|
|
|
"keyColumns": ["id"],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"SUBSTRING(MD5(id || 'a'), 1, 8) < '0083126e'",
|
|
|
|
"SUBSTRING(MD5(\"ID\" || 'a'), 1, 8) < '0083126e'",
|
|
|
|
),
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
(
|
|
|
|
TableDiffRuntimeParameters.model_construct(
|
|
|
|
**{
|
|
|
|
"table_profile_config": None,
|
|
|
|
"table1": TableParameter.model_construct(
|
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
),
|
|
|
|
"table2": TableParameter.model_construct(
|
2024-11-15 08:22:13 +01:00
|
|
|
**{
|
|
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
|
|
"columns": [
|
|
|
|
Column(name="id", dataType=DataType.STRING),
|
|
|
|
Column(name="name", dataType=DataType.STRING),
|
|
|
|
],
|
|
|
|
},
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
2024-11-15 08:22:13 +01:00
|
|
|
"keyColumns": ["id"],
|
2024-11-11 10:07:23 +01:00
|
|
|
}
|
|
|
|
),
|
2024-11-15 08:22:13 +01:00
|
|
|
(None, None),
|
2024-11-11 10:07:23 +01:00
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_sample_where_clauses(config, expected):
|
2024-11-15 08:22:13 +01:00
|
|
|
validator = TableDiffValidator(
|
|
|
|
None,
|
|
|
|
TestCase.model_construct(
|
|
|
|
parameterValues=[
|
|
|
|
TestCaseParameterValue(name="caseSensitiveColumns", value="false")
|
|
|
|
]
|
|
|
|
),
|
|
|
|
None,
|
|
|
|
)
|
2024-11-11 10:07:23 +01:00
|
|
|
validator.runtime_params = config
|
|
|
|
if (
|
|
|
|
config.table_profile_config
|
|
|
|
and config.table_profile_config.profileSampleType == ProfileSampleType.ROWS
|
|
|
|
):
|
|
|
|
validator.get_row_count = Mock(return_value=10_000)
|
|
|
|
with patch("random.choices", Mock(return_value=["a"])):
|
|
|
|
assert validator.sample_where_clause() == expected
|