mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-08-24 08:58:06 +00:00

* feat(data-quality): use sampling config in data diff - get the table profiling config - use hashing to sample deterministically the same ids from each table - use dirty-equals to assert results of stochastic processes * - reverted missing md5 - added missing database service type * - use a custom substr sql function * fixed nounce * added failure for mssql with sampling because it requires a larger change in the data-diff library * fixed unit tests * updated range for sampling
179 lines
6.7 KiB
Python
179 lines
6.7 KiB
Python
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
|
|
from metadata.data_quality.validations.models import (
|
|
TableDiffRuntimeParameters,
|
|
TableParameter,
|
|
)
|
|
from metadata.data_quality.validations.table.sqlalchemy.tableDiff import (
|
|
TableDiffValidator,
|
|
compile_and_clauses,
|
|
)
|
|
from metadata.generated.schema.entity.data.table import (
|
|
Column,
|
|
DataType,
|
|
ProfileSampleType,
|
|
TableProfilerConfig,
|
|
)
|
|
from metadata.generated.schema.entity.services.databaseService import (
|
|
DatabaseServiceType,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"elements, expected",
|
|
[
|
|
("a", "a"),
|
|
(["a", "b"], "a and b"),
|
|
(["a", ["b", "c"]], "a and (b and c)"),
|
|
(["a", ["b", ["c", "d"]]], "a and (b and (c and d))"),
|
|
(["a", ["b", "c"], "d"], "a and (b and c) and d"),
|
|
([], ""),
|
|
("", ""),
|
|
(["a"], "a"),
|
|
([["a"]], "a"),
|
|
([["a"]], "a"),
|
|
],
|
|
)
|
|
def test_compile_and_clauses(elements, expected):
|
|
assert compile_and_clauses(elements) == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"config,expected",
|
|
[
|
|
(
|
|
TableDiffRuntimeParameters.model_construct(
|
|
**{
|
|
"database_service_type": "BigQuery",
|
|
"table_profile_config": TableProfilerConfig(
|
|
profileSampleType=ProfileSampleType.PERCENTAGE,
|
|
profileSample=10,
|
|
),
|
|
"table1": TableParameter.model_construct(
|
|
**{
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
"columns": [
|
|
Column(name="id", dataType=DataType.STRING),
|
|
Column(name="name", dataType=DataType.STRING),
|
|
],
|
|
}
|
|
),
|
|
"table2": TableParameter.model_construct(
|
|
**{"database_service_type": DatabaseServiceType.Postgres}
|
|
),
|
|
"keyColumns": ["id"],
|
|
}
|
|
),
|
|
"SUBSTRING(MD5(id || 'a'), 1, 8) < '19999999'",
|
|
),
|
|
(
|
|
TableDiffRuntimeParameters.model_construct(
|
|
**{
|
|
"database_service_type": "BigQuery",
|
|
"table_profile_config": TableProfilerConfig(
|
|
profileSampleType=ProfileSampleType.PERCENTAGE,
|
|
profileSample=20,
|
|
),
|
|
"table1": TableParameter.model_construct(
|
|
**{
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
"columns": [
|
|
Column(name="id", dataType=DataType.STRING),
|
|
Column(name="name", dataType=DataType.STRING),
|
|
],
|
|
}
|
|
),
|
|
"table2": TableParameter.model_construct(
|
|
**{"database_service_type": DatabaseServiceType.Postgres}
|
|
),
|
|
"keyColumns": ["id"],
|
|
}
|
|
),
|
|
"SUBSTRING(MD5(id || 'a'), 1, 8) < '33333333'",
|
|
),
|
|
(
|
|
TableDiffRuntimeParameters.model_construct(
|
|
**{
|
|
"database_service_type": "BigQuery",
|
|
"table_profile_config": TableProfilerConfig(
|
|
profileSampleType=ProfileSampleType.PERCENTAGE,
|
|
profileSample=10,
|
|
),
|
|
"table1": TableParameter.model_construct(
|
|
**{
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
"columns": [
|
|
Column(name="id", dataType=DataType.STRING),
|
|
Column(name="name", dataType=DataType.STRING),
|
|
],
|
|
}
|
|
),
|
|
"table2": TableParameter.model_construct(
|
|
**{"database_service_type": DatabaseServiceType.Postgres}
|
|
),
|
|
"keyColumns": ["id", "name"],
|
|
}
|
|
),
|
|
"SUBSTRING(MD5(id || name || 'a'), 1, 8) < '19999999'",
|
|
),
|
|
(
|
|
TableDiffRuntimeParameters.model_construct(
|
|
**{
|
|
"database_service_type": "BigQuery",
|
|
"table_profile_config": TableProfilerConfig(
|
|
profileSampleType=ProfileSampleType.ROWS,
|
|
profileSample=20,
|
|
),
|
|
"table1": TableParameter.model_construct(
|
|
**{
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
"columns": [
|
|
Column(name="id", dataType=DataType.STRING),
|
|
Column(name="name", dataType=DataType.STRING),
|
|
],
|
|
}
|
|
),
|
|
"table2": TableParameter.model_construct(
|
|
**{"database_service_type": DatabaseServiceType.Postgres}
|
|
),
|
|
"keyColumns": ["id", "name"],
|
|
}
|
|
),
|
|
"SUBSTRING(MD5(id || name || 'a'), 1, 8) < '0083126e'",
|
|
),
|
|
(
|
|
TableDiffRuntimeParameters.model_construct(
|
|
**{
|
|
"table_profile_config": None,
|
|
"table1": TableParameter.model_construct(
|
|
**{
|
|
"database_service_type": DatabaseServiceType.Postgres,
|
|
"columns": [
|
|
Column(name="id", dataType=DataType.STRING),
|
|
Column(name="name", dataType=DataType.STRING),
|
|
],
|
|
}
|
|
),
|
|
"table2": TableParameter.model_construct(
|
|
**{"database_service_type": DatabaseServiceType.Postgres}
|
|
),
|
|
"keyColumns": ["id", "name"],
|
|
}
|
|
),
|
|
None,
|
|
),
|
|
],
|
|
)
|
|
def test_sample_where_clauses(config, expected):
|
|
validator = TableDiffValidator(None, None, None)
|
|
validator.runtime_params = config
|
|
if (
|
|
config.table_profile_config
|
|
and config.table_profile_config.profileSampleType == ProfileSampleType.ROWS
|
|
):
|
|
validator.get_row_count = Mock(return_value=10_000)
|
|
with patch("random.choices", Mock(return_value=["a"])):
|
|
assert validator.sample_where_clause() == expected
|