Eugenio bb50514a00
FIxes #16983: can't sample data from trino tables with complex types (#23478)
* Update test data for `tests.integration.trino`

This is to create tables with complex data types.

Using raw SQL because creating tables with pandas didn't get the right types for the structs

* Update tests to reproduce the issue

Also included the new tables in the other tests to make sure complex data types do not break anything else

Reference: [issue 16983](https://github.com/open-metadata/OpenMetadata/issues/16983)

* Added `TypeDecorator`s handle `trino.types.NamedRowTuple`

This is because pydantic couldn't figure out how to create python objects when receiving `NamedRowTuple`s, which broke the sampling process.

This makes sure the data we receive from the trino interface is compatible with Pydantic
2025-09-26 08:13:28 +02:00

228 lines
7.3 KiB
Python

from collections.abc import Callable
from copy import deepcopy
from dataclasses import dataclass
from typing import List
import pytest
from _openmetadata_testutils.dict import merge
from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects
from metadata.generated.schema.entity.data.table import (
ColumnProfile,
Table,
TableProfile,
)
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
DatabaseServiceProfilerPipeline,
)
from metadata.generated.schema.type.basic import Timestamp
from metadata.ingestion.lineage.sql_lineage import search_cache
from metadata.workflow.metadata import MetadataWorkflow
from metadata.workflow.profiler import ProfilerWorkflow
@pytest.fixture(
scope="module",
params=[
pytest.param({}, id="no_changes"),
pytest.param(
{"source": {"sourceConfig": {"config": {"useStatistics": True}}}},
id="useStatistics=True",
),
],
)
def run_profiler(
patch_passwords_for_db_services,
run_workflow,
ingestion_config,
profiler_config,
create_test_data,
request,
):
search_cache.clear()
profiler_config = deepcopy(profiler_config)
merge(request.param, profiler_config)
run_workflow(MetadataWorkflow, ingestion_config)
run_workflow(ProfilerWorkflow, profiler_config)
return profiler_config
@dataclass
class ProfilerTestParameters:
table_fqn: str
expected_table_profile: TableProfile
expected_column_profiles: List[ColumnProfile] = None
config_predicate: Callable[[DatabaseServiceProfilerPipeline], bool] = lambda x: True
@pytest.mark.parametrize(
"parameters",
[
ProfilerTestParameters(
"{database_service}.minio.my_schema.table",
TableProfile(timestamp=Timestamp(0), rowCount=3),
[
ColumnProfile(
name="three",
timestamp=Timestamp(0),
valuesCount=2,
nullCount=1,
)
],
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.titanic",
TableProfile(timestamp=Timestamp(0), rowCount=891),
[
ColumnProfile(
name="name",
timestamp=Timestamp(0),
valuesCount=891,
nullCount=0,
)
],
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.iris",
TableProfile(timestamp=Timestamp(0), rowCount=150),
[
ColumnProfile(
name="petal.length",
timestamp=Timestamp(0),
valuesCount=150,
nullCount=0,
)
],
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.userdata",
TableProfile(timestamp=Timestamp(0), rowCount=1000),
[
ColumnProfile(
name="gender",
timestamp=Timestamp(0),
valuesCount=1000,
nullCount=0,
)
],
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.empty",
TableProfile(timestamp=Timestamp(0), rowCount=0),
[
ColumnProfile(
name="a",
timestamp=Timestamp(0),
valuesCount=0,
nullCount=0,
)
],
lambda x: x.useStatistics == False,
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.empty",
TableProfile(timestamp=Timestamp(0), rowCount=None),
[
ColumnProfile(
name="a",
timestamp=Timestamp(0),
valuesCount=0,
nullCount=0,
)
],
lambda x: x.useStatistics == True,
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.complex_and_simple", # complex types ignored
TableProfile(timestamp=Timestamp(0), rowCount=2),
[
ColumnProfile(
name="promotiontransactionid",
timestamp=Timestamp(0),
valuesCount=2,
nullCount=0,
),
ColumnProfile(
name="validto", timestamp=Timestamp(0), valuesCount=2, nullCount=0
),
ColumnProfile(
name="vouchercode",
timestamp=Timestamp(0),
valuesCount=2,
nullCount=0,
),
],
),
ProfilerTestParameters(
"{database_service}.minio.my_schema.only_complex", # complex types ignored
TableProfile(timestamp=Timestamp(0), rowCount=2),
[],
),
],
ids=lambda x: x.table_fqn.split(".")[-1],
)
def test_profiler(
run_profiler, metadata, db_service, parameters: ProfilerTestParameters
):
if not parameters.config_predicate(
DatabaseServiceProfilerPipeline.model_validate(
run_profiler["source"]["sourceConfig"]["config"]
)
):
pytest.skip(
"Skipping test because it's not supported for this profiler configuration"
)
table: Table = metadata.get_latest_table_profile(
parameters.table_fqn.format(database_service=db_service.fullyQualifiedName.root)
)
assert_equal_pydantic_objects(
parameters.expected_table_profile,
# we dont want to validate the timestamp because it will be different for each run
table.profile.model_copy(
update={"timestamp": parameters.expected_table_profile.timestamp}
),
)
for profile in parameters.expected_column_profiles:
column = next(
(col for col in table.columns if col.profile.name == profile.name), None
)
if column is None:
raise AssertionError(
f"Column [{profile.name}] not found in table [{table.fullyQualifiedName.root}]"
)
assert_equal_pydantic_objects(
profile,
column.profile.model_copy(update={"timestamp": profile.timestamp}),
)
@pytest.mark.parametrize(
"parameters",
[
ProfilerTestParameters(
"{database_service}.minio.my_schema.empty",
TableProfile(timestamp=Timestamp(0), rowCount=None),
[],
lambda x: x.useStatistics == True,
),
],
ids=lambda x: x.table_fqn.split(".")[-1],
)
def test_no_statistics(
run_profiler, metadata, db_service, parameters: ProfilerTestParameters
):
if not parameters.config_predicate(
DatabaseServiceProfilerPipeline.model_validate(
run_profiler["source"]["sourceConfig"]["config"]
)
):
pytest.skip(
"Skipping test becuase its not supported for this profiler configuation"
)
table: Table = metadata.get_latest_table_profile(
parameters.table_fqn.format(database_service=db_service.fullyQualifiedName.root)
)
assert (
table.profile.rowCount is None
), "expected empty row count for a table with no collected statistics"