Eugenio bb50514a00
FIxes #16983: can't sample data from trino tables with complex types (#23478)
* Update test data for `tests.integration.trino`

This is to create tables with complex data types.

Using raw SQL because creating tables with pandas didn't get the right types for the structs

* Update tests to reproduce the issue

Also included the new tables in the other tests to make sure complex data types do not break anything else

Reference: [issue 16983](https://github.com/open-metadata/OpenMetadata/issues/16983)

* Added `TypeDecorator`s handle `trino.types.NamedRowTuple`

This is because pydantic couldn't figure out how to create python objects when receiving `NamedRowTuple`s, which broke the sampling process.

This makes sure the data we receive from the trino interface is compatible with Pydantic
2025-09-26 08:13:28 +02:00

64 lines
1.9 KiB
Python

from copy import deepcopy
import pytest
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
DatabaseServiceAutoClassificationPipeline,
)
from metadata.ingestion.lineage.sql_lineage import search_cache
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.workflow.classification import AutoClassificationWorkflow
from metadata.workflow.metadata import MetadataWorkflow
@pytest.fixture(scope="module")
def sampling_only_classifier_config(
db_service, sink_config, workflow_config, classifier_config
):
config = deepcopy(classifier_config)
config["source"]["sourceConfig"]["config"]["enableAutoClassification"] = False
return config
@pytest.fixture(
scope="module",
)
def run_classifier(
patch_passwords_for_db_services,
run_workflow,
ingestion_config,
sampling_only_classifier_config,
create_test_data,
request,
):
search_cache.clear()
run_workflow(MetadataWorkflow, ingestion_config)
run_workflow(AutoClassificationWorkflow, sampling_only_classifier_config)
return ingestion_config
@pytest.mark.parametrize(
"table_name",
(
"{database_service}.minio.my_schema.table",
"{database_service}.minio.my_schema.titanic",
"{database_service}.minio.my_schema.iris",
"{database_service}.minio.my_schema.userdata",
"{database_service}.minio.my_schema.empty",
"{database_service}.minio.my_schema.complex_and_simple",
"{database_service}.minio.my_schema.only_complex",
),
)
def test_auto_classification_workflow(
run_classifier,
metadata: OpenMetadata,
table_name: str,
db_service: DatabaseServiceAutoClassificationPipeline,
):
table = metadata.get_by_name(
Table, table_name.format(database_service=db_service.fullyQualifiedName.root)
)
assert metadata.get_sample_data(table) is not None