mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-10 15:59:57 +00:00
* Update test data for `tests.integration.trino` This is to create tables with complex data types. Using raw SQL because creating tables with pandas didn't get the right types for the structs * Update tests to reproduce the issue Also included the new tables in the other tests to make sure complex data types do not break anything else Reference: [issue 16983](https://github.com/open-metadata/OpenMetadata/issues/16983) * Added `TypeDecorator`s handle `trino.types.NamedRowTuple` This is because pydantic couldn't figure out how to create python objects when receiving `NamedRowTuple`s, which broke the sampling process. This makes sure the data we receive from the trino interface is compatible with Pydantic
This commit is contained in:
parent
cc265f956b
commit
bb50514a00
@ -24,6 +24,7 @@ from metadata.profiler.orm.converter.mariadb.converter import MariaDBMapTypes
|
||||
from metadata.profiler.orm.converter.mssql.converter import MssqlMapTypes
|
||||
from metadata.profiler.orm.converter.redshift.converter import RedshiftMapTypes
|
||||
from metadata.profiler.orm.converter.snowflake.converter import SnowflakeMapTypes
|
||||
from metadata.profiler.orm.converter.trino import TrinoMapTypes
|
||||
|
||||
converter_registry = defaultdict(lambda: CommonMapTypes)
|
||||
converter_registry[DatabaseServiceType.BigQuery] = BigqueryMapTypes
|
||||
@ -32,3 +33,4 @@ converter_registry[DatabaseServiceType.Redshift] = RedshiftMapTypes
|
||||
converter_registry[DatabaseServiceType.Mssql] = MssqlMapTypes
|
||||
converter_registry[DatabaseServiceType.AzureSQL] = AzureSqlMapTypes
|
||||
converter_registry[DatabaseServiceType.MariaDB] = MariaDBMapTypes
|
||||
converter_registry[DatabaseServiceType.Trino] = TrinoMapTypes
|
||||
|
||||
@ -0,0 +1,14 @@
|
||||
# Copyright 2025 Collate
|
||||
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .converter import TrinoMapTypes
|
||||
|
||||
__all__ = ("TrinoMapTypes",)
|
||||
@ -0,0 +1,39 @@
|
||||
# Copyright 2025 Collate
|
||||
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Dict, Set
|
||||
|
||||
from sqlalchemy.sql.type_api import TypeEngine
|
||||
|
||||
from metadata.generated.schema.entity.data.table import DataType
|
||||
from metadata.profiler.orm.converter.common import CommonMapTypes
|
||||
from metadata.profiler.orm.types.trino import TrinoArray, TrinoMap, TrinoStruct
|
||||
|
||||
|
||||
class TrinoMapTypes(CommonMapTypes):
|
||||
_TYPE_MAP_OVERRIDE = {
|
||||
DataType.ARRAY: TrinoArray,
|
||||
DataType.MAP: TrinoMap,
|
||||
DataType.STRUCT: TrinoStruct,
|
||||
}
|
||||
_TYPE_MAP = {
|
||||
**CommonMapTypes._TYPE_MAP,
|
||||
**_TYPE_MAP_OVERRIDE,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def map_sqa_to_om_types(cls) -> Dict[TypeEngine, Set[DataType]]:
|
||||
"""returns an ORM type"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
|
||||
return {
|
||||
**CommonMapTypes.map_sqa_to_om_types(),
|
||||
**{v: {k} for k, v in cls._TYPE_MAP_OVERRIDE.items()},
|
||||
}
|
||||
10
ingestion/src/metadata/profiler/orm/types/__init__.py
Normal file
10
ingestion/src/metadata/profiler/orm/types/__init__.py
Normal file
@ -0,0 +1,10 @@
|
||||
# Copyright 2025 Collate
|
||||
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
70
ingestion/src/metadata/profiler/orm/types/trino.py
Normal file
70
ingestion/src/metadata/profiler/orm/types/trino.py
Normal file
@ -0,0 +1,70 @@
|
||||
# Copyright 2025 Collate
|
||||
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Type adapter for Trino to handle NamedRowTuple serialization
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy import ARRAY
|
||||
from sqlalchemy.engine import Dialect
|
||||
from sqlalchemy.sql.sqltypes import String, TypeDecorator
|
||||
|
||||
from metadata.utils.logger import ingestion_logger
|
||||
|
||||
logger = ingestion_logger()
|
||||
|
||||
|
||||
class TrinoTypesMixin:
|
||||
def process_result_value(self, value: Any, dialect: Dialect) -> Any:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from trino.types import NamedRowTuple
|
||||
|
||||
def _convert_value(obj: Any) -> Any:
|
||||
if isinstance(obj, NamedRowTuple):
|
||||
return {
|
||||
k: _convert_value(getattr(obj, k))
|
||||
for k in obj.__annotations__["names"]
|
||||
}
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return type(obj)(_convert_value(v) for v in obj)
|
||||
elif isinstance(obj, dict):
|
||||
return {k: _convert_value(v) for k, v in obj.items()}
|
||||
return obj
|
||||
|
||||
return _convert_value(value)
|
||||
|
||||
|
||||
class TrinoArray(TrinoTypesMixin, TypeDecorator):
|
||||
impl = ARRAY
|
||||
cache_ok = True
|
||||
|
||||
@property
|
||||
def python_type(self):
|
||||
return list
|
||||
|
||||
|
||||
class TrinoMap(TrinoTypesMixin, TypeDecorator):
|
||||
impl = String
|
||||
cache_ok = True
|
||||
|
||||
@property
|
||||
def python_type(self):
|
||||
return dict
|
||||
|
||||
|
||||
class TrinoStruct(TrinoTypesMixin, TypeDecorator):
|
||||
impl = String
|
||||
cache_ok = True
|
||||
|
||||
@property
|
||||
def python_type(self):
|
||||
return dict
|
||||
@ -1,5 +1,6 @@
|
||||
import os.path
|
||||
import random
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
import docker
|
||||
@ -7,7 +8,7 @@ import pandas as pd
|
||||
import pytest
|
||||
import testcontainers.core.network
|
||||
from sqlalchemy import create_engine, insert
|
||||
from sqlalchemy.engine import make_url
|
||||
from sqlalchemy.engine import Engine, make_url
|
||||
from tenacity import retry, stop_after_delay, wait_fixed
|
||||
from testcontainers.core.container import DockerContainer
|
||||
from testcontainers.core.generic import DbContainer
|
||||
@ -192,26 +193,49 @@ def create_test_data(trino_container):
|
||||
)
|
||||
data_dir = os.path.dirname(__file__) + "/data"
|
||||
for file in os.listdir(data_dir):
|
||||
df = pd.read_parquet(f"{data_dir}/{file}")
|
||||
file_path = Path(os.path.join(data_dir, file))
|
||||
|
||||
if file_path.suffix == ".sql":
|
||||
# Creating test data with complex fields with pandas breaks
|
||||
create_test_data_from_sql(engine, file_path)
|
||||
else:
|
||||
create_test_data_from_parquet(engine, file_path)
|
||||
|
||||
sleep(1)
|
||||
engine.execute("ANALYZE " + f'minio."my_schema"."{file_path.stem}"')
|
||||
engine.execute(
|
||||
"CALL system.drop_stats(schema_name => 'my_schema', table_name => 'empty')"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
def create_test_data_from_parquet(engine: Engine, file_path: Path):
|
||||
df = pd.read_parquet(file_path)
|
||||
|
||||
# Convert data types
|
||||
for col in df.columns:
|
||||
if pd.api.types.is_datetime64tz_dtype(df[col]):
|
||||
df[col] = df[col].dt.tz_convert(None)
|
||||
|
||||
df.to_sql(
|
||||
file.replace(".parquet", ""),
|
||||
Path(file_path).stem,
|
||||
engine,
|
||||
schema="my_schema",
|
||||
if_exists="fail",
|
||||
index=False,
|
||||
method=custom_insert,
|
||||
)
|
||||
sleep(1)
|
||||
engine.execute(
|
||||
"ANALYZE " + f'minio."my_schema"."{file.replace(".parquet", "")}"'
|
||||
)
|
||||
engine.execute(
|
||||
"CALL system.drop_stats(schema_name => 'my_schema', table_name => 'empty')"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
def create_test_data_from_sql(engine: Engine, file_path: Path):
|
||||
with open(file_path, "r") as f:
|
||||
sql = f.read()
|
||||
|
||||
sql = sql.format(catalog="minio", schema="my_schema", table_name=file_path.stem)
|
||||
for statement in sql.split(";"):
|
||||
if statement.strip() == "":
|
||||
continue
|
||||
engine.execute(statement)
|
||||
|
||||
|
||||
def custom_insert(self, conn, keys: list[str], data_iter):
|
||||
|
||||
@ -0,0 +1,67 @@
|
||||
CREATE TABLE IF NOT EXISTS {catalog}.{schema}.{table_name} (
|
||||
promotiontransactionid BIGINT,
|
||||
validto TIMESTAMP,
|
||||
vouchercode VARCHAR,
|
||||
payload ROW(
|
||||
id VARCHAR,
|
||||
amount DOUBLE,
|
||||
currency VARCHAR,
|
||||
metadata MAP(VARCHAR, VARCHAR),
|
||||
items ARRAY(ROW(
|
||||
itemId VARCHAR,
|
||||
quantity INTEGER,
|
||||
price DOUBLE
|
||||
))
|
||||
),
|
||||
adjustmenthistory ARRAY(ROW(
|
||||
adjustmentId VARCHAR,
|
||||
timestamp TIMESTAMP,
|
||||
amount DOUBLE,
|
||||
reason VARCHAR
|
||||
)),
|
||||
simplemap MAP(VARCHAR, VARCHAR),
|
||||
simplearray ARRAY(VARCHAR)
|
||||
);
|
||||
|
||||
INSERT INTO {catalog}.{schema}.{table_name} VALUES
|
||||
(
|
||||
1001,
|
||||
TIMESTAMP '2024-12-31 23:59:59',
|
||||
'PROMO2024',
|
||||
ROW(
|
||||
'txn_001',
|
||||
99.99,
|
||||
'USD',
|
||||
MAP(ARRAY['store', 'region'], ARRAY['Store123', 'US-West']),
|
||||
ARRAY[
|
||||
ROW('item_001', 2, 29.99),
|
||||
ROW('item_002', 1, 40.01)
|
||||
]
|
||||
),
|
||||
ARRAY[
|
||||
ROW('adj_001', TIMESTAMP '2024-01-15 10:30:00', -5.00, 'Discount'),
|
||||
ROW('adj_002', TIMESTAMP '2024-01-15 11:00:00', 2.50, 'Tax adjustment')
|
||||
],
|
||||
MAP(ARRAY['status', 'type'], ARRAY['active', 'promotion']),
|
||||
ARRAY['tag1', 'tag2', 'tag3']
|
||||
),
|
||||
(
|
||||
1002,
|
||||
TIMESTAMP '2024-06-30 23:59:59',
|
||||
'SUMMER2024',
|
||||
ROW(
|
||||
'txn_002',
|
||||
150.75,
|
||||
'EUR',
|
||||
MAP(ARRAY['campaign', 'channel'], ARRAY['Summer Sale', 'Online']),
|
||||
ARRAY[
|
||||
ROW('item_003', 3, 45.25),
|
||||
ROW('item_004', 1, 15.00)
|
||||
]
|
||||
),
|
||||
ARRAY[
|
||||
ROW('adj_003', TIMESTAMP '2024-02-20 14:15:00', -10.00, 'Coupon')
|
||||
],
|
||||
MAP(ARRAY['status', 'priority'], ARRAY['completed', 'high']),
|
||||
ARRAY['summer', 'sale', 'online']
|
||||
)
|
||||
26
ingestion/tests/integration/trino/data/only_complex.sql
Normal file
26
ingestion/tests/integration/trino/data/only_complex.sql
Normal file
@ -0,0 +1,26 @@
|
||||
CREATE TABLE IF NOT EXISTS {catalog}.{schema}.{table_name} (
|
||||
payload ROW(
|
||||
foobar VARCHAR,
|
||||
foobaz DOUBLE,
|
||||
foos ARRAY(ROW(bars VARCHAR))
|
||||
),
|
||||
foobars ARRAY(VARCHAR)
|
||||
);
|
||||
|
||||
INSERT INTO {catalog}.{schema}.{table_name} VALUES
|
||||
(
|
||||
ROW(
|
||||
'test_value',
|
||||
123.45,
|
||||
ARRAY[ROW('bar1'), ROW('bar2')]
|
||||
),
|
||||
ARRAY['foo1', 'foo2', 'foo3']
|
||||
),
|
||||
(
|
||||
ROW(
|
||||
'another_value',
|
||||
678.90,
|
||||
ARRAY[ROW('bar3')]
|
||||
),
|
||||
ARRAY['foo4']
|
||||
);
|
||||
63
ingestion/tests/integration/trino/test_classifier.py
Normal file
63
ingestion/tests/integration/trino/test_classifier.py
Normal file
@ -0,0 +1,63 @@
|
||||
from copy import deepcopy
|
||||
|
||||
import pytest
|
||||
|
||||
from metadata.generated.schema.entity.data.table import Table
|
||||
from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
|
||||
DatabaseServiceAutoClassificationPipeline,
|
||||
)
|
||||
from metadata.ingestion.lineage.sql_lineage import search_cache
|
||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||
from metadata.workflow.classification import AutoClassificationWorkflow
|
||||
from metadata.workflow.metadata import MetadataWorkflow
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def sampling_only_classifier_config(
|
||||
db_service, sink_config, workflow_config, classifier_config
|
||||
):
|
||||
config = deepcopy(classifier_config)
|
||||
config["source"]["sourceConfig"]["config"]["enableAutoClassification"] = False
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
)
|
||||
def run_classifier(
|
||||
patch_passwords_for_db_services,
|
||||
run_workflow,
|
||||
ingestion_config,
|
||||
sampling_only_classifier_config,
|
||||
create_test_data,
|
||||
request,
|
||||
):
|
||||
search_cache.clear()
|
||||
run_workflow(MetadataWorkflow, ingestion_config)
|
||||
run_workflow(AutoClassificationWorkflow, sampling_only_classifier_config)
|
||||
return ingestion_config
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"table_name",
|
||||
(
|
||||
"{database_service}.minio.my_schema.table",
|
||||
"{database_service}.minio.my_schema.titanic",
|
||||
"{database_service}.minio.my_schema.iris",
|
||||
"{database_service}.minio.my_schema.userdata",
|
||||
"{database_service}.minio.my_schema.empty",
|
||||
"{database_service}.minio.my_schema.complex_and_simple",
|
||||
"{database_service}.minio.my_schema.only_complex",
|
||||
),
|
||||
)
|
||||
def test_auto_classification_workflow(
|
||||
run_classifier,
|
||||
metadata: OpenMetadata,
|
||||
table_name: str,
|
||||
db_service: DatabaseServiceAutoClassificationPipeline,
|
||||
):
|
||||
table = metadata.get_by_name(
|
||||
Table, table_name.format(database_service=db_service.fullyQualifiedName.root)
|
||||
)
|
||||
|
||||
assert metadata.get_sample_data(table) is not None
|
||||
@ -18,6 +18,8 @@ def run_workflow(run_workflow, ingestion_config, create_test_data):
|
||||
"{database_service}.minio.my_schema.iris",
|
||||
"{database_service}.minio.my_schema.userdata",
|
||||
"{database_service}.minio.my_schema.empty",
|
||||
"{database_service}.minio.my_schema.complex_and_simple",
|
||||
"{database_service}.minio.my_schema.only_complex",
|
||||
],
|
||||
ids=lambda x: x.split(".")[-1],
|
||||
)
|
||||
|
||||
@ -132,6 +132,32 @@ class ProfilerTestParameters:
|
||||
],
|
||||
lambda x: x.useStatistics == True,
|
||||
),
|
||||
ProfilerTestParameters(
|
||||
"{database_service}.minio.my_schema.complex_and_simple", # complex types ignored
|
||||
TableProfile(timestamp=Timestamp(0), rowCount=2),
|
||||
[
|
||||
ColumnProfile(
|
||||
name="promotiontransactionid",
|
||||
timestamp=Timestamp(0),
|
||||
valuesCount=2,
|
||||
nullCount=0,
|
||||
),
|
||||
ColumnProfile(
|
||||
name="validto", timestamp=Timestamp(0), valuesCount=2, nullCount=0
|
||||
),
|
||||
ColumnProfile(
|
||||
name="vouchercode",
|
||||
timestamp=Timestamp(0),
|
||||
valuesCount=2,
|
||||
nullCount=0,
|
||||
),
|
||||
],
|
||||
),
|
||||
ProfilerTestParameters(
|
||||
"{database_service}.minio.my_schema.only_complex", # complex types ignored
|
||||
TableProfile(timestamp=Timestamp(0), rowCount=2),
|
||||
[],
|
||||
),
|
||||
],
|
||||
ids=lambda x: x.table_fqn.split(".")[-1],
|
||||
)
|
||||
@ -144,7 +170,7 @@ def test_profiler(
|
||||
)
|
||||
):
|
||||
pytest.skip(
|
||||
"Skipping test becuase its not supported for this profiler configuation"
|
||||
"Skipping test because it's not supported for this profiler configuration"
|
||||
)
|
||||
table: Table = metadata.get_latest_table_profile(
|
||||
parameters.table_fqn.format(database_service=db_service.fullyQualifiedName.root)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user