2025-04-03 10:39:47 +05:30
|
|
|
# Copyright 2025 Collate
|
|
|
|
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
2022-07-20 17:54:10 +02:00
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
2025-04-03 10:39:47 +05:30
|
|
|
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
2022-07-20 17:54:10 +02:00
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Test SQA Interface
|
|
|
|
"""
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
import os
|
2024-06-07 04:36:17 +02:00
|
|
|
from datetime import datetime
|
2024-11-27 08:50:54 +01:00
|
|
|
from unittest.mock import patch
|
2022-08-19 10:52:08 +02:00
|
|
|
from uuid import uuid4
|
2022-07-20 17:54:10 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
import pytest
|
2022-07-29 10:41:53 +02:00
|
|
|
from sqlalchemy import TEXT, Column, Integer, String, inspect
|
2022-07-20 17:54:10 +02:00
|
|
|
from sqlalchemy.orm import declarative_base
|
|
|
|
from sqlalchemy.orm.session import Session
|
|
|
|
|
2022-08-22 09:01:24 -07:00
|
|
|
from metadata.generated.schema.api.data.createTableProfile import (
|
|
|
|
CreateTableProfileRequest,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
from metadata.generated.schema.entity.data.table import Column as EntityColumn
|
|
|
|
from metadata.generated.schema.entity.data.table import (
|
|
|
|
ColumnName,
|
|
|
|
ColumnProfile,
|
|
|
|
DataType,
|
|
|
|
Table,
|
|
|
|
TableProfile,
|
|
|
|
)
|
2022-07-20 17:54:10 +02:00
|
|
|
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
|
|
|
|
SQLiteConnection,
|
|
|
|
SQLiteScheme,
|
|
|
|
)
|
2024-06-05 21:18:37 +02:00
|
|
|
from metadata.generated.schema.type.basic import Timestamp
|
2023-11-17 17:51:39 +01:00
|
|
|
from metadata.profiler.api.models import ThreadPoolMetrics
|
2023-06-22 12:51:56 +05:30
|
|
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
2023-04-04 17:16:44 +02:00
|
|
|
SQAProfilerInterface,
|
|
|
|
)
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.core import (
|
2022-07-29 10:41:53 +02:00
|
|
|
ComposedMetric,
|
|
|
|
MetricTypes,
|
|
|
|
QueryMetric,
|
|
|
|
StaticMetric,
|
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
from metadata.profiler.metrics.registry import Metrics
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.static.row_count import RowCount
|
2023-04-04 17:16:44 +02:00
|
|
|
from metadata.profiler.processor.default import get_default_metrics
|
2024-11-19 08:10:45 +01:00
|
|
|
from metadata.sampler.sqlalchemy.sampler import SQASampler
|
2022-07-20 17:54:10 +02:00
|
|
|
|
|
|
|
|
|
|
|
class User(declarative_base()):
|
|
|
|
__tablename__ = "users"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
name = Column(String(256))
|
|
|
|
fullname = Column(String(256))
|
|
|
|
nickname = Column(String(256))
|
|
|
|
comments = Column(TEXT)
|
|
|
|
age = Column(Integer)
|
|
|
|
|
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
@pytest.fixture
|
|
|
|
def table_entity():
|
|
|
|
return Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
|
|
|
name=ColumnName("id"),
|
|
|
|
dataType=DataType.INT,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def sqlite_conn():
|
|
|
|
return SQLiteConnection(
|
|
|
|
scheme=SQLiteScheme.sqlite_pysqlite,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def sqa_profiler_interface(table_entity, sqlite_conn):
|
|
|
|
with patch.object(SQASampler, "build_table_orm", return_value=User):
|
|
|
|
sampler = SQASampler(
|
|
|
|
service_connection_config=sqlite_conn,
|
|
|
|
ometa_client=None,
|
|
|
|
entity=None,
|
2022-08-19 10:52:08 +02:00
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
|
|
|
|
with patch.object(SQASampler, "build_table_orm", return_value=User):
|
|
|
|
interface = SQAProfilerInterface(
|
|
|
|
sqlite_conn, None, table_entity, None, sampler, 5, 43200
|
2022-07-20 17:54:10 +02:00
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
return interface
|
2024-11-27 08:50:54 +01:00
|
|
|
|
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
def test_init_interface(sqa_profiler_interface):
|
|
|
|
"""Test we can instantiate our interface object correctly"""
|
|
|
|
assert isinstance(sqa_profiler_interface.session, Session)
|
2022-07-20 17:54:10 +02:00
|
|
|
|
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
@pytest.fixture(scope="class")
|
|
|
|
def db_path():
|
|
|
|
return os.path.join(os.path.dirname(__file__), "test.db")
|
2022-07-20 17:54:10 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
|
|
|
|
@pytest.fixture(scope="class")
|
|
|
|
def class_sqlite_conn(db_path):
|
|
|
|
return SQLiteConnection(
|
|
|
|
scheme=SQLiteScheme.sqlite_pysqlite,
|
|
|
|
databaseMode=db_path + "?check_same_thread=False",
|
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
@pytest.fixture(scope="class")
|
|
|
|
def class_table_entity():
|
|
|
|
return Table(
|
2022-08-19 10:52:08 +02:00
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("id"),
|
2022-08-19 10:52:08 +02:00
|
|
|
dataType=DataType.INT,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
)
|
2024-11-27 08:50:54 +01:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
|
|
|
|
@pytest.fixture(scope="class")
|
|
|
|
def class_sqa_profiler_interface(class_sqlite_conn, class_table_entity):
|
2024-11-27 08:50:54 +01:00
|
|
|
with patch.object(SQASampler, "build_table_orm", return_value=User):
|
|
|
|
sampler = SQASampler(
|
2025-06-17 19:01:00 +02:00
|
|
|
service_connection_config=class_sqlite_conn,
|
2024-11-27 08:50:54 +01:00
|
|
|
ometa_client=None,
|
|
|
|
entity=None,
|
|
|
|
)
|
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
interface = SQAProfilerInterface(
|
|
|
|
class_sqlite_conn,
|
2024-11-27 08:50:54 +01:00
|
|
|
None,
|
2025-06-17 19:01:00 +02:00
|
|
|
class_table_entity,
|
2024-11-27 08:50:54 +01:00
|
|
|
None,
|
|
|
|
sampler,
|
|
|
|
5,
|
|
|
|
43200,
|
2024-11-19 08:10:45 +01:00
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
return interface
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="class", autouse=True)
|
|
|
|
def setup_database(class_sqa_profiler_interface):
|
|
|
|
"""Setup test database and tables"""
|
|
|
|
try:
|
|
|
|
# Drop the table if it exists
|
|
|
|
User.__table__.drop(
|
|
|
|
bind=class_sqa_profiler_interface.session.get_bind(), checkfirst=True
|
|
|
|
)
|
|
|
|
# Create the table
|
|
|
|
User.__table__.create(bind=class_sqa_profiler_interface.session.get_bind())
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error during table setup: {str(e)}")
|
|
|
|
raise e
|
|
|
|
|
|
|
|
data = [
|
|
|
|
User(name="John", fullname="John Doe", nickname="johnny b goode", age=30),
|
|
|
|
User(name="Jane", fullname="Jone Doe", nickname=None, age=31),
|
|
|
|
]
|
|
|
|
class_sqa_profiler_interface.session.add_all(data)
|
|
|
|
class_sqa_profiler_interface.session.commit()
|
|
|
|
|
|
|
|
yield
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
# Cleanup
|
|
|
|
try:
|
|
|
|
User.__table__.drop(
|
|
|
|
bind=class_sqa_profiler_interface.session.get_bind(), checkfirst=True
|
|
|
|
)
|
|
|
|
class_sqa_profiler_interface.session.close()
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error during cleanup: {str(e)}")
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="class")
|
|
|
|
def metrics(class_sqa_profiler_interface):
|
|
|
|
metrics = get_default_metrics(Metrics, User)
|
|
|
|
return {
|
|
|
|
"all": metrics,
|
|
|
|
"static": [metric for metric in metrics if issubclass(metric, StaticMetric)],
|
|
|
|
"composed": [
|
|
|
|
metric for metric in metrics if issubclass(metric, ComposedMetric)
|
|
|
|
],
|
|
|
|
"window": [
|
2022-07-29 10:41:53 +02:00
|
|
|
metric
|
2025-06-17 19:01:00 +02:00
|
|
|
for metric in metrics
|
2022-07-29 10:41:53 +02:00
|
|
|
if issubclass(metric, StaticMetric) and metric.is_window_metric()
|
2025-06-17 19:01:00 +02:00
|
|
|
],
|
|
|
|
"query": [
|
2022-07-29 10:41:53 +02:00
|
|
|
metric
|
2025-06-17 19:01:00 +02:00
|
|
|
for metric in metrics
|
2022-07-29 10:41:53 +02:00
|
|
|
if issubclass(metric, QueryMetric) and metric.is_col_metric()
|
2025-06-17 19:01:00 +02:00
|
|
|
],
|
|
|
|
}
|
2022-07-29 10:41:53 +02:00
|
|
|
|
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
def test_init_interface_multi_thread(class_sqa_profiler_interface):
|
|
|
|
"""Test we can instantiate our interface object correctly"""
|
|
|
|
assert isinstance(class_sqa_profiler_interface.session, Session)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
|
|
|
|
def test_get_all_metrics(class_sqa_profiler_interface, metrics):
|
|
|
|
table_metrics = [
|
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=[
|
|
|
|
metric
|
|
|
|
for metric in metrics["all"]
|
|
|
|
if (not metric.is_col_metric() and not metric.is_system_metrics())
|
|
|
|
],
|
|
|
|
metric_type=MetricTypes.Table,
|
|
|
|
column=None,
|
|
|
|
table=User,
|
|
|
|
)
|
|
|
|
]
|
|
|
|
column_metrics = []
|
|
|
|
query_metrics = []
|
|
|
|
window_metrics = []
|
|
|
|
for col in inspect(User).c:
|
|
|
|
column_metrics.append(
|
2023-11-17 17:51:39 +01:00
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=[
|
2022-12-07 14:33:30 +01:00
|
|
|
metric
|
2025-06-17 19:01:00 +02:00
|
|
|
for metric in metrics["static"]
|
|
|
|
if metric.is_col_metric() and not metric.is_window_metric()
|
2022-12-07 14:33:30 +01:00
|
|
|
],
|
2025-06-17 19:01:00 +02:00
|
|
|
metric_type=MetricTypes.Static,
|
|
|
|
column=col,
|
|
|
|
table=User,
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
)
|
|
|
|
for query_metric in metrics["query"]:
|
|
|
|
query_metrics.append(
|
2023-11-17 17:51:39 +01:00
|
|
|
ThreadPoolMetrics(
|
2025-06-17 19:01:00 +02:00
|
|
|
metrics=query_metric,
|
|
|
|
metric_type=MetricTypes.Query,
|
2023-11-17 17:51:39 +01:00
|
|
|
column=col,
|
2025-06-17 19:01:00 +02:00
|
|
|
table=User,
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
window_metrics.append(
|
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=[
|
|
|
|
metric for metric in metrics["window"] if metric.is_window_metric()
|
|
|
|
],
|
|
|
|
metric_type=MetricTypes.Window,
|
|
|
|
column=col,
|
|
|
|
table=User,
|
2023-03-03 21:56:32 +01:00
|
|
|
)
|
2025-06-17 19:01:00 +02:00
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
all_metrics = [*table_metrics, *column_metrics, *query_metrics, *window_metrics]
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
profile_results = class_sqa_profiler_interface.get_all_metrics(
|
|
|
|
all_metrics,
|
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
column_profile = [
|
|
|
|
ColumnProfile(**profile_results["columns"].get(col.name))
|
|
|
|
for col in inspect(User).c
|
|
|
|
if profile_results["columns"].get(col.name)
|
|
|
|
]
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
table_profile = TableProfile(
|
|
|
|
columnCount=profile_results["table"].get("columnCount"),
|
|
|
|
rowCount=profile_results["table"].get(RowCount.name()),
|
|
|
|
timestamp=Timestamp(int(datetime.now().timestamp())),
|
|
|
|
)
|
2022-08-22 09:01:24 -07:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
profile_request = CreateTableProfileRequest(
|
|
|
|
tableProfile=table_profile, columnProfile=column_profile
|
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2025-06-17 19:01:00 +02:00
|
|
|
assert profile_request.tableProfile.columnCount == 6
|
|
|
|
assert profile_request.tableProfile.rowCount == 2
|
|
|
|
name_column_profile = [
|
|
|
|
profile for profile in profile_request.columnProfile if profile.name == "name"
|
|
|
|
][0]
|
|
|
|
id_column_profile = [
|
|
|
|
profile for profile in profile_request.columnProfile if profile.name == "id"
|
|
|
|
][0]
|
|
|
|
assert name_column_profile.nullCount == 0
|
|
|
|
assert id_column_profile.median == 1.0
|