2022-07-20 17:54:10 +02:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Test SQA Interface
|
|
|
|
"""
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
import os
|
2024-06-07 04:36:17 +02:00
|
|
|
from datetime import datetime
|
2023-07-12 17:02:32 +02:00
|
|
|
from unittest import TestCase, mock
|
2022-08-19 10:52:08 +02:00
|
|
|
from uuid import uuid4
|
2022-07-20 17:54:10 +02:00
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
from sqlalchemy import TEXT, Column, Integer, String, inspect
|
2022-07-20 17:54:10 +02:00
|
|
|
from sqlalchemy.orm import declarative_base
|
|
|
|
|
2022-08-22 09:01:24 -07:00
|
|
|
from metadata.generated.schema.api.data.createTableProfile import (
|
|
|
|
CreateTableProfileRequest,
|
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
from metadata.generated.schema.entity.data.table import Column as EntityColumn
|
|
|
|
from metadata.generated.schema.entity.data.table import (
|
|
|
|
ColumnName,
|
|
|
|
ColumnProfile,
|
|
|
|
DataType,
|
|
|
|
Table,
|
|
|
|
TableProfile,
|
|
|
|
)
|
2024-04-03 15:51:19 +05:30
|
|
|
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
|
|
|
DatalakeConnection,
|
|
|
|
)
|
2024-06-05 21:18:37 +02:00
|
|
|
from metadata.generated.schema.type.basic import Timestamp
|
2024-04-03 15:51:19 +05:30
|
|
|
from metadata.generated.schema.type.entityReference import EntityReference
|
2023-11-17 17:51:39 +01:00
|
|
|
from metadata.profiler.api.models import ThreadPoolMetrics
|
2023-07-12 17:02:32 +02:00
|
|
|
from metadata.profiler.interface.pandas.profiler_interface import (
|
|
|
|
PandasProfilerInterface,
|
2023-04-04 17:16:44 +02:00
|
|
|
)
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.core import (
|
2022-07-29 10:41:53 +02:00
|
|
|
ComposedMetric,
|
|
|
|
MetricTypes,
|
|
|
|
QueryMetric,
|
|
|
|
StaticMetric,
|
|
|
|
)
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.static.row_count import RowCount
|
2023-04-04 17:16:44 +02:00
|
|
|
from metadata.profiler.processor.default import get_default_metrics
|
2022-07-20 17:54:10 +02:00
|
|
|
|
|
|
|
|
|
|
|
class User(declarative_base()):
|
|
|
|
__tablename__ = "users"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
name = Column(String(256))
|
|
|
|
fullname = Column(String(256))
|
|
|
|
nickname = Column(String(256))
|
|
|
|
comments = Column(TEXT)
|
|
|
|
age = Column(Integer)
|
|
|
|
|
|
|
|
|
2023-07-12 17:02:32 +02:00
|
|
|
class FakeConnection:
|
|
|
|
def client(self):
|
|
|
|
return None
|
2022-07-20 17:54:10 +02:00
|
|
|
|
|
|
|
|
2023-07-12 17:02:32 +02:00
|
|
|
class PandasInterfaceTest(TestCase):
|
|
|
|
import pandas as pd
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2023-09-13 15:15:49 +05:30
|
|
|
col_names = [
|
|
|
|
"name",
|
|
|
|
"fullname",
|
|
|
|
"nickname",
|
|
|
|
"comments",
|
|
|
|
"age",
|
|
|
|
"dob",
|
|
|
|
"tob",
|
|
|
|
"doe",
|
|
|
|
"json",
|
|
|
|
"array",
|
|
|
|
]
|
2023-07-12 17:02:32 +02:00
|
|
|
root_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
csv_dir = "../custom_csv"
|
2023-09-13 15:15:49 +05:30
|
|
|
df1 = pd.read_csv(
|
|
|
|
os.path.join(root_dir, csv_dir, "test_datalake_metrics_1.csv"), names=col_names
|
|
|
|
)
|
|
|
|
df2 = pd.read_csv(
|
|
|
|
os.path.join(root_dir, csv_dir, "test_datalake_metrics_2.csv"), names=col_names
|
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2022-08-19 10:52:08 +02:00
|
|
|
table_entity = Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
2024-04-03 15:51:19 +05:30
|
|
|
databaseSchema=EntityReference(id=uuid4(), type="databaseSchema", name="name"),
|
|
|
|
fileFormat="csv",
|
2022-08-19 10:52:08 +02:00
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("name"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("fullname"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("nickname"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("comments"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("age"),
|
2022-08-19 10:52:08 +02:00
|
|
|
dataType=DataType.INT,
|
2024-04-03 15:51:19 +05:30
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("dob"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.DATETIME,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("tob"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.DATE,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("doe"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.DATE,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("json"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.JSON,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("array"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.ARRAY,
|
|
|
|
),
|
2022-08-19 10:52:08 +02:00
|
|
|
],
|
|
|
|
)
|
2023-07-12 17:02:32 +02:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
@mock.patch(
|
2024-06-12 11:40:30 +05:30
|
|
|
"metadata.profiler.interface.profiler_interface.get_ssl_connection",
|
2023-07-12 17:02:32 +02:00
|
|
|
return_value=FakeConnection,
|
|
|
|
)
|
2024-04-03 15:51:19 +05:30
|
|
|
@mock.patch(
|
|
|
|
"metadata.mixins.pandas.pandas_mixin.fetch_dataframe",
|
2023-09-13 15:15:49 +05:30
|
|
|
return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])],
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
2023-07-12 17:02:32 +02:00
|
|
|
def setUp(cls, mock_get_connection, mocked_dfs) -> None:
|
|
|
|
cls.datalake_profiler_interface = PandasProfilerInterface(
|
|
|
|
entity=cls.table_entity,
|
2024-04-03 15:51:19 +05:30
|
|
|
service_connection_config=DatalakeConnection(configSource={}),
|
2023-11-09 18:49:42 +05:30
|
|
|
storage_config=None,
|
2023-07-12 17:02:32 +02:00
|
|
|
ometa_client=None,
|
|
|
|
thread_count=None,
|
|
|
|
profile_sample_config=None,
|
|
|
|
source_config=None,
|
|
|
|
sample_query=None,
|
|
|
|
table_partition_config=None,
|
2022-10-11 15:57:25 +02:00
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def setUpClass(cls) -> None:
|
|
|
|
"""
|
|
|
|
Prepare Ingredients
|
|
|
|
"""
|
|
|
|
|
|
|
|
cls.table = User
|
|
|
|
cls.metrics = get_default_metrics(cls.table)
|
|
|
|
cls.static_metrics = [
|
|
|
|
metric for metric in cls.metrics if issubclass(metric, StaticMetric)
|
|
|
|
]
|
|
|
|
cls.composed_metrics = [
|
|
|
|
metric for metric in cls.metrics if issubclass(metric, ComposedMetric)
|
|
|
|
]
|
|
|
|
cls.window_metrics = [
|
|
|
|
metric
|
|
|
|
for metric in cls.metrics
|
|
|
|
if issubclass(metric, StaticMetric) and metric.is_window_metric()
|
|
|
|
]
|
|
|
|
cls.query_metrics = [
|
|
|
|
metric
|
|
|
|
for metric in cls.metrics
|
|
|
|
if issubclass(metric, QueryMetric) and metric.is_col_metric()
|
|
|
|
]
|
|
|
|
|
|
|
|
def test_get_all_metrics(self):
|
|
|
|
table_metrics = [
|
2023-11-17 17:51:39 +01:00
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=[
|
2022-12-07 14:33:30 +01:00
|
|
|
metric
|
|
|
|
for metric in self.metrics
|
|
|
|
if (not metric.is_col_metric() and not metric.is_system_metrics())
|
|
|
|
],
|
2023-11-17 17:51:39 +01:00
|
|
|
metric_type=MetricTypes.Table,
|
|
|
|
column=None,
|
|
|
|
table=self.table_entity,
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
|
|
|
]
|
|
|
|
column_metrics = []
|
|
|
|
query_metrics = []
|
|
|
|
window_metrics = []
|
|
|
|
for col in inspect(User).c:
|
2023-07-13 13:35:37 +02:00
|
|
|
if col.name == "id":
|
|
|
|
continue
|
2022-07-29 10:41:53 +02:00
|
|
|
column_metrics.append(
|
2023-11-17 17:51:39 +01:00
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=[
|
2022-07-29 10:41:53 +02:00
|
|
|
metric
|
|
|
|
for metric in self.static_metrics
|
|
|
|
if metric.is_col_metric() and not metric.is_window_metric()
|
|
|
|
],
|
2023-11-17 17:51:39 +01:00
|
|
|
metric_type=MetricTypes.Static,
|
|
|
|
column=col,
|
|
|
|
table=self.table_entity,
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
for query_metric in self.query_metrics:
|
|
|
|
query_metrics.append(
|
2023-11-17 17:51:39 +01:00
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=query_metric,
|
|
|
|
metric_type=MetricTypes.Query,
|
|
|
|
column=col,
|
|
|
|
table=self.table_entity,
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
|
|
|
)
|
2023-03-03 21:56:32 +01:00
|
|
|
window_metrics.append(
|
2023-11-17 17:51:39 +01:00
|
|
|
ThreadPoolMetrics(
|
|
|
|
metrics=[
|
2023-03-03 21:56:32 +01:00
|
|
|
metric
|
|
|
|
for metric in self.window_metrics
|
|
|
|
if metric.is_window_metric()
|
|
|
|
],
|
2023-11-17 17:51:39 +01:00
|
|
|
metric_type=MetricTypes.Window,
|
|
|
|
column=col,
|
|
|
|
table=self.table_entity,
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
2023-03-03 21:56:32 +01:00
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
|
|
|
all_metrics = [*table_metrics, *column_metrics, *query_metrics, *window_metrics]
|
|
|
|
|
2023-07-12 17:02:32 +02:00
|
|
|
profile_results = self.datalake_profiler_interface.get_all_metrics(
|
2022-07-29 10:41:53 +02:00
|
|
|
all_metrics,
|
|
|
|
)
|
|
|
|
|
|
|
|
column_profile = [
|
|
|
|
ColumnProfile(**profile_results["columns"].get(col.name))
|
|
|
|
for col in inspect(User).c
|
|
|
|
if profile_results["columns"].get(col.name)
|
|
|
|
]
|
|
|
|
|
|
|
|
table_profile = TableProfile(
|
|
|
|
columnCount=profile_results["table"].get("columnCount"),
|
|
|
|
rowCount=profile_results["table"].get(RowCount.name()),
|
2024-06-07 04:36:17 +02:00
|
|
|
timestamp=Timestamp(int(datetime.now().timestamp())),
|
2022-08-22 09:01:24 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
profile_request = CreateTableProfileRequest(
|
|
|
|
tableProfile=table_profile, columnProfile=column_profile
|
2022-07-29 10:41:53 +02:00
|
|
|
)
|
|
|
|
|
2023-09-13 15:15:49 +05:30
|
|
|
assert profile_request.tableProfile.columnCount == 10
|
|
|
|
assert profile_request.tableProfile.rowCount == 6
|
2022-07-29 10:41:53 +02:00
|
|
|
name_column_profile = [
|
2022-08-22 09:01:24 -07:00
|
|
|
profile
|
|
|
|
for profile in profile_request.columnProfile
|
|
|
|
if profile.name == "name"
|
2022-07-29 10:41:53 +02:00
|
|
|
][0]
|
2023-07-12 17:02:32 +02:00
|
|
|
age_column_profile = [
|
|
|
|
profile
|
|
|
|
for profile in profile_request.columnProfile
|
|
|
|
if profile.name == "age"
|
2022-07-29 10:41:53 +02:00
|
|
|
][0]
|
2023-09-13 15:15:49 +05:30
|
|
|
assert name_column_profile.nullCount == 2.0
|
|
|
|
assert age_column_profile.median == 31.0
|