2022-02-18 07:48:38 +01:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Test Profiler behavior
|
|
|
|
"""
|
2022-07-29 10:41:53 +02:00
|
|
|
import os
|
2023-03-08 18:01:25 +01:00
|
|
|
from datetime import datetime
|
2023-07-12 17:02:32 +02:00
|
|
|
from unittest import TestCase, mock
|
2022-08-04 07:22:47 -07:00
|
|
|
from uuid import uuid4
|
2022-02-18 07:48:38 +01:00
|
|
|
|
2022-02-24 07:08:39 +01:00
|
|
|
import pytest
|
2022-05-30 06:53:16 +02:00
|
|
|
import sqlalchemy.types
|
2022-07-20 17:54:10 +02:00
|
|
|
from sqlalchemy import Column, Integer, String
|
2022-02-18 07:48:38 +01:00
|
|
|
from sqlalchemy.orm import declarative_base
|
|
|
|
|
2022-09-15 17:39:29 +02:00
|
|
|
from metadata.generated.schema.api.data.createTableProfile import (
|
|
|
|
CreateTableProfileRequest,
|
|
|
|
)
|
2022-08-04 07:22:47 -07:00
|
|
|
from metadata.generated.schema.entity.data.table import Column as EntityColumn
|
|
|
|
from metadata.generated.schema.entity.data.table import (
|
|
|
|
ColumnName,
|
|
|
|
ColumnProfile,
|
2023-04-19 16:09:13 +02:00
|
|
|
ColumnProfilerConfig,
|
2022-08-04 07:22:47 -07:00
|
|
|
DataType,
|
2023-03-13 19:43:51 +05:30
|
|
|
Histogram,
|
2022-08-04 07:22:47 -07:00
|
|
|
Table,
|
2022-09-15 17:39:29 +02:00
|
|
|
TableProfile,
|
2023-04-19 16:09:13 +02:00
|
|
|
TableProfilerConfig,
|
2022-08-04 07:22:47 -07:00
|
|
|
)
|
2024-04-03 15:51:19 +05:30
|
|
|
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
|
|
|
DatalakeConnection,
|
|
|
|
)
|
2024-06-05 21:18:37 +02:00
|
|
|
from metadata.generated.schema.type.basic import Timestamp
|
2024-04-03 15:51:19 +05:30
|
|
|
from metadata.generated.schema.type.entityReference import EntityReference
|
2022-05-30 06:53:16 +02:00
|
|
|
from metadata.ingestion.source import sqa_types
|
2023-07-12 17:02:32 +02:00
|
|
|
from metadata.profiler.interface.pandas.profiler_interface import (
|
|
|
|
PandasProfilerInterface,
|
|
|
|
)
|
2023-04-19 16:09:13 +02:00
|
|
|
from metadata.profiler.metrics.core import MetricTypes, add_props
|
2023-04-04 17:16:44 +02:00
|
|
|
from metadata.profiler.metrics.registry import Metrics
|
|
|
|
from metadata.profiler.processor.core import MissingMetricException, Profiler
|
|
|
|
from metadata.profiler.processor.default import DefaultProfiler
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
|
|
|
|
|
|
|
|
class User(Base):
|
|
|
|
__tablename__ = "users"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
name = Column(String(256))
|
|
|
|
fullname = Column(String(256))
|
|
|
|
nickname = Column(String(256))
|
|
|
|
age = Column(Integer)
|
|
|
|
|
|
|
|
|
2024-06-20 08:38:21 +02:00
|
|
|
class FakeClient:
|
|
|
|
def __init__(self):
|
|
|
|
self._client = None
|
|
|
|
|
|
|
|
|
2023-07-12 17:02:32 +02:00
|
|
|
class FakeConnection:
|
2024-06-20 08:38:21 +02:00
|
|
|
def __init__(self):
|
|
|
|
self.client = FakeClient()
|
2023-07-12 17:02:32 +02:00
|
|
|
|
|
|
|
|
2022-02-18 07:48:38 +01:00
|
|
|
class ProfilerTest(TestCase):
|
|
|
|
"""
|
|
|
|
Run checks on different metrics
|
|
|
|
"""
|
|
|
|
|
2023-07-12 17:02:32 +02:00
|
|
|
import pandas as pd
|
|
|
|
|
2023-09-13 15:15:49 +05:30
|
|
|
col_names = [
|
|
|
|
"name",
|
|
|
|
"fullname",
|
|
|
|
"nickname",
|
|
|
|
"comments",
|
|
|
|
"age",
|
|
|
|
"dob",
|
|
|
|
"tob",
|
|
|
|
"doe",
|
|
|
|
"json",
|
|
|
|
"array",
|
|
|
|
]
|
|
|
|
|
2023-07-12 17:02:32 +02:00
|
|
|
root_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
csv_dir = "../custom_csv"
|
2023-09-13 15:15:49 +05:30
|
|
|
df1 = pd.read_csv(
|
|
|
|
os.path.join(root_dir, csv_dir, "test_datalake_metrics_1.csv"), names=col_names
|
|
|
|
)
|
|
|
|
df2 = pd.read_csv(
|
|
|
|
os.path.join(root_dir, csv_dir, "test_datalake_metrics_2.csv"), names=col_names
|
|
|
|
)
|
2022-08-04 07:22:47 -07:00
|
|
|
table_entity = Table(
|
|
|
|
id=uuid4(),
|
|
|
|
name="user",
|
2024-04-03 15:51:19 +05:30
|
|
|
databaseSchema=EntityReference(id=uuid4(), type="databaseSchema", name="name"),
|
|
|
|
fileFormat="csv",
|
2022-08-04 07:22:47 -07:00
|
|
|
columns=[
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("name"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("fullname"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("nickname"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("comments"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.STRING,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("age"),
|
2022-08-04 07:22:47 -07:00
|
|
|
dataType=DataType.INT,
|
2024-04-03 15:51:19 +05:30
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("dob"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.DATETIME,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("tob"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.DATE,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("doe"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.DATE,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("json"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.JSON,
|
|
|
|
),
|
|
|
|
EntityColumn(
|
2024-06-05 21:18:37 +02:00
|
|
|
name=ColumnName("array"),
|
2024-04-03 15:51:19 +05:30
|
|
|
dataType=DataType.ARRAY,
|
|
|
|
),
|
2022-08-04 07:22:47 -07:00
|
|
|
],
|
|
|
|
)
|
2022-02-18 07:48:38 +01:00
|
|
|
|
|
|
|
@classmethod
|
2023-07-12 17:02:32 +02:00
|
|
|
@mock.patch(
|
2024-06-12 11:40:30 +05:30
|
|
|
"metadata.profiler.interface.profiler_interface.get_ssl_connection",
|
2024-06-20 08:38:21 +02:00
|
|
|
return_value=FakeConnection(),
|
2023-07-12 17:02:32 +02:00
|
|
|
)
|
2024-04-03 15:51:19 +05:30
|
|
|
@mock.patch(
|
|
|
|
"metadata.mixins.pandas.pandas_mixin.fetch_dataframe",
|
2023-09-13 15:15:49 +05:30
|
|
|
return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])],
|
2023-07-12 17:02:32 +02:00
|
|
|
)
|
|
|
|
def setUpClass(cls, mock_get_connection, mocked_dfs):
|
|
|
|
cls.datalake_profiler_interface = PandasProfilerInterface(
|
|
|
|
entity=cls.table_entity,
|
2024-04-03 15:51:19 +05:30
|
|
|
service_connection_config=DatalakeConnection(configSource={}),
|
2023-11-09 18:49:42 +05:30
|
|
|
storage_config=None,
|
2023-07-12 17:02:32 +02:00
|
|
|
ometa_client=None,
|
|
|
|
thread_count=None,
|
|
|
|
profile_sample_config=None,
|
|
|
|
source_config=None,
|
|
|
|
sample_query=None,
|
|
|
|
table_partition_config=None,
|
|
|
|
)
|
2022-07-20 17:54:10 +02:00
|
|
|
|
2022-03-04 06:59:47 +01:00
|
|
|
def test_default_profiler(self):
|
2022-02-18 07:48:38 +01:00
|
|
|
"""
|
|
|
|
Check our pre-cooked profiler
|
|
|
|
"""
|
2022-07-20 17:54:10 +02:00
|
|
|
simple = DefaultProfiler(
|
2023-07-12 17:02:32 +02:00
|
|
|
profiler_interface=self.datalake_profiler_interface,
|
2022-07-20 17:54:10 +02:00
|
|
|
)
|
2022-08-19 10:52:08 +02:00
|
|
|
simple.compute_metrics()
|
2022-02-25 18:26:30 +01:00
|
|
|
|
|
|
|
profile = simple.get_profile()
|
|
|
|
|
2023-09-13 15:15:49 +05:30
|
|
|
assert profile.tableProfile.rowCount == 6
|
|
|
|
assert profile.tableProfile.columnCount == 10
|
2022-02-25 18:26:30 +01:00
|
|
|
|
|
|
|
age_profile = next(
|
2022-07-20 17:54:10 +02:00
|
|
|
(
|
|
|
|
col_profile
|
|
|
|
for col_profile in profile.columnProfile
|
|
|
|
if col_profile.name == "age"
|
2022-02-25 18:26:30 +01:00
|
|
|
),
|
|
|
|
None,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert age_profile == ColumnProfile(
|
|
|
|
name="age",
|
2023-09-13 15:15:49 +05:30
|
|
|
timestamp=age_profile.timestamp,
|
|
|
|
valuesCount=4.0,
|
2022-02-25 18:26:30 +01:00
|
|
|
valuesPercentage=None,
|
|
|
|
validCount=None,
|
|
|
|
duplicateCount=None,
|
2023-09-13 15:15:49 +05:30
|
|
|
nullCount=2.0,
|
|
|
|
nullProportion=0.3333333333333333,
|
|
|
|
missingPercentage=None,
|
|
|
|
missingCount=None,
|
|
|
|
uniqueCount=2.0,
|
|
|
|
uniqueProportion=0.5,
|
|
|
|
distinctCount=3.0,
|
|
|
|
distinctProportion=0.75,
|
2022-02-25 18:26:30 +01:00
|
|
|
min=30.0,
|
2023-09-13 15:15:49 +05:30
|
|
|
max=32.0,
|
2022-02-25 18:26:30 +01:00
|
|
|
minLength=None,
|
|
|
|
maxLength=None,
|
2023-09-13 15:15:49 +05:30
|
|
|
mean=31.0,
|
|
|
|
sum=124.0,
|
|
|
|
stddev=0.816496580927726,
|
2022-02-25 18:26:30 +01:00
|
|
|
variance=None,
|
2023-09-13 15:15:49 +05:30
|
|
|
median=31.0,
|
2023-07-12 17:02:32 +02:00
|
|
|
firstQuartile=30.5,
|
2023-09-13 15:15:49 +05:30
|
|
|
thirdQuartile=31.5,
|
|
|
|
interQuartileRange=1.0,
|
|
|
|
nonParametricSkew=0.0,
|
2023-07-12 17:02:32 +02:00
|
|
|
histogram=Histogram(
|
2024-01-24 09:19:19 +01:00
|
|
|
boundaries=["30.000 to 31.260", "31.260 and up"], frequencies=[3, 1]
|
2023-07-12 17:02:32 +02:00
|
|
|
),
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
def test_required_metrics(self):
|
|
|
|
"""
|
|
|
|
Check that we raise properly MissingMetricException
|
|
|
|
when not building the profiler with all the
|
|
|
|
required ingredients
|
|
|
|
"""
|
2022-02-25 18:26:30 +01:00
|
|
|
like = add_props(expression="J%")(Metrics.LIKE_COUNT.value)
|
|
|
|
count = Metrics.COUNT.value
|
|
|
|
like_ratio = Metrics.LIKE_RATIO.value
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
# This should run properly
|
2022-02-25 18:26:30 +01:00
|
|
|
Profiler(
|
|
|
|
like,
|
|
|
|
count,
|
|
|
|
like_ratio,
|
2023-07-12 17:02:32 +02:00
|
|
|
profiler_interface=self.datalake_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-02-24 07:08:39 +01:00
|
|
|
|
|
|
|
with pytest.raises(MissingMetricException):
|
|
|
|
# We are missing ingredients here
|
2022-02-25 18:26:30 +01:00
|
|
|
Profiler(
|
2022-07-20 17:54:10 +02:00
|
|
|
like,
|
|
|
|
like_ratio,
|
2023-07-12 17:02:32 +02:00
|
|
|
profiler_interface=self.datalake_profiler_interface,
|
2022-02-25 18:26:30 +01:00
|
|
|
)
|
2022-05-30 06:53:16 +02:00
|
|
|
|
|
|
|
def test_skipped_types(self):
|
|
|
|
"""
|
|
|
|
Check that we are properly skipping computations for
|
|
|
|
not supported types
|
|
|
|
"""
|
|
|
|
|
|
|
|
class NotCompute(Base):
|
|
|
|
__tablename__ = "not_compute"
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
null_col = Column(sqlalchemy.types.NULLTYPE)
|
|
|
|
array_col = Column(sqlalchemy.ARRAY(Integer, dimensions=2))
|
|
|
|
json_col = Column(sqlalchemy.JSON)
|
|
|
|
map_col = Column(sqa_types.SQAMap)
|
|
|
|
struct_col = Column(sqa_types.SQAStruct)
|
|
|
|
|
|
|
|
profiler = Profiler(
|
|
|
|
Metrics.COUNT.value,
|
2023-07-12 17:02:32 +02:00
|
|
|
profiler_interface=self.datalake_profiler_interface,
|
2022-05-30 06:53:16 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert not profiler.column_results
|
2022-07-29 10:41:53 +02:00
|
|
|
|
2022-09-15 17:39:29 +02:00
|
|
|
def test__check_profile_and_handle(self):
|
|
|
|
"""test _check_profile_and_handle returns as expected"""
|
|
|
|
profiler = Profiler(
|
|
|
|
Metrics.COUNT.value,
|
2023-07-12 17:02:32 +02:00
|
|
|
profiler_interface=self.datalake_profiler_interface,
|
2022-09-15 17:39:29 +02:00
|
|
|
)
|
|
|
|
|
2023-07-14 09:12:46 +02:00
|
|
|
profiler._check_profile_and_handle(
|
2022-09-15 17:39:29 +02:00
|
|
|
CreateTableProfileRequest(
|
|
|
|
tableProfile=TableProfile(
|
2024-06-05 21:18:37 +02:00
|
|
|
timestamp=Timestamp(int(datetime.now().timestamp())), columnCount=10
|
2022-09-15 17:39:29 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(Exception):
|
|
|
|
profiler._check_profile_and_handle(
|
|
|
|
CreateTableProfileRequest(
|
|
|
|
tableProfile=TableProfile(
|
2024-06-05 21:18:37 +02:00
|
|
|
timestamp=Timestamp(int(datetime.now().timestamp())),
|
|
|
|
profileSample=100,
|
2022-09-15 17:39:29 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2023-04-19 16:09:13 +02:00
|
|
|
def test_profiler_get_col_metrics(self):
|
|
|
|
"""check getc column metrics"""
|
|
|
|
metric_filter = ["mean", "min", "max", "firstQuartile"]
|
2023-07-12 17:02:32 +02:00
|
|
|
self.datalake_profiler_interface.table_entity.tableProfilerConfig = (
|
2023-04-19 16:09:13 +02:00
|
|
|
TableProfilerConfig(
|
|
|
|
includeColumns=[
|
2023-07-12 17:02:32 +02:00
|
|
|
ColumnProfilerConfig(columnName="age", metrics=metric_filter)
|
2023-04-19 16:09:13 +02:00
|
|
|
]
|
|
|
|
)
|
|
|
|
) # type: ignore
|
|
|
|
|
|
|
|
default_profiler = DefaultProfiler(
|
2023-07-12 17:02:32 +02:00
|
|
|
profiler_interface=self.datalake_profiler_interface,
|
2023-04-19 16:09:13 +02:00
|
|
|
)
|
|
|
|
column_metrics = default_profiler._prepare_column_metrics()
|
|
|
|
for metric in column_metrics:
|
2023-11-17 17:51:39 +01:00
|
|
|
if (
|
|
|
|
metric.metric_type is not MetricTypes.Table
|
|
|
|
and metric.column.name == "id"
|
|
|
|
):
|
|
|
|
assert all(metric_filter.count(m.name()) for m in metric.metrics)
|