OpenMetadata/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py
Teddy c7ac28f2c2
Fixes #11357 - Implement profiler custom metric processing (#14021)
* feat: add backend support for custom metrics

* feat: fix python test

* feat: support custom metrics computation

* feat: updated tests for custom metrics

* feat: added dl support for min max of datetime

* feat: added is safe query check for query sampler

* feat: added support for custom metric computation in dl

* feat: added explicit addProper for pydantic model import fo Extra

* feat: added custom metric to returned obj

* feat: wrapped trino import in __init__

* feat: fix python linting

* feat: fix typing in 3.8
2023-11-17 17:51:39 +01:00

234 lines
7.2 KiB
Python

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test Metrics behavior
"""
# import datetime
import os
from unittest import TestCase
from unittest.mock import patch
from uuid import uuid4
import boto3
import botocore
import pandas as pd
from moto import mock_s3
from metadata.generated.schema.entity.data.table import Column as EntityColumn
from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import (
S3Config,
)
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
DatalakeConnection,
)
from metadata.generated.schema.security.credentials.awsCredentials import AWSCredentials
from metadata.generated.schema.tests.customMetric import CustomMetric
from metadata.profiler.interface.pandas.profiler_interface import (
PandasProfilerInterface,
)
from metadata.profiler.processor.core import Profiler
BUCKET_NAME = "MyBucket"
@mock_s3
class MetricsTest(TestCase):
"""
Run checks on different metrics
"""
current_dir = os.path.dirname(__file__)
resources_dir = os.path.join(current_dir, "resources")
datalake_conn = DatalakeConnection(
configSource=S3Config(
securityConfig=AWSCredentials(
awsAccessKeyId="fake_access_key",
awsSecretAccessKey="fake_secret_key",
awsRegion="us-west-1",
)
)
)
dfs = [
pd.read_csv(os.path.join(resources_dir, "profiler_test_.csv"), parse_dates=[5])
]
table_entity = Table(
id=uuid4(),
name="user",
columns=[
EntityColumn(
name=ColumnName(__root__="id"),
dataType=DataType.INT,
)
],
)
def setUp(self):
# Mock our S3 bucket and ingest a file
boto3.DEFAULT_SESSION = None
self.client = boto3.client(
"s3",
region_name="us-weat-1",
)
# check that we are not running our test against a real bucket
try:
s3 = boto3.resource(
"s3",
region_name="us-west-1",
aws_access_key_id="fake_access_key",
aws_secret_access_key="fake_secret_key",
)
s3.meta.client.head_bucket(Bucket=BUCKET_NAME)
except botocore.exceptions.ClientError:
pass
else:
err = f"{BUCKET_NAME} should not exist."
raise EnvironmentError(err)
self.client.create_bucket(
Bucket=BUCKET_NAME,
CreateBucketConfiguration={"LocationConstraint": "us-west-1"},
)
resources_paths = [
os.path.join(path, filename)
for path, _, files in os.walk(self.resources_dir)
for filename in files
]
self.s3_keys = []
for path in resources_paths:
key = os.path.relpath(path, self.resources_dir)
self.s3_keys.append(key)
self.client.upload_file(Filename=path, Bucket=BUCKET_NAME, Key=key)
with patch.object(
PandasProfilerInterface,
"_convert_table_to_list_of_dataframe_objects",
return_value=self.dfs,
):
self.sqa_profiler_interface = PandasProfilerInterface(
self.datalake_conn,
None,
self.table_entity,
None,
None,
None,
None,
None,
thread_count=1,
)
def test_table_custom_metric(self):
table_entity = Table(
id=uuid4(),
name="user",
columns=[
EntityColumn(
name=ColumnName(__root__="id"),
dataType=DataType.INT,
)
],
customMetrics=[
CustomMetric(
name="LastNameFilter",
expression="'last_name' != Doe",
),
CustomMetric(
name="notUS",
expression="'country == US'",
),
],
)
with patch.object(
PandasProfilerInterface,
"_convert_table_to_list_of_dataframe_objects",
return_value=self.dfs,
):
self.sqa_profiler_interface = PandasProfilerInterface(
self.datalake_conn,
None,
table_entity,
None,
None,
None,
None,
None,
thread_count=1,
)
profiler = Profiler(
profiler_interface=self.sqa_profiler_interface,
)
metrics = profiler.compute_metrics()
for k, v in metrics._table_results.items():
for metric in v:
if metric.name == "LastNameFilter":
assert metric.value == 1
if metric.name == "notUS":
assert metric.value == 2
def test_column_custom_metric(self):
table_entity = Table(
id=uuid4(),
name="user",
columns=[
EntityColumn(
name=ColumnName(__root__="id"),
dataType=DataType.INT,
customMetrics=[
CustomMetric(
name="LastNameFilter",
columnName="id",
expression="'last_name' != Doe",
),
CustomMetric(
name="notUS",
columnName="id",
expression="'country == US'",
),
],
)
],
)
with patch.object(
PandasProfilerInterface,
"_convert_table_to_list_of_dataframe_objects",
return_value=self.dfs,
):
self.sqa_profiler_interface = PandasProfilerInterface(
self.datalake_conn,
None,
table_entity,
None,
None,
None,
None,
None,
thread_count=1,
)
profiler = Profiler(
profiler_interface=self.sqa_profiler_interface,
)
metrics = profiler.compute_metrics()
for k, v in metrics._column_results.items():
for metric in v.get("customMetrics", []):
if metric.name == "CustomerBornedAfter1991":
assert metric.value == 1
if metric.name == "AverageAge":
assert metric.value == 2