mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 02:29:03 +00:00 
			
		
		
		
	 c7ac28f2c2
			
		
	
	
		c7ac28f2c2
		
			
		
	
	
	
	
		
			
			* feat: add backend support for custom metrics * feat: fix python test * feat: support custom metrics computation * feat: updated tests for custom metrics * feat: added dl support for min max of datetime * feat: added is safe query check for query sampler * feat: added support for custom metric computation in dl * feat: added explicit addProper for pydantic model import fo Extra * feat: added custom metric to returned obj * feat: wrapped trino import in __init__ * feat: fix python linting * feat: fix typing in 3.8
		
			
				
	
	
		
			234 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			234 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #  Copyright 2021 Collate
 | |
| #  Licensed under the Apache License, Version 2.0 (the "License");
 | |
| #  you may not use this file except in compliance with the License.
 | |
| #  You may obtain a copy of the License at
 | |
| #  http://www.apache.org/licenses/LICENSE-2.0
 | |
| #  Unless required by applicable law or agreed to in writing, software
 | |
| #  distributed under the License is distributed on an "AS IS" BASIS,
 | |
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| #  See the License for the specific language governing permissions and
 | |
| #  limitations under the License.
 | |
| 
 | |
| """
 | |
| Test Metrics behavior
 | |
| """
 | |
| # import datetime
 | |
| import os
 | |
| from unittest import TestCase
 | |
| from unittest.mock import patch
 | |
| from uuid import uuid4
 | |
| 
 | |
| import boto3
 | |
| import botocore
 | |
| import pandas as pd
 | |
| from moto import mock_s3
 | |
| 
 | |
| from metadata.generated.schema.entity.data.table import Column as EntityColumn
 | |
| from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
 | |
| from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import (
 | |
|     S3Config,
 | |
| )
 | |
| from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
 | |
|     DatalakeConnection,
 | |
| )
 | |
| from metadata.generated.schema.security.credentials.awsCredentials import AWSCredentials
 | |
| from metadata.generated.schema.tests.customMetric import CustomMetric
 | |
| from metadata.profiler.interface.pandas.profiler_interface import (
 | |
|     PandasProfilerInterface,
 | |
| )
 | |
| from metadata.profiler.processor.core import Profiler
 | |
| 
 | |
| BUCKET_NAME = "MyBucket"
 | |
| 
 | |
| 
 | |
| @mock_s3
 | |
| class MetricsTest(TestCase):
 | |
|     """
 | |
|     Run checks on different metrics
 | |
|     """
 | |
| 
 | |
|     current_dir = os.path.dirname(__file__)
 | |
|     resources_dir = os.path.join(current_dir, "resources")
 | |
| 
 | |
|     datalake_conn = DatalakeConnection(
 | |
|         configSource=S3Config(
 | |
|             securityConfig=AWSCredentials(
 | |
|                 awsAccessKeyId="fake_access_key",
 | |
|                 awsSecretAccessKey="fake_secret_key",
 | |
|                 awsRegion="us-west-1",
 | |
|             )
 | |
|         )
 | |
|     )
 | |
| 
 | |
|     dfs = [
 | |
|         pd.read_csv(os.path.join(resources_dir, "profiler_test_.csv"), parse_dates=[5])
 | |
|     ]
 | |
| 
 | |
|     table_entity = Table(
 | |
|         id=uuid4(),
 | |
|         name="user",
 | |
|         columns=[
 | |
|             EntityColumn(
 | |
|                 name=ColumnName(__root__="id"),
 | |
|                 dataType=DataType.INT,
 | |
|             )
 | |
|         ],
 | |
|     )
 | |
| 
 | |
|     def setUp(self):
 | |
|         # Mock our S3 bucket and ingest a file
 | |
|         boto3.DEFAULT_SESSION = None
 | |
|         self.client = boto3.client(
 | |
|             "s3",
 | |
|             region_name="us-weat-1",
 | |
|         )
 | |
| 
 | |
|         # check that we are not running our test against a real bucket
 | |
|         try:
 | |
|             s3 = boto3.resource(
 | |
|                 "s3",
 | |
|                 region_name="us-west-1",
 | |
|                 aws_access_key_id="fake_access_key",
 | |
|                 aws_secret_access_key="fake_secret_key",
 | |
|             )
 | |
|             s3.meta.client.head_bucket(Bucket=BUCKET_NAME)
 | |
|         except botocore.exceptions.ClientError:
 | |
|             pass
 | |
|         else:
 | |
|             err = f"{BUCKET_NAME} should not exist."
 | |
|             raise EnvironmentError(err)
 | |
|         self.client.create_bucket(
 | |
|             Bucket=BUCKET_NAME,
 | |
|             CreateBucketConfiguration={"LocationConstraint": "us-west-1"},
 | |
|         )
 | |
| 
 | |
|         resources_paths = [
 | |
|             os.path.join(path, filename)
 | |
|             for path, _, files in os.walk(self.resources_dir)
 | |
|             for filename in files
 | |
|         ]
 | |
| 
 | |
|         self.s3_keys = []
 | |
| 
 | |
|         for path in resources_paths:
 | |
|             key = os.path.relpath(path, self.resources_dir)
 | |
|             self.s3_keys.append(key)
 | |
|             self.client.upload_file(Filename=path, Bucket=BUCKET_NAME, Key=key)
 | |
| 
 | |
|         with patch.object(
 | |
|             PandasProfilerInterface,
 | |
|             "_convert_table_to_list_of_dataframe_objects",
 | |
|             return_value=self.dfs,
 | |
|         ):
 | |
|             self.sqa_profiler_interface = PandasProfilerInterface(
 | |
|                 self.datalake_conn,
 | |
|                 None,
 | |
|                 self.table_entity,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 thread_count=1,
 | |
|             )
 | |
| 
 | |
|     def test_table_custom_metric(self):
 | |
|         table_entity = Table(
 | |
|             id=uuid4(),
 | |
|             name="user",
 | |
|             columns=[
 | |
|                 EntityColumn(
 | |
|                     name=ColumnName(__root__="id"),
 | |
|                     dataType=DataType.INT,
 | |
|                 )
 | |
|             ],
 | |
|             customMetrics=[
 | |
|                 CustomMetric(
 | |
|                     name="LastNameFilter",
 | |
|                     expression="'last_name' != Doe",
 | |
|                 ),
 | |
|                 CustomMetric(
 | |
|                     name="notUS",
 | |
|                     expression="'country == US'",
 | |
|                 ),
 | |
|             ],
 | |
|         )
 | |
|         with patch.object(
 | |
|             PandasProfilerInterface,
 | |
|             "_convert_table_to_list_of_dataframe_objects",
 | |
|             return_value=self.dfs,
 | |
|         ):
 | |
|             self.sqa_profiler_interface = PandasProfilerInterface(
 | |
|                 self.datalake_conn,
 | |
|                 None,
 | |
|                 table_entity,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 thread_count=1,
 | |
|             )
 | |
| 
 | |
|         profiler = Profiler(
 | |
|             profiler_interface=self.sqa_profiler_interface,
 | |
|         )
 | |
|         metrics = profiler.compute_metrics()
 | |
|         for k, v in metrics._table_results.items():
 | |
|             for metric in v:
 | |
|                 if metric.name == "LastNameFilter":
 | |
|                     assert metric.value == 1
 | |
|                 if metric.name == "notUS":
 | |
|                     assert metric.value == 2
 | |
| 
 | |
|     def test_column_custom_metric(self):
 | |
|         table_entity = Table(
 | |
|             id=uuid4(),
 | |
|             name="user",
 | |
|             columns=[
 | |
|                 EntityColumn(
 | |
|                     name=ColumnName(__root__="id"),
 | |
|                     dataType=DataType.INT,
 | |
|                     customMetrics=[
 | |
|                         CustomMetric(
 | |
|                             name="LastNameFilter",
 | |
|                             columnName="id",
 | |
|                             expression="'last_name' != Doe",
 | |
|                         ),
 | |
|                         CustomMetric(
 | |
|                             name="notUS",
 | |
|                             columnName="id",
 | |
|                             expression="'country == US'",
 | |
|                         ),
 | |
|                     ],
 | |
|                 )
 | |
|             ],
 | |
|         )
 | |
|         with patch.object(
 | |
|             PandasProfilerInterface,
 | |
|             "_convert_table_to_list_of_dataframe_objects",
 | |
|             return_value=self.dfs,
 | |
|         ):
 | |
|             self.sqa_profiler_interface = PandasProfilerInterface(
 | |
|                 self.datalake_conn,
 | |
|                 None,
 | |
|                 table_entity,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 None,
 | |
|                 thread_count=1,
 | |
|             )
 | |
| 
 | |
|         profiler = Profiler(
 | |
|             profiler_interface=self.sqa_profiler_interface,
 | |
|         )
 | |
|         metrics = profiler.compute_metrics()
 | |
|         for k, v in metrics._column_results.items():
 | |
|             for metric in v.get("customMetrics", []):
 | |
|                 if metric.name == "CustomerBornedAfter1991":
 | |
|                     assert metric.value == 1
 | |
|                 if metric.name == "AverageAge":
 | |
|                     assert metric.value == 2
 |