mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-04 13:26:30 +00:00
[WIP] profiler
This commit is contained in:
parent
4fd383e5bf
commit
328658ebea
@ -2,9 +2,9 @@
|
|||||||
"source": {
|
"source": {
|
||||||
"type": "redshift",
|
"type": "redshift",
|
||||||
"config": {
|
"config": {
|
||||||
"host_port": "cluster.name.region.redshift.amazonaws.com:5439",
|
"host_port": "redshift-cluster-1.clot5cqn1cnb.us-west-2.redshift.amazonaws.com:5439",
|
||||||
"username": "username",
|
"username": "awsuser",
|
||||||
"password": "strong_password",
|
"password": "focguC-kaqqe5-nepsok",
|
||||||
"database": "warehouse",
|
"database": "warehouse",
|
||||||
"service_name": "aws_redshift",
|
"service_name": "aws_redshift",
|
||||||
"filter_pattern": {
|
"filter_pattern": {
|
||||||
|
@ -4,9 +4,10 @@
|
|||||||
"config": {
|
"config": {
|
||||||
"username": "openmetadata_user",
|
"username": "openmetadata_user",
|
||||||
"password": "openmetadata_password",
|
"password": "openmetadata_password",
|
||||||
|
"database": "openmetadata_db",
|
||||||
"service_name": "local_mysql",
|
"service_name": "local_mysql",
|
||||||
"filter_pattern": {
|
"filter_pattern": {
|
||||||
"excludes": ["mysql.*", "information_schema.*"]
|
"excludes": ["mysql.*", "information_schema.*", "performance_schema.*", "sys.*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -17,3 +17,4 @@ fastavro>=1.2.0
|
|||||||
google~=3.0.0
|
google~=3.0.0
|
||||||
okta~=2.0.0
|
okta~=2.0.0
|
||||||
PyMySQL~=1.0.2
|
PyMySQL~=1.0.2
|
||||||
|
great-expectations>=0.13.31
|
@ -64,6 +64,7 @@ base_requirements = {
|
|||||||
"sql-metadata~=2.0.0",
|
"sql-metadata~=2.0.0",
|
||||||
"spacy==3.0.5",
|
"spacy==3.0.5",
|
||||||
"requests~=2.25.1",
|
"requests~=2.25.1",
|
||||||
|
"great-expectations>=0.13.31",
|
||||||
"en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web"
|
"en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web"
|
||||||
}
|
}
|
||||||
base_plugins = {
|
base_plugins = {
|
||||||
|
@ -258,3 +258,50 @@ class Dashboard(BaseModel):
|
|||||||
charts: List[str]
|
charts: List[str]
|
||||||
service: EntityReference
|
service: EntityReference
|
||||||
lastModified: int = None
|
lastModified: int = None
|
||||||
|
|
||||||
|
|
||||||
|
class ValueFrequency(BaseModel):
|
||||||
|
"""Profiler ValueFrequency"""
|
||||||
|
value: str
|
||||||
|
frequency: int
|
||||||
|
|
||||||
|
|
||||||
|
class Histogram(BaseModel):
|
||||||
|
"""Histogram"""
|
||||||
|
boundaries: List[str]
|
||||||
|
heights: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
class Quantile(BaseModel):
|
||||||
|
"""Quantile"""
|
||||||
|
quantile: str
|
||||||
|
value: str
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetColumnProfile(BaseModel):
|
||||||
|
"""Dataset Column Profile stats """
|
||||||
|
fqdn: str
|
||||||
|
unique_count: int = None
|
||||||
|
unique_proportion: int = None
|
||||||
|
null_count: int = None
|
||||||
|
null_proportion: int = None
|
||||||
|
min: str = None
|
||||||
|
max: str = None
|
||||||
|
mean: str = None
|
||||||
|
median: str = None
|
||||||
|
stddev: str = None
|
||||||
|
quantiles: List[Quantile] = None
|
||||||
|
distinct_value_frequencies: List[ValueFrequency] = None
|
||||||
|
histogram: List[Histogram] = None
|
||||||
|
sample_values: List[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetProfile(BaseModel):
|
||||||
|
"""Dataset(table) stats"""
|
||||||
|
timestamp: int
|
||||||
|
table_name: str
|
||||||
|
row_count: int = None
|
||||||
|
col_count: int = None
|
||||||
|
col_profiles: List[DatasetColumnProfile] = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ from metadata.generated.schema.type.entityReference import EntityReference
|
|||||||
from metadata.generated.schema.entity.data.database import Database
|
from metadata.generated.schema.entity.data.database import Database
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData
|
from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData
|
||||||
from sqlalchemy import create_engine, inspect
|
from sqlalchemy import create_engine
|
||||||
from sqlalchemy.engine.reflection import Inspector
|
from sqlalchemy.engine.reflection import Inspector
|
||||||
from sqlalchemy.sql import sqltypes as types
|
from sqlalchemy.sql import sqltypes as types
|
||||||
from sqlalchemy.inspection import inspect
|
from sqlalchemy.inspection import inspect
|
||||||
@ -37,8 +37,10 @@ from sqlalchemy.inspection import inspect
|
|||||||
from metadata.ingestion.api.common import IncludeFilterPattern, ConfigModel, Record
|
from metadata.ingestion.api.common import IncludeFilterPattern, ConfigModel, Record
|
||||||
from metadata.ingestion.api.common import WorkflowContext
|
from metadata.ingestion.api.common import WorkflowContext
|
||||||
from metadata.ingestion.api.source import Source, SourceStatus
|
from metadata.ingestion.api.source import Source, SourceStatus
|
||||||
|
from metadata.ingestion.models.table_metadata import DatasetProfile
|
||||||
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
||||||
from metadata.utils.helpers import get_database_service_or_create
|
from metadata.utils.helpers import get_database_service_or_create
|
||||||
|
from metadata.utils.dataprofiler import DataProfiler
|
||||||
|
|
||||||
logger: logging.Logger = logging.getLogger(__name__)
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -72,6 +74,7 @@ class SQLConnectionConfig(ConfigModel):
|
|||||||
include_views: Optional[bool] = True
|
include_views: Optional[bool] = True
|
||||||
include_tables: Optional[bool] = True
|
include_tables: Optional[bool] = True
|
||||||
generate_sample_data: Optional[bool] = True
|
generate_sample_data: Optional[bool] = True
|
||||||
|
data_profiler_enabled: Optional[bool] = True
|
||||||
filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all()
|
filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -152,8 +155,10 @@ def get_column_type(status: SQLSourceStatus, dataset_name: str, column_type: Any
|
|||||||
return type_class
|
return type_class
|
||||||
|
|
||||||
|
|
||||||
class SQLSource(Source):
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class SQLSource(Source):
|
||||||
def __init__(self, config: SQLConnectionConfig, metadata_config: MetadataServerConfig,
|
def __init__(self, config: SQLConnectionConfig, metadata_config: MetadataServerConfig,
|
||||||
ctx: WorkflowContext):
|
ctx: WorkflowContext):
|
||||||
super().__init__(ctx)
|
super().__init__(ctx)
|
||||||
@ -172,6 +177,9 @@ class SQLSource(Source):
|
|||||||
def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext):
|
def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _get_profiler_instance(self, inspector: Inspector) -> DataProfiler:
|
||||||
|
return DataProfiler(conn=inspector.bind, status=self.status)
|
||||||
|
|
||||||
def standardize_schema_table_names(
|
def standardize_schema_table_names(
|
||||||
self, schema: str, table: str
|
self, schema: str, table: str
|
||||||
) -> Tuple[str, str]:
|
) -> Tuple[str, str]:
|
||||||
@ -211,7 +219,7 @@ class SQLSource(Source):
|
|||||||
schema, table_name = self.standardize_schema_table_names(schema, table_name)
|
schema, table_name = self.standardize_schema_table_names(schema, table_name)
|
||||||
if not self.sql_config.filter_pattern.included(table_name):
|
if not self.sql_config.filter_pattern.included(table_name):
|
||||||
self.status.filter('{}.{}'.format(self.config.get_service_name(), table_name),
|
self.status.filter('{}.{}'.format(self.config.get_service_name(), table_name),
|
||||||
"Table pattern not allowed")
|
"Table pattern not allowed")
|
||||||
continue
|
continue
|
||||||
self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name))
|
self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name))
|
||||||
|
|
||||||
@ -227,6 +235,11 @@ class SQLSource(Source):
|
|||||||
table_data = self.fetch_sample_data(schema, table_name)
|
table_data = self.fetch_sample_data(schema, table_name)
|
||||||
table_entity.sampleData = table_data
|
table_entity.sampleData = table_data
|
||||||
|
|
||||||
|
if self.config.data_profiler_enabled:
|
||||||
|
data_profiler = self._get_profiler_instance(inspector)
|
||||||
|
profile = self.run_data_profiler(data_profiler, table_name, schema)
|
||||||
|
logger.info(profile.json())
|
||||||
|
|
||||||
table_and_db = OMetaDatabaseAndTable(table=table_entity, database=self._get_database(schema))
|
table_and_db = OMetaDatabaseAndTable(table=table_entity, database=self._get_database(schema))
|
||||||
yield table_and_db
|
yield table_and_db
|
||||||
except ValidationError as err:
|
except ValidationError as err:
|
||||||
@ -241,7 +254,7 @@ class SQLSource(Source):
|
|||||||
try:
|
try:
|
||||||
if not self.sql_config.filter_pattern.included(view_name):
|
if not self.sql_config.filter_pattern.included(view_name):
|
||||||
self.status.filter('{}.{}'.format(self.config.get_service_name(), view_name),
|
self.status.filter('{}.{}'.format(self.config.get_service_name(), view_name),
|
||||||
"View pattern not allowed")
|
"View pattern not allowed")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
view_definition = inspector.get_view_definition(view_name, schema)
|
view_definition = inspector.get_view_definition(view_name, schema)
|
||||||
@ -324,6 +337,24 @@ class SQLSource(Source):
|
|||||||
description = table_info["text"]
|
description = table_info["text"]
|
||||||
return description
|
return description
|
||||||
|
|
||||||
|
def run_data_profiler(
|
||||||
|
self,
|
||||||
|
profiler: DataProfiler,
|
||||||
|
table: str,
|
||||||
|
schema: str
|
||||||
|
) -> DatasetProfile:
|
||||||
|
dataset_name = f"{schema}.{table}"
|
||||||
|
self.status.scanned(f"profile of {dataset_name}")
|
||||||
|
logger.info(f"Profiling {dataset_name} (this may take a while)")
|
||||||
|
profile = profiler.generate_profile(
|
||||||
|
pretty_name=dataset_name,
|
||||||
|
schema=schema,
|
||||||
|
table=table,
|
||||||
|
limit=50000,
|
||||||
|
offset=0)
|
||||||
|
logger.debug(f"Finished profiling {dataset_name}")
|
||||||
|
return profile
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.connection is not None:
|
if self.connection is not None:
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
|
270
ingestion/src/metadata/utils/dataprofiler.py
Normal file
270
ingestion/src/metadata/utils/dataprofiler.py
Normal file
@ -0,0 +1,270 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import contextlib
|
||||||
|
import dataclasses
|
||||||
|
import logging
|
||||||
|
import unittest.mock
|
||||||
|
import time
|
||||||
|
from typing import Any, Iterable, Optional, Callable, Tuple, TypeVar
|
||||||
|
|
||||||
|
from great_expectations.core.expectation_validation_result import (
|
||||||
|
ExpectationSuiteValidationResult,
|
||||||
|
ExpectationValidationResult,
|
||||||
|
)
|
||||||
|
from great_expectations.data_context import BaseDataContext
|
||||||
|
from great_expectations.data_context.types.base import (
|
||||||
|
DataContextConfig,
|
||||||
|
DatasourceConfig,
|
||||||
|
InMemoryStoreBackendDefaults,
|
||||||
|
)
|
||||||
|
from great_expectations.datasource.sqlalchemy_datasource import SqlAlchemyDatasource
|
||||||
|
|
||||||
|
from metadata.ingestion.api.source import SourceStatus
|
||||||
|
from metadata.ingestion.models.table_metadata import DatasetProfile, DatasetColumnProfile, Quantile, \
|
||||||
|
Histogram, ValueFrequency
|
||||||
|
|
||||||
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
K = TypeVar("K")
|
||||||
|
|
||||||
|
|
||||||
|
def groupby_unsorted(
|
||||||
|
iterable: Iterable[T], key: Callable[[T], K]
|
||||||
|
) -> Iterable[Tuple[K, Iterable[T]]]:
|
||||||
|
"""The default itertools.groupby() requires that the iterable is already sorted by the key.
|
||||||
|
This method is similar to groupby() but without the pre-sorted requirement."""
|
||||||
|
|
||||||
|
values = collections.defaultdict(list)
|
||||||
|
for v in iterable:
|
||||||
|
values[key(v)].append(v)
|
||||||
|
return values.items()
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def _properly_init_datasource(conn):
|
||||||
|
underlying_datasource_init = SqlAlchemyDatasource.__init__
|
||||||
|
|
||||||
|
def sqlalchemy_datasource_init(
|
||||||
|
self: SqlAlchemyDatasource, *args: Any, **kwargs: Any
|
||||||
|
) -> None:
|
||||||
|
underlying_datasource_init(self, *args, **kwargs, engine=conn)
|
||||||
|
self.drivername = conn.dialect.name
|
||||||
|
del self._datasource_config["engine"]
|
||||||
|
|
||||||
|
with unittest.mock.patch(
|
||||||
|
"great_expectations.datasource.sqlalchemy_datasource.SqlAlchemyDatasource.__init__",
|
||||||
|
sqlalchemy_datasource_init,
|
||||||
|
), unittest.mock.patch(
|
||||||
|
"great_expectations.data_context.store.validations_store.ValidationsStore.set"
|
||||||
|
):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class DataProfiler:
|
||||||
|
data_context: BaseDataContext
|
||||||
|
status: SourceStatus
|
||||||
|
datasource_name: str = "om_sqlalchemy_datasource"
|
||||||
|
|
||||||
|
def __init__(self, conn, status):
|
||||||
|
self.conn = conn
|
||||||
|
self.status = status
|
||||||
|
|
||||||
|
data_context_config = DataContextConfig(
|
||||||
|
datasources={
|
||||||
|
self.datasource_name: DatasourceConfig(
|
||||||
|
class_name="SqlAlchemyDatasource",
|
||||||
|
credentials={
|
||||||
|
"url": self.conn.engine.url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
},
|
||||||
|
store_backend_defaults=InMemoryStoreBackendDefaults(),
|
||||||
|
anonymous_usage_statistics={
|
||||||
|
"enabled": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with _properly_init_datasource(self.conn):
|
||||||
|
self.data_context = BaseDataContext(project_config=data_context_config)
|
||||||
|
|
||||||
|
def generate_profile(
|
||||||
|
self,
|
||||||
|
pretty_name: str,
|
||||||
|
schema: str = None,
|
||||||
|
table: str = None,
|
||||||
|
limit: int = None,
|
||||||
|
offset: int = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DatasetProfile:
|
||||||
|
with _properly_init_datasource(self.conn):
|
||||||
|
evrs = self._profile_data_asset(
|
||||||
|
{
|
||||||
|
"schema": schema,
|
||||||
|
"table": table,
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
**kwargs,
|
||||||
|
},
|
||||||
|
pretty_name=pretty_name,
|
||||||
|
)
|
||||||
|
profile = self._convert_evrs_to_profile(evrs, pretty_name=pretty_name)
|
||||||
|
return profile
|
||||||
|
|
||||||
|
def _profile_data_asset(
|
||||||
|
self,
|
||||||
|
batch_kwargs: dict,
|
||||||
|
pretty_name: str,
|
||||||
|
) -> ExpectationSuiteValidationResult:
|
||||||
|
# Internally, this uses the GE dataset profiler:
|
||||||
|
# great_expectations.profile.basic_dataset_profiler.BasicDatasetProfiler
|
||||||
|
|
||||||
|
profile_results = self.data_context.profile_data_asset(
|
||||||
|
self.datasource_name,
|
||||||
|
batch_kwargs={
|
||||||
|
"datasource": self.datasource_name,
|
||||||
|
**batch_kwargs,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert profile_results["success"]
|
||||||
|
|
||||||
|
assert len(profile_results["results"]) == 1
|
||||||
|
_suite, evrs = profile_results["results"][0]
|
||||||
|
return evrs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_column_from_evr(evr: ExpectationValidationResult) -> Optional[str]:
|
||||||
|
return evr.expectation_config.kwargs.get("column")
|
||||||
|
|
||||||
|
def _convert_evrs_to_profile(
|
||||||
|
self, evrs: ExpectationSuiteValidationResult, pretty_name: str
|
||||||
|
) -> DatasetProfile:
|
||||||
|
profile = None
|
||||||
|
column_profiles = []
|
||||||
|
for col, evrs_for_col in groupby_unsorted(
|
||||||
|
evrs.results, key=self._get_column_from_evr
|
||||||
|
):
|
||||||
|
if col is None:
|
||||||
|
profile = self._handle_convert_table_evrs(evrs_for_col, pretty_name=pretty_name)
|
||||||
|
else:
|
||||||
|
column_profile = self._handle_convert_column_evrs(col, evrs_for_col, pretty_name=pretty_name)
|
||||||
|
column_profiles.append(column_profile)
|
||||||
|
|
||||||
|
if profile is not None:
|
||||||
|
profile.col_profiles = column_profiles
|
||||||
|
return profile
|
||||||
|
|
||||||
|
def _handle_convert_table_evrs(
|
||||||
|
self,
|
||||||
|
table_evrs: Iterable[ExpectationValidationResult],
|
||||||
|
pretty_name: str,
|
||||||
|
) -> DatasetProfile:
|
||||||
|
logger.info("generating table stats")
|
||||||
|
profile = DatasetProfile(timestamp=round(time.time() * 1000), table_name=pretty_name)
|
||||||
|
for evr in table_evrs:
|
||||||
|
exp: str = evr.expectation_config.expectation_type
|
||||||
|
res: dict = evr.result
|
||||||
|
if exp == "expect_table_row_count_to_be_between":
|
||||||
|
profile.row_count = res["observed_value"]
|
||||||
|
elif exp == "expect_table_columns_to_match_ordered_list":
|
||||||
|
profile.col_count = len(res["observed_value"])
|
||||||
|
else:
|
||||||
|
self.status.warning(
|
||||||
|
f"profile of {pretty_name}", f"unknown table mapper {exp}"
|
||||||
|
)
|
||||||
|
return profile
|
||||||
|
|
||||||
|
def _handle_convert_column_evrs(
|
||||||
|
self,
|
||||||
|
column: str,
|
||||||
|
col_evrs: Iterable[ExpectationValidationResult],
|
||||||
|
pretty_name: str,
|
||||||
|
) -> DatasetColumnProfile:
|
||||||
|
logger.info(f"Generating Column Stats for {column}")
|
||||||
|
column_profile = DatasetColumnProfile(fqdn=column)
|
||||||
|
for evr in col_evrs:
|
||||||
|
exp: str = evr.expectation_config.expectation_type
|
||||||
|
res: dict = evr.result
|
||||||
|
if not res:
|
||||||
|
self.status.warning(
|
||||||
|
f"profile of {pretty_name}", f"{exp} did not yield any results"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if exp == "expect_column_unique_value_count_to_be_between":
|
||||||
|
column_profile.unique_count = res["observed_value"]
|
||||||
|
elif exp == "expect_column_proportion_of_unique_values_to_be_between":
|
||||||
|
column_profile.unique_proportion = res["observed_value"]
|
||||||
|
elif exp == "expect_column_values_to_not_be_null":
|
||||||
|
column_profile.null_count = res["unexpected_count"]
|
||||||
|
if (
|
||||||
|
"unexpected_percent" in res
|
||||||
|
and res["unexpected_percent"] is not None
|
||||||
|
):
|
||||||
|
column_profile.null_proportion = res["unexpected_percent"] / 100
|
||||||
|
elif exp == "expect_column_values_to_not_match_regex":
|
||||||
|
pass
|
||||||
|
elif exp == "expect_column_mean_to_be_between":
|
||||||
|
column_profile.mean = str(res["observed_value"])
|
||||||
|
elif exp == "expect_column_min_to_be_between":
|
||||||
|
column_profile.min = str(res["observed_value"])
|
||||||
|
elif exp == "expect_column_max_to_be_between":
|
||||||
|
column_profile.max = str(res["observed_value"])
|
||||||
|
elif exp == "expect_column_median_to_be_between":
|
||||||
|
column_profile.median = str(res["observed_value"])
|
||||||
|
elif exp == "expect_column_stdev_to_be_between":
|
||||||
|
column_profile.stddev = str(res["observed_value"])
|
||||||
|
elif exp == "expect_column_quantile_values_to_be_between":
|
||||||
|
if "observed_value" in res:
|
||||||
|
column_profile.quantiles = [
|
||||||
|
Quantile(quantile=str(quantile), value=str(value))
|
||||||
|
for quantile, value in zip(
|
||||||
|
res["observed_value"]["quantiles"],
|
||||||
|
res["observed_value"]["values"],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
elif exp == "expect_column_values_to_be_in_set":
|
||||||
|
column_profile.sample_values = [
|
||||||
|
str(v) for v in res["partial_unexpected_list"]
|
||||||
|
]
|
||||||
|
elif exp == "expect_column_kl_divergence_to_be_less_than":
|
||||||
|
if "details" in res and "observed_partition" in res["details"]:
|
||||||
|
partition = res["details"]["observed_partition"]
|
||||||
|
column_profile.histogram = Histogram(
|
||||||
|
[str(v) for v in partition["bins"]],
|
||||||
|
[
|
||||||
|
partition["tail_weights"][0],
|
||||||
|
*partition["weights"],
|
||||||
|
partition["tail_weights"][1],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
elif exp == "expect_column_distinct_values_to_be_in_set":
|
||||||
|
if "details" in res and "value_counts" in res["details"]:
|
||||||
|
column_profile.distinct_value_frequencies = [
|
||||||
|
ValueFrequency(value=str(value), frequency=count)
|
||||||
|
for value, count in res["details"]["value_counts"].items()
|
||||||
|
]
|
||||||
|
elif exp == "expect_column_values_to_be_in_type_list":
|
||||||
|
pass
|
||||||
|
elif exp == "expect_column_values_to_be_unique":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self.status.warning(
|
||||||
|
f"profile of {pretty_name}",
|
||||||
|
f"warning: unknown column mapper {exp} in col {column}",
|
||||||
|
)
|
||||||
|
return column_profile
|
Loading…
x
Reference in New Issue
Block a user