mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-07 15:05:08 +00:00
feat(ingest): user configurable dataset profiling. (#3453)
This commit is contained in:
parent
4b6860efd7
commit
a1bf95307b
@ -58,6 +58,7 @@ kafka_common = {
|
||||
sql_common = {
|
||||
# Required for all SQL sources.
|
||||
"sqlalchemy==1.3.24",
|
||||
"great-expectations",
|
||||
}
|
||||
|
||||
aws_common = {
|
||||
@ -122,7 +123,6 @@ plugins: Dict[str, Set[str]] = {
|
||||
"snowflake": sql_common | {"snowflake-sqlalchemy<=1.2.4"},
|
||||
"snowflake-usage": sql_common | {"snowflake-sqlalchemy<=1.2.4"},
|
||||
"sqlalchemy": sql_common,
|
||||
"sql-profiles": sql_common | {"great-expectations"},
|
||||
"superset": {"requests"},
|
||||
"trino": sql_common
|
||||
| {
|
||||
@ -233,7 +233,6 @@ full_test_dev_requirements = {
|
||||
"mysql",
|
||||
"mariadb",
|
||||
"snowflake",
|
||||
"sql-profiles",
|
||||
"redash",
|
||||
]
|
||||
for dependency in plugins[plugin]
|
||||
|
@ -4,7 +4,8 @@ For context on getting started with ingestion, check out our [metadata ingestion
|
||||
|
||||
## Setup
|
||||
|
||||
To install this plugin, run `pip install 'acryl-datahub[sql-profiles]'`.
|
||||
To install this plugin, run `pip install 'acryl-datahub[sql-profiles]'` (prior to datahub version `0.8.16.0`).
|
||||
In the versions after `0.8.16.0`, this gets installed along with the SQL-based source itself.
|
||||
|
||||
The SQL-based profiler does not run alone, but rather can be enabled for other SQL-based sources.
|
||||
Enabling profiling will slow down ingestion runs.
|
||||
@ -68,16 +69,28 @@ sink:
|
||||
|
||||
Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||
|
||||
| Field | Required | Default | Description |
|
||||
| ---------------------------- | -------- | ------------------ | ----------------------------------------------------------------------- |
|
||||
| `profiling.enabled` | | `False` | Whether profiling should be done. |
|
||||
| `profiling.limit` | | | Max number of documents to profile. By default, profiles all documents. |
|
||||
| `profiling.offset` | | | Offset in documents to profile. By default, uses no offset. |
|
||||
| `profiling.max_workers` | | `5*os.cpu_count()` | Number of worker threads to use for profiling. Set to 1 to disable. |
|
||||
| `profile_pattern.allow` | | | List of regex patterns for tables to profile. |
|
||||
| `profile_pattern.deny` | | | List of regex patterns for tables to not profile. |
|
||||
| `profile_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `profile.send_sample_values` | | `True` | Whether to send sample values or not. |
|
||||
| Field | Required | Default | Description |
|
||||
| --------------------------------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------ |
|
||||
| `profiling.enabled` | | `False` | Whether profiling should be done. |
|
||||
| `profiling.limit` | | | Max number of documents to profile. By default, profiles all documents. |
|
||||
| `profiling.offset` | | | Offset in documents to profile. By default, uses no offset. |
|
||||
| `profiling.max_workers` | | `5 * (os.cpu_count() or 4)` | Number of worker threads to use for profiling. Set to 1 to disable. |
|
||||
| `profile_pattern.allow` | | `*` | List of regex patterns for tables or table columns to profile. Defaults to all. |
|
||||
| `profile_pattern.deny` | | | List of regex patterns for tables or table columns to not profile. Defaults to none. |
|
||||
| `profile_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `profile.turn_off_expensive_profiling_metrics` | | False | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.|
|
||||
| `profile.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.|
|
||||
| `profile.profile_table_level_only` | | False | Whether to perform profiling at table-level only, or include column-level profiling as well.|
|
||||
| `profile.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. |
|
||||
| `profile.include_field_min_value` | | `True` | Whether to profile for the min value of numeric columns. |
|
||||
| `profile.include_field_max_value` | | `True` | Whether to profile for the max value of numeric columns. |
|
||||
| `profile.include_field_mean_value` | | `True` | Whether to profile for the mean value of numeric columns. |
|
||||
| `profile.include_field_median_value` | | `True` | Whether to profile for the median value of numeric columns. |
|
||||
| `profile.include_field_stddev_value` | | `True` | Whether to profile for the standard deviation of numeric columns. |
|
||||
| `profile.include_field_quantiles` | | `True` | Whether to profile for the quantiles of numeric columns. |
|
||||
| `profile.include_field_distinct_value_frequencies` | | `True` | Whether to profile for distinct value frequencies. |
|
||||
| `profile.include_field_histogram` | | `True` | Whether to profile for the histogram for numeric fields. |
|
||||
| `profile.include_field_sample_values` | | `True` | Whether to profile for the sample values for all columns. |
|
||||
|
||||
## Compatibility
|
||||
|
||||
|
@ -0,0 +1,278 @@
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
|
||||
from great_expectations.core.expectation_suite import ExpectationSuite
|
||||
from great_expectations.dataset.dataset import Dataset
|
||||
from great_expectations.profile.base import ProfilerCardinality, ProfilerDataType
|
||||
from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfilerBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DatahubGECustomProfiler(BasicDatasetProfilerBase):
|
||||
"""DatahubGECustomProfiler is the customizable version of of the BasicDatasetProfiler.
|
||||
|
||||
The reason for going this route instead of using UserConfigurableProfiler is UserConfigurableProfiler
|
||||
does not compute all of the expectations the BasicDatasetProfiler such as the sample values etc.
|
||||
"""
|
||||
|
||||
# flake8: noqa: C901
|
||||
@classmethod
|
||||
def _profile(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
configuration: Dict[str, Any],
|
||||
) -> ExpectationSuite:
|
||||
columns_to_profile = configuration["columns_to_profile"]
|
||||
excluded_expectations = configuration["excluded_expectations"]
|
||||
|
||||
df = dataset
|
||||
|
||||
df.set_default_expectation_argument("catch_exceptions", True)
|
||||
|
||||
df.expect_table_row_count_to_be_between(min_value=0, max_value=None)
|
||||
df.expect_table_columns_to_match_ordered_list(None)
|
||||
df.set_config_value("interactive_evaluation", False)
|
||||
|
||||
meta_columns = {}
|
||||
for column in columns_to_profile:
|
||||
meta_columns[column] = {"description": ""}
|
||||
|
||||
number_of_columns = len(columns_to_profile)
|
||||
for i, column in enumerate(columns_to_profile):
|
||||
logger.info(
|
||||
" Preparing column {} of {}: {}".format(
|
||||
i + 1, number_of_columns, column
|
||||
)
|
||||
)
|
||||
|
||||
type_ = cls._get_column_type(df, column)
|
||||
cardinality = cls._get_column_cardinality(df, column)
|
||||
if "expect_column_values_to_not_be_null" not in excluded_expectations:
|
||||
df.expect_column_values_to_not_be_null(column, mostly=0.5)
|
||||
|
||||
if "expect_column_values_to_be_in_set" not in excluded_expectations:
|
||||
df.expect_column_values_to_be_in_set(
|
||||
column, [], result_format="SUMMARY"
|
||||
)
|
||||
|
||||
if type_ == ProfilerDataType.INT:
|
||||
if cardinality == ProfilerCardinality.UNIQUE:
|
||||
df.expect_column_values_to_be_unique(column)
|
||||
elif cardinality in [
|
||||
ProfilerCardinality.ONE,
|
||||
ProfilerCardinality.TWO,
|
||||
ProfilerCardinality.VERY_FEW,
|
||||
ProfilerCardinality.FEW,
|
||||
] and (
|
||||
"expect_column_distinct_values_to_be_in_set"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_distinct_values_to_be_in_set(
|
||||
column, value_set=None, result_format="SUMMARY"
|
||||
)
|
||||
elif cardinality in [
|
||||
ProfilerCardinality.MANY,
|
||||
ProfilerCardinality.VERY_MANY,
|
||||
ProfilerCardinality.UNIQUE,
|
||||
]:
|
||||
if "expect_column_min_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_min_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if "expect_column_max_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_max_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if "expect_column_mean_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_mean_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if (
|
||||
"expect_column_median_to_be_between"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_median_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if "expect_column_stdev_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_stdev_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if (
|
||||
"expect_column_quantile_values_to_be_between"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_quantile_values_to_be_between(
|
||||
column,
|
||||
quantile_ranges={
|
||||
"quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
|
||||
"value_ranges": [
|
||||
[None, None],
|
||||
[None, None],
|
||||
[None, None],
|
||||
[None, None],
|
||||
[None, None],
|
||||
],
|
||||
},
|
||||
)
|
||||
if (
|
||||
"expect_column_kl_divergence_to_be_less_than"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_kl_divergence_to_be_less_than(
|
||||
column,
|
||||
partition_object=None,
|
||||
threshold=None,
|
||||
result_format="COMPLETE",
|
||||
)
|
||||
else: # unknown cardinality - skip
|
||||
pass
|
||||
elif type_ == ProfilerDataType.FLOAT:
|
||||
if cardinality == ProfilerCardinality.UNIQUE:
|
||||
df.expect_column_values_to_be_unique(column)
|
||||
|
||||
elif cardinality in [
|
||||
ProfilerCardinality.ONE,
|
||||
ProfilerCardinality.TWO,
|
||||
ProfilerCardinality.VERY_FEW,
|
||||
ProfilerCardinality.FEW,
|
||||
] and (
|
||||
"expect_column_distinct_values_to_be_in_set"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_distinct_values_to_be_in_set(
|
||||
column, value_set=None, result_format="SUMMARY"
|
||||
)
|
||||
|
||||
elif cardinality in [
|
||||
ProfilerCardinality.MANY,
|
||||
ProfilerCardinality.VERY_MANY,
|
||||
ProfilerCardinality.UNIQUE,
|
||||
]:
|
||||
if "expect_column_min_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_min_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if "expect_column_max_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_max_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if "expect_column_mean_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_mean_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if (
|
||||
"expect_column_median_to_be_between"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_median_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
if (
|
||||
"expect_column_quantile_values_to_be_between"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_quantile_values_to_be_between(
|
||||
column,
|
||||
quantile_ranges={
|
||||
"quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
|
||||
"value_ranges": [
|
||||
[None, None],
|
||||
[None, None],
|
||||
[None, None],
|
||||
[None, None],
|
||||
[None, None],
|
||||
],
|
||||
},
|
||||
)
|
||||
if (
|
||||
"expect_column_kl_divergence_to_be_less_than"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_kl_divergence_to_be_less_than(
|
||||
column,
|
||||
partition_object=None,
|
||||
threshold=None,
|
||||
result_format="COMPLETE",
|
||||
)
|
||||
else: # unknown cardinality - skip
|
||||
pass
|
||||
|
||||
elif type_ == ProfilerDataType.STRING:
|
||||
# Check for leading and trailing whitespace.
|
||||
df.expect_column_values_to_not_match_regex(column, r"^\s+|\s+$")
|
||||
|
||||
if cardinality == ProfilerCardinality.UNIQUE:
|
||||
df.expect_column_values_to_be_unique(column)
|
||||
|
||||
elif cardinality in [
|
||||
ProfilerCardinality.ONE,
|
||||
ProfilerCardinality.TWO,
|
||||
ProfilerCardinality.VERY_FEW,
|
||||
ProfilerCardinality.FEW,
|
||||
] and (
|
||||
"expect_column_distinct_values_to_be_in_set"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_distinct_values_to_be_in_set(
|
||||
column, value_set=None, result_format="SUMMARY"
|
||||
)
|
||||
else:
|
||||
pass
|
||||
|
||||
elif type_ == ProfilerDataType.DATETIME:
|
||||
|
||||
if "expect_column_min_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_min_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
|
||||
if "expect_column_max_to_be_between" not in excluded_expectations:
|
||||
df.expect_column_max_to_be_between(
|
||||
column, min_value=None, max_value=None
|
||||
)
|
||||
|
||||
# Re-add once kl_divergence has been modified to support datetimes
|
||||
# df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None,
|
||||
# threshold=None, result_format='COMPLETE')
|
||||
|
||||
if cardinality in [
|
||||
ProfilerCardinality.ONE,
|
||||
ProfilerCardinality.TWO,
|
||||
ProfilerCardinality.VERY_FEW,
|
||||
ProfilerCardinality.FEW,
|
||||
] and (
|
||||
"expect_column_distinct_values_to_be_in_set"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_distinct_values_to_be_in_set(
|
||||
column, value_set=None, result_format="SUMMARY"
|
||||
)
|
||||
|
||||
else:
|
||||
if cardinality == ProfilerCardinality.UNIQUE:
|
||||
df.expect_column_values_to_be_unique(column)
|
||||
|
||||
elif cardinality in [
|
||||
ProfilerCardinality.ONE,
|
||||
ProfilerCardinality.TWO,
|
||||
ProfilerCardinality.VERY_FEW,
|
||||
ProfilerCardinality.FEW,
|
||||
] and (
|
||||
"expect_column_distinct_values_to_be_in_set"
|
||||
not in excluded_expectations
|
||||
):
|
||||
df.expect_column_distinct_values_to_be_in_set(
|
||||
column, value_set=None, result_format="SUMMARY"
|
||||
)
|
||||
else:
|
||||
pass
|
||||
|
||||
df.set_config_value("interactive_evaluation", True)
|
||||
expectation_suite = df.get_expectation_suite(
|
||||
suppress_warnings=True, discard_failed_expectations=False
|
||||
)
|
||||
expectation_suite.meta["columns"] = meta_columns
|
||||
|
||||
return expectation_suite
|
@ -1,13 +1,17 @@
|
||||
import concurrent.futures
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import unittest.mock
|
||||
import uuid
|
||||
from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
import pydantic
|
||||
from great_expectations.core import ExpectationSuite
|
||||
from great_expectations.core.expectation_validation_result import (
|
||||
ExpectationSuiteValidationResult,
|
||||
ExpectationValidationResult,
|
||||
@ -19,11 +23,15 @@ from great_expectations.data_context.types.base import (
|
||||
InMemoryStoreBackendDefaults,
|
||||
datasourceConfigSchema,
|
||||
)
|
||||
from great_expectations.dataset.dataset import Dataset
|
||||
from great_expectations.datasource.sqlalchemy_datasource import SqlAlchemyDatasource
|
||||
from great_expectations.profile.base import DatasetProfiler
|
||||
from sqlalchemy.engine import Connection, Engine
|
||||
|
||||
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
||||
from datahub.emitter.mce_builder import get_sys_time
|
||||
from datahub.ingestion.api.source import SourceReport
|
||||
from datahub.ingestion.source.datahub_custom_ge_profiler import DatahubGECustomProfiler
|
||||
from datahub.ingestion.source.sql.sql_common import SQLSourceReport
|
||||
from datahub.metadata.schema_classes import (
|
||||
DatasetFieldProfileClass,
|
||||
DatasetProfileClass,
|
||||
@ -77,7 +85,186 @@ class GEProfilerRequest:
|
||||
pretty_name: str
|
||||
batch_kwargs: dict
|
||||
|
||||
send_sample_values: bool
|
||||
|
||||
class GEProfilingConfig(ConfigModel):
|
||||
enabled: bool = False
|
||||
limit: Optional[int] = None
|
||||
offset: Optional[int] = None
|
||||
turn_off_expensive_profiling_metrics: bool = False
|
||||
profile_table_level_only: bool = False
|
||||
include_field_null_count: bool = True
|
||||
include_field_min_value: bool = True
|
||||
include_field_max_value: bool = True
|
||||
include_field_mean_value: bool = True
|
||||
include_field_median_value: bool = True
|
||||
include_field_stddev_value: bool = True
|
||||
include_field_quantiles: bool = True
|
||||
include_field_distinct_value_frequencies: bool = True
|
||||
include_field_histogram: bool = True
|
||||
include_field_sample_values: bool = True
|
||||
allow_deny_patterns: AllowDenyPattern = AllowDenyPattern.allow_all()
|
||||
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = None
|
||||
|
||||
# The default of (5 * cpu_count) is adopted from the default max_workers
|
||||
# parameter of ThreadPoolExecutor. Given that profiling is often an I/O-bound
|
||||
# task, it may make sense to increase this default value in the future.
|
||||
# https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
|
||||
max_workers: int = 5 * (os.cpu_count() or 4)
|
||||
|
||||
@pydantic.root_validator()
|
||||
def ensure_field_level_settings_are_normalized(
|
||||
cls: "GEProfilingConfig", values: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
max_num_fields_to_profile_key = "max_number_of_fields_to_profile"
|
||||
table_level_profiling_only_key = "profile_table_level_only"
|
||||
max_num_fields_to_profile = values.get(max_num_fields_to_profile_key)
|
||||
if values.get(table_level_profiling_only_key):
|
||||
all_field_level_metrics: List[str] = [
|
||||
"include_field_null_count",
|
||||
"include_field_min_value",
|
||||
"include_field_max_value",
|
||||
"include_field_mean_value",
|
||||
"include_field_median_value",
|
||||
"include_field_stddev_value",
|
||||
"include_field_quantiles",
|
||||
"include_field_distinct_value_frequencies",
|
||||
"include_field_histogram",
|
||||
"include_field_sample_values",
|
||||
]
|
||||
# Supress all field-level metrics
|
||||
for field_level_metric in all_field_level_metrics:
|
||||
values[field_level_metric] = False
|
||||
assert (
|
||||
max_num_fields_to_profile is None
|
||||
), f"{max_num_fields_to_profile_key} should be set to None"
|
||||
|
||||
if values.get("turn_off_expensive_profiling_metrics"):
|
||||
if not values.get(table_level_profiling_only_key):
|
||||
expensive_field_level_metrics: List[str] = [
|
||||
"include_field_quantiles",
|
||||
"include_field_distinct_value_frequencies",
|
||||
"include_field_histogram",
|
||||
"include_field_sample_values",
|
||||
]
|
||||
for expensive_field_metric in expensive_field_level_metrics:
|
||||
values[expensive_field_metric] = False
|
||||
if max_num_fields_to_profile is None:
|
||||
# We currently profile upto 10 non-filtered columns in this mode by default.
|
||||
values[max_num_fields_to_profile_key] = 10
|
||||
|
||||
return values
|
||||
|
||||
|
||||
class DatahubConfigurableProfiler(DatasetProfiler):
|
||||
"""
|
||||
DatahubConfigurableProfiler is a wrapper on top of DatahubGECustomProfiler that essentially translates the
|
||||
GEProfilingConfig into a proper GEProfiler's interface and delegates actual profiling to DatahubGECustomProfiler.
|
||||
Column filtering based on our Allow/Deny patterns requires us to intercept the _profile call
|
||||
and compute the list of the columns to profile.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _get_excluded_expectations(config: GEProfilingConfig) -> List[str]:
|
||||
# Compute excluded expectations
|
||||
excluded_expectations: List[str] = []
|
||||
if not config.include_field_null_count:
|
||||
excluded_expectations.append("expect_column_values_to_not_be_null")
|
||||
if not config.include_field_min_value:
|
||||
excluded_expectations.append("expect_column_min_to_be_between")
|
||||
if not config.include_field_max_value:
|
||||
excluded_expectations.append("expect_column_max_to_be_between")
|
||||
if not config.include_field_mean_value:
|
||||
excluded_expectations.append("expect_column_mean_to_be_between")
|
||||
if not config.include_field_median_value:
|
||||
excluded_expectations.append("expect_column_median_to_be_between")
|
||||
if not config.include_field_stddev_value:
|
||||
excluded_expectations.append("expect_column_stdev_to_be_between")
|
||||
if not config.include_field_quantiles:
|
||||
excluded_expectations.append("expect_column_quantile_values_to_be_between")
|
||||
if not config.include_field_distinct_value_frequencies:
|
||||
excluded_expectations.append("expect_column_distinct_values_to_be_in_set")
|
||||
if not config.include_field_histogram:
|
||||
excluded_expectations.append("expect_column_kl_divergence_to_be_less_than")
|
||||
if not config.include_field_sample_values:
|
||||
excluded_expectations.append("expect_column_values_to_be_in_set")
|
||||
return excluded_expectations
|
||||
|
||||
@staticmethod
|
||||
def _get_columns_to_profile(
|
||||
dataset: Dataset,
|
||||
dataset_name: str,
|
||||
config: GEProfilingConfig,
|
||||
report: SQLSourceReport,
|
||||
) -> List[str]:
|
||||
if config.profile_table_level_only:
|
||||
return []
|
||||
|
||||
# Compute columns to profile
|
||||
columns_to_profile: List[str] = []
|
||||
# Compute ignored columns
|
||||
ignored_columns: List[str] = []
|
||||
for col in dataset.get_table_columns():
|
||||
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
|
||||
if not config.allow_deny_patterns.allowed(f"{dataset_name}.{col}"):
|
||||
ignored_columns.append(col)
|
||||
else:
|
||||
columns_to_profile.append(col)
|
||||
if ignored_columns:
|
||||
report.report_dropped(
|
||||
f"The profile of columns by pattern {dataset_name}({', '.join(sorted(ignored_columns))})"
|
||||
)
|
||||
|
||||
if config.max_number_of_fields_to_profile is not None:
|
||||
columns_being_dropped: List[str] = list(
|
||||
itertools.islice(
|
||||
columns_to_profile, config.max_number_of_fields_to_profile, None
|
||||
)
|
||||
)
|
||||
columns_to_profile = list(
|
||||
itertools.islice(
|
||||
columns_to_profile, config.max_number_of_fields_to_profile
|
||||
)
|
||||
)
|
||||
if columns_being_dropped:
|
||||
report.report_dropped(
|
||||
f"The max_number_of_fields_to_profile={config.max_number_of_fields_to_profile} reached. Profile of columns {dataset_name}({', '.join(sorted(columns_being_dropped))})"
|
||||
)
|
||||
return columns_to_profile
|
||||
|
||||
@staticmethod
|
||||
def datahub_config_to_ge_config(
|
||||
dataset: Dataset,
|
||||
dataset_name: str,
|
||||
config: GEProfilingConfig,
|
||||
report: SQLSourceReport,
|
||||
) -> Dict[str, Any]:
|
||||
excluded_expectations: List[
|
||||
str
|
||||
] = DatahubConfigurableProfiler._get_excluded_expectations(config)
|
||||
columns_to_profile: List[
|
||||
str
|
||||
] = DatahubConfigurableProfiler._get_columns_to_profile(
|
||||
dataset, dataset_name, config, report
|
||||
)
|
||||
return {
|
||||
"excluded_expectations": excluded_expectations,
|
||||
"columns_to_profile": columns_to_profile,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _profile(
|
||||
cls, dataset: Dataset, configuration: Dict[str, Any]
|
||||
) -> ExpectationSuite:
|
||||
"""
|
||||
Override method, which returns the expectation suite using the UserConfigurable Profiler.
|
||||
"""
|
||||
profiler_configuration = cls.datahub_config_to_ge_config(
|
||||
dataset,
|
||||
configuration["dataset_name"],
|
||||
configuration["config"],
|
||||
configuration["report"],
|
||||
)
|
||||
return DatahubGECustomProfiler._profile(dataset, profiler_configuration)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@ -88,14 +275,21 @@ class GEContext:
|
||||
|
||||
@dataclasses.dataclass
|
||||
class DatahubGEProfiler:
|
||||
report: SourceReport
|
||||
report: SQLSourceReport
|
||||
config: GEProfilingConfig
|
||||
|
||||
# The actual value doesn't matter, it just matters that we use it consistently throughout.
|
||||
_datasource_name_base: str = "my_sqlalchemy_datasource"
|
||||
|
||||
def __init__(self, conn: Union[Engine, Connection], report: SourceReport):
|
||||
def __init__(
|
||||
self,
|
||||
conn: Union[Engine, Connection],
|
||||
report: SQLSourceReport,
|
||||
config: GEProfilingConfig,
|
||||
):
|
||||
self.base_engine = conn
|
||||
self.report = report
|
||||
self.config = config
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _ge_context(self) -> Iterator[GEContext]:
|
||||
@ -140,7 +334,7 @@ class DatahubGEProfiler:
|
||||
|
||||
def generate_profiles(
|
||||
self, requests: List[GEProfilerRequest], max_workers: int
|
||||
) -> Iterable[Tuple[GEProfilerRequest, DatasetProfileClass]]:
|
||||
) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]:
|
||||
start_time = time.perf_counter()
|
||||
|
||||
max_workers = min(max_workers, len(requests))
|
||||
@ -168,11 +362,10 @@ class DatahubGEProfiler:
|
||||
|
||||
def generate_profile_from_request(
|
||||
self, request: GEProfilerRequest
|
||||
) -> Tuple[GEProfilerRequest, DatasetProfileClass]:
|
||||
) -> Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]:
|
||||
return request, self.generate_profile(
|
||||
request.pretty_name,
|
||||
**request.batch_kwargs,
|
||||
send_sample_values=request.send_sample_values,
|
||||
)
|
||||
|
||||
def generate_profile(
|
||||
@ -180,11 +373,8 @@ class DatahubGEProfiler:
|
||||
pretty_name: str,
|
||||
schema: str = None,
|
||||
table: str = None,
|
||||
limit: int = None,
|
||||
offset: int = None,
|
||||
send_sample_values: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> DatasetProfileClass:
|
||||
) -> Optional[DatasetProfileClass]:
|
||||
with self._ge_context() as ge_context:
|
||||
logger.info(f"Profiling {pretty_name} (this may take a while)")
|
||||
|
||||
@ -193,15 +383,17 @@ class DatahubGEProfiler:
|
||||
{
|
||||
"schema": schema,
|
||||
"table": table,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"limit": self.config.limit,
|
||||
"offset": self.config.offset,
|
||||
**kwargs,
|
||||
},
|
||||
pretty_name=pretty_name,
|
||||
)
|
||||
|
||||
profile = self._convert_evrs_to_profile(
|
||||
evrs, pretty_name=pretty_name, send_sample_values=send_sample_values
|
||||
profile = (
|
||||
self._convert_evrs_to_profile(evrs, pretty_name=pretty_name)
|
||||
if evrs is not None
|
||||
else None
|
||||
)
|
||||
logger.debug(f"Finished profiling {pretty_name}")
|
||||
|
||||
@ -213,21 +405,36 @@ class DatahubGEProfiler:
|
||||
batch_kwargs: dict,
|
||||
pretty_name: str,
|
||||
) -> ExpectationSuiteValidationResult:
|
||||
# Internally, this uses the GE dataset profiler:
|
||||
# great_expectations.profile.basic_dataset_profiler.BasicDatasetProfiler
|
||||
try:
|
||||
start_time = time.perf_counter()
|
||||
profile_results = ge_context.data_context.profile_data_asset(
|
||||
ge_context.datasource_name,
|
||||
profiler=DatahubConfigurableProfiler,
|
||||
profiler_configuration={
|
||||
"config": self.config,
|
||||
"dataset_name": pretty_name,
|
||||
"report": self.report,
|
||||
},
|
||||
batch_kwargs={
|
||||
"datasource": ge_context.datasource_name,
|
||||
**batch_kwargs,
|
||||
},
|
||||
)
|
||||
end_time = time.perf_counter()
|
||||
logger.info(
|
||||
f"Profiling for {pretty_name} took {end_time - start_time} seconds."
|
||||
)
|
||||
|
||||
profile_results = ge_context.data_context.profile_data_asset(
|
||||
ge_context.datasource_name,
|
||||
batch_kwargs={
|
||||
"datasource": ge_context.datasource_name,
|
||||
**batch_kwargs,
|
||||
},
|
||||
)
|
||||
assert profile_results["success"]
|
||||
|
||||
assert len(profile_results["results"]) == 1
|
||||
_suite, evrs = profile_results["results"][0]
|
||||
return evrs
|
||||
assert profile_results["success"]
|
||||
assert len(profile_results["results"]) == 1
|
||||
_suite, evrs = profile_results["results"][0]
|
||||
return evrs
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Encountered exception {e}\nwhile profiling {pretty_name}, {batch_kwargs}"
|
||||
)
|
||||
self.report.report_warning(pretty_name, "Exception {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_column_from_evr(evr: ExpectationValidationResult) -> Optional[str]:
|
||||
@ -242,7 +449,6 @@ class DatahubGEProfiler:
|
||||
self,
|
||||
evrs: ExpectationSuiteValidationResult,
|
||||
pretty_name: str,
|
||||
send_sample_values: bool,
|
||||
) -> DatasetProfileClass:
|
||||
profile = DatasetProfileClass(timestampMillis=get_sys_time())
|
||||
|
||||
@ -259,7 +465,6 @@ class DatahubGEProfiler:
|
||||
col,
|
||||
evrs_for_col,
|
||||
pretty_name=pretty_name,
|
||||
send_sample_values=send_sample_values,
|
||||
)
|
||||
|
||||
return profile
|
||||
@ -291,7 +496,6 @@ class DatahubGEProfiler:
|
||||
column: str,
|
||||
col_evrs: Iterable[ExpectationValidationResult],
|
||||
pretty_name: str,
|
||||
send_sample_values: bool,
|
||||
) -> None:
|
||||
# TRICKY: This method mutates the profile directly.
|
||||
|
||||
@ -346,8 +550,6 @@ class DatahubGEProfiler:
|
||||
column_profile.sampleValues = [
|
||||
str(v) for v in res["partial_unexpected_list"]
|
||||
]
|
||||
if not send_sample_values:
|
||||
column_profile.sampleValues = []
|
||||
elif exp == "expect_column_kl_divergence_to_be_less_than":
|
||||
if "details" in res and "observed_partition" in res["details"]:
|
||||
partition = res["details"]["observed_partition"]
|
||||
@ -368,8 +570,6 @@ class DatahubGEProfiler:
|
||||
ValueFrequencyClass(value=str(value), frequency=count)
|
||||
for value, count in res["details"]["value_counts"].items()
|
||||
]
|
||||
if not send_sample_values:
|
||||
column_profile.distinctValueFrequencies = []
|
||||
elif exp == "expect_column_values_to_be_in_type_list":
|
||||
# ignore; we already know the types for each column via ingestion
|
||||
pass
|
||||
|
@ -273,8 +273,6 @@ class BigQuerySource(SQLAlchemySource):
|
||||
return dict(
|
||||
schema=self.config.project_id,
|
||||
table=f"{schema}.{table}",
|
||||
limit=self.config.profiling.limit,
|
||||
offset=self.config.profiling.offset,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -1,5 +1,4 @@
|
||||
import logging
|
||||
import os
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import (
|
||||
@ -21,12 +20,7 @@ from sqlalchemy import create_engine, inspect
|
||||
from sqlalchemy.engine.reflection import Inspector
|
||||
from sqlalchemy.sql import sqltypes as types
|
||||
|
||||
from datahub import __package_name__
|
||||
from datahub.configuration.common import (
|
||||
AllowDenyPattern,
|
||||
ConfigModel,
|
||||
ConfigurationError,
|
||||
)
|
||||
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
||||
from datahub.emitter.mce_builder import DEFAULT_ENV
|
||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
@ -144,19 +138,6 @@ class SQLSourceReport(SourceReport):
|
||||
self.filtered.append(ent_name)
|
||||
|
||||
|
||||
class GEProfilingConfig(ConfigModel):
|
||||
enabled: bool = False
|
||||
limit: Optional[int] = None
|
||||
offset: Optional[int] = None
|
||||
send_sample_values: bool = True
|
||||
|
||||
# The default of (5 * cpu_count) is adopted from the default max_workers
|
||||
# parameter of ThreadPoolExecutor. Given that profiling is often an I/O-bound
|
||||
# task, it may make sense to increase this default value in the future.
|
||||
# https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
|
||||
max_workers: int = 5 * (os.cpu_count() or 4)
|
||||
|
||||
|
||||
class SQLAlchemyConfig(ConfigModel):
|
||||
env: str = DEFAULT_ENV
|
||||
options: dict = {}
|
||||
@ -172,8 +153,19 @@ class SQLAlchemyConfig(ConfigModel):
|
||||
include_views: Optional[bool] = True
|
||||
include_tables: Optional[bool] = True
|
||||
|
||||
from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig
|
||||
|
||||
profiling: GEProfilingConfig = GEProfilingConfig()
|
||||
|
||||
@pydantic.root_validator()
|
||||
def ensure_profiling_pattern_is_passed_to_profiling(
|
||||
cls, values: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
profiling = values.get("profiling")
|
||||
if profiling is not None and profiling.enabled:
|
||||
profiling.allow_deny_patterns = values["profile_pattern"]
|
||||
return values
|
||||
|
||||
@abstractmethod
|
||||
def get_sql_alchemy_url(self):
|
||||
pass
|
||||
@ -316,12 +308,6 @@ class SQLAlchemySource(Source):
|
||||
self.platform = platform
|
||||
self.report = SQLSourceReport()
|
||||
|
||||
if self.config.profiling.enabled and not self._can_run_profiler():
|
||||
raise ConfigurationError(
|
||||
"Table profiles requested but profiler plugin is not enabled. "
|
||||
f"Try running: pip install '{__package_name__}[sql-profiles]'"
|
||||
)
|
||||
|
||||
def get_inspectors(self) -> Iterable[Inspector]:
|
||||
# This method can be overridden in the case that you want to dynamically
|
||||
# run on multiple databases.
|
||||
@ -608,20 +594,12 @@ class SQLAlchemySource(Source):
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
def _can_run_profiler(self) -> bool:
|
||||
try:
|
||||
from datahub.ingestion.source.ge_data_profiler import ( # noqa: F401
|
||||
DatahubGEProfiler,
|
||||
)
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _get_profiler_instance(self, inspector: Inspector) -> "DatahubGEProfiler":
|
||||
from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler
|
||||
|
||||
return DatahubGEProfiler(conn=inspector.bind, report=self.report)
|
||||
return DatahubGEProfiler(
|
||||
conn=inspector.bind, report=self.report, config=self.config.profiling
|
||||
)
|
||||
|
||||
def loop_profiler_requests(
|
||||
self,
|
||||
@ -647,7 +625,6 @@ class SQLAlchemySource(Source):
|
||||
yield GEProfilerRequest(
|
||||
pretty_name=dataset_name,
|
||||
batch_kwargs=self.prepare_profiler_args(schema=schema, table=table),
|
||||
send_sample_values=self.config.profiling.send_sample_values,
|
||||
)
|
||||
|
||||
def loop_profiler(
|
||||
@ -656,6 +633,8 @@ class SQLAlchemySource(Source):
|
||||
for request, profile in profiler.generate_profiles(
|
||||
profile_requests, self.config.profiling.max_workers
|
||||
):
|
||||
if profile is None:
|
||||
continue
|
||||
dataset_name = request.pretty_name
|
||||
mcp = MetadataChangeProposalWrapper(
|
||||
entityType="dataset",
|
||||
@ -672,8 +651,6 @@ class SQLAlchemySource(Source):
|
||||
return dict(
|
||||
schema=schema,
|
||||
table=table,
|
||||
limit=self.config.profiling.limit,
|
||||
offset=self.config.profiling.offset,
|
||||
)
|
||||
|
||||
def get_report(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user