mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-03 04:10:43 +00:00
fix(ingest): profiling - disable expensive profilers by default (#3759)
This commit is contained in:
parent
926b6eed4e
commit
599edd22ae
@ -69,30 +69,29 @@ sink:
|
||||
|
||||
Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||
|
||||
| Field | Required | Default | Description |
|
||||
| --------------------------------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------ |
|
||||
| `profiling.enabled` | | `False` | Whether profiling should be done. |
|
||||
| `profiling.limit` | | | Max number of documents to profile. By default, profiles all documents. |
|
||||
| `profiling.offset` | | | Offset in documents to profile. By default, uses no offset. |
|
||||
| Field | Required | Default | Description |
|
||||
| --------------------------------------------------- | -------- |----------------------| ------------------------------------------------------------------------------------ |
|
||||
| `profiling.enabled` | | `False` | Whether profiling should be done. |
|
||||
| `profiling.limit` | | | Max number of documents to profile. By default, profiles all documents. |
|
||||
| `profiling.offset` | | | Offset in documents to profile. By default, uses no offset. |
|
||||
| `profiling.max_workers` | | `5 * os.cpu_count()` | Number of worker threads to use for profiling. Set to 1 to disable. |
|
||||
| `profiling.query_combiner_enabled` | | `True` | *This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible. |
|
||||
| `profile_pattern.allow` | | `*` | List of regex patterns for tables or table columns to profile. Defaults to all. |
|
||||
| `profile_pattern.deny` | | | List of regex patterns for tables or table columns to not profile. Defaults to none. |
|
||||
| `profile_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `profiling.turn_off_expensive_profiling_metrics` | | False | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.|
|
||||
| `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.|
|
||||
| `profiling.profile_table_level_only` | | False | Whether to perform profiling at table-level only, or include column-level profiling as well.|
|
||||
| `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. |
|
||||
| `profiling.include_field_min_value` | | `True` | Whether to profile for the min value of numeric columns. |
|
||||
| `profiling.include_field_max_value` | | `True` | Whether to profile for the max value of numeric columns. |
|
||||
| `profiling.include_field_mean_value` | | `True` | Whether to profile for the mean value of numeric columns. |
|
||||
| `profiling.include_field_median_value` | | `True` | Whether to profile for the median value of numeric columns. |
|
||||
| `profiling.include_field_stddev_value` | | `True` | Whether to profile for the standard deviation of numeric columns. |
|
||||
| `profiling.include_field_quantiles` | | `True` | Whether to profile for the quantiles of numeric columns. |
|
||||
| `profiling.include_field_distinct_value_frequencies` | | `True` | Whether to profile for distinct value frequencies. |
|
||||
| `profiling.include_field_histogram` | | `True` | Whether to profile for the histogram for numeric fields. |
|
||||
| `profiling.include_field_sample_values` | | `True` | Whether to profile for the sample values for all columns. |
|
||||
|
||||
| `profiling.query_combiner_enabled` | | `True` | *This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible. |
|
||||
| `profile_pattern.allow` | | `*` | List of regex patterns for tables or table columns to profile. Defaults to all. |
|
||||
| `profile_pattern.deny` | | | List of regex patterns for tables or table columns to not profile. Defaults to none. |
|
||||
| `profile_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `profiling.turn_off_expensive_profiling_metrics` | | False | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.|
|
||||
| `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.|
|
||||
| `profiling.profile_table_level_only` | | False | Whether to perform profiling at table-level only, or include column-level profiling as well.|
|
||||
| `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. |
|
||||
| `profiling.include_field_min_value` | | `True` | Whether to profile for the min value of numeric columns. |
|
||||
| `profiling.include_field_max_value` | | `True` | Whether to profile for the max value of numeric columns. |
|
||||
| `profiling.include_field_mean_value` | | `True` | Whether to profile for the mean value of numeric columns. |
|
||||
| `profiling.include_field_median_value` | | `True` | Whether to profile for the median value of numeric columns. |
|
||||
| `profiling.include_field_stddev_value` | | `True` | Whether to profile for the standard deviation of numeric columns. |
|
||||
| `profiling.include_field_quantiles` | | `False` | Whether to profile for the quantiles of numeric columns. |
|
||||
| `profiling.include_field_distinct_value_frequencies` | | `False` | Whether to profile for distinct value frequencies. |
|
||||
| `profiling.include_field_histogram` | | `False` | Whether to profile for the histogram for numeric fields. |
|
||||
| `profiling.include_field_sample_values` | | `True` | Whether to profile for the sample values for all columns. |
|
||||
## Compatibility
|
||||
|
||||
Coming soon!
|
||||
|
||||
@ -112,9 +112,9 @@ class GEProfilingConfig(ConfigModel):
|
||||
include_field_mean_value: bool = True
|
||||
include_field_median_value: bool = True
|
||||
include_field_stddev_value: bool = True
|
||||
include_field_quantiles: bool = True
|
||||
include_field_distinct_value_frequencies: bool = True
|
||||
include_field_histogram: bool = True
|
||||
include_field_quantiles: bool = False
|
||||
include_field_distinct_value_frequencies: bool = False
|
||||
include_field_histogram: bool = False
|
||||
include_field_sample_values: bool = True
|
||||
|
||||
allow_deny_patterns: AllowDenyPattern = AllowDenyPattern.allow_all()
|
||||
|
||||
@ -20,7 +20,16 @@ source:
|
||||
- "^test_cases"
|
||||
profiling:
|
||||
enabled: True
|
||||
|
||||
include_field_null_count: true
|
||||
include_field_min_value: true
|
||||
include_field_max_value: true
|
||||
include_field_mean_value: true
|
||||
include_field_median_value: true
|
||||
include_field_stddev_value: true
|
||||
include_field_quantiles: true
|
||||
include_field_distinct_value_frequencies: true
|
||||
include_field_histogram: true
|
||||
include_field_sample_values: true
|
||||
sink:
|
||||
type: file
|
||||
config:
|
||||
|
||||
@ -20,6 +20,16 @@ source:
|
||||
- "library_catalog.librarydb.*"
|
||||
profiling:
|
||||
enabled: True
|
||||
include_field_null_count: true
|
||||
include_field_min_value: true
|
||||
include_field_max_value: true
|
||||
include_field_mean_value: true
|
||||
include_field_median_value: true
|
||||
include_field_stddev_value: true
|
||||
include_field_quantiles: true
|
||||
include_field_distinct_value_frequencies: true
|
||||
include_field_histogram: true
|
||||
include_field_sample_values: true
|
||||
|
||||
sink:
|
||||
type: file
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user