diff --git a/metadata-ingestion/source_docs/sql_profiles.md b/metadata-ingestion/source_docs/sql_profiles.md index 0640bb0213..d22db2c723 100644 --- a/metadata-ingestion/source_docs/sql_profiles.md +++ b/metadata-ingestion/source_docs/sql_profiles.md @@ -69,30 +69,29 @@ sink: Note that a `.` is used to denote nested fields in the YAML recipe. -| Field | Required | Default | Description | -| --------------------------------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------ | -| `profiling.enabled` | | `False` | Whether profiling should be done. | -| `profiling.limit` | | | Max number of documents to profile. By default, profiles all documents. | -| `profiling.offset` | | | Offset in documents to profile. By default, uses no offset. | +| Field | Required | Default | Description | +| --------------------------------------------------- | -------- |----------------------| ------------------------------------------------------------------------------------ | +| `profiling.enabled` | | `False` | Whether profiling should be done. | +| `profiling.limit` | | | Max number of documents to profile. By default, profiles all documents. | +| `profiling.offset` | | | Offset in documents to profile. By default, uses no offset. | | `profiling.max_workers` | | `5 * os.cpu_count()` | Number of worker threads to use for profiling. Set to 1 to disable. | -| `profiling.query_combiner_enabled` | | `True` | *This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible. | -| `profile_pattern.allow` | | `*` | List of regex patterns for tables or table columns to profile. Defaults to all. | -| `profile_pattern.deny` | | | List of regex patterns for tables or table columns to not profile. Defaults to none. | -| `profile_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. | -| `profiling.turn_off_expensive_profiling_metrics` | | False | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.| -| `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.| -| `profiling.profile_table_level_only` | | False | Whether to perform profiling at table-level only, or include column-level profiling as well.| -| `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. | -| `profiling.include_field_min_value` | | `True` | Whether to profile for the min value of numeric columns. | -| `profiling.include_field_max_value` | | `True` | Whether to profile for the max value of numeric columns. | -| `profiling.include_field_mean_value` | | `True` | Whether to profile for the mean value of numeric columns. | -| `profiling.include_field_median_value` | | `True` | Whether to profile for the median value of numeric columns. | -| `profiling.include_field_stddev_value` | | `True` | Whether to profile for the standard deviation of numeric columns. | -| `profiling.include_field_quantiles` | | `True` | Whether to profile for the quantiles of numeric columns. | -| `profiling.include_field_distinct_value_frequencies` | | `True` | Whether to profile for distinct value frequencies. | -| `profiling.include_field_histogram` | | `True` | Whether to profile for the histogram for numeric fields. | -| `profiling.include_field_sample_values` | | `True` | Whether to profile for the sample values for all columns. | - +| `profiling.query_combiner_enabled` | | `True` | *This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible. | +| `profile_pattern.allow` | | `*` | List of regex patterns for tables or table columns to profile. Defaults to all. | +| `profile_pattern.deny` | | | List of regex patterns for tables or table columns to not profile. Defaults to none. | +| `profile_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. | +| `profiling.turn_off_expensive_profiling_metrics` | | False | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.| +| `profiling.max_number_of_fields_to_profile` | | `None` | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.| +| `profiling.profile_table_level_only` | | False | Whether to perform profiling at table-level only, or include column-level profiling as well.| +| `profiling.include_field_null_count` | | `True` | Whether to profile for the number of nulls for each column. | +| `profiling.include_field_min_value` | | `True` | Whether to profile for the min value of numeric columns. | +| `profiling.include_field_max_value` | | `True` | Whether to profile for the max value of numeric columns. | +| `profiling.include_field_mean_value` | | `True` | Whether to profile for the mean value of numeric columns. | +| `profiling.include_field_median_value` | | `True` | Whether to profile for the median value of numeric columns. | +| `profiling.include_field_stddev_value` | | `True` | Whether to profile for the standard deviation of numeric columns. | +| `profiling.include_field_quantiles` | | `False` | Whether to profile for the quantiles of numeric columns. | +| `profiling.include_field_distinct_value_frequencies` | | `False` | Whether to profile for distinct value frequencies. | +| `profiling.include_field_histogram` | | `False` | Whether to profile for the histogram for numeric fields. | +| `profiling.include_field_sample_values` | | `True` | Whether to profile for the sample values for all columns. | ## Compatibility Coming soon! diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index d3b3c6fe90..fffc60cad6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -112,9 +112,9 @@ class GEProfilingConfig(ConfigModel): include_field_mean_value: bool = True include_field_median_value: bool = True include_field_stddev_value: bool = True - include_field_quantiles: bool = True - include_field_distinct_value_frequencies: bool = True - include_field_histogram: bool = True + include_field_quantiles: bool = False + include_field_distinct_value_frequencies: bool = False + include_field_histogram: bool = False include_field_sample_values: bool = True allow_deny_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() diff --git a/metadata-ingestion/tests/integration/mysql/mysql_to_file.yml b/metadata-ingestion/tests/integration/mysql/mysql_to_file.yml index 4f4fb80c3c..e3705ce5ea 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_to_file.yml +++ b/metadata-ingestion/tests/integration/mysql/mysql_to_file.yml @@ -20,7 +20,16 @@ source: - "^test_cases" profiling: enabled: True - + include_field_null_count: true + include_field_min_value: true + include_field_max_value: true + include_field_mean_value: true + include_field_median_value: true + include_field_stddev_value: true + include_field_quantiles: true + include_field_distinct_value_frequencies: true + include_field_histogram: true + include_field_sample_values: true sink: type: file config: diff --git a/metadata-ingestion/tests/integration/trino/trino_to_file.yml b/metadata-ingestion/tests/integration/trino/trino_to_file.yml index 98a19e4c76..b09324f9cd 100644 --- a/metadata-ingestion/tests/integration/trino/trino_to_file.yml +++ b/metadata-ingestion/tests/integration/trino/trino_to_file.yml @@ -20,6 +20,16 @@ source: - "library_catalog.librarydb.*" profiling: enabled: True + include_field_null_count: true + include_field_min_value: true + include_field_max_value: true + include_field_mean_value: true + include_field_median_value: true + include_field_stddev_value: true + include_field_quantiles: true + include_field_distinct_value_frequencies: true + include_field_histogram: true + include_field_sample_values: true sink: type: file