diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 16947d79d8..09f13624d6 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -11,6 +11,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Deprecations ### Other notable Changes +- #4961 Dropped profiling is not reported by default as that caused a lot of spurious logging in some cases. Set `profiling.report_dropped_profiles` to `True` if you want older behaviour. ## `v0.8.35` diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 6d7e8ea80d..28a8e04413 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -20,6 +20,10 @@ class GEProfilingConfig(ConfigModel): default=None, description="Offset in documents to profile. By default, uses no offset.", ) + report_dropped_profiles: bool = Field( + default=False, + description="If datasets which were not profiled are reported in source report or not. Set to `True` for debugging purposes.", + ) # These settings will override the ones below. turn_off_expensive_profiling_metrics: bool = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 8f540884ef..308a2f8556 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -1295,7 +1295,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase): schema=schema, entity=table, inspector=inspector ) if not self.is_dataset_eligible_for_profiling(dataset_name, sql_config): - self.report.report_dropped(f"profile of {dataset_name}") + if self.config.profiling.report_dropped_profiles: + self.report.report_dropped(f"profile of {dataset_name}") continue dataset_name = self.normalise_dataset_name(dataset_name)