From dfeced8eee17e0156ae6cd05e289ac6ad26627cb Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 13 Sep 2022 03:39:10 -0700 Subject: [PATCH] fix(ingest): hide internal profiler.allow_deny_patterns from config (#5619) --- .../src/datahub/ingestion/source/bigquery_v2/bigquery.py | 4 ++-- .../src/datahub/ingestion/source/ge_data_profiler.py | 2 +- .../src/datahub/ingestion/source/ge_profiling_config.py | 3 +-- .../src/datahub/ingestion/source/s3/config.py | 4 ++-- .../src/datahub/ingestion/source/s3/profiling.py | 6 +++--- .../src/datahub/ingestion/source/sql/sql_common.py | 7 +++---- 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 479ea01e64..3333842c07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -490,7 +490,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): def add_table_to_dataset_container( self, dataset_urn: str, db_name: str, schema: str - ) -> Iterable[Union[MetadataWorkUnit]]: + ) -> Iterable[MetadataWorkUnit]: schema_container_key = self.gen_dataset_key(db_name, schema) container_workunits = add_dataset_to_container( container_key=schema_container_key, @@ -755,7 +755,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): self.report.report_workunit(wu) if isinstance(table, BigqueryTable) and self.config.profiling.enabled: - if self.config.profiling.allow_deny_patterns.allowed( + if self.config.profiling._allow_deny_patterns.allowed( datahub_dataset_name.raw_table_name() ): # Emit the profile work unit diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index abff32c27e..32f4750bf3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -258,7 +258,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): ignored_columns: List[str] = [] for col in self.dataset.get_table_columns(): # We expect the allow/deny patterns to specify '.' - if not self.config.allow_deny_patterns.allowed( + if not self.config._allow_deny_patterns.allowed( f"{self.dataset_name}.{col}" ): ignored_columns.append(col) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 1e7094df8e..26ab73dbb4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -75,9 +75,8 @@ class GEProfilingConfig(ConfigModel): description="Whether to profile for the sample values for all columns.", ) - allow_deny_patterns: AllowDenyPattern = Field( + _allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr( default=AllowDenyPattern.allow_all(), - description="regex patterns for filtering of tables or table columns to profile.", ) max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( default=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index d84ef529c4..7a0e873bd9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -118,7 +118,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase): def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: - profiling = values.get("profiling") + profiling: Optional[DataLakeProfilerConfig] = values.get("profiling") if profiling is not None and profiling.enabled: - profiling.allow_deny_patterns = values["profile_patterns"] + profiling._allow_deny_patterns = values["profile_patterns"] return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py b/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py index af2cb718e3..27b8773362 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/profiling.py @@ -70,8 +70,8 @@ class DataLakeProfilerConfig(ConfigModel): description="Whether to perform profiling at table-level only or include column-level profiling as well.", ) - allow_deny_patterns: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), description="" + _allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr( + default=AllowDenyPattern.allow_all(), ) max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( @@ -209,7 +209,7 @@ class _SingleTableProfiler: # get column distinct counts for column in dataframe.columns: - if not self.profiling_config.allow_deny_patterns.allowed(column): + if not self.profiling_config._allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index fee1def3b3..b98ac4f3ba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -48,6 +48,7 @@ from datahub.emitter.mcp_builder import ( ) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.sql_common_state import ( BaseSQLAlchemyCheckpointState, @@ -269,8 +270,6 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase): default=True, description="Whether tables should be ingested." ) - from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig - profiling: GEProfilingConfig = GEProfilingConfig() # Custom Stateful Ingestion settings stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None @@ -290,9 +289,9 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase): def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: - profiling = values.get("profiling") + profiling: Optional[GEProfilingConfig] = values.get("profiling") if profiling is not None and profiling.enabled: - profiling.allow_deny_patterns = values["profile_pattern"] + profiling._allow_deny_patterns = values["profile_pattern"] return values @abstractmethod