fix(ingest): hide internal profiler.allow_deny_patterns from config (#5619)

This commit is contained in:
Harshal Sheth 2022-09-13 03:39:10 -07:00 committed by GitHub
parent c5c8e156aa
commit dfeced8eee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 12 additions and 14 deletions

View File

@ -490,7 +490,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
def add_table_to_dataset_container( def add_table_to_dataset_container(
self, dataset_urn: str, db_name: str, schema: str self, dataset_urn: str, db_name: str, schema: str
) -> Iterable[Union[MetadataWorkUnit]]: ) -> Iterable[MetadataWorkUnit]:
schema_container_key = self.gen_dataset_key(db_name, schema) schema_container_key = self.gen_dataset_key(db_name, schema)
container_workunits = add_dataset_to_container( container_workunits = add_dataset_to_container(
container_key=schema_container_key, container_key=schema_container_key,
@ -755,7 +755,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
self.report.report_workunit(wu) self.report.report_workunit(wu)
if isinstance(table, BigqueryTable) and self.config.profiling.enabled: if isinstance(table, BigqueryTable) and self.config.profiling.enabled:
if self.config.profiling.allow_deny_patterns.allowed( if self.config.profiling._allow_deny_patterns.allowed(
datahub_dataset_name.raw_table_name() datahub_dataset_name.raw_table_name()
): ):
# Emit the profile work unit # Emit the profile work unit

View File

@ -258,7 +258,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
ignored_columns: List[str] = [] ignored_columns: List[str] = []
for col in self.dataset.get_table_columns(): for col in self.dataset.get_table_columns():
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>' # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
if not self.config.allow_deny_patterns.allowed( if not self.config._allow_deny_patterns.allowed(
f"{self.dataset_name}.{col}" f"{self.dataset_name}.{col}"
): ):
ignored_columns.append(col) ignored_columns.append(col)

View File

@ -75,9 +75,8 @@ class GEProfilingConfig(ConfigModel):
description="Whether to profile for the sample values for all columns.", description="Whether to profile for the sample values for all columns.",
) )
allow_deny_patterns: AllowDenyPattern = Field( _allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
default=AllowDenyPattern.allow_all(), default=AllowDenyPattern.allow_all(),
description="regex patterns for filtering of tables or table columns to profile.",
) )
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
default=None, default=None,

View File

@ -118,7 +118,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
def ensure_profiling_pattern_is_passed_to_profiling( def ensure_profiling_pattern_is_passed_to_profiling(
cls, values: Dict[str, Any] cls, values: Dict[str, Any]
) -> Dict[str, Any]: ) -> Dict[str, Any]:
profiling = values.get("profiling") profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
if profiling is not None and profiling.enabled: if profiling is not None and profiling.enabled:
profiling.allow_deny_patterns = values["profile_patterns"] profiling._allow_deny_patterns = values["profile_patterns"]
return values return values

View File

@ -70,8 +70,8 @@ class DataLakeProfilerConfig(ConfigModel):
description="Whether to perform profiling at table-level only or include column-level profiling as well.", description="Whether to perform profiling at table-level only or include column-level profiling as well.",
) )
allow_deny_patterns: AllowDenyPattern = Field( _allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
default=AllowDenyPattern.allow_all(), description="" default=AllowDenyPattern.allow_all(),
) )
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field( max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
@ -209,7 +209,7 @@ class _SingleTableProfiler:
# get column distinct counts # get column distinct counts
for column in dataframe.columns: for column in dataframe.columns:
if not self.profiling_config.allow_deny_patterns.allowed(column): if not self.profiling_config._allow_deny_patterns.allowed(column):
self.ignored_columns.append(column) self.ignored_columns.append(column)
continue continue

View File

@ -48,6 +48,7 @@ from datahub.emitter.mcp_builder import (
) )
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.checkpoint import Checkpoint
from datahub.ingestion.source.state.sql_common_state import ( from datahub.ingestion.source.state.sql_common_state import (
BaseSQLAlchemyCheckpointState, BaseSQLAlchemyCheckpointState,
@ -269,8 +270,6 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase):
default=True, description="Whether tables should be ingested." default=True, description="Whether tables should be ingested."
) )
from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig
profiling: GEProfilingConfig = GEProfilingConfig() profiling: GEProfilingConfig = GEProfilingConfig()
# Custom Stateful Ingestion settings # Custom Stateful Ingestion settings
stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None
@ -290,9 +289,9 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase):
def ensure_profiling_pattern_is_passed_to_profiling( def ensure_profiling_pattern_is_passed_to_profiling(
cls, values: Dict[str, Any] cls, values: Dict[str, Any]
) -> Dict[str, Any]: ) -> Dict[str, Any]:
profiling = values.get("profiling") profiling: Optional[GEProfilingConfig] = values.get("profiling")
if profiling is not None and profiling.enabled: if profiling is not None and profiling.enabled:
profiling.allow_deny_patterns = values["profile_pattern"] profiling._allow_deny_patterns = values["profile_pattern"]
return values return values
@abstractmethod @abstractmethod