fix(ingest): hide internal profiler.allow_deny_patterns from config (#5619)

This commit is contained in:
Harshal Sheth 2022-09-13 03:39:10 -07:00 committed by GitHub
parent c5c8e156aa
commit dfeced8eee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 12 additions and 14 deletions

View File

@ -490,7 +490,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
def add_table_to_dataset_container(
self, dataset_urn: str, db_name: str, schema: str
) -> Iterable[Union[MetadataWorkUnit]]:
) -> Iterable[MetadataWorkUnit]:
schema_container_key = self.gen_dataset_key(db_name, schema)
container_workunits = add_dataset_to_container(
container_key=schema_container_key,
@ -755,7 +755,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
self.report.report_workunit(wu)
if isinstance(table, BigqueryTable) and self.config.profiling.enabled:
if self.config.profiling.allow_deny_patterns.allowed(
if self.config.profiling._allow_deny_patterns.allowed(
datahub_dataset_name.raw_table_name()
):
# Emit the profile work unit

View File

@ -258,7 +258,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
ignored_columns: List[str] = []
for col in self.dataset.get_table_columns():
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
if not self.config.allow_deny_patterns.allowed(
if not self.config._allow_deny_patterns.allowed(
f"{self.dataset_name}.{col}"
):
ignored_columns.append(col)

View File

@ -75,9 +75,8 @@ class GEProfilingConfig(ConfigModel):
description="Whether to profile for the sample values for all columns.",
)
allow_deny_patterns: AllowDenyPattern = Field(
_allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
default=AllowDenyPattern.allow_all(),
description="regex patterns for filtering of tables or table columns to profile.",
)
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
default=None,

View File

@ -118,7 +118,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
def ensure_profiling_pattern_is_passed_to_profiling(
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
profiling = values.get("profiling")
profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
if profiling is not None and profiling.enabled:
profiling.allow_deny_patterns = values["profile_patterns"]
profiling._allow_deny_patterns = values["profile_patterns"]
return values

View File

@ -70,8 +70,8 @@ class DataLakeProfilerConfig(ConfigModel):
description="Whether to perform profiling at table-level only or include column-level profiling as well.",
)
allow_deny_patterns: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(), description=""
_allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
default=AllowDenyPattern.allow_all(),
)
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
@ -209,7 +209,7 @@ class _SingleTableProfiler:
# get column distinct counts
for column in dataframe.columns:
if not self.profiling_config.allow_deny_patterns.allowed(column):
if not self.profiling_config._allow_deny_patterns.allowed(column):
self.ignored_columns.append(column)
continue

View File

@ -48,6 +48,7 @@ from datahub.emitter.mcp_builder import (
)
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
from datahub.ingestion.source.state.checkpoint import Checkpoint
from datahub.ingestion.source.state.sql_common_state import (
BaseSQLAlchemyCheckpointState,
@ -269,8 +270,6 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase):
default=True, description="Whether tables should be ingested."
)
from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig
profiling: GEProfilingConfig = GEProfilingConfig()
# Custom Stateful Ingestion settings
stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None
@ -290,9 +289,9 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase):
def ensure_profiling_pattern_is_passed_to_profiling(
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
profiling = values.get("profiling")
profiling: Optional[GEProfilingConfig] = values.get("profiling")
if profiling is not None and profiling.enabled:
profiling.allow_deny_patterns = values["profile_pattern"]
profiling._allow_deny_patterns = values["profile_pattern"]
return values
@abstractmethod