mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-25 09:00:50 +00:00
fix(ingest): hide internal profiler.allow_deny_patterns from config (#5619)
This commit is contained in:
parent
c5c8e156aa
commit
dfeced8eee
@ -490,7 +490,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
||||
|
||||
def add_table_to_dataset_container(
|
||||
self, dataset_urn: str, db_name: str, schema: str
|
||||
) -> Iterable[Union[MetadataWorkUnit]]:
|
||||
) -> Iterable[MetadataWorkUnit]:
|
||||
schema_container_key = self.gen_dataset_key(db_name, schema)
|
||||
container_workunits = add_dataset_to_container(
|
||||
container_key=schema_container_key,
|
||||
@ -755,7 +755,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
||||
self.report.report_workunit(wu)
|
||||
|
||||
if isinstance(table, BigqueryTable) and self.config.profiling.enabled:
|
||||
if self.config.profiling.allow_deny_patterns.allowed(
|
||||
if self.config.profiling._allow_deny_patterns.allowed(
|
||||
datahub_dataset_name.raw_table_name()
|
||||
):
|
||||
# Emit the profile work unit
|
||||
|
@ -258,7 +258,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
||||
ignored_columns: List[str] = []
|
||||
for col in self.dataset.get_table_columns():
|
||||
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
|
||||
if not self.config.allow_deny_patterns.allowed(
|
||||
if not self.config._allow_deny_patterns.allowed(
|
||||
f"{self.dataset_name}.{col}"
|
||||
):
|
||||
ignored_columns.append(col)
|
||||
|
@ -75,9 +75,8 @@ class GEProfilingConfig(ConfigModel):
|
||||
description="Whether to profile for the sample values for all columns.",
|
||||
)
|
||||
|
||||
allow_deny_patterns: AllowDenyPattern = Field(
|
||||
_allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
|
||||
default=AllowDenyPattern.allow_all(),
|
||||
description="regex patterns for filtering of tables or table columns to profile.",
|
||||
)
|
||||
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
|
||||
default=None,
|
||||
|
@ -118,7 +118,7 @@ class DataLakeSourceConfig(PlatformSourceConfigBase, EnvBasedSourceConfigBase):
|
||||
def ensure_profiling_pattern_is_passed_to_profiling(
|
||||
cls, values: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
profiling = values.get("profiling")
|
||||
profiling: Optional[DataLakeProfilerConfig] = values.get("profiling")
|
||||
if profiling is not None and profiling.enabled:
|
||||
profiling.allow_deny_patterns = values["profile_patterns"]
|
||||
profiling._allow_deny_patterns = values["profile_patterns"]
|
||||
return values
|
||||
|
@ -70,8 +70,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
||||
description="Whether to perform profiling at table-level only or include column-level profiling as well.",
|
||||
)
|
||||
|
||||
allow_deny_patterns: AllowDenyPattern = Field(
|
||||
default=AllowDenyPattern.allow_all(), description=""
|
||||
_allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
|
||||
default=AllowDenyPattern.allow_all(),
|
||||
)
|
||||
|
||||
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
|
||||
@ -209,7 +209,7 @@ class _SingleTableProfiler:
|
||||
# get column distinct counts
|
||||
for column in dataframe.columns:
|
||||
|
||||
if not self.profiling_config.allow_deny_patterns.allowed(column):
|
||||
if not self.profiling_config._allow_deny_patterns.allowed(column):
|
||||
self.ignored_columns.append(column)
|
||||
continue
|
||||
|
||||
|
@ -48,6 +48,7 @@ from datahub.emitter.mcp_builder import (
|
||||
)
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
||||
from datahub.ingestion.source.state.checkpoint import Checkpoint
|
||||
from datahub.ingestion.source.state.sql_common_state import (
|
||||
BaseSQLAlchemyCheckpointState,
|
||||
@ -269,8 +270,6 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase):
|
||||
default=True, description="Whether tables should be ingested."
|
||||
)
|
||||
|
||||
from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig
|
||||
|
||||
profiling: GEProfilingConfig = GEProfilingConfig()
|
||||
# Custom Stateful Ingestion settings
|
||||
stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None
|
||||
@ -290,9 +289,9 @@ class SQLAlchemyConfig(StatefulIngestionConfigBase):
|
||||
def ensure_profiling_pattern_is_passed_to_profiling(
|
||||
cls, values: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
profiling = values.get("profiling")
|
||||
profiling: Optional[GEProfilingConfig] = values.get("profiling")
|
||||
if profiling is not None and profiling.enabled:
|
||||
profiling.allow_deny_patterns = values["profile_pattern"]
|
||||
profiling._allow_deny_patterns = values["profile_pattern"]
|
||||
return values
|
||||
|
||||
@abstractmethod
|
||||
|
Loading…
x
Reference in New Issue
Block a user