mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-01 13:23:09 +00:00
feat(ingest): auto-fix duplicate schema fieldPaths (#10526)
This commit is contained in:
parent
2b6c78b776
commit
f2b5875632
@ -30,6 +30,7 @@ from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUn
|
||||
from datahub.ingestion.api.report import Report
|
||||
from datahub.ingestion.api.source_helpers import (
|
||||
auto_browse_path_v2,
|
||||
auto_fix_duplicate_schema_field_paths,
|
||||
auto_lowercase_urns,
|
||||
auto_materialize_referenced_tags_terms,
|
||||
auto_status_aspect,
|
||||
@ -243,6 +244,9 @@ class Source(Closeable, metaclass=ABCMeta):
|
||||
auto_lowercase_dataset_urns,
|
||||
auto_status_aspect,
|
||||
auto_materialize_referenced_tags_terms,
|
||||
partial(
|
||||
auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
|
||||
),
|
||||
browse_path_processor,
|
||||
partial(auto_workunit_reporter, self.get_report()),
|
||||
]
|
||||
@ -286,14 +290,18 @@ class Source(Closeable, metaclass=ABCMeta):
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
|
||||
def _infer_platform(self) -> Optional[str]:
|
||||
config = self.get_config()
|
||||
|
||||
platform = (
|
||||
return (
|
||||
getattr(config, "platform_name", None)
|
||||
or getattr(self, "platform", None)
|
||||
or getattr(config, "platform", None)
|
||||
)
|
||||
|
||||
def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
|
||||
config = self.get_config()
|
||||
|
||||
platform = self._infer_platform()
|
||||
env = getattr(config, "env", None)
|
||||
browse_path_drop_dirs = [
|
||||
platform,
|
||||
|
@ -30,6 +30,8 @@ from datahub.metadata.schema_classes import (
|
||||
MetadataChangeEventClass,
|
||||
MetadataChangeProposalClass,
|
||||
OwnershipClass as Ownership,
|
||||
SchemaFieldClass,
|
||||
SchemaMetadataClass,
|
||||
StatusClass,
|
||||
TimeWindowSizeClass,
|
||||
)
|
||||
@ -378,6 +380,54 @@ def auto_browse_path_v2(
|
||||
telemetry.telemetry_instance.ping("incorrect_browse_path_v2", properties)
|
||||
|
||||
|
||||
def auto_fix_duplicate_schema_field_paths(
|
||||
stream: Iterable[MetadataWorkUnit],
|
||||
*,
|
||||
platform: Optional[str] = None,
|
||||
) -> Iterable[MetadataWorkUnit]:
|
||||
"""Count schema metadata aspects with duplicate field paths and emit telemetry."""
|
||||
|
||||
total_schema_aspects = 0
|
||||
schemas_with_duplicates = 0
|
||||
duplicated_field_paths = 0
|
||||
|
||||
for wu in stream:
|
||||
schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass)
|
||||
if schema_metadata:
|
||||
total_schema_aspects += 1
|
||||
|
||||
seen_fields = set()
|
||||
dropped_fields = []
|
||||
updated_fields: List[SchemaFieldClass] = []
|
||||
for field in schema_metadata.fields:
|
||||
if field.fieldPath in seen_fields:
|
||||
dropped_fields.append(field.fieldPath)
|
||||
else:
|
||||
seen_fields.add(field.fieldPath)
|
||||
updated_fields.append(field)
|
||||
|
||||
if dropped_fields:
|
||||
logger.info(
|
||||
f"Fixing duplicate field paths in schema aspect for {wu.get_urn()} by dropping fields: {dropped_fields}"
|
||||
)
|
||||
schema_metadata.fields = updated_fields
|
||||
schemas_with_duplicates += 1
|
||||
duplicated_field_paths += len(dropped_fields)
|
||||
|
||||
yield wu
|
||||
|
||||
if schemas_with_duplicates:
|
||||
properties = {
|
||||
"platform": platform,
|
||||
"total_schema_aspects": total_schema_aspects,
|
||||
"schemas_with_duplicates": schemas_with_duplicates,
|
||||
"duplicated_field_paths": duplicated_field_paths,
|
||||
}
|
||||
telemetry.telemetry_instance.ping(
|
||||
"ingestion_duplicate_schema_field_paths", properties
|
||||
)
|
||||
|
||||
|
||||
def auto_empty_dataset_usage_statistics(
|
||||
stream: Iterable[MetadataWorkUnit],
|
||||
*,
|
||||
|
Loading…
x
Reference in New Issue
Block a user