mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-03 20:27:50 +00:00 
			
		
		
		
	fix(ingest): switch various sources to auto_stale_entity_removal helper (#7158)
				
					
				
			This commit is contained in:
		
							parent
							
								
									727050a8f3
								
							
						
					
					
						commit
						7ace79c153
					
				@ -97,6 +97,10 @@ from datahub.metadata.schema_classes import (
 | 
			
		||||
    UpstreamLineageClass,
 | 
			
		||||
)
 | 
			
		||||
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
 | 
			
		||||
from datahub.utilities.source_helpers import (
 | 
			
		||||
    auto_stale_entity_removal,
 | 
			
		||||
    auto_status_aspect,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
@ -105,9 +109,7 @@ DEFAULT_PLATFORM = "glue"
 | 
			
		||||
VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GlueSourceConfig(
 | 
			
		||||
    AwsSourceConfig, GlueProfilingConfig, StatefulIngestionConfigBase
 | 
			
		||||
):
 | 
			
		||||
class GlueSourceConfig(AwsSourceConfig, StatefulIngestionConfigBase):
 | 
			
		||||
    extract_owners: Optional[bool] = Field(
 | 
			
		||||
        default=True,
 | 
			
		||||
        description="When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets.",
 | 
			
		||||
@ -943,6 +945,12 @@ class GlueSource(StatefulIngestionSourceBase):
 | 
			
		||||
                yield wu
 | 
			
		||||
 | 
			
		||||
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        return auto_stale_entity_removal(
 | 
			
		||||
            self.stale_entity_removal_handler,
 | 
			
		||||
            auto_status_aspect(self.get_workunits_internal()),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        database_seen = set()
 | 
			
		||||
        databases, tables = self.get_all_tables_and_databases()
 | 
			
		||||
 | 
			
		||||
@ -989,9 +997,6 @@ class GlueSource(StatefulIngestionSourceBase):
 | 
			
		||||
                dataset_urn=dataset_urn, db_name=database_name
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # Add table to the checkpoint state.
 | 
			
		||||
            self.stale_entity_removal_handler.add_entity_to_state("table", dataset_urn)
 | 
			
		||||
 | 
			
		||||
            mcp = self.get_lineage_if_enabled(mce)
 | 
			
		||||
            if mcp:
 | 
			
		||||
                mcp_wu = MetadataWorkUnit(
 | 
			
		||||
@ -1013,9 +1018,6 @@ class GlueSource(StatefulIngestionSourceBase):
 | 
			
		||||
        if self.extract_transforms:
 | 
			
		||||
            yield from self._transform_extraction()
 | 
			
		||||
 | 
			
		||||
        # Clean up stale entities.
 | 
			
		||||
        yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
 | 
			
		||||
 | 
			
		||||
    def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        dags: Dict[str, Optional[Dict[str, Any]]] = {}
 | 
			
		||||
        flow_names: Dict[str, str] = {}
 | 
			
		||||
 | 
			
		||||
@ -56,6 +56,10 @@ from datahub.metadata.schema_classes import (
 | 
			
		||||
    SubTypesClass,
 | 
			
		||||
)
 | 
			
		||||
from datahub.utilities.registries.domain_registry import DomainRegistry
 | 
			
		||||
from datahub.utilities.source_helpers import (
 | 
			
		||||
    auto_stale_entity_removal,
 | 
			
		||||
    auto_status_aspect,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
@ -195,6 +199,12 @@ class KafkaSource(StatefulIngestionSourceBase):
 | 
			
		||||
        return cls(config, ctx)
 | 
			
		||||
 | 
			
		||||
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        return auto_stale_entity_removal(
 | 
			
		||||
            self.stale_entity_removal_handler,
 | 
			
		||||
            auto_status_aspect(self.get_workunits_internal()),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        topics = self.consumer.list_topics(
 | 
			
		||||
            timeout=self.source_config.connection.client_timeout_seconds
 | 
			
		||||
        ).topics
 | 
			
		||||
@ -204,20 +214,8 @@ class KafkaSource(StatefulIngestionSourceBase):
 | 
			
		||||
            self.report.report_topic_scanned(t)
 | 
			
		||||
            if self.source_config.topic_patterns.allowed(t):
 | 
			
		||||
                yield from self._extract_record(t, t_detail, extra_topic_details.get(t))
 | 
			
		||||
                # add topic to checkpoint
 | 
			
		||||
                topic_urn = make_dataset_urn_with_platform_instance(
 | 
			
		||||
                    platform=self.platform,
 | 
			
		||||
                    name=t,
 | 
			
		||||
                    platform_instance=self.source_config.platform_instance,
 | 
			
		||||
                    env=self.source_config.env,
 | 
			
		||||
                )
 | 
			
		||||
                self.stale_entity_removal_handler.add_entity_to_state(
 | 
			
		||||
                    type="topic", urn=topic_urn
 | 
			
		||||
                )
 | 
			
		||||
            else:
 | 
			
		||||
                self.report.report_dropped(t)
 | 
			
		||||
        # Clean up stale entities.
 | 
			
		||||
        yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
 | 
			
		||||
 | 
			
		||||
    def _extract_record(
 | 
			
		||||
        self,
 | 
			
		||||
 | 
			
		||||
@ -48,6 +48,10 @@ from datahub.metadata.schema_classes import (
 | 
			
		||||
    DatasetPropertiesClass,
 | 
			
		||||
    SubTypesClass,
 | 
			
		||||
)
 | 
			
		||||
from datahub.utilities.source_helpers import (
 | 
			
		||||
    auto_stale_entity_removal,
 | 
			
		||||
    auto_status_aspect,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
@ -234,6 +238,12 @@ class PulsarSource(StatefulIngestionSourceBase):
 | 
			
		||||
        return cls(config, ctx)
 | 
			
		||||
 | 
			
		||||
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        return auto_stale_entity_removal(
 | 
			
		||||
            self.stale_entity_removal_handler,
 | 
			
		||||
            auto_status_aspect(self.get_workunits_internal()),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 | 
			
		||||
        """
 | 
			
		||||
        Interacts with the Pulsar Admin Api and loops over tenants, namespaces and topics. For every topic
 | 
			
		||||
        the schema information is retrieved if available.
 | 
			
		||||
@ -302,24 +312,12 @@ class PulsarSource(StatefulIngestionSourceBase):
 | 
			
		||||
                            self.report.topics_scanned += 1
 | 
			
		||||
                            if self.config.topic_patterns.allowed(topic):
 | 
			
		||||
                                yield from self._extract_record(topic, is_partitioned)
 | 
			
		||||
                                # Add topic to checkpoint if stateful ingestion is enabled
 | 
			
		||||
                                topic_urn = make_dataset_urn_with_platform_instance(
 | 
			
		||||
                                    platform=self.platform,
 | 
			
		||||
                                    name=topic,
 | 
			
		||||
                                    platform_instance=self.config.platform_instance,
 | 
			
		||||
                                    env=self.config.env,
 | 
			
		||||
                                )
 | 
			
		||||
                                self.stale_entity_removal_handler.add_entity_to_state(
 | 
			
		||||
                                    type="topic", urn=topic_urn
 | 
			
		||||
                                )
 | 
			
		||||
                            else:
 | 
			
		||||
                                self.report.report_topics_dropped(topic)
 | 
			
		||||
                    else:
 | 
			
		||||
                        self.report.report_namespaces_dropped(namespace)
 | 
			
		||||
            else:
 | 
			
		||||
                self.report.report_tenants_dropped(tenant)
 | 
			
		||||
        # Clean up stale entities.
 | 
			
		||||
        yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
 | 
			
		||||
 | 
			
		||||
    def _is_token_authentication_configured(self) -> bool:
 | 
			
		||||
        return self.config.token is not None
 | 
			
		||||
 | 
			
		||||
@ -194,9 +194,6 @@ class VerticaSource(SQLAlchemySource):
 | 
			
		||||
            if sql_config.include_oauth:
 | 
			
		||||
                yield from self.loop_oauth(inspector, oauth_schema, sql_config)
 | 
			
		||||
 | 
			
		||||
        # Clean up stale entities.
 | 
			
		||||
        yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
 | 
			
		||||
 | 
			
		||||
    def get_database_properties(
 | 
			
		||||
        self, inspector: Inspector, database: str
 | 
			
		||||
    ) -> Optional[Dict[str, str]]:
 | 
			
		||||
@ -491,8 +488,6 @@ class VerticaSource(SQLAlchemySource):
 | 
			
		||||
            urn=dataset_urn,
 | 
			
		||||
            aspects=[StatusClass(removed=False)],
 | 
			
		||||
        )
 | 
			
		||||
        # Add table to the checkpoint state
 | 
			
		||||
        self.stale_entity_removal_handler.add_entity_to_state("table", dataset_urn)
 | 
			
		||||
        description, properties, location_urn = self.get_projection_properties(
 | 
			
		||||
            inspector, schema, projection
 | 
			
		||||
        )
 | 
			
		||||
@ -718,8 +713,6 @@ class VerticaSource(SQLAlchemySource):
 | 
			
		||||
            urn=dataset_urn,
 | 
			
		||||
            aspects=[StatusClass(removed=False)],
 | 
			
		||||
        )
 | 
			
		||||
        # Add table to the checkpoint state
 | 
			
		||||
        self.stale_entity_removal_handler.add_entity_to_state("model", dataset_urn)
 | 
			
		||||
        description, properties, location = self.get_model_properties(
 | 
			
		||||
            inspector, schema, table
 | 
			
		||||
        )
 | 
			
		||||
@ -904,8 +897,6 @@ class VerticaSource(SQLAlchemySource):
 | 
			
		||||
            urn=dataset_urn,
 | 
			
		||||
            aspects=[StatusClass(removed=False)],
 | 
			
		||||
        )
 | 
			
		||||
        # Add table to the checkpoint state
 | 
			
		||||
        self.stale_entity_removal_handler.add_entity_to_state("oauth", dataset_urn)
 | 
			
		||||
        description, properties, location_urn = self.get_oauth_properties(
 | 
			
		||||
            inspector, schema, oauth
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user