fix(ingest): switch various sources to auto_stale_entity_removal helper (#7158)

This commit is contained in:
Harshal Sheth 2023-01-30 14:45:12 -05:00 committed by GitHub
parent 727050a8f3
commit 7ace79c153
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 42 deletions

View File

@ -97,6 +97,10 @@ from datahub.metadata.schema_classes import (
UpstreamLineageClass, UpstreamLineageClass,
) )
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
from datahub.utilities.source_helpers import (
auto_stale_entity_removal,
auto_status_aspect,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -105,9 +109,7 @@ DEFAULT_PLATFORM = "glue"
VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"] VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
class GlueSourceConfig( class GlueSourceConfig(AwsSourceConfig, StatefulIngestionConfigBase):
AwsSourceConfig, GlueProfilingConfig, StatefulIngestionConfigBase
):
extract_owners: Optional[bool] = Field( extract_owners: Optional[bool] = Field(
default=True, default=True,
description="When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets.", description="When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets.",
@ -943,6 +945,12 @@ class GlueSource(StatefulIngestionSourceBase):
yield wu yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: def get_workunits(self) -> Iterable[MetadataWorkUnit]:
return auto_stale_entity_removal(
self.stale_entity_removal_handler,
auto_status_aspect(self.get_workunits_internal()),
)
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
database_seen = set() database_seen = set()
databases, tables = self.get_all_tables_and_databases() databases, tables = self.get_all_tables_and_databases()
@ -989,9 +997,6 @@ class GlueSource(StatefulIngestionSourceBase):
dataset_urn=dataset_urn, db_name=database_name dataset_urn=dataset_urn, db_name=database_name
) )
# Add table to the checkpoint state.
self.stale_entity_removal_handler.add_entity_to_state("table", dataset_urn)
mcp = self.get_lineage_if_enabled(mce) mcp = self.get_lineage_if_enabled(mce)
if mcp: if mcp:
mcp_wu = MetadataWorkUnit( mcp_wu = MetadataWorkUnit(
@ -1013,9 +1018,6 @@ class GlueSource(StatefulIngestionSourceBase):
if self.extract_transforms: if self.extract_transforms:
yield from self._transform_extraction() yield from self._transform_extraction()
# Clean up stale entities.
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]: def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
dags: Dict[str, Optional[Dict[str, Any]]] = {} dags: Dict[str, Optional[Dict[str, Any]]] = {}
flow_names: Dict[str, str] = {} flow_names: Dict[str, str] = {}

View File

@ -56,6 +56,10 @@ from datahub.metadata.schema_classes import (
SubTypesClass, SubTypesClass,
) )
from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.registries.domain_registry import DomainRegistry
from datahub.utilities.source_helpers import (
auto_stale_entity_removal,
auto_status_aspect,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -195,6 +199,12 @@ class KafkaSource(StatefulIngestionSourceBase):
return cls(config, ctx) return cls(config, ctx)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: def get_workunits(self) -> Iterable[MetadataWorkUnit]:
return auto_stale_entity_removal(
self.stale_entity_removal_handler,
auto_status_aspect(self.get_workunits_internal()),
)
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
topics = self.consumer.list_topics( topics = self.consumer.list_topics(
timeout=self.source_config.connection.client_timeout_seconds timeout=self.source_config.connection.client_timeout_seconds
).topics ).topics
@ -204,20 +214,8 @@ class KafkaSource(StatefulIngestionSourceBase):
self.report.report_topic_scanned(t) self.report.report_topic_scanned(t)
if self.source_config.topic_patterns.allowed(t): if self.source_config.topic_patterns.allowed(t):
yield from self._extract_record(t, t_detail, extra_topic_details.get(t)) yield from self._extract_record(t, t_detail, extra_topic_details.get(t))
# add topic to checkpoint
topic_urn = make_dataset_urn_with_platform_instance(
platform=self.platform,
name=t,
platform_instance=self.source_config.platform_instance,
env=self.source_config.env,
)
self.stale_entity_removal_handler.add_entity_to_state(
type="topic", urn=topic_urn
)
else: else:
self.report.report_dropped(t) self.report.report_dropped(t)
# Clean up stale entities.
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
def _extract_record( def _extract_record(
self, self,

View File

@ -48,6 +48,10 @@ from datahub.metadata.schema_classes import (
DatasetPropertiesClass, DatasetPropertiesClass,
SubTypesClass, SubTypesClass,
) )
from datahub.utilities.source_helpers import (
auto_stale_entity_removal,
auto_status_aspect,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -234,6 +238,12 @@ class PulsarSource(StatefulIngestionSourceBase):
return cls(config, ctx) return cls(config, ctx)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: def get_workunits(self) -> Iterable[MetadataWorkUnit]:
return auto_stale_entity_removal(
self.stale_entity_removal_handler,
auto_status_aspect(self.get_workunits_internal()),
)
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
""" """
Interacts with the Pulsar Admin Api and loops over tenants, namespaces and topics. For every topic Interacts with the Pulsar Admin Api and loops over tenants, namespaces and topics. For every topic
the schema information is retrieved if available. the schema information is retrieved if available.
@ -302,24 +312,12 @@ class PulsarSource(StatefulIngestionSourceBase):
self.report.topics_scanned += 1 self.report.topics_scanned += 1
if self.config.topic_patterns.allowed(topic): if self.config.topic_patterns.allowed(topic):
yield from self._extract_record(topic, is_partitioned) yield from self._extract_record(topic, is_partitioned)
# Add topic to checkpoint if stateful ingestion is enabled
topic_urn = make_dataset_urn_with_platform_instance(
platform=self.platform,
name=topic,
platform_instance=self.config.platform_instance,
env=self.config.env,
)
self.stale_entity_removal_handler.add_entity_to_state(
type="topic", urn=topic_urn
)
else: else:
self.report.report_topics_dropped(topic) self.report.report_topics_dropped(topic)
else: else:
self.report.report_namespaces_dropped(namespace) self.report.report_namespaces_dropped(namespace)
else: else:
self.report.report_tenants_dropped(tenant) self.report.report_tenants_dropped(tenant)
# Clean up stale entities.
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
def _is_token_authentication_configured(self) -> bool: def _is_token_authentication_configured(self) -> bool:
return self.config.token is not None return self.config.token is not None

View File

@ -194,9 +194,6 @@ class VerticaSource(SQLAlchemySource):
if sql_config.include_oauth: if sql_config.include_oauth:
yield from self.loop_oauth(inspector, oauth_schema, sql_config) yield from self.loop_oauth(inspector, oauth_schema, sql_config)
# Clean up stale entities.
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
def get_database_properties( def get_database_properties(
self, inspector: Inspector, database: str self, inspector: Inspector, database: str
) -> Optional[Dict[str, str]]: ) -> Optional[Dict[str, str]]:
@ -491,8 +488,6 @@ class VerticaSource(SQLAlchemySource):
urn=dataset_urn, urn=dataset_urn,
aspects=[StatusClass(removed=False)], aspects=[StatusClass(removed=False)],
) )
# Add table to the checkpoint state
self.stale_entity_removal_handler.add_entity_to_state("table", dataset_urn)
description, properties, location_urn = self.get_projection_properties( description, properties, location_urn = self.get_projection_properties(
inspector, schema, projection inspector, schema, projection
) )
@ -718,8 +713,6 @@ class VerticaSource(SQLAlchemySource):
urn=dataset_urn, urn=dataset_urn,
aspects=[StatusClass(removed=False)], aspects=[StatusClass(removed=False)],
) )
# Add table to the checkpoint state
self.stale_entity_removal_handler.add_entity_to_state("model", dataset_urn)
description, properties, location = self.get_model_properties( description, properties, location = self.get_model_properties(
inspector, schema, table inspector, schema, table
) )
@ -904,8 +897,6 @@ class VerticaSource(SQLAlchemySource):
urn=dataset_urn, urn=dataset_urn,
aspects=[StatusClass(removed=False)], aspects=[StatusClass(removed=False)],
) )
# Add table to the checkpoint state
self.stale_entity_removal_handler.add_entity_to_state("oauth", dataset_urn)
description, properties, location_urn = self.get_oauth_properties( description, properties, location_urn = self.get_oauth_properties(
inspector, schema, oauth inspector, schema, oauth
) )