mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-03 12:16:10 +00:00
fix(ingest): switch various sources to auto_stale_entity_removal helper (#7158)
This commit is contained in:
parent
727050a8f3
commit
7ace79c153
@ -97,6 +97,10 @@ from datahub.metadata.schema_classes import (
|
||||
UpstreamLineageClass,
|
||||
)
|
||||
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
||||
from datahub.utilities.source_helpers import (
|
||||
auto_stale_entity_removal,
|
||||
auto_status_aspect,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -105,9 +109,7 @@ DEFAULT_PLATFORM = "glue"
|
||||
VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
|
||||
|
||||
|
||||
class GlueSourceConfig(
|
||||
AwsSourceConfig, GlueProfilingConfig, StatefulIngestionConfigBase
|
||||
):
|
||||
class GlueSourceConfig(AwsSourceConfig, StatefulIngestionConfigBase):
|
||||
extract_owners: Optional[bool] = Field(
|
||||
default=True,
|
||||
description="When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets.",
|
||||
@ -943,6 +945,12 @@ class GlueSource(StatefulIngestionSourceBase):
|
||||
yield wu
|
||||
|
||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||
return auto_stale_entity_removal(
|
||||
self.stale_entity_removal_handler,
|
||||
auto_status_aspect(self.get_workunits_internal()),
|
||||
)
|
||||
|
||||
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
||||
database_seen = set()
|
||||
databases, tables = self.get_all_tables_and_databases()
|
||||
|
||||
@ -989,9 +997,6 @@ class GlueSource(StatefulIngestionSourceBase):
|
||||
dataset_urn=dataset_urn, db_name=database_name
|
||||
)
|
||||
|
||||
# Add table to the checkpoint state.
|
||||
self.stale_entity_removal_handler.add_entity_to_state("table", dataset_urn)
|
||||
|
||||
mcp = self.get_lineage_if_enabled(mce)
|
||||
if mcp:
|
||||
mcp_wu = MetadataWorkUnit(
|
||||
@ -1013,9 +1018,6 @@ class GlueSource(StatefulIngestionSourceBase):
|
||||
if self.extract_transforms:
|
||||
yield from self._transform_extraction()
|
||||
|
||||
# Clean up stale entities.
|
||||
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
|
||||
|
||||
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
|
||||
dags: Dict[str, Optional[Dict[str, Any]]] = {}
|
||||
flow_names: Dict[str, str] = {}
|
||||
|
||||
@ -56,6 +56,10 @@ from datahub.metadata.schema_classes import (
|
||||
SubTypesClass,
|
||||
)
|
||||
from datahub.utilities.registries.domain_registry import DomainRegistry
|
||||
from datahub.utilities.source_helpers import (
|
||||
auto_stale_entity_removal,
|
||||
auto_status_aspect,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -195,6 +199,12 @@ class KafkaSource(StatefulIngestionSourceBase):
|
||||
return cls(config, ctx)
|
||||
|
||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||
return auto_stale_entity_removal(
|
||||
self.stale_entity_removal_handler,
|
||||
auto_status_aspect(self.get_workunits_internal()),
|
||||
)
|
||||
|
||||
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
||||
topics = self.consumer.list_topics(
|
||||
timeout=self.source_config.connection.client_timeout_seconds
|
||||
).topics
|
||||
@ -204,20 +214,8 @@ class KafkaSource(StatefulIngestionSourceBase):
|
||||
self.report.report_topic_scanned(t)
|
||||
if self.source_config.topic_patterns.allowed(t):
|
||||
yield from self._extract_record(t, t_detail, extra_topic_details.get(t))
|
||||
# add topic to checkpoint
|
||||
topic_urn = make_dataset_urn_with_platform_instance(
|
||||
platform=self.platform,
|
||||
name=t,
|
||||
platform_instance=self.source_config.platform_instance,
|
||||
env=self.source_config.env,
|
||||
)
|
||||
self.stale_entity_removal_handler.add_entity_to_state(
|
||||
type="topic", urn=topic_urn
|
||||
)
|
||||
else:
|
||||
self.report.report_dropped(t)
|
||||
# Clean up stale entities.
|
||||
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
|
||||
|
||||
def _extract_record(
|
||||
self,
|
||||
|
||||
@ -48,6 +48,10 @@ from datahub.metadata.schema_classes import (
|
||||
DatasetPropertiesClass,
|
||||
SubTypesClass,
|
||||
)
|
||||
from datahub.utilities.source_helpers import (
|
||||
auto_stale_entity_removal,
|
||||
auto_status_aspect,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -234,6 +238,12 @@ class PulsarSource(StatefulIngestionSourceBase):
|
||||
return cls(config, ctx)
|
||||
|
||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||
return auto_stale_entity_removal(
|
||||
self.stale_entity_removal_handler,
|
||||
auto_status_aspect(self.get_workunits_internal()),
|
||||
)
|
||||
|
||||
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
||||
"""
|
||||
Interacts with the Pulsar Admin Api and loops over tenants, namespaces and topics. For every topic
|
||||
the schema information is retrieved if available.
|
||||
@ -302,24 +312,12 @@ class PulsarSource(StatefulIngestionSourceBase):
|
||||
self.report.topics_scanned += 1
|
||||
if self.config.topic_patterns.allowed(topic):
|
||||
yield from self._extract_record(topic, is_partitioned)
|
||||
# Add topic to checkpoint if stateful ingestion is enabled
|
||||
topic_urn = make_dataset_urn_with_platform_instance(
|
||||
platform=self.platform,
|
||||
name=topic,
|
||||
platform_instance=self.config.platform_instance,
|
||||
env=self.config.env,
|
||||
)
|
||||
self.stale_entity_removal_handler.add_entity_to_state(
|
||||
type="topic", urn=topic_urn
|
||||
)
|
||||
else:
|
||||
self.report.report_topics_dropped(topic)
|
||||
else:
|
||||
self.report.report_namespaces_dropped(namespace)
|
||||
else:
|
||||
self.report.report_tenants_dropped(tenant)
|
||||
# Clean up stale entities.
|
||||
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
|
||||
|
||||
def _is_token_authentication_configured(self) -> bool:
|
||||
return self.config.token is not None
|
||||
|
||||
@ -194,9 +194,6 @@ class VerticaSource(SQLAlchemySource):
|
||||
if sql_config.include_oauth:
|
||||
yield from self.loop_oauth(inspector, oauth_schema, sql_config)
|
||||
|
||||
# Clean up stale entities.
|
||||
yield from self.stale_entity_removal_handler.gen_removed_entity_workunits()
|
||||
|
||||
def get_database_properties(
|
||||
self, inspector: Inspector, database: str
|
||||
) -> Optional[Dict[str, str]]:
|
||||
@ -491,8 +488,6 @@ class VerticaSource(SQLAlchemySource):
|
||||
urn=dataset_urn,
|
||||
aspects=[StatusClass(removed=False)],
|
||||
)
|
||||
# Add table to the checkpoint state
|
||||
self.stale_entity_removal_handler.add_entity_to_state("table", dataset_urn)
|
||||
description, properties, location_urn = self.get_projection_properties(
|
||||
inspector, schema, projection
|
||||
)
|
||||
@ -718,8 +713,6 @@ class VerticaSource(SQLAlchemySource):
|
||||
urn=dataset_urn,
|
||||
aspects=[StatusClass(removed=False)],
|
||||
)
|
||||
# Add table to the checkpoint state
|
||||
self.stale_entity_removal_handler.add_entity_to_state("model", dataset_urn)
|
||||
description, properties, location = self.get_model_properties(
|
||||
inspector, schema, table
|
||||
)
|
||||
@ -904,8 +897,6 @@ class VerticaSource(SQLAlchemySource):
|
||||
urn=dataset_urn,
|
||||
aspects=[StatusClass(removed=False)],
|
||||
)
|
||||
# Add table to the checkpoint state
|
||||
self.stale_entity_removal_handler.add_entity_to_state("oauth", dataset_urn)
|
||||
description, properties, location_urn = self.get_oauth_properties(
|
||||
inspector, schema, oauth
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user