mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-11 00:48:39 +00:00
fix(ingest): add status aspect to dataProcessInstance (#10757)
This commit is contained in:
parent
d63f25faa8
commit
fa2ab1bcee
@ -2,7 +2,6 @@ import logging
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Callable,
|
|
||||||
Dict,
|
Dict,
|
||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
@ -43,9 +42,6 @@ from datahub.utilities.urns.urn_iter import list_urns, lowercase_dataset_urns
|
|||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from datahub.ingestion.api.source import SourceReport
|
from datahub.ingestion.api.source import SourceReport
|
||||||
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
||||||
StaleEntityRemovalHandler,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -139,39 +135,6 @@ def auto_status_aspect(
|
|||||||
).as_workunit()
|
).as_workunit()
|
||||||
|
|
||||||
|
|
||||||
def _default_entity_type_fn(wu: MetadataWorkUnit) -> Optional[str]:
|
|
||||||
urn = wu.get_urn()
|
|
||||||
entity_type = guess_entity_type(urn)
|
|
||||||
return entity_type
|
|
||||||
|
|
||||||
|
|
||||||
def auto_stale_entity_removal(
|
|
||||||
stale_entity_removal_handler: "StaleEntityRemovalHandler",
|
|
||||||
stream: Iterable[MetadataWorkUnit],
|
|
||||||
entity_type_fn: Callable[
|
|
||||||
[MetadataWorkUnit], Optional[str]
|
|
||||||
] = _default_entity_type_fn,
|
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
|
||||||
"""
|
|
||||||
Record all entities that are found, and emit removals for any that disappeared in this run.
|
|
||||||
"""
|
|
||||||
|
|
||||||
for wu in stream:
|
|
||||||
urn = wu.get_urn()
|
|
||||||
|
|
||||||
if wu.is_primary_source:
|
|
||||||
entity_type = entity_type_fn(wu)
|
|
||||||
if entity_type is not None:
|
|
||||||
stale_entity_removal_handler.add_entity_to_state(entity_type, urn)
|
|
||||||
else:
|
|
||||||
stale_entity_removal_handler.add_urn_to_skip(urn)
|
|
||||||
|
|
||||||
yield wu
|
|
||||||
|
|
||||||
# Clean up stale entities.
|
|
||||||
yield from stale_entity_removal_handler.gen_removed_entity_workunits()
|
|
||||||
|
|
||||||
|
|
||||||
T = TypeVar("T", bound=MetadataWorkUnit)
|
T = TypeVar("T", bound=MetadataWorkUnit)
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,7 +9,6 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|||||||
from datahub.emitter.mcp_builder import entity_supports_aspect
|
from datahub.emitter.mcp_builder import entity_supports_aspect
|
||||||
from datahub.ingestion.api.common import PipelineContext
|
from datahub.ingestion.api.common import PipelineContext
|
||||||
from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId
|
from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId
|
||||||
from datahub.ingestion.api.source_helpers import auto_stale_entity_removal
|
|
||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.ingestion.source.state.checkpoint import Checkpoint
|
from datahub.ingestion.source.state.checkpoint import Checkpoint
|
||||||
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
|
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
|
||||||
@ -28,6 +27,10 @@ from datahub.utilities.urns.urn import guess_entity_type
|
|||||||
|
|
||||||
logger: logging.Logger = logging.getLogger(__name__)
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
STATEFUL_INGESTION_IGNORED_ENTITY_TYPES = {
|
||||||
|
"dataProcessInstance",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
|
class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
|
||||||
"""
|
"""
|
||||||
@ -59,6 +62,30 @@ class StaleEntityRemovalSourceReport(StatefulIngestionReport):
|
|||||||
self.last_state_non_deletable_entities.append(urn)
|
self.last_state_non_deletable_entities.append(urn)
|
||||||
|
|
||||||
|
|
||||||
|
def auto_stale_entity_removal(
|
||||||
|
stale_entity_removal_handler: "StaleEntityRemovalHandler",
|
||||||
|
stream: Iterable[MetadataWorkUnit],
|
||||||
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
"""
|
||||||
|
Record all entities that are found, and emit removals for any that disappeared in this run.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for wu in stream:
|
||||||
|
urn = wu.get_urn()
|
||||||
|
|
||||||
|
if wu.is_primary_source:
|
||||||
|
entity_type = guess_entity_type(urn)
|
||||||
|
if entity_type is not None:
|
||||||
|
stale_entity_removal_handler.add_entity_to_state(entity_type, urn)
|
||||||
|
else:
|
||||||
|
stale_entity_removal_handler.add_urn_to_skip(urn)
|
||||||
|
|
||||||
|
yield wu
|
||||||
|
|
||||||
|
# Clean up stale entities.
|
||||||
|
yield from stale_entity_removal_handler.gen_removed_entity_workunits()
|
||||||
|
|
||||||
|
|
||||||
class StaleEntityRemovalHandler(
|
class StaleEntityRemovalHandler(
|
||||||
StatefulIngestionUsecaseHandlerBase["GenericCheckpointState"]
|
StatefulIngestionUsecaseHandlerBase["GenericCheckpointState"]
|
||||||
):
|
):
|
||||||
@ -285,7 +312,11 @@ class StaleEntityRemovalHandler(
|
|||||||
for urn in last_checkpoint_state.get_urns_not_in(
|
for urn in last_checkpoint_state.get_urns_not_in(
|
||||||
type="*", other_checkpoint_state=cur_checkpoint_state
|
type="*", other_checkpoint_state=cur_checkpoint_state
|
||||||
):
|
):
|
||||||
if not entity_supports_aspect(guess_entity_type(urn), StatusClass):
|
entity_type = guess_entity_type(urn)
|
||||||
|
if (
|
||||||
|
entity_type in STATEFUL_INGESTION_IGNORED_ENTITY_TYPES
|
||||||
|
or not entity_supports_aspect(entity_type, StatusClass)
|
||||||
|
):
|
||||||
# If any entity does not support aspect 'status' then skip that entity urn
|
# If any entity does not support aspect 'status' then skip that entity urn
|
||||||
report.report_last_state_non_deletable_entities(urn)
|
report.report_last_state_non_deletable_entities(urn)
|
||||||
continue
|
continue
|
||||||
|
@ -6010,6 +6010,70 @@
|
|||||||
"lastRunId": "no-run-id-provided"
|
"lastRunId": "no-run-id-provided"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:36225e795a4597b2376996774a803b0d",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1643871600000,
|
||||||
|
"runId": "dbt-model-performance",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:57aa623f096cf3a28af70fe94b713907",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1643871600000,
|
||||||
|
"runId": "dbt-model-performance",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:a42a5b1bee156e45972e12d4156fb7a2",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1643871600000,
|
||||||
|
"runId": "dbt-model-performance",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:fc6268f0be68fd04c310705b65efd6fe",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1643871600000,
|
||||||
|
"runId": "dbt-model-performance",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"entityType": "tag",
|
"entityType": "tag",
|
||||||
"entityUrn": "urn:li:tag:dbt:tag_from_dbt",
|
"entityUrn": "urn:li:tag:dbt:tag_from_dbt",
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
"aspect": {
|
"aspect": {
|
||||||
"json": {
|
"json": {
|
||||||
"owners": [],
|
"owners": [],
|
||||||
|
"ownerTypes": {},
|
||||||
"lastModified": {
|
"lastModified": {
|
||||||
"time": 0,
|
"time": 0,
|
||||||
"actor": "urn:li:corpuser:fivetran"
|
"actor": "urn:li:corpuser:fivetran"
|
||||||
@ -186,6 +187,7 @@
|
|||||||
"aspect": {
|
"aspect": {
|
||||||
"json": {
|
"json": {
|
||||||
"owners": [],
|
"owners": [],
|
||||||
|
"ownerTypes": {},
|
||||||
"lastModified": {
|
"lastModified": {
|
||||||
"time": 0,
|
"time": 0,
|
||||||
"actor": "urn:li:corpuser:fivetran"
|
"actor": "urn:li:corpuser:fivetran"
|
||||||
@ -614,5 +616,53 @@
|
|||||||
"runId": "powerbi-test",
|
"runId": "powerbi-test",
|
||||||
"lastRunId": "no-run-id-provided"
|
"lastRunId": "no-run-id-provided"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1654621200000,
|
||||||
|
"runId": "powerbi-test",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1654621200000,
|
||||||
|
"runId": "powerbi-test",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1654621200000,
|
||||||
|
"runId": "powerbi-test",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -624,5 +624,53 @@
|
|||||||
"runId": "powerbi-test",
|
"runId": "powerbi-test",
|
||||||
"lastRunId": "no-run-id-provided"
|
"lastRunId": "no-run-id-provided"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1654621200000,
|
||||||
|
"runId": "powerbi-test",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1654621200000,
|
||||||
|
"runId": "powerbi-test",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1654621200000,
|
||||||
|
"runId": "powerbi-test",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -1,9 +1,6 @@
|
|||||||
# Adapted from https://github.com/big-data-europe/docker-hive.
|
# Adapted from https://github.com/big-data-europe/docker-hive.
|
||||||
|
|
||||||
version: "3"
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
presto:
|
presto:
|
||||||
image: starburstdata/presto:350-e.15
|
image: starburstdata/presto:350-e.15
|
||||||
container_name: "presto"
|
container_name: "presto"
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
---
|
---
|
||||||
version: '3.8'
|
|
||||||
services:
|
services:
|
||||||
connect:
|
connect:
|
||||||
image: confluentinc/cp-kafka-connect:7.4.0
|
image: confluentinc/cp-kafka-connect:7.4.0
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
---
|
---
|
||||||
version: "3.8"
|
|
||||||
services:
|
services:
|
||||||
zookeeper:
|
zookeeper:
|
||||||
image: confluentinc/cp-zookeeper:7.2.2
|
image: confluentinc/cp-zookeeper:7.2.2
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
# Adapted from https://github.com/big-data-europe/docker-hive.
|
# Adapted from https://github.com/big-data-europe/docker-hive.
|
||||||
|
|
||||||
version: "3"
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
testtrino:
|
testtrino:
|
||||||
image: trinodb/trino:369
|
image: trinodb/trino:369
|
||||||
container_name: "testtrino"
|
container_name: "testtrino"
|
||||||
|
@ -68,5 +68,21 @@
|
|||||||
"runId": "dummy-test-stateful-ingestion",
|
"runId": "dummy-test-stateful-ingestion",
|
||||||
"lastRunId": "no-run-id-provided"
|
"lastRunId": "no-run-id-provided"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1586847600000,
|
||||||
|
"runId": "dummy-test-stateful-ingestion",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
@ -53,6 +53,22 @@
|
|||||||
"lastRunId": "no-run-id-provided"
|
"lastRunId": "no-run-id-provided"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"entityType": "dataProcessInstance",
|
||||||
|
"entityUrn": "urn:li:dataProcessInstance:7f26c3b4d2d82ace47f4b9dd0c9dea26",
|
||||||
|
"changeType": "UPSERT",
|
||||||
|
"aspectName": "status",
|
||||||
|
"aspect": {
|
||||||
|
"json": {
|
||||||
|
"removed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systemMetadata": {
|
||||||
|
"lastObserved": 1586847600000,
|
||||||
|
"runId": "dummy-test-stateful-ingestion",
|
||||||
|
"lastRunId": "no-run-id-provided"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"entityType": "dataset",
|
"entityType": "dataset",
|
||||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)",
|
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)",
|
||||||
|
@ -110,6 +110,7 @@ entities:
|
|||||||
- dataProcessInstanceProperties
|
- dataProcessInstanceProperties
|
||||||
- dataProcessInstanceRelationships
|
- dataProcessInstanceRelationships
|
||||||
- dataProcessInstanceRunEvent
|
- dataProcessInstanceRunEvent
|
||||||
|
- status
|
||||||
- name: chart
|
- name: chart
|
||||||
category: core
|
category: core
|
||||||
keyAspect: chartKey
|
keyAspect: chartKey
|
||||||
|
Loading…
x
Reference in New Issue
Block a user