feat(ingest): add more fail-safes to stateful ingestion (#8111)

2025-11-11 17:03:33 +00:00 · 2023-05-31 18:49:48 -07:00 · 2023-05-31 18:49:48 -07:00 · 690ed083d9
commit 690ed083d9
parent 60dd9ef187
7 changed files with 74 additions and 54 deletions
--- a/metadata-ingestion/docs/dev_guides/reporting_telemetry.md
+++ b/metadata-ingestion/docs/dev_guides/reporting_telemetry.md
@ -70,14 +70,15 @@ An ingestion reporting state provider is responsible for saving and retrieving t
 associated with the ingestion runs of various jobs inside the source connector of the ingestion pipeline. 
 The data model used for capturing the telemetry is [DatahubIngestionRunSummary](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/datajob/datahub/DatahubIngestionRunSummary.pdl). 
 A reporting ingestion state provider needs to implement the [IngestionReportingProviderBase](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_reporting_provider_base.py)
-interface and register itself with datahub by adding an entry under `datahub.ingestion.checkpointing_provider.plugins` 
+interface and register itself with datahub by adding an entry under `datahub.ingestion.reporting_provider.plugins` 
 key of the entry_points section in [setup.py](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/setup.py) 
 with its type and implementation class as shown below. 
 ```python
 entry_points = {
    # <snip other keys>"
-    "datahub.ingestion.checkpointing_provider.plugins": [
+    "datahub.ingestion.reporting_provider.plugins": [
-        "datahub = datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider:DatahubIngestionCheckpointingProvider",
+        "datahub = datahub.ingestion.reporting.datahub_ingestion_run_summary_provider:DatahubIngestionRunSummaryProvider",
        "file = datahub.ingestion.reporting.file_reporter:FileReporter",
    ],
 }
 ```
--- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
+++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
@ -308,8 +308,7 @@ class Pipeline:
                    status="CANCELLED"
                    if self.final_status == "cancelled"
                    else "FAILURE"
-                    if self.source.get_report().failures
+                    if self.has_failures()
                    or self.sink.get_report().failures
                    else "SUCCESS"
                    if self.final_status == "completed"
                    else "UNKNOWN",
@ -385,6 +384,7 @@ class Pipeline:
                except SystemExit:
                    raise
                except Exception as e:
                    # TODO: Transformer errors should cause the pipeline to fail.
                    logger.error(
                        "Failed to process some records. Continuing.", exc_info=e
                    )
@ -410,7 +410,7 @@ class Pipeline:
            self.sink.close()
            self.process_commits()
            self.final_status = "completed"
-        except (SystemExit, RuntimeError) as e:
+        except (SystemExit, RuntimeError, KeyboardInterrupt) as e:
            self.final_status = "cancelled"
            logger.error("Caught error", exc_info=e)
            raise
@ -534,6 +534,11 @@ class Pipeline:
            else:
                return "bright_green"
    def has_failures(self) -> bool:
        return bool(
            self.source.get_report().failures or self.sink.get_report().failures
        )
    def pretty_print_summary(
        self, warnings_as_failure: bool = False, currently_running: bool = False
    ) -> int:
--- a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py
@ -128,27 +128,23 @@ class GenericCheckpointState(CheckpointStateBase):
        :return: (1-|intersection(self, old_checkpoint_state)| / |old_checkpoint_state|) * 100.0
        """
        return compute_percent_entities_changed(
-            [(self.urns, old_checkpoint_state.urns)]
+            new_entities=self.urns, old_entities=old_checkpoint_state.urns
        )
 def compute_percent_entities_changed(
-    new_old_entity_list: List[Tuple[List[str], List[str]]]
+    new_entities: List[str], old_entities: List[str]
 ) -> float:
-    old_count_all = 0
+    (overlap_count, old_count, _,) = _get_entity_overlap_and_cardinalities(
-    overlap_count_all = 0
+        new_entities=new_entities, old_entities=old_entities
-    for new_entities, old_entities in new_old_entity_list:
+    )
-        (overlap_count, old_count, _,) = get_entity_overlap_and_cardinalities(
+
-            new_entities=new_entities, old_entities=old_entities
+    if old_count:
-        )
+        return (1 - overlap_count / old_count) * 100.0
        overlap_count_all += overlap_count
        old_count_all += old_count
    if old_count_all:
        return (1 - overlap_count_all / old_count_all) * 100.0
    return 0.0
-def get_entity_overlap_and_cardinalities(
+def _get_entity_overlap_and_cardinalities(
    new_entities: List[str], old_entities: List[str]
 ) -> Tuple[int, int, int]:
    new_set = set(new_entities)
--- a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py
@ -208,7 +208,7 @@ class StaleEntityRemovalHandler(
        if not self.is_checkpointing_enabled() or self._ignore_old_state():
            return
        logger.debug("Checking for stale entity removal.")
-        last_checkpoint: Optional[Checkpoint] = self.state_provider.get_last_checkpoint(
+        last_checkpoint = self.state_provider.get_last_checkpoint(
            self.job_id, self.state_type_class
        )
        if not last_checkpoint:
@ -216,14 +216,15 @@ class StaleEntityRemovalHandler(
        cur_checkpoint = self.state_provider.get_current_checkpoint(self.job_id)
        assert cur_checkpoint is not None
        # Get the underlying states
-        last_checkpoint_state = cast(GenericCheckpointState, last_checkpoint.state)
+        last_checkpoint_state: GenericCheckpointState = last_checkpoint.state
        cur_checkpoint_state = cast(GenericCheckpointState, cur_checkpoint.state)
        assert self.stateful_ingestion_config
        # Check if the entity delta is below the fail-safe threshold.
        entity_difference_percent = cur_checkpoint_state.get_percent_entities_changed(
            last_checkpoint_state
        )
        assert self.stateful_ingestion_config
        if (
            entity_difference_percent
            > self.stateful_ingestion_config.fail_safe_threshold
@ -234,11 +235,30 @@ class StaleEntityRemovalHandler(
        ):
            # Log the failure. This would prevent the current state from getting committed.
            self.source.get_report().report_failure(
-                "Stateful Ingestion",
+                "stale-entity-removal",
                f"Will not soft-delete entities, since we'd be deleting {entity_difference_percent:.1f}% of the existing entities. "
                f"To force a deletion, increase the value of 'stateful_ingestion.fail_safe_threshold' (currently {self.stateful_ingestion_config.fail_safe_threshold})",
            )
-            # Bail so that we don't emit the stale entity removal workunits.
+            return
        if self.source.get_report().events_produced == 0:
            # SUBTLE: By reporting this as a failure here, we also ensure that the
            # new (empty) state doesn't get committed.
            # TODO: Move back to using fail_safe_threshold once we're confident that we've squashed all the bugs.
            self.source.get_report().report_failure(
                "stale-entity-removal",
                "Skipping stale entity soft-deletion because the source produced no events. "
                "This is a fail-safe mechanism to prevent accidental deletion of all entities.",
            )
            return
        # If the source already had a failure, skip soft-deletion.
        # TODO: Eventually, switch this to check if anything in the pipeline had a failure so far.
        if self.source.get_report().failures:
            self.source.get_report().report_warning(
                "stale-entity-removal",
                "Skipping stale entity soft-deletion since source already had failures.",
            )
            return
        # Everything looks good, emit the soft-deletion workunits
--- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
@ -321,7 +321,7 @@ class StateProviderWrapper:
    # Base-class implementations for common state management tasks.
    def get_last_checkpoint(
        self, job_id: JobId, checkpoint_state_class: Type[StateType]
-    ) -> Optional[Checkpoint]:
+    ) -> Optional[Checkpoint[StateType]]:
        if not self.is_stateful_ingestion_configured() or (
            self.stateful_ingestion_config
            and self.stateful_ingestion_config.ignore_old_state
--- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py
@ -11,7 +11,7 @@ from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import (
    JobId,
 )
 from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
-from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass, StatusClass
+from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass
 logger = logging.getLogger(__name__)
@ -115,14 +115,11 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
                job_name,
            )
-            self.graph.emit_mcp(
+            # We don't want the state payloads to show up in search. As such, we emit the
-                # We don't want the state payloads to show up in search. As such, we emit the
+            # dataJob aspects as soft-deleted. This doesn't affect the ability to query
-                # dataJob aspects as soft-deleted. This doesn't affect the ability to query
+            # them using the timeseries API.
-                # them using the timeseries API.
+            self.graph.soft_delete_entity(
-                MetadataChangeProposalWrapper(
+                urn=datajob_urn,
                    entityUrn=datajob_urn,
                    aspect=StatusClass(removed=True),
                )
            )
            self.graph.emit_mcp(
                MetadataChangeProposalWrapper(
--- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py
+++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py
@ -6,42 +6,43 @@ from datahub.ingestion.source.state.entity_removal_state import (
    compute_percent_entities_changed,
 )
 EntList = List[str]
 OldNewEntLists = List[Tuple[List[str], List[str]]]
-old_new_ent_tests: Dict[str, Tuple[OldNewEntLists, float]] = {
+new_old_ent_tests: Dict[str, Tuple[EntList, EntList, float]] = {
-    "no_change_empty_old_and_new": ([([], [])], 0.0),
+    "no_change_empty_old_and_new": ([], [], 0.0),
-    "no_change_empty_old_and_non_empty_new": ([(["a"], [])], 0.0),
+    "no_change_empty_old_and_non_empty_new": (["a"], [], 0.0),
    "no_change_non_empty_old_new_equals_old": (
-        [(["a", "b", "c"], ["c", "b", "a"])],
+        ["a", "b", "c"],
        ["c", "b", "a"],
        0.0,
    ),
    "no_change_non_empty_old_new_superset_old": (
-        [(["a", "b", "c", "d"], ["c", "b", "a"])],
+        ["a", "b", "c", "d"],
        ["c", "b", "a"],
        0.0,
    ),
-    "change_25_percent_delta": ([(["a", "b", "c"], ["d", "c", "b", "a"])], 25.0),
+    "change_25_percent_delta": (["a", "b", "c"], ["d", "c", "b", "a"], 25.0),
    "change_50_percent_delta": (
-        [
+        ["b", "a"],
-            (
+        ["a", "b", "c", "d"],
                ["b", "a"],
                ["a", "b", "c", "d"],
            )
        ],
        50.0,
    ),
-    "change_75_percent_delta": ([(["a"], ["a", "b", "c", "d"])], 75.0),
+    "change_75_percent_delta": (["a"], ["a", "b", "c", "d"], 75.0),
-    "change_100_percent_delta_empty_new": ([([], ["a", "b", "c", "d"])], 100.0),
+    "change_100_percent_delta_empty_new": ([], ["a", "b", "c", "d"], 100.0),
-    "change_100_percent_delta_non_empty_new": ([(["e"], ["a", "b", "c", "d"])], 100.0),
+    "change_100_percent_delta_non_empty_new": (["e"], ["a", "b", "c", "d"], 100.0),
 }
@pytest.mark.parametrize(
-    "new_old_entity_list, expected_percent_change",
+    "new_entities, old_entities, expected_percent_change",
-    old_new_ent_tests.values(),
+    new_old_ent_tests.values(),
-    ids=old_new_ent_tests.keys(),
+    ids=new_old_ent_tests.keys(),
 )
 def test_change_percent(
-    new_old_entity_list: OldNewEntLists, expected_percent_change: float
+    new_entities: EntList, old_entities: EntList, expected_percent_change: float
 ) -> None:
-    actual_percent_change = compute_percent_entities_changed(new_old_entity_list)
+    actual_percent_change = compute_percent_entities_changed(
        new_entities=new_entities, old_entities=old_entities
    )
    assert actual_percent_change == expected_percent_change