feat(ingest): add more fail-safes to stateful ingestion (#8111)

This commit is contained in:
Harshal Sheth 2023-05-31 18:49:48 -07:00 committed by GitHub
parent 60dd9ef187
commit 690ed083d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 74 additions and 54 deletions

View File

@ -70,14 +70,15 @@ An ingestion reporting state provider is responsible for saving and retrieving t
associated with the ingestion runs of various jobs inside the source connector of the ingestion pipeline. associated with the ingestion runs of various jobs inside the source connector of the ingestion pipeline.
The data model used for capturing the telemetry is [DatahubIngestionRunSummary](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/datajob/datahub/DatahubIngestionRunSummary.pdl). The data model used for capturing the telemetry is [DatahubIngestionRunSummary](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/datajob/datahub/DatahubIngestionRunSummary.pdl).
A reporting ingestion state provider needs to implement the [IngestionReportingProviderBase](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_reporting_provider_base.py) A reporting ingestion state provider needs to implement the [IngestionReportingProviderBase](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_reporting_provider_base.py)
interface and register itself with datahub by adding an entry under `datahub.ingestion.checkpointing_provider.plugins` interface and register itself with datahub by adding an entry under `datahub.ingestion.reporting_provider.plugins`
key of the entry_points section in [setup.py](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/setup.py) key of the entry_points section in [setup.py](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/setup.py)
with its type and implementation class as shown below. with its type and implementation class as shown below.
```python ```python
entry_points = { entry_points = {
# <snip other keys>" # <snip other keys>"
"datahub.ingestion.checkpointing_provider.plugins": [ "datahub.ingestion.reporting_provider.plugins": [
"datahub = datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider:DatahubIngestionCheckpointingProvider", "datahub = datahub.ingestion.reporting.datahub_ingestion_run_summary_provider:DatahubIngestionRunSummaryProvider",
"file = datahub.ingestion.reporting.file_reporter:FileReporter",
], ],
} }
``` ```

View File

@ -308,8 +308,7 @@ class Pipeline:
status="CANCELLED" status="CANCELLED"
if self.final_status == "cancelled" if self.final_status == "cancelled"
else "FAILURE" else "FAILURE"
if self.source.get_report().failures if self.has_failures()
or self.sink.get_report().failures
else "SUCCESS" else "SUCCESS"
if self.final_status == "completed" if self.final_status == "completed"
else "UNKNOWN", else "UNKNOWN",
@ -385,6 +384,7 @@ class Pipeline:
except SystemExit: except SystemExit:
raise raise
except Exception as e: except Exception as e:
# TODO: Transformer errors should cause the pipeline to fail.
logger.error( logger.error(
"Failed to process some records. Continuing.", exc_info=e "Failed to process some records. Continuing.", exc_info=e
) )
@ -410,7 +410,7 @@ class Pipeline:
self.sink.close() self.sink.close()
self.process_commits() self.process_commits()
self.final_status = "completed" self.final_status = "completed"
except (SystemExit, RuntimeError) as e: except (SystemExit, RuntimeError, KeyboardInterrupt) as e:
self.final_status = "cancelled" self.final_status = "cancelled"
logger.error("Caught error", exc_info=e) logger.error("Caught error", exc_info=e)
raise raise
@ -534,6 +534,11 @@ class Pipeline:
else: else:
return "bright_green" return "bright_green"
def has_failures(self) -> bool:
return bool(
self.source.get_report().failures or self.sink.get_report().failures
)
def pretty_print_summary( def pretty_print_summary(
self, warnings_as_failure: bool = False, currently_running: bool = False self, warnings_as_failure: bool = False, currently_running: bool = False
) -> int: ) -> int:

View File

@ -128,27 +128,23 @@ class GenericCheckpointState(CheckpointStateBase):
:return: (1-|intersection(self, old_checkpoint_state)| / |old_checkpoint_state|) * 100.0 :return: (1-|intersection(self, old_checkpoint_state)| / |old_checkpoint_state|) * 100.0
""" """
return compute_percent_entities_changed( return compute_percent_entities_changed(
[(self.urns, old_checkpoint_state.urns)] new_entities=self.urns, old_entities=old_checkpoint_state.urns
) )
def compute_percent_entities_changed( def compute_percent_entities_changed(
new_old_entity_list: List[Tuple[List[str], List[str]]] new_entities: List[str], old_entities: List[str]
) -> float: ) -> float:
old_count_all = 0 (overlap_count, old_count, _,) = _get_entity_overlap_and_cardinalities(
overlap_count_all = 0 new_entities=new_entities, old_entities=old_entities
for new_entities, old_entities in new_old_entity_list: )
(overlap_count, old_count, _,) = get_entity_overlap_and_cardinalities(
new_entities=new_entities, old_entities=old_entities if old_count:
) return (1 - overlap_count / old_count) * 100.0
overlap_count_all += overlap_count
old_count_all += old_count
if old_count_all:
return (1 - overlap_count_all / old_count_all) * 100.0
return 0.0 return 0.0
def get_entity_overlap_and_cardinalities( def _get_entity_overlap_and_cardinalities(
new_entities: List[str], old_entities: List[str] new_entities: List[str], old_entities: List[str]
) -> Tuple[int, int, int]: ) -> Tuple[int, int, int]:
new_set = set(new_entities) new_set = set(new_entities)

View File

@ -208,7 +208,7 @@ class StaleEntityRemovalHandler(
if not self.is_checkpointing_enabled() or self._ignore_old_state(): if not self.is_checkpointing_enabled() or self._ignore_old_state():
return return
logger.debug("Checking for stale entity removal.") logger.debug("Checking for stale entity removal.")
last_checkpoint: Optional[Checkpoint] = self.state_provider.get_last_checkpoint( last_checkpoint = self.state_provider.get_last_checkpoint(
self.job_id, self.state_type_class self.job_id, self.state_type_class
) )
if not last_checkpoint: if not last_checkpoint:
@ -216,14 +216,15 @@ class StaleEntityRemovalHandler(
cur_checkpoint = self.state_provider.get_current_checkpoint(self.job_id) cur_checkpoint = self.state_provider.get_current_checkpoint(self.job_id)
assert cur_checkpoint is not None assert cur_checkpoint is not None
# Get the underlying states # Get the underlying states
last_checkpoint_state = cast(GenericCheckpointState, last_checkpoint.state) last_checkpoint_state: GenericCheckpointState = last_checkpoint.state
cur_checkpoint_state = cast(GenericCheckpointState, cur_checkpoint.state) cur_checkpoint_state = cast(GenericCheckpointState, cur_checkpoint.state)
assert self.stateful_ingestion_config
# Check if the entity delta is below the fail-safe threshold. # Check if the entity delta is below the fail-safe threshold.
entity_difference_percent = cur_checkpoint_state.get_percent_entities_changed( entity_difference_percent = cur_checkpoint_state.get_percent_entities_changed(
last_checkpoint_state last_checkpoint_state
) )
assert self.stateful_ingestion_config
if ( if (
entity_difference_percent entity_difference_percent
> self.stateful_ingestion_config.fail_safe_threshold > self.stateful_ingestion_config.fail_safe_threshold
@ -234,11 +235,30 @@ class StaleEntityRemovalHandler(
): ):
# Log the failure. This would prevent the current state from getting committed. # Log the failure. This would prevent the current state from getting committed.
self.source.get_report().report_failure( self.source.get_report().report_failure(
"Stateful Ingestion", "stale-entity-removal",
f"Will not soft-delete entities, since we'd be deleting {entity_difference_percent:.1f}% of the existing entities. " f"Will not soft-delete entities, since we'd be deleting {entity_difference_percent:.1f}% of the existing entities. "
f"To force a deletion, increase the value of 'stateful_ingestion.fail_safe_threshold' (currently {self.stateful_ingestion_config.fail_safe_threshold})", f"To force a deletion, increase the value of 'stateful_ingestion.fail_safe_threshold' (currently {self.stateful_ingestion_config.fail_safe_threshold})",
) )
# Bail so that we don't emit the stale entity removal workunits. return
if self.source.get_report().events_produced == 0:
# SUBTLE: By reporting this as a failure here, we also ensure that the
# new (empty) state doesn't get committed.
# TODO: Move back to using fail_safe_threshold once we're confident that we've squashed all the bugs.
self.source.get_report().report_failure(
"stale-entity-removal",
"Skipping stale entity soft-deletion because the source produced no events. "
"This is a fail-safe mechanism to prevent accidental deletion of all entities.",
)
return
# If the source already had a failure, skip soft-deletion.
# TODO: Eventually, switch this to check if anything in the pipeline had a failure so far.
if self.source.get_report().failures:
self.source.get_report().report_warning(
"stale-entity-removal",
"Skipping stale entity soft-deletion since source already had failures.",
)
return return
# Everything looks good, emit the soft-deletion workunits # Everything looks good, emit the soft-deletion workunits

View File

@ -321,7 +321,7 @@ class StateProviderWrapper:
# Base-class implementations for common state management tasks. # Base-class implementations for common state management tasks.
def get_last_checkpoint( def get_last_checkpoint(
self, job_id: JobId, checkpoint_state_class: Type[StateType] self, job_id: JobId, checkpoint_state_class: Type[StateType]
) -> Optional[Checkpoint]: ) -> Optional[Checkpoint[StateType]]:
if not self.is_stateful_ingestion_configured() or ( if not self.is_stateful_ingestion_configured() or (
self.stateful_ingestion_config self.stateful_ingestion_config
and self.stateful_ingestion_config.ignore_old_state and self.stateful_ingestion_config.ignore_old_state

View File

@ -11,7 +11,7 @@ from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import (
JobId, JobId,
) )
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass, StatusClass from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -115,14 +115,11 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
job_name, job_name,
) )
self.graph.emit_mcp( # We don't want the state payloads to show up in search. As such, we emit the
# We don't want the state payloads to show up in search. As such, we emit the # dataJob aspects as soft-deleted. This doesn't affect the ability to query
# dataJob aspects as soft-deleted. This doesn't affect the ability to query # them using the timeseries API.
# them using the timeseries API. self.graph.soft_delete_entity(
MetadataChangeProposalWrapper( urn=datajob_urn,
entityUrn=datajob_urn,
aspect=StatusClass(removed=True),
)
) )
self.graph.emit_mcp( self.graph.emit_mcp(
MetadataChangeProposalWrapper( MetadataChangeProposalWrapper(

View File

@ -6,42 +6,43 @@ from datahub.ingestion.source.state.entity_removal_state import (
compute_percent_entities_changed, compute_percent_entities_changed,
) )
EntList = List[str]
OldNewEntLists = List[Tuple[List[str], List[str]]] OldNewEntLists = List[Tuple[List[str], List[str]]]
old_new_ent_tests: Dict[str, Tuple[OldNewEntLists, float]] = { new_old_ent_tests: Dict[str, Tuple[EntList, EntList, float]] = {
"no_change_empty_old_and_new": ([([], [])], 0.0), "no_change_empty_old_and_new": ([], [], 0.0),
"no_change_empty_old_and_non_empty_new": ([(["a"], [])], 0.0), "no_change_empty_old_and_non_empty_new": (["a"], [], 0.0),
"no_change_non_empty_old_new_equals_old": ( "no_change_non_empty_old_new_equals_old": (
[(["a", "b", "c"], ["c", "b", "a"])], ["a", "b", "c"],
["c", "b", "a"],
0.0, 0.0,
), ),
"no_change_non_empty_old_new_superset_old": ( "no_change_non_empty_old_new_superset_old": (
[(["a", "b", "c", "d"], ["c", "b", "a"])], ["a", "b", "c", "d"],
["c", "b", "a"],
0.0, 0.0,
), ),
"change_25_percent_delta": ([(["a", "b", "c"], ["d", "c", "b", "a"])], 25.0), "change_25_percent_delta": (["a", "b", "c"], ["d", "c", "b", "a"], 25.0),
"change_50_percent_delta": ( "change_50_percent_delta": (
[ ["b", "a"],
( ["a", "b", "c", "d"],
["b", "a"],
["a", "b", "c", "d"],
)
],
50.0, 50.0,
), ),
"change_75_percent_delta": ([(["a"], ["a", "b", "c", "d"])], 75.0), "change_75_percent_delta": (["a"], ["a", "b", "c", "d"], 75.0),
"change_100_percent_delta_empty_new": ([([], ["a", "b", "c", "d"])], 100.0), "change_100_percent_delta_empty_new": ([], ["a", "b", "c", "d"], 100.0),
"change_100_percent_delta_non_empty_new": ([(["e"], ["a", "b", "c", "d"])], 100.0), "change_100_percent_delta_non_empty_new": (["e"], ["a", "b", "c", "d"], 100.0),
} }
@pytest.mark.parametrize( @pytest.mark.parametrize(
"new_old_entity_list, expected_percent_change", "new_entities, old_entities, expected_percent_change",
old_new_ent_tests.values(), new_old_ent_tests.values(),
ids=old_new_ent_tests.keys(), ids=new_old_ent_tests.keys(),
) )
def test_change_percent( def test_change_percent(
new_old_entity_list: OldNewEntLists, expected_percent_change: float new_entities: EntList, old_entities: EntList, expected_percent_change: float
) -> None: ) -> None:
actual_percent_change = compute_percent_entities_changed(new_old_entity_list) actual_percent_change = compute_percent_entities_changed(
new_entities=new_entities, old_entities=old_entities
)
assert actual_percent_change == expected_percent_change assert actual_percent_change == expected_percent_change