mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-04 12:51:23 +00:00 
			
		
		
		
	feat(ingestion): Copy urns from previous checkpoint state on ingestion failure (#10347)
This commit is contained in:
		
							parent
							
								
									d08f36f14b
								
							
						
					
					
						commit
						ae3f0fd5ee
					
				@ -26,9 +26,7 @@ class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStates
 | 
			
		||||
    The base class for all checkpointing state provider implementations.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self, name: str, commit_policy: CommitPolicy = CommitPolicy.ON_NO_ERRORS
 | 
			
		||||
    ):
 | 
			
		||||
    def __init__(self, name: str, commit_policy: CommitPolicy = CommitPolicy.ALWAYS):
 | 
			
		||||
        # Set the initial state to an empty dict.
 | 
			
		||||
        super().__init__(name, commit_policy, {})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -164,6 +164,9 @@ class StaleEntityRemovalHandler(
 | 
			
		||||
    def is_checkpointing_enabled(self) -> bool:
 | 
			
		||||
        return self.checkpointing_enabled
 | 
			
		||||
 | 
			
		||||
    def _get_state_obj(self):
 | 
			
		||||
        return self.state_type_class()
 | 
			
		||||
 | 
			
		||||
    def create_checkpoint(self) -> Optional[Checkpoint]:
 | 
			
		||||
        if self.is_checkpointing_enabled() and not self._ignore_new_state():
 | 
			
		||||
            assert self.stateful_ingestion_config is not None
 | 
			
		||||
@ -172,7 +175,7 @@ class StaleEntityRemovalHandler(
 | 
			
		||||
                job_name=self.job_id,
 | 
			
		||||
                pipeline_name=self.pipeline_name,
 | 
			
		||||
                run_id=self.run_id,
 | 
			
		||||
                state=self.state_type_class(),
 | 
			
		||||
                state=self._get_state_obj(),
 | 
			
		||||
            )
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
@ -255,9 +258,13 @@ class StaleEntityRemovalHandler(
 | 
			
		||||
        # If the source already had a failure, skip soft-deletion.
 | 
			
		||||
        # TODO: Eventually, switch this to check if anything in the pipeline had a failure so far.
 | 
			
		||||
        if self.source.get_report().failures:
 | 
			
		||||
            for urn in last_checkpoint_state.get_urns_not_in(
 | 
			
		||||
                type="*", other_checkpoint_state=cur_checkpoint_state
 | 
			
		||||
            ):
 | 
			
		||||
                self.add_entity_to_state("", urn)
 | 
			
		||||
            self.source.get_report().report_warning(
 | 
			
		||||
                "stale-entity-removal",
 | 
			
		||||
                "Skipping stale entity soft-deletion since source already had failures.",
 | 
			
		||||
                "Skipping stale entity soft-deletion and coping urns from last state since source already had failures.",
 | 
			
		||||
            )
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -16,8 +16,8 @@
 | 
			
		||||
            "config": "",
 | 
			
		||||
            "state": {
 | 
			
		||||
                "formatVersion": "1.0",
 | 
			
		||||
                "serde": "base85-bz2-json",
 | 
			
		||||
                "payload": "LRx4!F+o`-Q(1w>5G4QrYoCBnWH=B60MH7jr`{?c0BA?5L)2-AGyu>6y;V<9hz%Mv0Bt1*)lOMzr>a0|Iq-4VtTsYONQsFPLn1EpdQS;HIy|&CvSAlRvAJwmtCEM+Rx(v_)~sVvkx3V@WX4O`=losC6yZWb2OL0@"
 | 
			
		||||
                "serde": "utf-8",
 | 
			
		||||
                "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\"]}"
 | 
			
		||||
            },
 | 
			
		||||
            "runId": "dummy-test-stateful-ingestion"
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@ -16,8 +16,8 @@
 | 
			
		||||
            "config": "",
 | 
			
		||||
            "state": {
 | 
			
		||||
                "formatVersion": "1.0",
 | 
			
		||||
                "serde": "base85-bz2-json",
 | 
			
		||||
                "payload": "LRx4!F+o`-Q(317h`0a%NgsevWH1l}0MH7jr`{?c0B9vdZ9%mLfYG4P6;f$2G%+v`9z&~6n|e(JEPC2_Iix~CA_im)jR-zsjEK*yo|HQz#IUUHtf@DYVEme-lUW9{Xmmt~y^2jCdyY95az!{$kf#WUxB"
 | 
			
		||||
                "serde": "utf-8",
 | 
			
		||||
                "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\"]}"
 | 
			
		||||
            },
 | 
			
		||||
            "runId": "dummy-test-stateful-ingestion"
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@ -0,0 +1,26 @@
 | 
			
		||||
[
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataJob",
 | 
			
		||||
    "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(file,dummy_stateful,prod),default_stale_entity_removal)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "datahubIngestionCheckpoint",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "timestampMillis": 1586847600000,
 | 
			
		||||
            "partitionSpec": {
 | 
			
		||||
                "type": "FULL_TABLE",
 | 
			
		||||
                "partition": "FULL_TABLE_SNAPSHOT"
 | 
			
		||||
            },
 | 
			
		||||
            "pipelineName": "dummy_stateful",
 | 
			
		||||
            "platformInstanceId": "",
 | 
			
		||||
            "config": "",
 | 
			
		||||
            "state": {
 | 
			
		||||
                "formatVersion": "1.0",
 | 
			
		||||
                "serde": "utf-8",
 | 
			
		||||
                "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\"]}"
 | 
			
		||||
            },
 | 
			
		||||
            "runId": "dummy-test-stateful-ingestion"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
]
 | 
			
		||||
@ -0,0 +1,26 @@
 | 
			
		||||
[
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataJob",
 | 
			
		||||
    "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(file,dummy_stateful,prod),default_stale_entity_removal)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "datahubIngestionCheckpoint",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "timestampMillis": 1586847600000,
 | 
			
		||||
            "partitionSpec": {
 | 
			
		||||
                "type": "FULL_TABLE",
 | 
			
		||||
                "partition": "FULL_TABLE_SNAPSHOT"
 | 
			
		||||
            },
 | 
			
		||||
            "pipelineName": "dummy_stateful",
 | 
			
		||||
            "platformInstanceId": "",
 | 
			
		||||
            "config": "",
 | 
			
		||||
            "state": {
 | 
			
		||||
                "formatVersion": "1.0",
 | 
			
		||||
                "serde": "utf-8",
 | 
			
		||||
                "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\"]}"
 | 
			
		||||
            },
 | 
			
		||||
            "runId": "dummy-test-stateful-ingestion"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
]
 | 
			
		||||
@ -0,0 +1,34 @@
 | 
			
		||||
[
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataset",
 | 
			
		||||
    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "status",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "removed": false
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "systemMetadata": {
 | 
			
		||||
        "lastObserved": 1586847600000,
 | 
			
		||||
        "runId": "dummy-test-stateful-ingestion",
 | 
			
		||||
        "lastRunId": "no-run-id-provided"
 | 
			
		||||
    }
 | 
			
		||||
},
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataset",
 | 
			
		||||
    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "status",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "removed": false
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "systemMetadata": {
 | 
			
		||||
        "lastObserved": 1586847600000,
 | 
			
		||||
        "runId": "dummy-test-stateful-ingestion",
 | 
			
		||||
        "lastRunId": "no-run-id-provided"
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
]
 | 
			
		||||
@ -0,0 +1,50 @@
 | 
			
		||||
[
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataset",
 | 
			
		||||
    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "status",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "removed": false
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "systemMetadata": {
 | 
			
		||||
        "lastObserved": 1586847600000,
 | 
			
		||||
        "runId": "dummy-test-stateful-ingestion",
 | 
			
		||||
        "lastRunId": "no-run-id-provided"
 | 
			
		||||
    }
 | 
			
		||||
},
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataset",
 | 
			
		||||
    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "status",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "removed": false
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "systemMetadata": {
 | 
			
		||||
        "lastObserved": 1586847600000,
 | 
			
		||||
        "runId": "dummy-test-stateful-ingestion",
 | 
			
		||||
        "lastRunId": "no-run-id-provided"
 | 
			
		||||
    }
 | 
			
		||||
},
 | 
			
		||||
{
 | 
			
		||||
    "entityType": "dataset",
 | 
			
		||||
    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)",
 | 
			
		||||
    "changeType": "UPSERT",
 | 
			
		||||
    "aspectName": "status",
 | 
			
		||||
    "aspect": {
 | 
			
		||||
        "json": {
 | 
			
		||||
            "removed": false
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    "systemMetadata": {
 | 
			
		||||
        "lastObserved": 1586847600000,
 | 
			
		||||
        "runId": "dummy-test-stateful-ingestion",
 | 
			
		||||
        "lastRunId": "no-run-id-provided"
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
]
 | 
			
		||||
@ -1,7 +1,9 @@
 | 
			
		||||
from dataclasses import dataclass, field as dataclass_field
 | 
			
		||||
from typing import Any, Dict, Iterable, List, Optional, cast
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
import pydantic
 | 
			
		||||
import pytest
 | 
			
		||||
from freezegun import freeze_time
 | 
			
		||||
from pydantic import Field
 | 
			
		||||
 | 
			
		||||
@ -56,6 +58,10 @@ class DummySourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 | 
			
		||||
    stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field(
 | 
			
		||||
        default=None, description="Dummy source Ingestion Config."
 | 
			
		||||
    )
 | 
			
		||||
    report_failure: bool = Field(
 | 
			
		||||
        default=False,
 | 
			
		||||
        description="Should this dummy source report a failure.",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DummySource(StatefulIngestionSourceBase):
 | 
			
		||||
@ -103,10 +109,23 @@ class DummySource(StatefulIngestionSourceBase):
 | 
			
		||||
                aspect=StatusClass(removed=False),
 | 
			
		||||
            ).as_workunit()
 | 
			
		||||
 | 
			
		||||
        if self.source_config.report_failure:
 | 
			
		||||
            self.reporter.report_failure("Dummy error", "Error")
 | 
			
		||||
 | 
			
		||||
    def get_report(self) -> SourceReport:
 | 
			
		||||
        return self.reporter
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="module")
 | 
			
		||||
def mock_generic_checkpoint_state():
 | 
			
		||||
    with mock.patch(
 | 
			
		||||
        "datahub.ingestion.source.state.entity_removal_state.GenericCheckpointState"
 | 
			
		||||
    ) as mock_checkpoint_state:
 | 
			
		||||
        checkpoint_state = mock_checkpoint_state.return_value
 | 
			
		||||
        checkpoint_state.serde.return_value = "utf-8"
 | 
			
		||||
        yield mock_checkpoint_state
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@freeze_time(FROZEN_TIME)
 | 
			
		||||
def test_stateful_ingestion(pytestconfig, tmp_path, mock_time):
 | 
			
		||||
    # test stateful ingestion using dummy source
 | 
			
		||||
@ -148,80 +167,209 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time):
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pipeline_run1 = None
 | 
			
		||||
    pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(  # type: ignore
 | 
			
		||||
        base_pipeline_config  # type: ignore
 | 
			
		||||
    )
 | 
			
		||||
    pipeline_run1_config["sink"]["config"][
 | 
			
		||||
        "filename"
 | 
			
		||||
    ] = f"{tmp_path}/{output_file_name}"
 | 
			
		||||
    pipeline_run1 = Pipeline.create(pipeline_run1_config)
 | 
			
		||||
    pipeline_run1.run()
 | 
			
		||||
    pipeline_run1.raise_from_status()
 | 
			
		||||
    pipeline_run1.pretty_print_summary()
 | 
			
		||||
    with mock.patch(
 | 
			
		||||
        "datahub.ingestion.source.state.stale_entity_removal_handler.StaleEntityRemovalHandler._get_state_obj"
 | 
			
		||||
    ) as mock_state:
 | 
			
		||||
        mock_state.return_value = GenericCheckpointState(serde="utf-8")
 | 
			
		||||
        pipeline_run1 = None
 | 
			
		||||
        pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(  # type: ignore
 | 
			
		||||
            base_pipeline_config  # type: ignore
 | 
			
		||||
        )
 | 
			
		||||
        pipeline_run1_config["sink"]["config"][
 | 
			
		||||
            "filename"
 | 
			
		||||
        ] = f"{tmp_path}/{output_file_name}"
 | 
			
		||||
        pipeline_run1 = Pipeline.create(pipeline_run1_config)
 | 
			
		||||
        pipeline_run1.run()
 | 
			
		||||
        pipeline_run1.raise_from_status()
 | 
			
		||||
        pipeline_run1.pretty_print_summary()
 | 
			
		||||
 | 
			
		||||
    # validate both dummy source mces and checkpoint state mces files
 | 
			
		||||
    mce_helpers.check_golden_file(
 | 
			
		||||
        pytestconfig,
 | 
			
		||||
        output_path=tmp_path / output_file_name,
 | 
			
		||||
        golden_path=f"{test_resources_dir}/{golden_file_name}",
 | 
			
		||||
    )
 | 
			
		||||
    mce_helpers.check_golden_file(
 | 
			
		||||
        pytestconfig,
 | 
			
		||||
        output_path=tmp_path / state_file_name,
 | 
			
		||||
        golden_path=f"{test_resources_dir}/{golden_state_file_name}",
 | 
			
		||||
    )
 | 
			
		||||
    checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)
 | 
			
		||||
    assert checkpoint1
 | 
			
		||||
    assert checkpoint1.state
 | 
			
		||||
        # validate both dummy source mces and checkpoint state mces files
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / output_file_name,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_file_name}",
 | 
			
		||||
        )
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / state_file_name,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_state_file_name}",
 | 
			
		||||
        )
 | 
			
		||||
        checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)
 | 
			
		||||
        assert checkpoint1
 | 
			
		||||
        assert checkpoint1.state
 | 
			
		||||
 | 
			
		||||
    pipeline_run2 = None
 | 
			
		||||
    pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config)  # type: ignore
 | 
			
		||||
    pipeline_run2_config["source"]["config"]["dataset_patterns"] = {
 | 
			
		||||
        "allow": ["dummy_dataset1", "dummy_dataset2"],
 | 
			
		||||
    with mock.patch(
 | 
			
		||||
        "datahub.ingestion.source.state.stale_entity_removal_handler.StaleEntityRemovalHandler._get_state_obj"
 | 
			
		||||
    ) as mock_state:
 | 
			
		||||
        mock_state.return_value = GenericCheckpointState(serde="utf-8")
 | 
			
		||||
        pipeline_run2 = None
 | 
			
		||||
        pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config)  # type: ignore
 | 
			
		||||
        pipeline_run2_config["source"]["config"]["dataset_patterns"] = {
 | 
			
		||||
            "allow": ["dummy_dataset1", "dummy_dataset2"],
 | 
			
		||||
        }
 | 
			
		||||
        pipeline_run2_config["sink"]["config"][
 | 
			
		||||
            "filename"
 | 
			
		||||
        ] = f"{tmp_path}/{output_file_name_after_deleted}"
 | 
			
		||||
        pipeline_run2 = Pipeline.create(pipeline_run2_config)
 | 
			
		||||
        pipeline_run2.run()
 | 
			
		||||
        pipeline_run2.raise_from_status()
 | 
			
		||||
        pipeline_run2.pretty_print_summary()
 | 
			
		||||
 | 
			
		||||
        # validate both updated dummy source mces and checkpoint state mces files after deleting dataset
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / output_file_name_after_deleted,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_file_name_after_deleted}",
 | 
			
		||||
        )
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / state_file_name,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_state_file_name_after_deleted}",
 | 
			
		||||
        )
 | 
			
		||||
        checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)
 | 
			
		||||
        assert checkpoint2
 | 
			
		||||
        assert checkpoint2.state
 | 
			
		||||
 | 
			
		||||
        # Validate that all providers have committed successfully.
 | 
			
		||||
        validate_all_providers_have_committed_successfully(
 | 
			
		||||
            pipeline=pipeline_run1, expected_providers=1
 | 
			
		||||
        )
 | 
			
		||||
        validate_all_providers_have_committed_successfully(
 | 
			
		||||
            pipeline=pipeline_run2, expected_providers=1
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Perform all assertions on the states. The deleted table should not be
 | 
			
		||||
        # part of the second state
 | 
			
		||||
        state1 = cast(GenericCheckpointState, checkpoint1.state)
 | 
			
		||||
        state2 = cast(GenericCheckpointState, checkpoint2.state)
 | 
			
		||||
 | 
			
		||||
        difference_dataset_urns = list(
 | 
			
		||||
            state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2)
 | 
			
		||||
        )
 | 
			
		||||
        # the difference in dataset urns is the dataset which is not allowed to ingest
 | 
			
		||||
        assert len(difference_dataset_urns) == 1
 | 
			
		||||
        deleted_dataset_urns: List[str] = [
 | 
			
		||||
            "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)",
 | 
			
		||||
        ]
 | 
			
		||||
        assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@freeze_time(FROZEN_TIME)
 | 
			
		||||
def test_stateful_ingestion_failure(pytestconfig, tmp_path, mock_time):
 | 
			
		||||
    # test stateful ingestion using dummy source with pipeline execution failed in second ingestion
 | 
			
		||||
    state_file_name: str = "checkpoint_state_mces_failure.json"
 | 
			
		||||
    golden_state_file_name: str = "golden_test_checkpoint_state_failure.json"
 | 
			
		||||
    golden_state_file_name_after_deleted: str = (
 | 
			
		||||
        "golden_test_checkpoint_state_after_deleted_failure.json"
 | 
			
		||||
    )
 | 
			
		||||
    output_file_name: str = "dummy_mces_failure.json"
 | 
			
		||||
    golden_file_name: str = "golden_test_stateful_ingestion_failure.json"
 | 
			
		||||
    output_file_name_after_deleted: str = (
 | 
			
		||||
        "dummy_mces_stateful_after_deleted_failure.json"
 | 
			
		||||
    )
 | 
			
		||||
    golden_file_name_after_deleted: str = (
 | 
			
		||||
        "golden_test_stateful_ingestion_after_deleted_failure.json"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    test_resources_dir = pytestconfig.rootpath / "tests/unit/stateful_ingestion/state"
 | 
			
		||||
 | 
			
		||||
    base_pipeline_config = {
 | 
			
		||||
        "run_id": "dummy-test-stateful-ingestion",
 | 
			
		||||
        "pipeline_name": "dummy_stateful",
 | 
			
		||||
        "source": {
 | 
			
		||||
            "type": "tests.unit.stateful_ingestion.state.test_stateful_ingestion.DummySource",
 | 
			
		||||
            "config": {
 | 
			
		||||
                "stateful_ingestion": {
 | 
			
		||||
                    "enabled": True,
 | 
			
		||||
                    "remove_stale_metadata": True,
 | 
			
		||||
                    "state_provider": {
 | 
			
		||||
                        "type": "file",
 | 
			
		||||
                        "config": {
 | 
			
		||||
                            "filename": f"{tmp_path}/{state_file_name}",
 | 
			
		||||
                        },
 | 
			
		||||
                    },
 | 
			
		||||
                },
 | 
			
		||||
            },
 | 
			
		||||
        },
 | 
			
		||||
        "sink": {
 | 
			
		||||
            "type": "file",
 | 
			
		||||
            "config": {},
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    pipeline_run2_config["sink"]["config"][
 | 
			
		||||
        "filename"
 | 
			
		||||
    ] = f"{tmp_path}/{output_file_name_after_deleted}"
 | 
			
		||||
    pipeline_run2 = Pipeline.create(pipeline_run2_config)
 | 
			
		||||
    pipeline_run2.run()
 | 
			
		||||
    pipeline_run2.raise_from_status()
 | 
			
		||||
    pipeline_run2.pretty_print_summary()
 | 
			
		||||
 | 
			
		||||
    # validate both updated dummy source mces and checkpoint state mces files after deleting dataset
 | 
			
		||||
    mce_helpers.check_golden_file(
 | 
			
		||||
        pytestconfig,
 | 
			
		||||
        output_path=tmp_path / output_file_name_after_deleted,
 | 
			
		||||
        golden_path=f"{test_resources_dir}/{golden_file_name_after_deleted}",
 | 
			
		||||
    )
 | 
			
		||||
    mce_helpers.check_golden_file(
 | 
			
		||||
        pytestconfig,
 | 
			
		||||
        output_path=tmp_path / state_file_name,
 | 
			
		||||
        golden_path=f"{test_resources_dir}/{golden_state_file_name_after_deleted}",
 | 
			
		||||
    )
 | 
			
		||||
    checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)
 | 
			
		||||
    assert checkpoint2
 | 
			
		||||
    assert checkpoint2.state
 | 
			
		||||
    with mock.patch(
 | 
			
		||||
        "datahub.ingestion.source.state.stale_entity_removal_handler.StaleEntityRemovalHandler._get_state_obj"
 | 
			
		||||
    ) as mock_state:
 | 
			
		||||
        mock_state.return_value = GenericCheckpointState(serde="utf-8")
 | 
			
		||||
        pipeline_run1 = None
 | 
			
		||||
        pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(  # type: ignore
 | 
			
		||||
            base_pipeline_config  # type: ignore
 | 
			
		||||
        )
 | 
			
		||||
        pipeline_run1_config["sink"]["config"][
 | 
			
		||||
            "filename"
 | 
			
		||||
        ] = f"{tmp_path}/{output_file_name}"
 | 
			
		||||
        pipeline_run1 = Pipeline.create(pipeline_run1_config)
 | 
			
		||||
        pipeline_run1.run()
 | 
			
		||||
        pipeline_run1.raise_from_status()
 | 
			
		||||
        pipeline_run1.pretty_print_summary()
 | 
			
		||||
 | 
			
		||||
    # Validate that all providers have committed successfully.
 | 
			
		||||
    validate_all_providers_have_committed_successfully(
 | 
			
		||||
        pipeline=pipeline_run1, expected_providers=1
 | 
			
		||||
    )
 | 
			
		||||
    validate_all_providers_have_committed_successfully(
 | 
			
		||||
        pipeline=pipeline_run2, expected_providers=1
 | 
			
		||||
    )
 | 
			
		||||
        # validate both dummy source mces and checkpoint state mces files
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / output_file_name,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_file_name}",
 | 
			
		||||
        )
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / state_file_name,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_state_file_name}",
 | 
			
		||||
        )
 | 
			
		||||
        checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)
 | 
			
		||||
        assert checkpoint1
 | 
			
		||||
        assert checkpoint1.state
 | 
			
		||||
 | 
			
		||||
    # Perform all assertions on the states. The deleted table should not be
 | 
			
		||||
    # part of the second state
 | 
			
		||||
    state1 = cast(GenericCheckpointState, checkpoint1.state)
 | 
			
		||||
    state2 = cast(GenericCheckpointState, checkpoint2.state)
 | 
			
		||||
    with mock.patch(
 | 
			
		||||
        "datahub.ingestion.source.state.stale_entity_removal_handler.StaleEntityRemovalHandler._get_state_obj"
 | 
			
		||||
    ) as mock_state:
 | 
			
		||||
        mock_state.return_value = GenericCheckpointState(serde="utf-8")
 | 
			
		||||
        pipeline_run2 = None
 | 
			
		||||
        pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config)  # type: ignore
 | 
			
		||||
        pipeline_run2_config["source"]["config"]["dataset_patterns"] = {
 | 
			
		||||
            "allow": ["dummy_dataset1", "dummy_dataset2"],
 | 
			
		||||
        }
 | 
			
		||||
        pipeline_run2_config["source"]["config"]["report_failure"] = True
 | 
			
		||||
        pipeline_run2_config["sink"]["config"][
 | 
			
		||||
            "filename"
 | 
			
		||||
        ] = f"{tmp_path}/{output_file_name_after_deleted}"
 | 
			
		||||
        pipeline_run2 = Pipeline.create(pipeline_run2_config)
 | 
			
		||||
        pipeline_run2.run()
 | 
			
		||||
        pipeline_run2.pretty_print_summary()
 | 
			
		||||
 | 
			
		||||
    difference_dataset_urns = list(
 | 
			
		||||
        state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2)
 | 
			
		||||
    )
 | 
			
		||||
    # the difference in dataset urns is the dataset which is not allowed to ingest
 | 
			
		||||
    assert len(difference_dataset_urns) == 1
 | 
			
		||||
    deleted_dataset_urns: List[str] = [
 | 
			
		||||
        "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)",
 | 
			
		||||
    ]
 | 
			
		||||
    assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns)
 | 
			
		||||
        # validate both updated dummy source mces and checkpoint state mces files after deleting dataset
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / output_file_name_after_deleted,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_file_name_after_deleted}",
 | 
			
		||||
        )
 | 
			
		||||
        mce_helpers.check_golden_file(
 | 
			
		||||
            pytestconfig,
 | 
			
		||||
            output_path=tmp_path / state_file_name,
 | 
			
		||||
            golden_path=f"{test_resources_dir}/{golden_state_file_name_after_deleted}",
 | 
			
		||||
        )
 | 
			
		||||
        checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)
 | 
			
		||||
        assert checkpoint2
 | 
			
		||||
        assert checkpoint2.state
 | 
			
		||||
 | 
			
		||||
        # Validate that all providers have committed successfully.
 | 
			
		||||
        validate_all_providers_have_committed_successfully(
 | 
			
		||||
            pipeline=pipeline_run1, expected_providers=1
 | 
			
		||||
        )
 | 
			
		||||
        validate_all_providers_have_committed_successfully(
 | 
			
		||||
            pipeline=pipeline_run2, expected_providers=1
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Perform assertions on the states. The deleted table should be
 | 
			
		||||
        # still part of the second state as pipeline run failed
 | 
			
		||||
        state1 = cast(GenericCheckpointState, checkpoint1.state)
 | 
			
		||||
        state2 = cast(GenericCheckpointState, checkpoint2.state)
 | 
			
		||||
        assert state1 == state2
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user