perf(ingestion/fivetran): Connector performance optimization (#10556)

This commit is contained in:
Shubham Jagtap 2024-06-12 08:49:57 +05:30 committed by GitHub
parent 52ac3143a4
commit 05aee03f3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 201 additions and 60 deletions

View File

@ -102,7 +102,6 @@ def auto_status_aspect(
""" """
all_urns: Set[str] = set() all_urns: Set[str] = set()
status_urns: Set[str] = set() status_urns: Set[str] = set()
skip_urns: Set[str] = set()
for wu in stream: for wu in stream:
urn = wu.get_urn() urn = wu.get_urn()
all_urns.add(urn) all_urns.add(urn)
@ -127,14 +126,13 @@ def auto_status_aspect(
yield wu yield wu
for urn in sorted(all_urns - status_urns - skip_urns): for urn in sorted(all_urns - status_urns):
entity_type = guess_entity_type(urn) entity_type = guess_entity_type(urn)
if not entity_supports_aspect(entity_type, StatusClass): if not entity_supports_aspect(entity_type, StatusClass):
# If any entity does not support aspect 'status' then skip that entity from adding status aspect. # If any entity does not support aspect 'status' then skip that entity from adding status aspect.
# Example like dataProcessInstance doesn't suppport status aspect. # Example like dataProcessInstance doesn't suppport status aspect.
# If not skipped gives error: java.lang.RuntimeException: Unknown aspect status for entity dataProcessInstance # If not skipped gives error: java.lang.RuntimeException: Unknown aspect status for entity dataProcessInstance
continue continue
yield MetadataChangeProposalWrapper( yield MetadataChangeProposalWrapper(
entityUrn=urn, entityUrn=urn,
aspect=StatusClass(removed=False), aspect=StatusClass(removed=False),

View File

@ -9,6 +9,7 @@ from typing_extensions import Literal
from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.common import AllowDenyPattern, ConfigModel
from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin
from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.ingestion.api.report import Report
from datahub.ingestion.source.bigquery_v2.bigquery_config import ( from datahub.ingestion.source.bigquery_v2.bigquery_config import (
BigQueryConnectionConfig, BigQueryConnectionConfig,
) )
@ -20,6 +21,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
StatefulIngestionConfigBase, StatefulIngestionConfigBase,
) )
from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig
from datahub.utilities.perf_timer import PerfTimer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -110,10 +112,26 @@ class FivetranLogConfig(ConfigModel):
return values return values
@dataclass
class MetadataExtractionPerfReport(Report):
connectors_metadata_extraction_sec: PerfTimer = dataclass_field(
default_factory=PerfTimer
)
connectors_lineage_extraction_sec: PerfTimer = dataclass_field(
default_factory=PerfTimer
)
connectors_jobs_extraction_sec: PerfTimer = dataclass_field(
default_factory=PerfTimer
)
@dataclass @dataclass
class FivetranSourceReport(StaleEntityRemovalSourceReport): class FivetranSourceReport(StaleEntityRemovalSourceReport):
connectors_scanned: int = 0 connectors_scanned: int = 0
filtered_connectors: List[str] = dataclass_field(default_factory=list) filtered_connectors: List[str] = dataclass_field(default_factory=list)
metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field(
default_factory=MetadataExtractionPerfReport
)
def report_connectors_scanned(self, count: int = 1) -> None: def report_connectors_scanned(self, count: int = 1) -> None:
self.connectors_scanned += count self.connectors_scanned += count
@ -163,3 +181,7 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
default={}, default={},
description="A mapping of destination dataset to platform instance. Use destination id as key.", description="A mapping of destination dataset to platform instance. Use destination id as key.",
) )
history_sync_lookback_period: int = pydantic.Field(
7,
description="The number of days to look back when extracting connectors' sync history.",
)

View File

@ -1,5 +1,5 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional from typing import List
@dataclass @dataclass
@ -23,7 +23,7 @@ class Connector:
paused: bool paused: bool
sync_frequency: int sync_frequency: int
destination_id: str destination_id: str
user_email: Optional[str] user_id: str
table_lineage: List[TableLineage] table_lineage: List[TableLineage]
jobs: List["Job"] jobs: List["Job"]

View File

@ -173,11 +173,12 @@ class FivetranSource(StatefulIngestionSourceBase):
env=self.config.env, env=self.config.env,
platform_instance=self.config.platform_instance, platform_instance=self.config.platform_instance,
) )
owner_email = self.audit_log.get_user_email(connector.user_id)
datajob = DataJob( datajob = DataJob(
id=connector.connector_id, id=connector.connector_id,
flow_urn=dataflow_urn, flow_urn=dataflow_urn,
name=connector.connector_name, name=connector.connector_name,
owners={connector.user_email} if connector.user_email else set(), owners={owner_email} if owner_email else set(),
) )
job_property_bag: Dict[str, str] = {} job_property_bag: Dict[str, str] = {}
@ -281,7 +282,9 @@ class FivetranSource(StatefulIngestionSourceBase):
""" """
logger.info("Fivetran plugin execution is started") logger.info("Fivetran plugin execution is started")
connectors = self.audit_log.get_allowed_connectors_list( connectors = self.audit_log.get_allowed_connectors_list(
self.config.connector_patterns, self.report self.config.connector_patterns,
self.report,
self.config.history_sync_lookback_period,
) )
for connector in connectors: for connector in connectors:
logger.info(f"Processing connector id: {connector.connector_id}") logger.info(f"Processing connector id: {connector.connector_id}")

View File

@ -1,3 +1,4 @@
import functools
import json import json
import logging import logging
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@ -151,9 +152,14 @@ class FivetranLogAPI:
return table_lineage_list return table_lineage_list
def _get_all_connector_sync_logs(self) -> Dict[str, Dict]: def _get_all_connector_sync_logs(self, syncs_interval: int) -> Dict[str, Dict]:
sync_logs = {} sync_logs = {}
for row in self._query(self.fivetran_log_query.get_sync_logs_query()): for row in self._query(
self.fivetran_log_query.get_sync_logs_query().format(
db_clause=self.fivetran_log_query.db_clause,
syncs_interval=syncs_interval,
)
):
if row[Constant.CONNECTOR_ID] not in sync_logs: if row[Constant.CONNECTOR_ID] not in sync_logs:
sync_logs[row[Constant.CONNECTOR_ID]] = { sync_logs[row[Constant.CONNECTOR_ID]] = {
row[Constant.SYNC_ID]: { row[Constant.SYNC_ID]: {
@ -208,50 +214,62 @@ class FivetranLogAPI:
) )
return jobs return jobs
def _get_user_email(self, user_id: Optional[str]) -> Optional[str]: @functools.lru_cache()
def _get_users(self) -> Dict[str, str]:
users = self._query(self.fivetran_log_query.get_users_query())
if not users:
return {}
return {user[Constant.USER_ID]: user[Constant.EMAIL] for user in users}
def get_user_email(self, user_id: str) -> Optional[str]:
if not user_id: if not user_id:
return None return None
user_details = self._query( return self._get_users().get(user_id)
self.fivetran_log_query.get_user_query(user_id=user_id)
)
if not user_details: def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None:
return None
return f"{user_details[0][Constant.EMAIL]}"
def get_allowed_connectors_list(
self, connector_patterns: AllowDenyPattern, report: FivetranSourceReport
) -> List[Connector]:
connectors: List[Connector] = []
sync_logs = self._get_all_connector_sync_logs()
table_lineage_metadata = self._get_connectors_table_lineage_metadata() table_lineage_metadata = self._get_connectors_table_lineage_metadata()
column_lineage_metadata = self._get_column_lineage_metadata() column_lineage_metadata = self._get_column_lineage_metadata()
connector_list = self._query(self.fivetran_log_query.get_connectors_query()) for connector in connectors:
for connector in connector_list: connector.table_lineage = self._get_table_lineage(
if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]): column_lineage_metadata=column_lineage_metadata,
report.report_connectors_dropped(connector[Constant.CONNECTOR_NAME]) table_lineage_result=table_lineage_metadata.get(connector.connector_id),
continue
connectors.append(
Connector(
connector_id=connector[Constant.CONNECTOR_ID],
connector_name=connector[Constant.CONNECTOR_NAME],
connector_type=connector[Constant.CONNECTOR_TYPE_ID],
paused=connector[Constant.PAUSED],
sync_frequency=connector[Constant.SYNC_FREQUENCY],
destination_id=connector[Constant.DESTINATION_ID],
user_email=self._get_user_email(
connector[Constant.CONNECTING_USER_ID]
),
table_lineage=self._get_table_lineage(
column_lineage_metadata=column_lineage_metadata,
table_lineage_result=table_lineage_metadata.get(
connector[Constant.CONNECTOR_ID]
),
),
jobs=self._get_jobs_list(
sync_logs.get(connector[Constant.CONNECTOR_ID])
),
)
) )
def _fill_connectors_jobs(
self, connectors: List[Connector], syncs_interval: int
) -> None:
sync_logs = self._get_all_connector_sync_logs(syncs_interval)
for connector in connectors:
connector.jobs = self._get_jobs_list(sync_logs.get(connector.connector_id))
def get_allowed_connectors_list(
self,
connector_patterns: AllowDenyPattern,
report: FivetranSourceReport,
syncs_interval: int,
) -> List[Connector]:
connectors: List[Connector] = []
with report.metadata_extraction_perf.connectors_metadata_extraction_sec:
connector_list = self._query(self.fivetran_log_query.get_connectors_query())
for connector in connector_list:
if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]):
report.report_connectors_dropped(connector[Constant.CONNECTOR_NAME])
continue
connectors.append(
Connector(
connector_id=connector[Constant.CONNECTOR_ID],
connector_name=connector[Constant.CONNECTOR_NAME],
connector_type=connector[Constant.CONNECTOR_TYPE_ID],
paused=connector[Constant.PAUSED],
sync_frequency=connector[Constant.SYNC_FREQUENCY],
destination_id=connector[Constant.DESTINATION_ID],
user_id=connector[Constant.CONNECTING_USER_ID],
table_lineage=[],
jobs=[],
)
)
with report.metadata_extraction_perf.connectors_lineage_extraction_sec:
self._fill_connectors_table_lineage(connectors)
with report.metadata_extraction_perf.connectors_jobs_extraction_sec:
self._fill_connectors_jobs(connectors, syncs_interval)
return connectors return connectors

View File

@ -21,24 +21,24 @@ class FivetranLogQuery:
FROM {self.db_clause}connector FROM {self.db_clause}connector
WHERE _fivetran_deleted = FALSE""" WHERE _fivetran_deleted = FALSE"""
def get_user_query(self, user_id: str) -> str: def get_users_query(self) -> str:
return f""" return f"""
SELECT id as user_id, SELECT id as user_id,
given_name, given_name,
family_name, family_name,
email email
FROM {self.db_clause}user FROM {self.db_clause}user"""
WHERE id = '{user_id}'"""
def get_sync_logs_query(self) -> str: def get_sync_logs_query(self) -> str:
return f""" return """
SELECT connector_id, SELECT connector_id,
sync_id, sync_id,
message_event, message_event,
message_data, message_data,
time_stamp time_stamp
FROM {self.db_clause}log FROM {db_clause}log
WHERE message_event in ('sync_start', 'sync_end')""" WHERE message_event in ('sync_start', 'sync_end')
and time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'"""
def get_table_lineage_query(self) -> str: def get_table_lineage_query(self) -> str:
return f""" return f"""

View File

@ -6,6 +6,7 @@ from typing import Dict, Iterable, Optional, Set, Type, cast
import pydantic import pydantic
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.mcp_builder import entity_supports_aspect
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId
from datahub.ingestion.api.source_helpers import auto_stale_entity_removal from datahub.ingestion.api.source_helpers import auto_stale_entity_removal
@ -23,6 +24,7 @@ from datahub.ingestion.source.state.use_case_handler import (
) )
from datahub.metadata.schema_classes import StatusClass from datahub.metadata.schema_classes import StatusClass
from datahub.utilities.lossy_collections import LossyList from datahub.utilities.lossy_collections import LossyList
from datahub.utilities.urns.urn import guess_entity_type
logger: logging.Logger = logging.getLogger(__name__) logger: logging.Logger = logging.getLogger(__name__)
@ -48,10 +50,14 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
@dataclass @dataclass
class StaleEntityRemovalSourceReport(StatefulIngestionReport): class StaleEntityRemovalSourceReport(StatefulIngestionReport):
soft_deleted_stale_entities: LossyList[str] = field(default_factory=LossyList) soft_deleted_stale_entities: LossyList[str] = field(default_factory=LossyList)
last_state_non_deletable_entities: LossyList[str] = field(default_factory=LossyList)
def report_stale_entity_soft_deleted(self, urn: str) -> None: def report_stale_entity_soft_deleted(self, urn: str) -> None:
self.soft_deleted_stale_entities.append(urn) self.soft_deleted_stale_entities.append(urn)
def report_last_state_non_deletable_entities(self, urn: str) -> None:
self.last_state_non_deletable_entities.append(urn)
class StaleEntityRemovalHandler( class StaleEntityRemovalHandler(
StatefulIngestionUsecaseHandlerBase["GenericCheckpointState"] StatefulIngestionUsecaseHandlerBase["GenericCheckpointState"]
@ -272,11 +278,19 @@ class StaleEntityRemovalHandler(
self.add_entity_to_state("", urn) self.add_entity_to_state("", urn)
return return
report = self.source.get_report()
assert isinstance(report, StaleEntityRemovalSourceReport)
# Everything looks good, emit the soft-deletion workunits # Everything looks good, emit the soft-deletion workunits
for urn in last_checkpoint_state.get_urns_not_in( for urn in last_checkpoint_state.get_urns_not_in(
type="*", other_checkpoint_state=cur_checkpoint_state type="*", other_checkpoint_state=cur_checkpoint_state
): ):
if not entity_supports_aspect(guess_entity_type(urn), StatusClass):
# If any entity does not support aspect 'status' then skip that entity urn
report.report_last_state_non_deletable_entities(urn)
continue
if urn in self._urns_to_skip: if urn in self._urns_to_skip:
report.report_last_state_non_deletable_entities(urn)
logger.debug( logger.debug(
f"Not soft-deleting entity {urn} since it is in urns_to_skip" f"Not soft-deleting entity {urn} since it is in urns_to_skip"
) )

View File

@ -89,7 +89,7 @@ def default_query_results(
"destination_column_name": "name", "destination_column_name": "name",
}, },
] ]
elif query == fivetran_log_query.get_user_query("reapply_phone"): elif query == fivetran_log_query.get_users_query():
return [ return [
{ {
"user_id": "reapply_phone", "user_id": "reapply_phone",
@ -98,7 +98,9 @@ def default_query_results(
"email": "abc.xyz@email.com", "email": "abc.xyz@email.com",
} }
] ]
elif query == fivetran_log_query.get_sync_logs_query(): elif query == fivetran_log_query.get_sync_logs_query().format(
db_clause=fivetran_log_query.db_clause, syncs_interval=7
):
return [ return [
{ {
"connector_id": "calendar_elected", "connector_id": "calendar_elected",

View File

@ -17,7 +17,7 @@
"state": { "state": {
"formatVersion": "1.0", "formatVersion": "1.0",
"serde": "utf-8", "serde": "utf-8",
"payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\"]}" "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)\", \"urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619\"]}"
}, },
"runId": "dummy-test-stateful-ingestion" "runId": "dummy-test-stateful-ingestion"
} }

View File

@ -17,7 +17,7 @@
"state": { "state": {
"formatVersion": "1.0", "formatVersion": "1.0",
"serde": "utf-8", "serde": "utf-8",
"payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\"]}" "payload": "{\"urns\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)\", \"urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)\", \"urn:li:dataProcessInstance:7f26c3b4d2d82ace47f4b9dd0c9dea26\"]}"
}, },
"runId": "dummy-test-stateful-ingestion" "runId": "dummy-test-stateful-ingestion"
} }

View File

@ -46,5 +46,27 @@
"runId": "dummy-test-stateful-ingestion", "runId": "dummy-test-stateful-ingestion",
"lastRunId": "no-run-id-provided" "lastRunId": "no-run-id-provided"
} }
},
{
"entityType": "dataProcessInstance",
"entityUrn": "urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619",
"changeType": "UPSERT",
"aspectName": "dataProcessInstanceProperties",
"aspect": {
"json": {
"customProperties": {},
"name": "job1",
"type": "BATCH_SCHEDULED",
"created": {
"time": 1586847600000,
"actor": "urn:li:corpuser:datahub"
}
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "dummy-test-stateful-ingestion",
"lastRunId": "no-run-id-provided"
}
} }
] ]

View File

@ -31,6 +31,28 @@
"lastRunId": "no-run-id-provided" "lastRunId": "no-run-id-provided"
} }
}, },
{
"entityType": "dataProcessInstance",
"entityUrn": "urn:li:dataProcessInstance:7f26c3b4d2d82ace47f4b9dd0c9dea26",
"changeType": "UPSERT",
"aspectName": "dataProcessInstanceProperties",
"aspect": {
"json": {
"customProperties": {},
"name": "job2",
"type": "BATCH_SCHEDULED",
"created": {
"time": 1586847600000,
"actor": "urn:li:corpuser:datahub"
}
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "dummy-test-stateful-ingestion",
"lastRunId": "no-run-id-provided"
}
},
{ {
"entityType": "dataset", "entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)",

View File

@ -1,3 +1,4 @@
import time
from dataclasses import dataclass, field as dataclass_field from dataclasses import dataclass, field as dataclass_field
from typing import Any, Dict, Iterable, List, Optional, cast from typing import Any, Dict, Iterable, List, Optional, cast
from unittest import mock from unittest import mock
@ -7,6 +8,7 @@ import pytest
from freezegun import freeze_time from freezegun import freeze_time
from pydantic import Field from pydantic import Field
from datahub.api.entities.dataprocess.dataprocess_instance import DataProcessInstance
from datahub.configuration.common import AllowDenyPattern from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
@ -24,7 +26,10 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
StatefulIngestionConfigBase, StatefulIngestionConfigBase,
StatefulIngestionSourceBase, StatefulIngestionSourceBase,
) )
from datahub.metadata.schema_classes import StatusClass from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
DataProcessInstanceProperties,
)
from datahub.metadata.schema_classes import AuditStampClass, StatusClass
from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.dataset_urn import DatasetUrn
from tests.test_helpers import mce_helpers from tests.test_helpers import mce_helpers
from tests.test_helpers.state_helpers import ( from tests.test_helpers.state_helpers import (
@ -62,6 +67,10 @@ class DummySourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
default=False, default=False,
description="Should this dummy source report a failure.", description="Should this dummy source report a failure.",
) )
dpi_id_to_ingest: Optional[str] = Field(
default=None,
description="Data process instance id to ingest.",
)
class DummySource(StatefulIngestionSourceBase): class DummySource(StatefulIngestionSourceBase):
@ -109,6 +118,24 @@ class DummySource(StatefulIngestionSourceBase):
aspect=StatusClass(removed=False), aspect=StatusClass(removed=False),
).as_workunit() ).as_workunit()
if self.source_config.dpi_id_to_ingest:
dpi = DataProcessInstance(
id=self.source_config.dpi_id_to_ingest,
orchestrator="dummy",
)
yield MetadataChangeProposalWrapper(
entityUrn=str(dpi.urn),
aspect=DataProcessInstanceProperties(
name=dpi.id,
created=AuditStampClass(
time=int(time.time() * 1000),
actor="urn:li:corpuser:datahub",
),
type=dpi.type,
),
).as_workunit()
if self.source_config.report_failure: if self.source_config.report_failure:
self.reporter.report_failure("Dummy error", "Error") self.reporter.report_failure("Dummy error", "Error")
@ -152,6 +179,7 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time):
"stateful_ingestion": { "stateful_ingestion": {
"enabled": True, "enabled": True,
"remove_stale_metadata": True, "remove_stale_metadata": True,
"fail_safe_threshold": 100,
"state_provider": { "state_provider": {
"type": "file", "type": "file",
"config": { "config": {
@ -159,6 +187,7 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time):
}, },
}, },
}, },
"dpi_id_to_ingest": "job1",
}, },
}, },
"sink": { "sink": {
@ -207,6 +236,7 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time):
pipeline_run2_config["source"]["config"]["dataset_patterns"] = { pipeline_run2_config["source"]["config"]["dataset_patterns"] = {
"allow": ["dummy_dataset1", "dummy_dataset2"], "allow": ["dummy_dataset1", "dummy_dataset2"],
} }
pipeline_run2_config["source"]["config"]["dpi_id_to_ingest"] = "job2"
pipeline_run2_config["sink"]["config"][ pipeline_run2_config["sink"]["config"][
"filename" "filename"
] = f"{tmp_path}/{output_file_name_after_deleted}" ] = f"{tmp_path}/{output_file_name_after_deleted}"
@ -253,6 +283,16 @@ def test_stateful_ingestion(pytestconfig, tmp_path, mock_time):
] ]
assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns) assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns)
report = pipeline_run2.source.get_report()
assert isinstance(report, StaleEntityRemovalSourceReport)
# assert report last ingestion state non_deletable entity urns
non_deletable_urns: List[str] = [
"urn:li:dataProcessInstance:478810e859f870a54f72c681f41af619",
]
assert sorted(non_deletable_urns) == sorted(
report.last_state_non_deletable_entities
)
@freeze_time(FROZEN_TIME) @freeze_time(FROZEN_TIME)
def test_stateful_ingestion_failure(pytestconfig, tmp_path, mock_time): def test_stateful_ingestion_failure(pytestconfig, tmp_path, mock_time):