datahub/metadata-ingestion/examples/library/data_process_instance_create_with_retry.py

91 lines
2.4 KiB
Python

import os
import time
from datahub.api.entities.dataprocess.dataprocess_instance import (
DataProcessInstance,
InstanceRunResult,
)
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.schema_classes import DataProcessTypeClass
from datahub.utilities.urns.data_job_urn import DataJobUrn
from datahub.utilities.urns.dataset_urn import DatasetUrn
emitter = DatahubRestEmitter(
gms_server=os.getenv("DATAHUB_GMS_URL", "http://localhost:8080"),
token=os.getenv("DATAHUB_GMS_TOKEN"),
)
parent_job_urn = DataJobUrn.create_from_string(
"urn:li:dataJob:(urn:li:dataFlow:(airflow,etl_pipeline,prod),load_customer_data)"
)
instance = DataProcessInstance(
id="scheduled__2024-01-15T14:30:00+00:00",
orchestrator="airflow",
cluster="prod",
template_urn=parent_job_urn,
type=DataProcessTypeClass.BATCH_SCHEDULED,
inlets=[
DatasetUrn.create_from_string(
"urn:li:dataset:(urn:li:dataPlatform:s3,customer_exports,PROD)"
)
],
outlets=[
DatasetUrn.create_from_string(
"urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics.customers,PROD)"
)
],
)
# First attempt
start_time_attempt1 = int(time.time() * 1000)
instance.emit_process_start(
emitter=emitter,
start_timestamp_millis=start_time_attempt1,
attempt=1,
emit_template=True,
materialize_iolets=True,
)
print("Attempt 1 started...")
time.sleep(1)
# First attempt fails
end_time_attempt1 = int(time.time() * 1000)
instance.emit_process_end(
emitter=emitter,
end_timestamp_millis=end_time_attempt1,
result=InstanceRunResult.UP_FOR_RETRY,
result_type="airflow",
attempt=1,
start_timestamp_millis=start_time_attempt1,
)
print("Attempt 1 failed, will retry...")
time.sleep(2)
# Second attempt (retry)
start_time_attempt2 = int(time.time() * 1000)
instance.emit_process_start(
emitter=emitter,
start_timestamp_millis=start_time_attempt2,
attempt=2,
emit_template=False,
materialize_iolets=False,
)
print("Attempt 2 started (retry)...")
time.sleep(1)
# Second attempt succeeds
end_time_attempt2 = int(time.time() * 1000)
instance.emit_process_end(
emitter=emitter,
end_timestamp_millis=end_time_attempt2,
result=InstanceRunResult.SUCCESS,
result_type="airflow",
attempt=2,
start_timestamp_millis=start_time_attempt2,
)
print("Attempt 2 succeeded!")