datahub/metadata-ingestion/examples/library/data_process_instance_create_from_dataflow.py

62 lines
1.7 KiB
Python

import os
import time
from datahub.api.entities.datajob import DataFlow
from datahub.api.entities.dataprocess.dataprocess_instance import (
DataProcessInstance,
InstanceRunResult,
)
from datahub.emitter.rest_emitter import DatahubRestEmitter
emitter = DatahubRestEmitter(
gms_server=os.getenv("DATAHUB_GMS_URL", "http://localhost:8080"),
token=os.getenv("DATAHUB_GMS_TOKEN"),
)
# Define the DataFlow (Airflow DAG)
dataflow = DataFlow(
orchestrator="airflow",
id="daily_reporting_pipeline",
env="prod",
description="Daily reporting pipeline that aggregates metrics",
)
# Create a DataProcessInstance for a specific DAG run
dag_run_instance = DataProcessInstance.from_dataflow(
dataflow=dataflow, id="scheduled__2024-01-15T00:00:00+00:00"
)
# Set properties specific to this DAG run
dag_run_instance.properties = {
"execution_date": "2024-01-15",
"run_type": "scheduled",
"external_trigger": "false",
}
dag_run_instance.url = "https://airflow.company.com/dags/daily_reporting_pipeline/grid?dag_run_id=scheduled__2024-01-15T00:00:00+00:00"
# Track DAG run start
start_time = int(time.time() * 1000)
dag_run_instance.emit_process_start(
emitter=emitter,
start_timestamp_millis=start_time,
attempt=1,
emit_template=True,
materialize_iolets=True,
)
print(f"DAG run started: {dag_run_instance.urn}")
# Simulate DAG execution
time.sleep(3)
# Track DAG run completion
end_time = int(time.time() * 1000)
dag_run_instance.emit_process_end(
emitter=emitter,
end_timestamp_millis=end_time,
result=InstanceRunResult.SUCCESS,
result_type="airflow",
attempt=1,
start_timestamp_millis=start_time,
)
print(f"DAG run completed successfully in {end_time - start_time}ms")