From 5e45db9e60254b1d68c4d924a41085ee8498616c Mon Sep 17 00:00:00 2001 From: matthew-coudert-cko <113023884+matthew-coudert-cko@users.noreply.github.com> Date: Thu, 15 Jun 2023 08:16:25 +0100 Subject: [PATCH] docs(airflow): add docs on custom operators (#7913) Co-authored-by: John Joyce --- docs/lineage/airflow.md | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index da7d4a625c..ef4071f89c 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -69,14 +69,41 @@ lazy_load_plugins = False ### How to validate installation -1. Go and check in Airflow at Admin -> Plugins menu if you can see the Datahub plugin +1. Go and check in Airflow at Admin -> Plugins menu if you can see the DataHub plugin 2. Run an Airflow DAG. In the task logs, you should see Datahub related log messages like: ``` -Emitting Datahub ... +Emitting DataHub ... ``` -## Using Datahub's Airflow lineage backend (deprecated) +### Emitting lineage via a custom operator to the Airflow Plugin + +If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, +when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`. +The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. + + + +```python +class DbtOperator(BaseOperator): + ... + + def execute(self, context): + # do something + inlets, outlets = self._get_lineage() + # inlets/outlets are lists of either datahub_provider.entities.Dataset or datahub_provider.entities.Urn + context['ti'].task.inlets = self.inlets + context['ti'].task.outlets = self.outlets + + def _get_lineage(self): + # Do some processing to get inlets/outlets + + return inlets, outlets +``` + +If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) + +## Using DataHub's Airflow lineage backend (deprecated) :::caution @@ -145,6 +172,7 @@ Take a look at this sample DAG: In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. + ## Debugging ### Incorrect URLs