#11626 & #14131 - Lineage with other Entities & attr-based xlets (#14191)

* Add OMEntity model

* Test OMEntity

* Update repr

* Fix __str__

* Add entity ref map

* Test serializer for backend

* Fix tests

* Fix serializer

* Test runner

* Add runner tests

* Update docs

* Format
This commit is contained in:
Pere Miquel Brull 2023-12-01 06:29:44 +01:00 committed by GitHub
parent f57e429eb5
commit 7fcdf08ca4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 932 additions and 136 deletions

View File

@ -358,6 +358,13 @@ services:
DB_SCHEME: ${AIRFLOW_DB_SCHEME:-mysql+pymysql}
DB_USER: ${AIRFLOW_DB_USER:-airflow_user}
DB_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow_pass}
# To test the lineage backend
# AIRFLOW__LINEAGE__BACKEND: airflow_provider_openmetadata.lineage.backend.OpenMetadataLineageBackend
# AIRFLOW__LINEAGE__AIRFLOW_SERVICE_NAME: local_airflow
# AIRFLOW__LINEAGE__OPENMETADATA_API_ENDPOINT: http://openmetadata-server:8585/api
# AIRFLOW__LINEAGE__JWT_TOKEN: ...
entrypoint: /bin/bash
command:
- "/opt/airflow/ingestion_dependency.sh"

View File

@ -24,6 +24,10 @@ from datetime import timedelta
from airflow.decorators import dag, task
from airflow.utils.dates import days_ago
from metadata.generated.schema.entity.data.container import Container
from metadata.generated.schema.entity.data.table import Table
from metadata.ingestion.source.pipeline.airflow.lineage_parser import OMEntity
default_args = {
"owner": "openmetadata_airflow_example",
"depends_on_past": False,
@ -45,7 +49,6 @@ def openmetadata_airflow_lineage_example():
inlets={
"tables": [
"sample_data.ecommerce_db.shopify.raw_order",
"sample_data.ecommerce_db.shopify.raw_customer",
],
},
outlets={"tables": ["sample_data.ecommerce_db.shopify.fact_order"]},
@ -53,7 +56,23 @@ def openmetadata_airflow_lineage_example():
def generate_data():
pass
@task(
inlets=[
OMEntity(entity=Container, fqn="s3_storage_sample.transactions", key="test")
],
outlets=[
OMEntity(
entity=Table,
fqn="sample_data.ecommerce_db.shopify.raw_order",
key="test",
)
],
)
def generate_data2():
pass
generate_data()
generate_data2()
openmetadata_airflow_lineage_example_dag = openmetadata_airflow_lineage_example()

View File

@ -114,14 +114,17 @@ base_requirements = {
"sqlalchemy>=1.4.0,<2",
"collate-sqllineage>=1.0.4",
"tabulate==0.9.0",
"typing_extensions<=4.5.0", # We need to have this fixed due to a yanked release 4.6.0
"typing_extensions>=4.8.0",
"typing-inspect",
"wheel~=0.38.4",
}
plugins: Dict[str, Set[str]] = {
"airflow": {VERSIONS["airflow"]}, # Same as ingestion container. For development.
"airflow": {
VERSIONS["airflow"],
"attrs",
}, # Same as ingestion container. For development.
"amundsen": {VERSIONS["neo4j"]},
"athena": {"pyathena==3.0.8"},
"atlas": {},

View File

@ -64,14 +64,17 @@ class OpenMetadataLineageBackend(LineageBackend):
"""
try:
dag = context["dag"]
dag.log.info("Executing OpenMetadata Lineage Backend...")
config: AirflowLineageConfig = get_lineage_config()
xlet_list: List[XLets] = get_xlets_from_dag(dag)
metadata = OpenMetadata(config.metadata_config)
xlet_list: List[XLets] = get_xlets_from_dag(context["dag"])
runner = AirflowLineageRunner(
metadata=metadata,
service_name=config.airflow_service_name,
dag=context["dag"],
dag=dag,
xlets=xlet_list,
only_keep_dag_lineage=config.only_keep_dag_lineage,
max_status=config.max_status,

View File

@ -47,6 +47,7 @@ from metadata.generated.schema.type.entityLineage import EntitiesEdge, LineageDe
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.pipeline.airflow.lineage_parser import XLets
from metadata.utils.constants import ENTITY_REFERENCE_TYPE_MAP
from metadata.utils.helpers import clean_uri, datetime_to_ts
@ -251,37 +252,47 @@ class AirflowLineageRunner:
"""
lineage_details = LineageDetails(
pipeline=EntityReference(id=pipeline.id, type="pipeline")
pipeline=EntityReference(
id=pipeline.id, type=ENTITY_REFERENCE_TYPE_MAP[Pipeline.__name__]
)
)
for from_fqn in xlets.inlets or []:
for from_xlet in xlets.inlets or []:
from_entity: Optional[Table] = self.metadata.get_by_name(
entity=Table, fqn=from_fqn
entity=from_xlet.entity, fqn=from_xlet.fqn
)
if from_entity:
for to_fqn in xlets.outlets or []:
for to_xlet in xlets.outlets or []:
to_entity: Optional[Table] = self.metadata.get_by_name(
entity=Table, fqn=to_fqn
entity=to_xlet.entity, fqn=to_xlet.fqn
)
if to_entity:
lineage = AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=from_entity.id, type="table"
id=from_entity.id,
type=ENTITY_REFERENCE_TYPE_MAP[
from_xlet.entity.__name__
],
),
toEntity=EntityReference(
id=to_entity.id,
type=ENTITY_REFERENCE_TYPE_MAP[
to_xlet.entity.__name__
],
),
toEntity=EntityReference(id=to_entity.id, type="table"),
lineageDetails=lineage_details,
)
)
self.metadata.add_lineage(lineage)
else:
self.dag.log.warning(
f"Could not find Table [{to_fqn}] from "
f"Could not find [{to_xlet.entity.__name__}] [{to_xlet.fqn}] from "
f"[{pipeline.fullyQualifiedName.__root__}] outlets"
)
else:
self.dag.log.warning(
f"Could not find Table [{from_fqn}] from "
f"Could not find [{from_xlet.entity.__name__}] [{from_xlet.fqn}] from "
f"[{pipeline.fullyQualifiedName.__root__}] inlets"
)
@ -305,7 +316,8 @@ class AirflowLineageRunner:
for node in lineage_data.get("nodes") or []
if node["id"] == upstream_edge["fromEntity"]
and node["type"] == "table"
)
),
None,
)
for upstream_edge in lineage_data.get("upstreamEdges") or []
]
@ -316,26 +328,37 @@ class AirflowLineageRunner:
for node in lineage_data.get("nodes") or []
if node["id"] == downstream_edge["toEntity"]
and node["type"] == "table"
)
),
None,
)
for downstream_edge in lineage_data.get("downstreamEdges") or []
]
for edge in upstream_edges:
if edge.fqn not in xlets.inlets:
for edge in upstream_edges or []:
if edge.fqn not in (inlet.fqn for inlet in xlets.inlets):
self.dag.log.info(f"Removing upstream edge with {edge.fqn}")
edge_to_remove = EntitiesEdge(
fromEntity=EntityReference(id=edge.id, type="table"),
toEntity=EntityReference(id=pipeline.id, type="pipeline"),
fromEntity=EntityReference(
id=edge.id, type=ENTITY_REFERENCE_TYPE_MAP[Table.__name__]
),
toEntity=EntityReference(
id=pipeline.id,
type=ENTITY_REFERENCE_TYPE_MAP[Pipeline.__name__],
),
)
self.metadata.delete_lineage_edge(edge=edge_to_remove)
for edge in downstream_edges:
if edge.fqn not in xlets.outlets:
for edge in downstream_edges or []:
if edge.fqn not in (outlet.fqn for outlet in xlets.outlets):
self.dag.log.info(f"Removing downstream edge with {edge.fqn}")
edge_to_remove = EntitiesEdge(
fromEntity=EntityReference(id=pipeline.id, type="pipeline"),
toEntity=EntityReference(id=edge.id, type="table"),
fromEntity=EntityReference(
id=pipeline.id,
type=ENTITY_REFERENCE_TYPE_MAP[Pipeline.__name__],
),
toEntity=EntityReference(
id=edge.id, type=ENTITY_REFERENCE_TYPE_MAP[Table.__name__]
),
)
self.metadata.delete_lineage_edge(edge=edge_to_remove)

View File

@ -62,13 +62,23 @@ we'll join the keys and get [
]
and we'll treat this as independent sets of lineage
"""
import json
import logging
import traceback
from collections import defaultdict
from copy import deepcopy
from enum import Enum
from typing import Dict, List, Optional, Set
from functools import singledispatch
from typing import Any, DefaultDict, Dict, List, Optional, Type
import attr
from pydantic import BaseModel
from metadata.generated.schema.entity.data.table import Table
from metadata.ingestion.ometa.models import T
from metadata.utils.deprecation import deprecated
from metadata.utils.importer import import_from_module
logger = logging.getLogger("airflow.task")
@ -85,41 +95,207 @@ class XLetsAttr(Enum):
PRIVATE_OUTLETS = "_outlets"
@attr.s(auto_attribs=True, kw_only=True)
class OMEntity:
"""
Identifies one entity in OpenMetadata.
We use attr annotated object similar to https://github.com/apache/airflow/blob/main/airflow/lineage/entities.py
based on https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html
"""
# Entity Type, such as Table, Container or Dashboard.
entity: Type[T] = attr.ib()
# Entity Fully Qualified Name, e.g., service.database.schema.table
fqn: str = attr.ib()
# We will use the key in case we need to group different lineages from the same DAG
key: str = "default"
def __str__(self):
"""Custom serialization"""
_dict = deepcopy(self.__dict__)
_dict["entity"] = f"{self.entity.__module__}.{self.entity.__name__}"
return json.dumps(_dict)
def serialize(self) -> str:
"""Custom serialization to be called in airflow internals"""
return str(self)
class XLets(BaseModel):
"""
Group inlets and outlets from all tasks in a DAG
"""
inlets: Set[str]
outlets: Set[str]
inlets: List[OMEntity]
outlets: List[OMEntity]
class Config:
arbitrary_types_allowed = True
def parse_xlets(xlet: List[dict]) -> Optional[Dict[str, List[str]]]:
def concat_dict_values(
d1: DefaultDict[str, List[Any]], d2: Optional[Dict[str, List[Any]]]
) -> DefaultDict[str, List[Any]]:
"""
Update d1 based on d2 values concatenating their results.
"""
if d2:
for key, value in d2.items():
d1[key] = d1[key] + value
return d1
def parse_xlets(xlet: List[Any]) -> Optional[Dict[str, List[OMEntity]]]:
"""
Parse airflow xlets for V1
:param xlet: airflow v2 xlet dict
:return: dictionary of xlet list or None
[{'__var': {'tables': ['sample_data.ecommerce_db.shopify.fact_order']},
'__type': 'dict'}]
If our operators are like
```
BashOperator(
task_id="print_date",
bash_command="date",
inlets={"tables": ["A"]},
)
```
the inlets/outlets will still be processed in airflow as a `List`.
Note that when picking them up from Serialized DAGs, the shape is:
```
[{'__var': {'tables': ['sample_data.ecommerce_db.shopify.fact_order']}, '__type': 'dict'}]
```
If using Datasets, we get something like:
```
[Dataset(uri='s3://dataset-bucket/input.csv', extra=None)]
```
We need to figure out how we want to handle information coming in this format.
"""
# This branch is for lineage parser op
if isinstance(xlet, list) and len(xlet) and isinstance(xlet[0], dict):
xlet_dict = xlet[0]
# This is how the Serialized DAG is giving us the info from _inlets & _outlets
if isinstance(xlet_dict, dict) and xlet_dict.get("__var"):
xlet_dict = xlet_dict["__var"]
return {
key: value for key, value in xlet_dict.items() if isinstance(value, list)
}
if isinstance(xlet, list) and len(xlet):
_parsed_xlets = defaultdict(list)
for element in xlet:
parsed_element = _parse_xlets(element) or {}
# Update our xlet dict based on each parsed element
# Since we can get a list of elements, concatenate the results from multiple xlets
_parsed_xlets = concat_dict_values(_parsed_xlets, parsed_element)
return _parsed_xlets
return None
@singledispatch
def _parse_xlets(xlet: Any) -> None:
"""
Default behavior to handle lineage.
We can use this function to register further inlets/outlets
representations, e.g., https://github.com/open-metadata/OpenMetadata/issues/11626
"""
logger.warning(f"Inlet/Outlet type {type(xlet)} is not supported.")
@_parse_xlets.register
@deprecated(
message="Please update your inlets/outlets to follow <TODO DOCS>",
release="1.4.0",
)
def dictionary_lineage_annotation(xlet: dict) -> Dict[str, List[OMEntity]]:
"""
Handle OM specific inlet/outlet information. E.g.,
```
BashOperator(
task_id="print_date",
bash_command="date",
inlets={
"tables": ["A", "A"],
"more_tables": ["X", "Y"],
"this is a bit random": "foo",
},
)
```
"""
xlet_dict = xlet
# This is how the Serialized DAG is giving us the info from _inlets & _outlets
if isinstance(xlet_dict, dict) and xlet_dict.get("__var"):
xlet_dict = xlet_dict["__var"]
return {
key: [
# We will convert the old dict lineage method into Tables
OMEntity(entity=Table, fqn=fqn)
for fqn in set(value) # Remove duplicates
]
for key, value in xlet_dict.items()
if isinstance(value, list)
}
@_parse_xlets.register
def _(xlet: OMEntity) -> Optional[Dict[str, List[OMEntity]]]:
"""
Handle OM specific inlet/outlet information. E.g.,
```
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets=[OMEntity(entity=Table, fqn="B")],
)
```
"""
return {xlet.key: [xlet]}
@_parse_xlets.register
def _(xlet: str) -> Optional[Dict[str, List[OMEntity]]]:
"""
Handle OM specific inlet/outlet information. E.g.,
```
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets=[OMEntity(entity=Table, fqn="B")],
)
```
Once a DAG is serialized, the xlet info will be stored as:
```
['{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}']
```
based on our custom serialization logic.
In this method, we need to revert this back to the actual instance of OMEntity.
Note that we need to properly validate that the string is following the constraints of:
- Being a JSON representation
- Following the structure of an OMEntity
Otherwise, we could be having any other attr-based xlet native from Airflow.
"""
try:
body = json.loads(xlet)
om_entity = OMEntity(
entity=import_from_module(body.get("entity")),
fqn=body.get("fqn"),
key=body.get("key"),
)
return {om_entity.key: [om_entity]}
except Exception as exc:
logger.error(
f"We could not parse the inlet/outlet information from [{xlet}] due to [{exc}]"
)
return None
def get_xlets_from_operator(
operator: "BaseOperator", xlet_mode: XLetsMode
) -> Optional[Dict[str, List[str]]]:
) -> Optional[Dict[str, List[OMEntity]]]:
"""
Given an Airflow DAG Task, obtain the tables
set in inlets or outlets.
@ -166,25 +342,26 @@ def get_xlets_from_dag(dag: "DAG") -> List[XLets]:
Fill the inlets and outlets of the Pipeline by iterating
over all its tasks
"""
_inlets = {}
_outlets = {}
_inlets = defaultdict(list)
_outlets = defaultdict(list)
# First, grab all the inlets and outlets from all tasks grouped by keys
for task in dag.tasks:
try:
_inlets.update(
_inlets = concat_dict_values(
_inlets,
get_xlets_from_operator(
operator=task,
xlet_mode=XLetsMode.INLETS,
)
or []
),
)
_outlets.update(
_outlets = concat_dict_values(
_outlets,
get_xlets_from_operator(
operator=task,
xlet_mode=XLetsMode.OUTLETS,
)
or []
),
)
except Exception as exc:
@ -197,7 +374,7 @@ def get_xlets_from_dag(dag: "DAG") -> List[XLets]:
# We expect to have the same keys in both inlets and outlets dicts
# We will then iterate over the inlet keys to build the list of XLets
return [
XLets(inlets=set(value), outlets=set(_outlets[key]))
XLets(inlets=value, outlets=_outlets[key])
for key, value in _inlets.items()
if value and _outlets.get(key)
]

View File

@ -30,7 +30,6 @@ from metadata.generated.schema.entity.data.pipeline import (
Task,
TaskStatus,
)
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.services.connections.pipeline.airflowConnection import (
AirflowConnection,
)
@ -45,7 +44,10 @@ from metadata.ingestion.api.steps import InvalidSourceException
from metadata.ingestion.connections.session import create_and_bind_session
from metadata.ingestion.models.pipeline_status import OMetaPipelineStatus
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.pipeline.airflow.lineage_parser import get_xlets_from_dag
from metadata.ingestion.source.pipeline.airflow.lineage_parser import (
XLets,
get_xlets_from_dag,
)
from metadata.ingestion.source.pipeline.airflow.models import (
AirflowDag,
AirflowDagDetails,
@ -53,6 +55,7 @@ from metadata.ingestion.source.pipeline.airflow.models import (
from metadata.ingestion.source.pipeline.airflow.utils import get_schedule_interval
from metadata.ingestion.source.pipeline.pipeline_service import PipelineServiceSource
from metadata.utils import fqn
from metadata.utils.constants import ENTITY_REFERENCE_TYPE_MAP
from metadata.utils.helpers import clean_uri, datetime_to_ts
from metadata.utils.logger import ingestion_logger
@ -264,7 +267,7 @@ class AirflowSource(PipelineServiceSource):
SerializedDagModel.dag_id,
json_data_column,
SerializedDagModel.fileloc,
).all():
).yield_per(100):
try:
data = serialized_dag[1]["dag"]
dag = AirflowDagDetails(
@ -429,25 +432,40 @@ class AirflowSource(PipelineServiceSource):
return
lineage_details = LineageDetails(
pipeline=EntityReference(id=pipeline_entity.id.__root__, type="pipeline"),
pipeline=EntityReference(
id=pipeline_entity.id.__root__,
type=ENTITY_REFERENCE_TYPE_MAP[Pipeline.__name__],
),
source=LineageSource.PipelineLineage,
)
xlets = get_xlets_from_dag(dag=pipeline_details) if pipeline_details else []
xlets: List[XLets] = (
get_xlets_from_dag(dag=pipeline_details) if pipeline_details else []
)
for xlet in xlets:
for from_fqn in xlet.inlets or []:
from_entity = self.metadata.get_by_name(entity=Table, fqn=from_fqn)
for from_xlet in xlet.inlets or []:
from_entity = self.metadata.get_by_name(
entity=from_xlet.entity, fqn=from_xlet.fqn
)
if from_entity:
for to_fqn in xlet.outlets or []:
to_entity = self.metadata.get_by_name(entity=Table, fqn=to_fqn)
for to_xlet in xlet.outlets or []:
to_entity = self.metadata.get_by_name(
entity=to_xlet.entity, fqn=to_xlet.fqn
)
if to_entity:
lineage = AddLineageRequest(
edge=EntitiesEdge(
fromEntity=EntityReference(
id=from_entity.id, type="table"
id=from_entity.id,
type=ENTITY_REFERENCE_TYPE_MAP[
from_xlet.entity.__name__
],
),
toEntity=EntityReference(
id=to_entity.id, type="table"
id=to_entity.id,
type=ENTITY_REFERENCE_TYPE_MAP[
to_xlet.entity.__name__
],
),
lineageDetails=lineage_details,
)
@ -455,12 +473,12 @@ class AirflowSource(PipelineServiceSource):
yield Either(right=lineage)
else:
logger.warning(
f"Could not find Table [{to_fqn}] from "
f"Could not find [{to_xlet.entity.__name__}] [{to_xlet.fqn}] from "
f"[{pipeline_entity.fullyQualifiedName.__root__}] outlets"
)
else:
logger.warning(
f"Could not find Table [{from_fqn}] from "
f"Could not find [{from_xlet.entity.__name__}] [{from_xlet.fqn}] from "
f"[{pipeline_entity.fullyQualifiedName.__root__}] inlets"
)

View File

@ -12,6 +12,26 @@
"""
Define constants useful for the metadata ingestion
"""
from metadata.generated.schema.entity.data.chart import Chart
from metadata.generated.schema.entity.data.container import Container
from metadata.generated.schema.entity.data.dashboard import Dashboard
from metadata.generated.schema.entity.data.dashboardDataModel import DashboardDataModel
from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
from metadata.generated.schema.entity.data.mlmodel import MlModel
from metadata.generated.schema.entity.data.pipeline import Pipeline
from metadata.generated.schema.entity.data.searchIndex import SearchIndex
from metadata.generated.schema.entity.data.storedProcedure import StoredProcedure
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.data.topic import Topic
from metadata.generated.schema.entity.services.dashboardService import DashboardService
from metadata.generated.schema.entity.services.databaseService import DatabaseService
from metadata.generated.schema.entity.services.messagingService import MessagingService
from metadata.generated.schema.entity.services.metadataService import MetadataService
from metadata.generated.schema.entity.services.mlmodelService import MlModelService
from metadata.generated.schema.entity.services.pipelineService import PipelineService
from metadata.generated.schema.entity.services.searchService import SearchService
from metadata.generated.schema.entity.services.storageService import StorageService
DOT = "_DOT_"
TEN_MIN = 10 * 60
@ -47,3 +67,29 @@ AUTHORIZATION_HEADER = "Authorization"
NO_ACCESS_TOKEN = "no_token"
SAMPLE_DATA_DEFAULT_COUNT = 50
# Mainly used for lineage
ENTITY_REFERENCE_TYPE_MAP = {
# Service Entities
DatabaseService.__name__: "databaseService",
MessagingService.__name__: "messagingService",
DashboardService.__name__: "dashboardService",
PipelineService.__name__: "pipelineService",
StorageService.__name__: "storageService",
MlModelService.__name__: "mlmodelService",
MetadataService.__name__: "metadataService",
SearchService.__name__: "searchService",
# Data Asset Entities
Table.__name__: "table",
StoredProcedure.__name__: "storedProcedure",
Database.__name__: "database",
DatabaseSchema.__name__: "databaseSchema",
Dashboard.__name__: "dashboard",
DashboardDataModel.__name__: "dashboardDataModel",
Pipeline.__name__: "pipeline",
Chart.__name__: "chart",
Topic.__name__: "topic",
SearchIndex.__name__: "searchIndex",
MlModel.__name__: "mlmodel",
Container.__name__: "container",
}

View File

@ -187,7 +187,9 @@ def log_ansi_encoded_string(
@singledispatch
def get_log_name(record: Entity) -> Optional[str]:
try:
return f"{type(record).__name__} [{getattr(record, 'name', record.entity.name).__root__}]"
if hasattr(record, "name"):
return f"{type(record).__name__} [{getattr(record, 'name').__root__}]"
return f"{type(record).__name__} [{record.entity.name.__root__}]"
except Exception:
return str(record)

View File

@ -0,0 +1,218 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test lineage parser to get inlets and outlets information
"""
from datetime import datetime
from unittest import TestCase
from unittest.mock import patch
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow_provider_openmetadata.lineage.runner import AirflowLineageRunner
from metadata.generated.schema.api.data.createDatabase import CreateDatabaseRequest
from metadata.generated.schema.api.data.createDatabaseSchema import (
CreateDatabaseSchemaRequest,
)
from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.api.services.createDatabaseService import (
CreateDatabaseServiceRequest,
)
from metadata.generated.schema.entity.data.table import Column, DataType, Table
from metadata.generated.schema.entity.services.connections.database.common.basicAuth import (
BasicAuth,
)
from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
MysqlConnection,
)
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection,
)
from metadata.generated.schema.entity.services.databaseService import (
DatabaseConnection,
DatabaseService,
DatabaseServiceType,
)
from metadata.generated.schema.entity.services.pipelineService import PipelineService
from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (
OpenMetadataJWTClientConfig,
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.pipeline.airflow.lineage_parser import (
OMEntity,
get_xlets_from_dag,
)
SLEEP = "sleep 1"
PIPELINE_SERVICE_NAME = "test-lineage-runner"
DB_SERVICE_NAME = "test-service-lineage-runner"
OM_JWT = "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
class TestAirflowLineageRuner(TestCase):
"""
Validate AirflowLineageRunner
"""
server_config = OpenMetadataConnection(
hostPort="http://localhost:8585/api",
authProvider="openmetadata",
securityConfig=OpenMetadataJWTClientConfig(jwtToken=OM_JWT),
)
metadata = OpenMetadata(server_config)
assert metadata.health_check()
service = CreateDatabaseServiceRequest(
name=DB_SERVICE_NAME,
serviceType=DatabaseServiceType.Mysql,
connection=DatabaseConnection(
config=MysqlConnection(
username="username",
authType=BasicAuth(password="password"),
hostPort="http://localhost:1234",
)
),
)
service_type = "databaseService"
@classmethod
def setUpClass(cls) -> None:
"""
Prepare ingredients: Table Entity + DAG
"""
service_entity = cls.metadata.create_or_update(data=cls.service)
create_db = CreateDatabaseRequest(
name="test-db",
service=service_entity.fullyQualifiedName,
)
create_db_entity = cls.metadata.create_or_update(data=create_db)
create_schema = CreateDatabaseSchemaRequest(
name="test-schema",
database=create_db_entity.fullyQualifiedName,
)
create_schema_entity = cls.metadata.create_or_update(data=create_schema)
create_inlet = CreateTableRequest(
name="lineage-test-inlet",
databaseSchema=create_schema_entity.fullyQualifiedName,
columns=[Column(name="id", dataType=DataType.BIGINT)],
)
create_inlet_2 = CreateTableRequest(
name="lineage-test-inlet2",
databaseSchema=create_schema_entity.fullyQualifiedName,
columns=[Column(name="id", dataType=DataType.BIGINT)],
)
create_outlet = CreateTableRequest(
name="lineage-test-outlet",
databaseSchema=create_schema_entity.fullyQualifiedName,
columns=[Column(name="id", dataType=DataType.BIGINT)],
)
cls.table_inlet1: Table = cls.metadata.create_or_update(data=create_inlet)
cls.table_inlet2: Table = cls.metadata.create_or_update(data=create_inlet_2)
cls.table_outlet: Table = cls.metadata.create_or_update(data=create_outlet)
@classmethod
def tearDownClass(cls) -> None:
"""
Clean up
"""
service_id = str(
cls.metadata.get_by_name(
entity=DatabaseService, fqn=DB_SERVICE_NAME
).id.__root__
)
cls.metadata.delete(
entity=DatabaseService,
entity_id=service_id,
recursive=True,
hard_delete=True,
)
# Service ID created from the Airflow Lineage Operator in the
# example DAG
pipeline_service_id = str(
cls.metadata.get_by_name(
entity=PipelineService, fqn=PIPELINE_SERVICE_NAME
).id.__root__
)
cls.metadata.delete(
entity=PipelineService,
entity_id=pipeline_service_id,
recursive=True,
hard_delete=True,
)
def test_lineage_runner(self):
with DAG("test_runner", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets=[
OMEntity(
entity=Table,
fqn="test-service-lineage-runner.test-db.test-schema.lineage-test-inlet",
),
OMEntity(
entity=Table,
fqn="test-service-lineage-runner.test-db.test-schema.lineage-test-inlet2",
),
],
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets=[
OMEntity(
entity=Table,
fqn="test-service-lineage-runner.test-db.test-schema.lineage-test-outlet",
)
],
)
# skip the statuses since they require getting data from airflow's db
with patch.object(
AirflowLineageRunner, "add_all_pipeline_status", return_value=None
):
runner = AirflowLineageRunner(
metadata=self.metadata,
service_name=PIPELINE_SERVICE_NAME,
dag=dag,
xlets=get_xlets_from_dag(dag),
only_keep_dag_lineage=True,
)
runner.execute()
lineage_data = self.metadata.get_lineage_by_name(
entity=Table,
fqn=self.table_outlet.fullyQualifiedName.__root__,
up_depth=1,
down_depth=1,
)
upstream_ids = [edge["fromEntity"] for edge in lineage_data["upstreamEdges"]]
self.assertIn(str(self.table_inlet1.id.__root__), upstream_ids)
self.assertIn(str(self.table_inlet2.id.__root__), upstream_ids)

View File

@ -12,25 +12,68 @@
Test lineage parser to get inlets and outlets information
"""
from datetime import datetime
from typing import List, Set
from unittest import TestCase
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.serialization.serde import serialize
from metadata.generated.schema.entity.data.container import Container
from metadata.generated.schema.entity.data.dashboard import Dashboard
from metadata.generated.schema.entity.data.table import Table
from metadata.ingestion.source.pipeline.airflow.lineage_parser import (
OMEntity,
XLets,
XLetsMode,
_parse_xlets,
get_xlets_from_dag,
get_xlets_from_operator,
parse_xlets,
)
SLEEP = "sleep 1"
def xlet_fqns(xlet: XLets, xlet_mode: XLetsMode) -> Set[str]:
"""Helper method to get a set of FQNs out of the xlet"""
return set(elem.fqn for elem in getattr(xlet, xlet_mode.value))
class TestAirflowLineageParser(TestCase):
"""
Handle airflow lineage parser validations
"""
def assertXLetsEquals(self, first: List[XLets], second: List[XLets]):
"""
Check that both XLet lists are the same
Even if they are lists, we don't care about the order.
Note that we cannot use sets since `OMEntity` is not hashable.
For this test, we will assume that by having the same FQN, the
entity type will also be the same.
"""
self.assertEquals(len(first), len(second))
for xlet1 in first:
match = False
first_inlets = xlet_fqns(xlet1, XLetsMode.INLETS)
first_outlets = xlet_fqns(xlet1, XLetsMode.OUTLETS)
for xlet2 in second:
second_inlets = xlet_fqns(xlet2, XLetsMode.INLETS)
second_outlets = xlet_fqns(xlet2, XLetsMode.OUTLETS)
if first_inlets == second_inlets and first_outlets == second_outlets:
match = True
break
self.assertTrue(match)
def test_parse_xlets(self):
"""
Handle the shape validation of inlets and outlets, e.g.,
@ -40,13 +83,19 @@ class TestAirflowLineageParser(TestCase):
}],
"""
raw_xlet = [{"tables": ["A"], "more_tables": ["X"]}]
self.assertEqual(parse_xlets(raw_xlet), {"tables": ["A"], "more_tables": ["X"]})
self.assertEqual(
parse_xlets(raw_xlet),
{
"tables": [OMEntity(entity=Table, fqn="A")],
"more_tables": [OMEntity(entity=Table, fqn="X")],
},
)
raw_xlet_without_list = [{"tables": ["A"], "more_tables": "random"}]
self.assertEqual(
parse_xlets(raw_xlet_without_list),
{
"tables": ["A"],
"tables": [OMEntity(entity=Table, fqn="A")],
},
)
@ -67,7 +116,7 @@ class TestAirflowLineageParser(TestCase):
# But the outlets are parsed correctly
self.assertEqual(
get_xlets_from_operator(operator, xlet_mode=XLetsMode.OUTLETS),
{"tables": ["A"]},
{"tables": [OMEntity(entity=Table, fqn="A")]},
)
operator = BashOperator(
@ -78,20 +127,21 @@ class TestAirflowLineageParser(TestCase):
self.assertEqual(
get_xlets_from_operator(operator, xlet_mode=XLetsMode.INLETS),
{"tables": ["A"], "more_tables": ["X"]},
{
"tables": [OMEntity(entity=Table, fqn="A")],
"more_tables": [OMEntity(entity=Table, fqn="X")],
},
)
self.assertIsNone(
get_xlets_from_operator(operator, xlet_mode=XLetsMode.OUTLETS)
)
def test_get_xlets_from_dag(self):
def test_get_string_xlets_from_dag(self):
"""
Check that we can properly join the xlet information from
all operators in the DAG
"""
sleep_1 = "sleep 1"
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
@ -101,12 +151,18 @@ class TestAirflowLineageParser(TestCase):
BashOperator(
task_id="sleep",
bash_command=sleep_1,
bash_command=SLEEP,
outlets={"tables": ["B"]},
)
self.assertEqual(
get_xlets_from_dag(dag), [XLets(inlets={"A"}, outlets={"B"})]
self.assertXLetsEquals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[OMEntity(entity=Table, fqn="A")],
outlets=[OMEntity(entity=Table, fqn="B")],
)
],
)
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
@ -118,12 +174,18 @@ class TestAirflowLineageParser(TestCase):
BashOperator(
task_id="sleep",
bash_command=sleep_1,
bash_command=SLEEP,
outlets={"tables": ["B"]},
)
self.assertEqual(
get_xlets_from_dag(dag), [XLets(inlets={"A"}, outlets={"B"})]
self.assertXLetsEquals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[OMEntity(entity=Table, fqn="A")],
outlets=[OMEntity(entity=Table, fqn="B")],
)
],
)
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
@ -139,18 +201,27 @@ class TestAirflowLineageParser(TestCase):
BashOperator(
task_id="sleep",
bash_command=sleep_1,
bash_command=SLEEP,
outlets={
"tables": ["B"],
"more_tables": ["Z"],
},
)
self.assertEqual(
self.assertXLetsEquals(
get_xlets_from_dag(dag),
[
XLets(inlets={"A"}, outlets={"B"}),
XLets(inlets={"X", "Y"}, outlets={"Z"}),
XLets(
inlets=[OMEntity(entity=Table, fqn="A")],
outlets=[OMEntity(entity=Table, fqn="B")],
),
XLets(
inlets=[
OMEntity(entity=Table, fqn="X"),
OMEntity(entity=Table, fqn="Y"),
],
outlets=[OMEntity(entity=Table, fqn="Z")],
),
],
)
@ -165,15 +236,156 @@ class TestAirflowLineageParser(TestCase):
BashOperator(
task_id="sleep",
bash_command=sleep_1,
bash_command=SLEEP,
outlets={
"tables": ["B"],
},
)
self.assertEqual(
self.assertXLetsEquals(
get_xlets_from_dag(dag),
[
XLets(inlets={"A", "B"}, outlets={"B"}),
XLets(
inlets=[
OMEntity(entity=Table, fqn="A"),
OMEntity(entity=Table, fqn="B"),
],
outlets=[OMEntity(entity=Table, fqn="B")],
),
],
)
def test_get_attrs_xlets_from_dag(self):
"""
Check that we can properly join the xlet information from
all operators in the DAG
"""
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets=[
OMEntity(entity=Table, fqn="A"),
OMEntity(entity=Table, fqn="B"),
],
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets=[OMEntity(entity=Table, fqn="C")],
)
BashOperator(
task_id="sleep2",
bash_command=SLEEP,
outlets=[OMEntity(entity=Container, fqn="D")],
)
self.assertXLetsEquals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[
OMEntity(entity=Table, fqn="A"),
OMEntity(entity=Table, fqn="B"),
],
outlets=[
OMEntity(entity=Table, fqn="C"),
OMEntity(entity=Container, fqn="D"),
],
)
],
)
def test_om_entity_serializer(self):
"""To ensure the serialized DAGs will have the right shape"""
om_entity = OMEntity(
entity=Table,
fqn="FQN",
key="test",
)
self.assertEquals(
str(om_entity),
'{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}',
)
om_entity = OMEntity(
entity=Container,
fqn="FQN",
key="test",
)
self.assertEquals(
str(om_entity),
'{"entity": "metadata.generated.schema.entity.data.container.Container", "fqn": "FQN", "key": "test"}',
)
def test_str_deserializer(self):
"""
Once a DAG is serialized, the xlet info will be stored as:
```
['{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}']
```
based on our custom serialization logic.
Validate the deserialization process.
"""
self.assertIsNone(_parse_xlets("random"))
self.assertEquals(
_parse_xlets(
'{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}'
),
{
"test": [
OMEntity(
entity=Table,
fqn="FQN",
key="test",
)
]
},
)
self.assertEquals(
_parse_xlets(
'{"entity": "metadata.generated.schema.entity.data.container.Container", "fqn": "FQN", "key": "test"}'
),
{
"test": [
OMEntity(
entity=Container,
fqn="FQN",
key="test",
)
]
},
)
self.assertEquals(
_parse_xlets(
'{"entity": "metadata.generated.schema.entity.data.dashboard.Dashboard", "fqn": "FQN", "key": "test"}'
),
{
"test": [
OMEntity(
entity=Dashboard,
fqn="FQN",
key="test",
)
]
},
)
def test_airflow_serializer(self):
"""It should be able to serialize our models"""
om_entity = OMEntity(
entity=Table,
fqn="FQN",
key="test",
)
self.assertEquals(
serialize(om_entity).get("__data__"),
'{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}',
)

View File

@ -9,6 +9,116 @@ Regardless of the Airflow ingestion process you follow ([Workflow](/connectors/p
[Lineage Backend](/connectors/pipeline/airflow/lineage-backend) or [Lineage Operator](/connectors/pipeline/airflow/lineage-operator)),
OpenMetadata will try to extract the lineage information based on the tasks `inlets` and `outlets`.
What it's important to consider here is that when we are ingesting Airflow lineage, we are actually building a graph:
```
Table A (node) -> DAG (edge) -> Table B (node)
```
Where tables are nodes and DAGs (Pipelines) are considered edges. This means that the correct way of setting these
parameters is by making sure that we are informing both `inlets` and `outlets`, so that we have the nodes to build
the relationship.
## Configuring Lineage
{% note %}
This lineage configuration method is available for OpenMetadata release 1.2.3 or higher.
{% /note %}
Let's take a look at the following example:
```python
from datetime import timedelta
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
from metadata.generated.schema.entity.data.container import Container
from metadata.generated.schema.entity.data.table import Table
from metadata.ingestion.source.pipeline.airflow.lineage_parser import OMEntity
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': ['airflow@example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=1),
}
with DAG(
"test-lineage",
default_args=default_args,
description="An example DAG which runs a lineage test",
start_date=days_ago(1),
is_paused_upon_creation=False,
catchup=False,
) as dag:
t0 = DummyOperator(
task_id='task0',
inlets=[
OMEntity(entity=Container, fqn="Container A", key="group_A"),
OMEntity(entity=Table, fqn="Table X", key="group_B"),
]
)
t1 = DummyOperator(
task_id='task10',
outlets=[
OMEntity(entity=Table, fqn="Table B", key="group_A"),
OMEntity(entity=Table, fqn="Table Y", key="group_B"),
]
)
t0 >> t1
```
We are passing inlets and outlets as a list of the `OMEntity` class, that lets us specify:
1. The type of the asset we are using: Table, Container,... following our SDK
2. The FQN of the asset, which is the unique name of each asset in OpenMetadata, e.g., `serviceName.databaseName.schemaName.tableName`.
3. The key to group the lineage if needed.
This `OMEntity` class is defined following the example of Airflow's internal lineage
[models](https://github.com/apache/airflow/blob/main/airflow/lineage/entities.py).
## Keys
We can inform the lineage dependencies among different groups of tables. In the example above, we are not building the
lineage from all inlets to all outlets, but rather grouping the tables by key (`group_A` and `group_B`).
This means that after this lineage is processed, the relationship will be:
```
Container A (node) -> DAG (edge) -> Table B (node)
```
and
```
Table X (node) -> DAG (edge) -> Table Y (node)
```
It does not matter in which task of the DAG these inlet/outlet information is specified. During the ingestion process we
group all these details at the DAG level.
## Configuring Lineage between Tables
{% note %}
Note that this method only allows lineage between Tables.
We will deprecate it in OpenMetadata 1.4
{% /note %}
Let's take a look at the following example:
```python
@ -33,7 +143,7 @@ default_args = {
with DAG(
"test-multiple-inlet-keys",
default_args=default_args,
description="An example DAG which runs a a task group lineage test",
description="An example DAG which runs a lineage test",
start_date=days_ago(1),
is_paused_upon_creation=False,
catchup=False,
@ -43,25 +153,22 @@ with DAG(
t0 = DummyOperator(
task_id='task0',
inlets={
"tables": ["Table A"],
"more_tables": ["Table X"]
"group_A": ["Table A"],
"group_B": ["Table X"]
}
)
t1 = DummyOperator(
task_id='task10',
outlets={
"tables": ["Table B"],
"more_tables": ["Table Y"]
"group_A": ["Table B"],
"group_B": ["Table Y"]
}
)
t0 >> t1
```
Note how we have two tasks:
- `t0`: Informing the `inlets`, with keys `tables` and `more_tables`.
- `t1`: Informing the `outlets` with keys `tables` and `more_tables`.
{% note %}
@ -70,32 +177,3 @@ Make sure to add the table Fully Qualified Name (FQN), which is the unique name
This name is composed as `serviceName.databaseName.schemaName.tableName`.
{% /note %}
What it's important to consider here is that when we are ingesting Airflow lineage, we are actually building a graph:
```
Table A (node) -> DAG (edge) -> Table B (node)
```
Where tables are nodes and DAGs (Pipelines) are considered edges. This means that the correct way of setting this
parameters is by making sure that we are informing both `inlets` and `outlets`, so that we have the nodes to build
the relationship.
## Keys
We can inform the lineage dependencies among different groups of tables. In the example above, we are not building the
lineage from all inlets to all outlets, but rather grouping the tables by the dictionary key (`tables` and `more_tables`).
This means that after this lineage is processed, the relationship will be:
```
Table A (node) -> DAG (edge) -> Table B (node)
```
and
```
Table X (node) -> DAG (edge) -> Table Y (node)
```
It does not matter in which task of the DAG these inlet/outlet information is specified. During the ingestion process we
group all these details at the DAG level.

View File

@ -32,20 +32,11 @@ distribution:
pip3 install "openmetadata-ingestion==x.y.z"
```
Where `x.y.z` is the version of your OpenMetadata server, e.g., 0.13.0. It is important that server and client
versions match.
Where `x.y.z` is the version of your OpenMetadata server, e.g., 1.2.2. **It is important that server and client
versions match**.
### Adding Lineage Config
<Note>
If using OpenMetadata version 0.13.0 or lower, the import for the lineage backend is
`airflow_provider_openmetadata.lineage.openmetadata.OpenMetadataLineageBackend`.
For 0.13.1 or higher, the import has been renamed to `airflow_provider_openmetadata.lineage.backend.OpenMetadataLineageBackend`.
</Note>
After the installation, we need to update the Airflow configuration. This can be done following this example on
`airflow.cfg`:
@ -81,11 +72,10 @@ max_status = 10
```
- `only_keep_dag_lineage` will remove any table lineage not present in the inlets or outlets. This will ensure
that any lineage in OpenMetadata comes from your code.
that any lineage in OpenMetadata comes only from your code.
- `max_status` controls the number of status to ingest in each run. By default, we'll pick the last 10.
In the following sections, we'll show how to adapt our pipelines to help us build the lineage information.
## Lineage Backend
@ -139,13 +129,13 @@ and downstream for outlets) between the Pipeline and Table Entities.
It is important to get the naming right, as we will fetch the Table Entity by its FQN. If no information is specified
in terms of lineage, we will just ingest the Pipeline Entity without adding further information.
<Note>
{% note %}
While we are showing here how to parse the lineage using the Lineage Backend, the setup of `inlets` and `outlets`
is supported as well through external metadata ingestion from Airflow, be it via the UI, CLI or directly running
an extraction DAG from Airflow itself.
</Note>
{% /note %}
## Example
@ -246,7 +236,7 @@ backend = airflow_provider_openmetadata.lineage.backend.OpenMetadataLineageBacke
airflow_service_name = local_airflow
openmetadata_api_endpoint = http://localhost:8585/api
auth_provider_type = openmetadata
jwt_token = eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg
jwt_token = ...
```
After running the DAG, you should be able to see the following information in the ingested Pipeline: