OpenMetadata/ingestion/tests/unit/airflow/test_lineage_parser.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

418 lines
12 KiB
Python
Raw Normal View History

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test lineage parser to get inlets and outlets information
"""
from datetime import datetime
from typing import List, Set
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.serialization.serde import serialize
from metadata.generated.schema.entity.data.container import Container
from metadata.generated.schema.entity.data.dashboard import Dashboard
from metadata.generated.schema.entity.data.table import Table
from metadata.ingestion.source.pipeline.airflow.lineage_parser import (
OMEntity,
XLets,
XLetsMode,
_parse_xlets,
get_xlets_from_dag,
get_xlets_from_operator,
parse_xlets,
)
SLEEP = "sleep 1"
def xlet_fqns(xlet: XLets, xlet_mode: XLetsMode) -> Set[str]:
"""Helper method to get a set of FQNs out of the xlet"""
return set(elem.fqn for elem in getattr(xlet, xlet_mode.value))
def assert_xlets_equals(first: List[XLets], second: List[XLets]):
"""
Check that both XLet lists are the same
Even if they are lists, we don't care about the order.
Note that we cannot use sets since `OMEntity` is not hashable.
For this test, we will assume that by having the same FQN, the
entity type will also be the same.
"""
assert len(first) == len(second)
for xlet1 in first:
match = False
first_inlets = xlet_fqns(xlet1, XLetsMode.INLETS)
first_outlets = xlet_fqns(xlet1, XLetsMode.OUTLETS)
for xlet2 in second:
second_inlets = xlet_fqns(xlet2, XLetsMode.INLETS)
second_outlets = xlet_fqns(xlet2, XLetsMode.OUTLETS)
if first_inlets == second_inlets and first_outlets == second_outlets:
match = True
break
assert match
def test_parse_xlets():
"""
Handle the shape validation of inlets and outlets, e.g.,
[{
"tables": ["A"],
"more_tables": ["X"]
}],
"""
raw_xlet = [{"tables": ["A"], "more_tables": ["X"]}]
assert parse_xlets(raw_xlet) == (
{
"tables": [OMEntity(entity=Table, fqn="A")],
"more_tables": [OMEntity(entity=Table, fqn="X")],
}
)
raw_xlet_without_list = [{"tables": ["A"], "more_tables": "random"}]
assert parse_xlets(raw_xlet_without_list) == (
{
"tables": [OMEntity(entity=Table, fqn="A")],
}
)
not_an_xlet_list = {"tables": ["A"]}
assert parse_xlets(not_an_xlet_list) is None
def test_get_xlets_from_operator():
"""
Check how to get xlet data from operators
"""
operator = BashOperator(
task_id="print_date",
bash_command="date",
outlets={"tables": ["A"]},
)
assert get_xlets_from_operator(operator, XLetsMode.INLETS) is None
# But the outlets are parsed correctly
assert get_xlets_from_operator(operator, xlet_mode=XLetsMode.OUTLETS) == (
{"tables": [OMEntity(entity=Table, fqn="A")]}
)
operator = BashOperator(
task_id="print_date",
bash_command="date",
inlets={"tables": ["A"], "more_tables": ["X"]},
)
assert get_xlets_from_operator(operator, xlet_mode=XLetsMode.INLETS) == (
{
"tables": [OMEntity(entity=Table, fqn="A")],
"more_tables": [OMEntity(entity=Table, fqn="X")],
}
)
assert get_xlets_from_operator(operator, xlet_mode=XLetsMode.OUTLETS) is None
def test_get_string_xlets_from_dag():
"""
Check that we can properly join the xlet information from
all operators in the DAG
"""
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets={"tables": ["A"]},
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets={"tables": ["B"]},
)
assert_xlets_equals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[OMEntity(entity=Table, fqn="A")],
outlets=[OMEntity(entity=Table, fqn="B")],
)
],
)
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets={"tables": ["A", "A"], "this is a bit random": "foo"},
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets={"tables": ["B"]},
)
assert_xlets_equals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[OMEntity(entity=Table, fqn="A")],
outlets=[OMEntity(entity=Table, fqn="B")],
)
],
)
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets={
"tables": ["A", "A"],
"more_tables": ["X", "Y"],
"this is a bit random": "foo",
},
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets={
"tables": ["B"],
"more_tables": ["Z"],
},
)
assert_xlets_equals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[OMEntity(entity=Table, fqn="A")],
outlets=[OMEntity(entity=Table, fqn="B")],
),
XLets(
inlets=[
OMEntity(entity=Table, fqn="X"),
OMEntity(entity=Table, fqn="Y"),
],
outlets=[OMEntity(entity=Table, fqn="Z")],
),
],
)
with DAG("test_dag_cycle", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets={
"tables": ["A", "B"],
},
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets={
"tables": ["B"],
},
)
assert_xlets_equals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[
OMEntity(entity=Table, fqn="A"),
OMEntity(entity=Table, fqn="B"),
],
outlets=[OMEntity(entity=Table, fqn="B")],
),
],
)
def test_get_dict_xlets_from_dag():
"""
We can get the dict-based xlets for any entity without
annotating directly with OMEntity
"""
with DAG("test_dag_cycle", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets=[
{"entity": "container", "fqn": "C1", "key": "test"},
{"entity": "container", "fqn": "C2", "key": "test"},
],
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets=[
{"entity": "table", "fqn": "T", "key": "test"},
],
)
assert_xlets_equals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[
OMEntity(entity=Container, fqn="C1", key="test"),
OMEntity(entity=Container, fqn="C2", key="test"),
],
outlets=[OMEntity(entity=Table, fqn="T", key="test")],
),
],
)
def test_get_attrs_xlets_from_dag():
"""
Check that we can properly join the xlet information from
all operators in the DAG
"""
with DAG("test_dag", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",
bash_command="date",
inlets=[
OMEntity(entity=Table, fqn="A"),
OMEntity(entity=Table, fqn="B"),
],
)
BashOperator(
task_id="sleep",
bash_command=SLEEP,
outlets=[OMEntity(entity=Table, fqn="C")],
)
BashOperator(
task_id="sleep2",
bash_command=SLEEP,
outlets=[OMEntity(entity=Container, fqn="D")],
)
assert_xlets_equals(
get_xlets_from_dag(dag),
[
XLets(
inlets=[
OMEntity(entity=Table, fqn="A"),
OMEntity(entity=Table, fqn="B"),
],
outlets=[
OMEntity(entity=Table, fqn="C"),
OMEntity(entity=Container, fqn="D"),
],
)
],
)
def test_om_entity_serializer():
"""To ensure the serialized DAGs will have the right shape"""
om_entity = OMEntity(
entity=Table,
fqn="FQN",
key="test",
)
assert str(om_entity) == (
'{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}'
)
om_entity = OMEntity(
entity=Container,
fqn="FQN",
key="test",
)
assert str(om_entity) == (
'{"entity": "metadata.generated.schema.entity.data.container.Container", "fqn": "FQN", "key": "test"}'
)
def test_str_deserializer():
"""
Once a DAG is serialized, the xlet info will be stored as:
```
['{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}']
```
based on our custom serialization logic.
Validate the deserialization process.
"""
assert _parse_xlets("random") is None
assert _parse_xlets(
'{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}'
) == (
{
"test": [
OMEntity(
entity=Table,
fqn="FQN",
key="test",
)
]
}
)
assert _parse_xlets(
'{"entity": "metadata.generated.schema.entity.data.container.Container", "fqn": "FQN", "key": "test"}'
) == (
{
"test": [
OMEntity(
entity=Container,
fqn="FQN",
key="test",
)
]
}
)
assert _parse_xlets(
'{"entity": "metadata.generated.schema.entity.data.dashboard.Dashboard", "fqn": "FQN", "key": "test"}'
) == (
{
"test": [
OMEntity(
entity=Dashboard,
fqn="FQN",
key="test",
)
]
}
)
def test_airflow_serializer():
"""It should be able to serialize our models"""
om_entity = OMEntity(
entity=Table,
fqn="FQN",
key="test",
)
assert serialize(om_entity).get("__data__") == (
'{"entity": "metadata.generated.schema.entity.data.table.Table", "fqn": "FQN", "key": "test"}'
)