Fix #2153 - Lineage date parsing (#2387)

* Check for start_date

* Fix date keys and remove repr

* Add date management tests
This commit is contained in:
Pere Miquel Brull 2022-01-24 21:29:16 +01:00 committed by GitHub
parent 6a8fc9e0fb
commit 05c8d1ebb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 81 additions and 21 deletions

View File

@ -13,9 +13,8 @@
OpenMetadata Airflow Lineage Backend
"""
import ast
import traceback
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Union
from airflow.configuration import conf
@ -98,13 +97,11 @@ def get_properties(
:return: properties dict
"""
props: Dict[str, str] = {
key: repr(value) for (key, value) in serializer(obj).items()
}
props: Dict[str, str] = {key: value for (key, value) in serializer(obj).items()}
for key in obj.get_serialized_fields():
if key not in props:
props[key] = repr(getattr(obj, key))
props[key] = getattr(obj, key)
return {key: value for (key, value) in props.items() if key in allowed_keys}
@ -136,6 +133,46 @@ def get_xlets(
return None
def iso_dag_start_date(props: Dict[str, Any]) -> Optional[str]:
"""
Given a properties dict, return the start_date
as an iso string if start_date is informed
:param props: properties dict
:return: iso start_date or None
"""
# DAG start date comes as `float`
if props.get("start_date"):
return convert_epoch_to_iso(int(float(props["start_date"])))
return None
def iso_task_start_end_date(
props: Dict[str, Any]
) -> Tuple[Optional[str], Optional[str]]:
"""
Given the attributes of a Task Instance, return
the task start date and task end date as
ISO format
:param props: task instance attributes
:return: task start and end date
"""
task_start_date = (
convert_epoch_to_iso(int(props["start_date"].timestamp()))
if props.get("start_date")
else None
)
task_end_date = (
convert_epoch_to_iso(int(props["end_date"].timestamp()))
if props.get("end_date")
else None
)
return task_start_date, task_end_date
def create_pipeline_entity(
dag_properties: Dict[str, str],
task_properties: Dict[str, str],
@ -161,25 +198,15 @@ def create_pipeline_entity(
f"{pipeline_service_url}/taskinstance/list/"
+ f"?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={operator.task_id}"
)
dag_start_date = convert_epoch_to_iso(int(float(dag_properties["start_date"])))
dag_start_date = iso_dag_start_date(dag_properties)
task_start_date, task_end_date = iso_task_start_end_date(task_properties)
downstream_tasks = []
if "_downstream_task_ids" in task_properties:
downstream_tasks = ast.literal_eval(task_properties["_downstream_task_ids"])
if task_properties.get("_downstream_task_ids"):
downstream_tasks = task_properties["_downstream_task_ids"]
operator.log.info(f"downstream tasks {downstream_tasks}")
task_start_date = (
task_properties["start_date"].isoformat()
if "start_time" in task_properties
else None
)
task_end_date = (
task_properties["end_date"].isoformat()
if "end_time" in task_properties
else None
)
task = Task(
name=task_properties["task_id"],
displayName=task_properties.get("label"), # v1.10.15 does not have label

View File

@ -14,7 +14,6 @@ Test airflow lineage backend
"""
from datetime import datetime, timedelta
from textwrap import dedent
from unittest import TestCase
# The DAG object; we'll need this to instantiate a DAG
@ -32,6 +31,10 @@ from airflow_provider_openmetadata.lineage.openmetadata import (
get_properties,
get_xlets,
)
from airflow_provider_openmetadata.lineage.utils import (
iso_dag_start_date,
iso_task_start_end_date,
)
from metadata.generated.schema.api.data.createDatabase import (
CreateDatabaseEntityRequest,
)
@ -169,6 +172,36 @@ class AirflowLineageTest(TestCase):
)
self.assertTrue(set(task3_props.keys()).issubset(ALLOWED_TASK_KEYS))
def test_times(self):
"""
Check the ISO date extraction for DAG and Tasks instances
"""
dag_props = get_properties(
self.dag, SerializedDAG.serialize_dag, ALLOWED_FLOW_KEYS
)
dag_date = iso_dag_start_date(dag_props)
self.assertEqual("2021-01-01T00:00:00Z", dag_date)
# Remove the start_time
dag_props.pop("start_date")
dag_none_date = iso_dag_start_date(dag_props)
self.assertIsNone(dag_none_date)
# By default we'll get the start_date for the task,
# so we can check its value, but the end date
# might not come as in this case.
# Check that we can have those values as None
task1_props = get_properties(
self.dag.get_task("task1"),
SerializedBaseOperator.serialize_operator,
ALLOWED_TASK_KEYS,
)
task_start_date, task_end_date = iso_task_start_end_date(task1_props)
self.assertEqual("2021-01-01T00:00:00Z", task_start_date)
self.assertIsNone(task_end_date)
def test_lineage(self):
"""
Test end to end