mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-07-25 18:30:00 +00:00
* fix(airflow): correctly extract owners from serialized Airflow DAGs Airflow serialization format wraps tasks under `__var` and `__type`. Previously, the OpenMetadata Airflow connector failed to extract task owners properly in this format. This patch: - Flattens `__var` when parsing task owners - Fallbacks to `default_args["owner"]` if no task-level owner is explicitly present - Ensures correct DAG owner is picked as the most common task owner - Handles compatibility with older Airflow versions Fixes: #21106 * test(airflow): add tests for owner extraction from serialized Airflow DAGs Adds new test cases to validate owner extraction logic: - Owners from serialized task format (`__var`) - Fallback to `default_args['owner']` if task owners are missing - Resolution of most common owner - Compatibility with unstructured or missing owners * remove test version specific comment * simplify comments and warnings * fix return statement * fixing formatting * adding handling of default args * fixing and adding more tests
This commit is contained in:
parent
93cef8d868
commit
b0e1a136cf
@ -414,18 +414,41 @@ class AirflowSource(PipelineServiceSource):
|
|||||||
- `owners`: Applied at the tasks. In Airflow's source code, DAG ownership is then a
|
- `owners`: Applied at the tasks. In Airflow's source code, DAG ownership is then a
|
||||||
list joined with the owners of all the tasks.
|
list joined with the owners of all the tasks.
|
||||||
|
|
||||||
We will pick the owner from the tasks that appears in most tasks.
|
We will pick the owner from the tasks that appears in most tasks,
|
||||||
|
or fall back to the default_args owner if available.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if self.source_config.includeOwners:
|
if not self.source_config.includeOwners:
|
||||||
task_owners = [
|
return None
|
||||||
task.get("owner")
|
|
||||||
for task in data.get("tasks", [])
|
tasks = data.get("tasks", [])
|
||||||
if task.get("owner") is not None
|
task_owners = []
|
||||||
]
|
|
||||||
if task_owners:
|
# Handle default_args.owner (wrapped or not)
|
||||||
most_common_owner, _ = Counter(task_owners).most_common(1)[0]
|
default_args = data.get("default_args", {})
|
||||||
return most_common_owner
|
if isinstance(default_args, dict) and "__var" in default_args:
|
||||||
|
default_args = default_args["__var"]
|
||||||
|
default_owner = default_args.get("owner")
|
||||||
|
|
||||||
|
for task in tasks:
|
||||||
|
# Flatten serialized task
|
||||||
|
task_data = (
|
||||||
|
task.get("__var")
|
||||||
|
if isinstance(task, dict) and "__var" in task
|
||||||
|
else task
|
||||||
|
)
|
||||||
|
|
||||||
|
owner = task_data.get("owner") or default_owner
|
||||||
|
|
||||||
|
if owner:
|
||||||
|
task_owners.append(owner)
|
||||||
|
|
||||||
|
if task_owners:
|
||||||
|
most_common_owner, _ = Counter(task_owners).most_common(1)[0]
|
||||||
|
return most_common_owner
|
||||||
|
|
||||||
|
return default_owner
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
self.status.warning(
|
self.status.warning(
|
||||||
data.get("dag_id"), f"Could not extract owner information due to {exc}"
|
data.get("dag_id"), f"Could not extract owner information due to {exc}"
|
||||||
|
@ -272,3 +272,51 @@ class TestAirflow(TestCase):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.assertEqual(get_schedule_interval(pipeline_data), "*/2 * * * *")
|
self.assertEqual(get_schedule_interval(pipeline_data), "*/2 * * * *")
|
||||||
|
|
||||||
|
def test_get_dag_owners_with_serialized_tasks(self):
|
||||||
|
# Case 1: All tasks have no explicit owner → fallback to default_args
|
||||||
|
data = {
|
||||||
|
"default_args": {"__var": {"owner": "default_owner"}},
|
||||||
|
"tasks": [
|
||||||
|
{"__var": {"task_id": "t1"}, "__type": "EmptyOperator"},
|
||||||
|
{"__var": {"task_id": "t2"}, "__type": "EmptyOperator"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
self.assertEqual("default_owner", self.airflow.fetch_dag_owners(data))
|
||||||
|
|
||||||
|
# Case 2: One task explicitly overrides the owner → tie between two owners
|
||||||
|
data = {
|
||||||
|
"default_args": {"__var": {"owner": "default_owner"}},
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"__var": {"task_id": "t1"},
|
||||||
|
"__type": "EmptyOperator",
|
||||||
|
}, # uses default_owner
|
||||||
|
{
|
||||||
|
"__var": {"task_id": "t2", "owner": "overridden_owner"},
|
||||||
|
"__type": "EmptyOperator",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
result = self.airflow.fetch_dag_owners(data)
|
||||||
|
self.assertIn(result, {"default_owner", "overridden_owner"})
|
||||||
|
|
||||||
|
# Case 3: One owner is majority -> must return that owner
|
||||||
|
data = {
|
||||||
|
"default_args": {"__var": {"owner": "default_owner"}},
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"__var": {"task_id": "t1", "owner": "overridden_owner"},
|
||||||
|
"__type": "EmptyOperator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"__var": {"task_id": "t2", "owner": "overridden_owner"},
|
||||||
|
"__type": "EmptyOperator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"__var": {"task_id": "t3", "owner": "another_owner"},
|
||||||
|
"__type": "EmptyOperator",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
self.assertEqual("overridden_owner", self.airflow.fetch_dag_owners(data))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user