feat(airflow): add example docker setup for airflow (#3176)
168
docker/airflow/docker-compose.yaml
Normal file
@ -0,0 +1,168 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
|
||||
#
|
||||
# WARNING: This configuration is for local development. Do not use it in a production deployment.
|
||||
#
|
||||
# This configuration supports basic configuration using environment variables or an .env file
|
||||
# The following variables are supported:
|
||||
#
|
||||
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
|
||||
# Default: apache/airflow:master-python3.8
|
||||
# AIRFLOW_UID - User ID in Airflow containers
|
||||
# Default: 50000
|
||||
# AIRFLOW_GID - Group ID in Airflow containers
|
||||
# Default: 50000
|
||||
#
|
||||
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
|
||||
#
|
||||
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
|
||||
# Default: airflow
|
||||
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
|
||||
# Default: airflow
|
||||
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
|
||||
# Default: ''
|
||||
#
|
||||
# Feel free to modify this file to suit your needs.
|
||||
---
|
||||
version: '3'
|
||||
x-airflow-common:
|
||||
&airflow-common
|
||||
image: ${AIRFLOW_IMAGE_NAME:-acryldata/airflow-datahub:latest}
|
||||
environment:
|
||||
&airflow-common-env
|
||||
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
|
||||
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
|
||||
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
|
||||
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
|
||||
AIRFLOW__CORE__FERNET_KEY: ''
|
||||
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
|
||||
AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
|
||||
AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
|
||||
AIRFLOW__LINEAGE__BACKEND: 'datahub_provider.lineage.datahub.DatahubLineageBackend'
|
||||
AIRFLOW__LINEAGE__DATAHUB_KWARGS: '{ "datahub_conn_id": "datahub_rest_default", "capture_ownership_info": true, "capture_tags_info": true, "graceful_exceptions": false }'
|
||||
_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
|
||||
volumes:
|
||||
- ./dags:/opt/airflow/dags
|
||||
- ./logs:/opt/airflow/logs
|
||||
- ./plugins:/opt/airflow/plugins
|
||||
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- datahub_network
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:13
|
||||
hostname: postgres
|
||||
environment:
|
||||
POSTGRES_USER: airflow
|
||||
POSTGRES_PASSWORD: airflow
|
||||
POSTGRES_DB: airflow
|
||||
volumes:
|
||||
- postgres-db-volume:/var/lib/postgresql/data
|
||||
networks:
|
||||
- datahub_network
|
||||
healthcheck:
|
||||
test: ["CMD", "pg_isready", "-U", "airflow"]
|
||||
interval: 5s
|
||||
retries: 5
|
||||
restart: always
|
||||
|
||||
redis:
|
||||
image: redis:latest
|
||||
hostname: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
networks:
|
||||
- datahub_network
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 5s
|
||||
timeout: 30s
|
||||
retries: 50
|
||||
restart: always
|
||||
|
||||
airflow-webserver:
|
||||
<<: *airflow-common
|
||||
command: webserver
|
||||
ports:
|
||||
- 58080:8080
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "--fail", "http://localhost:58080/health"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
restart: always
|
||||
|
||||
airflow-scheduler:
|
||||
<<: *airflow-common
|
||||
command: scheduler
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
restart: always
|
||||
|
||||
airflow-worker:
|
||||
<<: *airflow-common
|
||||
command: celery worker
|
||||
healthcheck:
|
||||
test:
|
||||
- "CMD-SHELL"
|
||||
- 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
restart: always
|
||||
|
||||
airflow-init:
|
||||
<<: *airflow-common
|
||||
command: version
|
||||
environment:
|
||||
<<: *airflow-common-env
|
||||
_AIRFLOW_DB_UPGRADE: 'true'
|
||||
_AIRFLOW_WWW_USER_CREATE: 'true'
|
||||
_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
|
||||
_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
|
||||
|
||||
flower:
|
||||
<<: *airflow-common
|
||||
command: celery flower
|
||||
ports:
|
||||
- 5555:5555
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
restart: always
|
||||
|
||||
volumes:
|
||||
postgres-db-volume:
|
||||
|
||||
networks:
|
||||
datahub_network:
|
||||
external: true
|
||||
|
||||
158
docker/airflow/local_airflow.md
Normal file
@ -0,0 +1,158 @@
|
||||
# Running Airflow locally with DataHub
|
||||
|
||||
## Introduction
|
||||
This document describes how you can run Airflow side-by-side with DataHub's docker images to test out Airflow lineage with DataHub.
|
||||
This offers a much easier way to try out Airflow with DataHub, compared to configuring containers by hand, setting up configurations and networking connectivity between the two systems.
|
||||
|
||||
## Pre-requisites
|
||||
- Docker: ensure that you have a working Docker installation and you have at least 8GB of memory to allocate to both Airflow and DataHub combined.
|
||||
```
|
||||
docker info | grep Memory
|
||||
|
||||
> Total Memory: 7.775GiB
|
||||
```
|
||||
|
||||
## Step 1: Set up your Airflow area
|
||||
- Create an area to host your airflow installation
|
||||
- Download the docker-compose file hosted in DataHub's repo in that directory
|
||||
- Download a sample dag to use for testing Airflow lineage
|
||||
|
||||
```
|
||||
mkdir -p airflow_install
|
||||
cd airflow_install
|
||||
# Download docker-compose
|
||||
curl -L 'https://raw.githubusercontent.com/acryldata/datahub-fork/airflow-local-docker/docker/airflow/docker-compose.yaml?token=AAG5J3NA2ZJRVLS3XB3C3RTBG7BAM' -o docker-compose.yaml
|
||||
# Create dags directory
|
||||
mkdir -p dags
|
||||
# Download a sample DAG
|
||||
curl -L 'https://raw.githubusercontent.com/linkedin/datahub/master/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py' -o dags/lineage_backend_demo.py
|
||||
```
|
||||
|
||||
### What is different between this docker-compose file and the official Apache Airflow docker compose file?
|
||||
- This docker-compose file is derived from the [official Airflow docker-compose file](https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#docker-compose-yaml) but makes a few critical changes to make interoperability with DataHub seamless.
|
||||
- The Airflow image in this docker compose file extends the [base Apache Airflow docker image](https://airflow.apache.org/docs/docker-stack/index.html) and is published [here](https://hub.docker.com/r/acryldata/airflow-datahub). It includes the latest `acryl-datahub` pip package installed by default so you don't need to install it yourself.
|
||||
- This docker-compose file sets up the networking so that
|
||||
- the Airflow containers can talk to the DataHub containers through the `datahub_network` bridge interface.
|
||||
- Modifies the port-forwarding to map the Airflow Webserver port `8080` to port `58080` on the localhost (to avoid conflicts with DataHub metadata-service, which is mapped to 8080 by default)
|
||||
- This docker-compose file also sets up the ENV variables to configure Airflow's Lineage Backend to talk to DataHub. (Look for the `AIRFLOW__LINEAGE__BACKEND` and `AIRFLOW__LINEAGE__DATAHUB_KWARGS` variables)
|
||||
|
||||
## Step 2: Bring up Airflow
|
||||
```
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
You should see a host of messages as Airflow starts up.
|
||||
|
||||
```
|
||||
Container airflow_deploy_airflow-scheduler_1 Started 15.7s
|
||||
Attaching to airflow-init_1, airflow-scheduler_1, airflow-webserver_1, airflow-worker_1, flower_1, postgres_1, redis_1
|
||||
airflow-worker_1 | BACKEND=redis
|
||||
airflow-worker_1 | DB_HOST=redis
|
||||
airflow-worker_1 | DB_PORT=6379
|
||||
airflow-worker_1 |
|
||||
airflow-webserver_1 |
|
||||
airflow-init_1 | DB: postgresql+psycopg2://airflow:***@postgres/airflow
|
||||
airflow-init_1 | [2021-08-31 20:02:07,534] {db.py:702} INFO - Creating tables
|
||||
airflow-init_1 | INFO [alembic.runtime.migration] Context impl PostgresqlImpl.
|
||||
airflow-init_1 | INFO [alembic.runtime.migration] Will assume transactional DDL.
|
||||
airflow-scheduler_1 | ____________ _____________
|
||||
airflow-scheduler_1 | ____ |__( )_________ __/__ /________ __
|
||||
airflow-scheduler_1 | ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / /
|
||||
airflow-scheduler_1 | ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ /
|
||||
airflow-scheduler_1 | _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/
|
||||
airflow-scheduler_1 | [2021-08-31 20:02:07,736] {scheduler_job.py:661} INFO - Starting the scheduler
|
||||
airflow-scheduler_1 | [2021-08-31 20:02:07,736] {scheduler_job.py:666} INFO - Processing each file at most -1 times
|
||||
airflow-scheduler_1 | [2021-08-31 20:02:07,915] {manager.py:254} INFO - Launched DagFileProcessorManager with pid: 25
|
||||
airflow-scheduler_1 | [2021-08-31 20:02:07,918] {scheduler_job.py:1197} INFO - Resetting orphaned tasks for active dag runs
|
||||
airflow-scheduler_1 | [2021-08-31 20:02:07,923] {settings.py:51} INFO - Configured default timezone Timezone('UTC')
|
||||
flower_1 |
|
||||
airflow-worker_1 | * Serving Flask app "airflow.utils.serve_logs" (lazy loading)
|
||||
airflow-worker_1 | * Environment: production
|
||||
airflow-worker_1 | WARNING: This is a development server. Do not use it in a production deployment.
|
||||
airflow-worker_1 | Use a production WSGI server instead.
|
||||
airflow-worker_1 | * Debug mode: off
|
||||
airflow-worker_1 | [2021-08-31 20:02:09,283] {_internal.py:113} INFO - * Running on http://0.0.0.0:8793/ (Press CTRL+C to quit)
|
||||
flower_1 | BACKEND=redis
|
||||
flower_1 | DB_HOST=redis
|
||||
flower_1 | DB_PORT=6379
|
||||
flower_1 |
|
||||
```
|
||||
|
||||
Finally, Airflow should be healthy and up on port 58080.
|
||||
|
||||
```
|
||||
airflow-webserver_1 | 172.22.0.1 - - [31/Aug/2021:20:30:52 +0000] "GET /static/appbuilder/fonts/fontawesome-webfont.woff2?v=4.7.0 HTTP/1.1" 304 0 "http://localhost:58080/static/appbuilder/css/font-awesome.min.css" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
|
||||
airflow-init_1 | Admin user airflow created
|
||||
airflow-init_1 | 2.1.3
|
||||
airflow_install_airflow-init_1 exited with code 0
|
||||
|
||||
```
|
||||
|
||||
Navigate to http://localhost:58080 to confirm and find your Airflow webserver.
|
||||
Default username and password is:
|
||||
```
|
||||
airflow:airflow
|
||||
```
|
||||
|
||||
## Step 4: Register DataHub connection (hook) to Airflow
|
||||
|
||||
```
|
||||
docker exec -it `docker ps | grep webserver | cut -d " " -f 1` airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080'
|
||||
```
|
||||
|
||||
### Result
|
||||
```
|
||||
Successfully added `conn_id`=datahub_rest_default : datahub_rest://:@http://datahub-gms:8080:
|
||||
```
|
||||
|
||||
### What is the above command doing?
|
||||
- Find the container running airflow webserver: `docker ps | grep webserver | cut -d " " -f 1`
|
||||
- Running the `airflow connections add ...` command inside that container to register the `datahub_rest` connection type and connect it to the `datahub-gms` host on port 8080.
|
||||
- Note: This is what requires Airflow to be able to connect to `datahub-gms` the host (this is the container running datahub-gms image) and this is why we needed to connect the Airflow containers to the `datahub_network` using our custom docker-compose file.
|
||||
|
||||
|
||||
## Step 3: Find the DAGs and run it
|
||||
Navigate the Airflow UI to find the sample Airflow dag we just brought in
|
||||
|
||||

|
||||
|
||||
By default, Airflow loads all DAG-s in paused status. Unpause the sample DAG to use it.
|
||||

|
||||

|
||||
|
||||
Then trigger the DAG to run.
|
||||
|
||||

|
||||
|
||||
After the DAG runs successfully, go over to your DataHub instance to see the Pipeline and navigate its lineage.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## TroubleShooting
|
||||
|
||||
Most issues are related to connectivity between Airflow and DataHub.
|
||||
|
||||
Here is how you can debug them.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
In this case, clearly the connection `datahub-rest` has not been registered. Looks like we forgot to register the connection with Airflow!
|
||||
Let's execute Step 4 to register the datahub connection with Airflow.
|
||||
|
||||
After re-running the DAG, we see success!
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -74,7 +74,7 @@ function list_markdown_files(): string[] {
|
||||
/^datahub-kubernetes\//,
|
||||
/^datahub-web\//,
|
||||
/^metadata-ingestion-examples\//,
|
||||
/^docker\/(?!README|datahub-upgrade)/, // Drop all but a few docker docs.
|
||||
/^docker\/(?!README|datahub-upgrade|airflow\/local_airflow)/, // Drop all but a few docker docs.
|
||||
/^docs\/rfc\/templates\/000-template\.md$/,
|
||||
/^docs\/docker\/README\.md/, // This one is just a pointer to another file.
|
||||
/^docs\/README\.md/, // This one is just a pointer to the hosted docs site.
|
||||
|
||||
@ -122,6 +122,7 @@ module.exports = {
|
||||
"docs/how/delete-metadata",
|
||||
"datahub-web-react/src/app/analytics/README",
|
||||
"metadata-ingestion/developing",
|
||||
"docker/airflow/local_airflow",
|
||||
],
|
||||
Components: [
|
||||
"datahub-web-react/README",
|
||||
|
||||
BIN
docs/imgs/airflow/connection_error.png
Normal file
|
After Width: | Height: | Size: 258 KiB |
BIN
docs/imgs/airflow/datahub_lineage_view.png
Normal file
|
After Width: | Height: | Size: 72 KiB |
BIN
docs/imgs/airflow/datahub_pipeline_entity.png
Normal file
|
After Width: | Height: | Size: 89 KiB |
BIN
docs/imgs/airflow/datahub_pipeline_view.png
Normal file
|
After Width: | Height: | Size: 59 KiB |
BIN
docs/imgs/airflow/datahub_task_view.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
|
Before Width: | Height: | Size: 157 KiB After Width: | Height: | Size: 157 KiB |
BIN
docs/imgs/airflow/find_the_dag.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
docs/imgs/airflow/finding_failed_log.png
Normal file
|
After Width: | Height: | Size: 107 KiB |
BIN
docs/imgs/airflow/paused_dag.png
Normal file
|
After Width: | Height: | Size: 57 KiB |
BIN
docs/imgs/airflow/successful_run.png
Normal file
|
After Width: | Height: | Size: 7.5 KiB |
BIN
docs/imgs/airflow/trigger_dag.png
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
docs/imgs/airflow/unpaused_dag.png
Normal file
|
After Width: | Height: | Size: 57 KiB |
@ -183,6 +183,12 @@ The Airflow lineage backend is only supported in Airflow 1.10.15+ and 2.0.2+.
|
||||
|
||||
:::
|
||||
|
||||
### Running on Docker locally
|
||||
|
||||
If you are looking to run Airflow and DataHub using docker locally, follow the guide [here](../docker/airflow/local_airflow.md). Otherwise proceed to follow the instructions below.
|
||||
|
||||
### Setting up Airflow to use DataHub as Lineage Backend
|
||||
|
||||
1. You need to install the required dependency in your airflow. See https://registry.astronomer.io/providers/datahub/modules/datahublineagebackend
|
||||
|
||||
```shell
|
||||
|
||||