feat(airflow): add example docker setup for airflow (#3176)

This commit is contained in:
Dexter Lee 2021-09-01 11:51:14 -07:00 committed by GitHub
parent d340288575
commit f63d9205e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 334 additions and 1 deletions

View File

@ -0,0 +1,168 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
#
# WARNING: This configuration is for local development. Do not use it in a production deployment.
#
# This configuration supports basic configuration using environment variables or an .env file
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
# Default: apache/airflow:master-python3.8
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_GID - Group ID in Airflow containers
# Default: 50000
#
# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
#
# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
# Default: airflow
# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
# Default: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
# Default: ''
#
# Feel free to modify this file to suit your needs.
---
version: '3'
x-airflow-common:
&airflow-common
image: ${AIRFLOW_IMAGE_NAME:-acryldata/airflow-datahub:latest}
environment:
&airflow-common-env
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
AIRFLOW__LINEAGE__BACKEND: 'datahub_provider.lineage.datahub.DatahubLineageBackend'
AIRFLOW__LINEAGE__DATAHUB_KWARGS: '{ "datahub_conn_id": "datahub_rest_default", "capture_ownership_info": true, "capture_tags_info": true, "graceful_exceptions": false }'
_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
volumes:
- ./dags:/opt/airflow/dags
- ./logs:/opt/airflow/logs
- ./plugins:/opt/airflow/plugins
user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
depends_on:
redis:
condition: service_healthy
postgres:
condition: service_healthy
networks:
- datahub_network
services:
postgres:
image: postgres:13
hostname: postgres
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: airflow
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
networks:
- datahub_network
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 5s
retries: 5
restart: always
redis:
image: redis:latest
hostname: redis
ports:
- 6379:6379
networks:
- datahub_network
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 30s
retries: 50
restart: always
airflow-webserver:
<<: *airflow-common
command: webserver
ports:
- 58080:8080
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:58080/health"]
interval: 10s
timeout: 10s
retries: 5
restart: always
airflow-scheduler:
<<: *airflow-common
command: scheduler
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
interval: 10s
timeout: 10s
retries: 5
restart: always
airflow-worker:
<<: *airflow-common
command: celery worker
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
interval: 10s
timeout: 10s
retries: 5
restart: always
airflow-init:
<<: *airflow-common
command: version
environment:
<<: *airflow-common-env
_AIRFLOW_DB_UPGRADE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'true'
_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
flower:
<<: *airflow-common
command: celery flower
ports:
- 5555:5555
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
interval: 10s
timeout: 10s
retries: 5
restart: always
volumes:
postgres-db-volume:
networks:
datahub_network:
external: true

View File

@ -0,0 +1,158 @@
# Running Airflow locally with DataHub
## Introduction
This document describes how you can run Airflow side-by-side with DataHub's docker images to test out Airflow lineage with DataHub.
This offers a much easier way to try out Airflow with DataHub, compared to configuring containers by hand, setting up configurations and networking connectivity between the two systems.
## Pre-requisites
- Docker: ensure that you have a working Docker installation and you have at least 8GB of memory to allocate to both Airflow and DataHub combined.
```
docker info | grep Memory
> Total Memory: 7.775GiB
```
## Step 1: Set up your Airflow area
- Create an area to host your airflow installation
- Download the docker-compose file hosted in DataHub's repo in that directory
- Download a sample dag to use for testing Airflow lineage
```
mkdir -p airflow_install
cd airflow_install
# Download docker-compose
curl -L 'https://raw.githubusercontent.com/acryldata/datahub-fork/airflow-local-docker/docker/airflow/docker-compose.yaml?token=AAG5J3NA2ZJRVLS3XB3C3RTBG7BAM' -o docker-compose.yaml
# Create dags directory
mkdir -p dags
# Download a sample DAG
curl -L 'https://raw.githubusercontent.com/linkedin/datahub/master/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py' -o dags/lineage_backend_demo.py
```
### What is different between this docker-compose file and the official Apache Airflow docker compose file?
- This docker-compose file is derived from the [official Airflow docker-compose file](https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#docker-compose-yaml) but makes a few critical changes to make interoperability with DataHub seamless.
- The Airflow image in this docker compose file extends the [base Apache Airflow docker image](https://airflow.apache.org/docs/docker-stack/index.html) and is published [here](https://hub.docker.com/r/acryldata/airflow-datahub). It includes the latest `acryl-datahub` pip package installed by default so you don't need to install it yourself.
- This docker-compose file sets up the networking so that
- the Airflow containers can talk to the DataHub containers through the `datahub_network` bridge interface.
- Modifies the port-forwarding to map the Airflow Webserver port `8080` to port `58080` on the localhost (to avoid conflicts with DataHub metadata-service, which is mapped to 8080 by default)
- This docker-compose file also sets up the ENV variables to configure Airflow's Lineage Backend to talk to DataHub. (Look for the `AIRFLOW__LINEAGE__BACKEND` and `AIRFLOW__LINEAGE__DATAHUB_KWARGS` variables)
## Step 2: Bring up Airflow
```
docker-compose up
```
You should see a host of messages as Airflow starts up.
```
Container airflow_deploy_airflow-scheduler_1 Started 15.7s
Attaching to airflow-init_1, airflow-scheduler_1, airflow-webserver_1, airflow-worker_1, flower_1, postgres_1, redis_1
airflow-worker_1 | BACKEND=redis
airflow-worker_1 | DB_HOST=redis
airflow-worker_1 | DB_PORT=6379
airflow-worker_1 |
airflow-webserver_1 |
airflow-init_1 | DB: postgresql+psycopg2://airflow:***@postgres/airflow
airflow-init_1 | [2021-08-31 20:02:07,534] {db.py:702} INFO - Creating tables
airflow-init_1 | INFO [alembic.runtime.migration] Context impl PostgresqlImpl.
airflow-init_1 | INFO [alembic.runtime.migration] Will assume transactional DDL.
airflow-scheduler_1 | ____________ _____________
airflow-scheduler_1 | ____ |__( )_________ __/__ /________ __
airflow-scheduler_1 | ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / /
airflow-scheduler_1 | ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ /
airflow-scheduler_1 | _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/
airflow-scheduler_1 | [2021-08-31 20:02:07,736] {scheduler_job.py:661} INFO - Starting the scheduler
airflow-scheduler_1 | [2021-08-31 20:02:07,736] {scheduler_job.py:666} INFO - Processing each file at most -1 times
airflow-scheduler_1 | [2021-08-31 20:02:07,915] {manager.py:254} INFO - Launched DagFileProcessorManager with pid: 25
airflow-scheduler_1 | [2021-08-31 20:02:07,918] {scheduler_job.py:1197} INFO - Resetting orphaned tasks for active dag runs
airflow-scheduler_1 | [2021-08-31 20:02:07,923] {settings.py:51} INFO - Configured default timezone Timezone('UTC')
flower_1 |
airflow-worker_1 | * Serving Flask app "airflow.utils.serve_logs" (lazy loading)
airflow-worker_1 | * Environment: production
airflow-worker_1 | WARNING: This is a development server. Do not use it in a production deployment.
airflow-worker_1 | Use a production WSGI server instead.
airflow-worker_1 | * Debug mode: off
airflow-worker_1 | [2021-08-31 20:02:09,283] {_internal.py:113} INFO - * Running on http://0.0.0.0:8793/ (Press CTRL+C to quit)
flower_1 | BACKEND=redis
flower_1 | DB_HOST=redis
flower_1 | DB_PORT=6379
flower_1 |
```
Finally, Airflow should be healthy and up on port 58080.
```
airflow-webserver_1 | 172.22.0.1 - - [31/Aug/2021:20:30:52 +0000] "GET /static/appbuilder/fonts/fontawesome-webfont.woff2?v=4.7.0 HTTP/1.1" 304 0 "http://localhost:58080/static/appbuilder/css/font-awesome.min.css" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
airflow-init_1 | Admin user airflow created
airflow-init_1 | 2.1.3
airflow_install_airflow-init_1 exited with code 0
```
Navigate to http://localhost:58080 to confirm and find your Airflow webserver.
Default username and password is:
```
airflow:airflow
```
## Step 4: Register DataHub connection (hook) to Airflow
```
docker exec -it `docker ps | grep webserver | cut -d " " -f 1` airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080'
```
### Result
```
Successfully added `conn_id`=datahub_rest_default : datahub_rest://:@http://datahub-gms:8080:
```
### What is the above command doing?
- Find the container running airflow webserver: `docker ps | grep webserver | cut -d " " -f 1`
- Running the `airflow connections add ...` command inside that container to register the `datahub_rest` connection type and connect it to the `datahub-gms` host on port 8080.
- Note: This is what requires Airflow to be able to connect to `datahub-gms` the host (this is the container running datahub-gms image) and this is why we needed to connect the Airflow containers to the `datahub_network` using our custom docker-compose file.
## Step 3: Find the DAGs and run it
Navigate the Airflow UI to find the sample Airflow dag we just brought in
![Find the DAG](../../docs/imgs/airflow/find_the_dag.png)
By default, Airflow loads all DAG-s in paused status. Unpause the sample DAG to use it.
![Paused DAG](../../docs/imgs/airflow/paused_dag.png)
![Unpaused DAG](../../docs/imgs/airflow/unpaused_dag.png)
Then trigger the DAG to run.
![Trigger the DAG](../../docs/imgs/airflow/trigger_dag.png)
After the DAG runs successfully, go over to your DataHub instance to see the Pipeline and navigate its lineage.
![DataHub Pipeline View](../../docs/imgs/airflow/datahub_pipeline_view.png)
![DataHub Pipeline Entity](../../docs/imgs/airflow/datahub_pipeline_entity.png)
![DataHub Task View](../../docs/imgs/airflow/datahub_task_view.png)
![DataHub Lineage View](../../docs/imgs/airflow/datahub_lineage_view.png)
## TroubleShooting
Most issues are related to connectivity between Airflow and DataHub.
Here is how you can debug them.
![Find the Task Log](../../docs/imgs/airflow/finding_failed_log.png)
![Inspect the Log](../../docs/imgs/airflow/connection_error.png)
In this case, clearly the connection `datahub-rest` has not been registered. Looks like we forgot to register the connection with Airflow!
Let's execute Step 4 to register the datahub connection with Airflow.
After re-running the DAG, we see success!
![Pipeline Success](../../docs/imgs/airflow/successful_run.png)

View File

@ -74,7 +74,7 @@ function list_markdown_files(): string[] {
/^datahub-kubernetes\//,
/^datahub-web\//,
/^metadata-ingestion-examples\//,
/^docker\/(?!README|datahub-upgrade)/, // Drop all but a few docker docs.
/^docker\/(?!README|datahub-upgrade|airflow\/local_airflow)/, // Drop all but a few docker docs.
/^docs\/rfc\/templates\/000-template\.md$/,
/^docs\/docker\/README\.md/, // This one is just a pointer to another file.
/^docs\/README\.md/, // This one is just a pointer to the hosted docs site.

View File

@ -122,6 +122,7 @@ module.exports = {
"docs/how/delete-metadata",
"datahub-web-react/src/app/analytics/README",
"metadata-ingestion/developing",
"docker/airflow/local_airflow",
],
Components: [
"datahub-web-react/README",

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

View File

Before

Width:  |  Height:  |  Size: 157 KiB

After

Width:  |  Height:  |  Size: 157 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

View File

@ -183,6 +183,12 @@ The Airflow lineage backend is only supported in Airflow 1.10.15+ and 2.0.2+.
:::
### Running on Docker locally
If you are looking to run Airflow and DataHub using docker locally, follow the guide [here](../docker/airflow/local_airflow.md). Otherwise proceed to follow the instructions below.
### Setting up Airflow to use DataHub as Lineage Backend
1. You need to install the required dependency in your airflow. See https://registry.astronomer.io/providers/datahub/modules/datahublineagebackend
```shell