mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-26 17:37:33 +00:00
feat(datahub cli): DataHub CLI Quickstart (#2689)
This commit is contained in:
parent
1c364b52f6
commit
7591c8994a
15
.github/workflows/build-and-test.yml
vendored
15
.github/workflows/build-and-test.yml
vendored
@ -65,10 +65,7 @@ jobs:
|
||||
- name: Gradle build
|
||||
run: ./gradlew build -x check -x docs-website:build
|
||||
- name: Smoke test
|
||||
run: |
|
||||
./docker/dev.sh -d
|
||||
sleep 30
|
||||
./smoke-test/smoke.sh
|
||||
run: ./smoke-test/smoke.sh
|
||||
- name: Slack failure notification
|
||||
if: failure() && github.event_name == 'push'
|
||||
uses: kpritam/slack-job-status-action@v1
|
||||
@ -76,3 +73,13 @@ jobs:
|
||||
job-status: ${{ job.status }}
|
||||
slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
channel: github-activities
|
||||
|
||||
quickstart-compose-validation:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: "3.6"
|
||||
- name: Quickstart Compose Validation
|
||||
run: ./docker/quickstart/generate_and_compare.sh
|
||||
@ -14,6 +14,17 @@ services:
|
||||
- ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
|
||||
- mysqldata:/var/lib/mysql
|
||||
|
||||
mysql-setup:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: docker/mysql-setup/Dockerfile
|
||||
image: acryldata/datahub-mysql-setup:head
|
||||
env_file: mysql-setup/env/docker.env
|
||||
hostname: mysql-setup
|
||||
container_name: mysql-setup
|
||||
depends_on:
|
||||
- mysql
|
||||
|
||||
datahub-gms:
|
||||
env_file: datahub-gms/env/docker.env
|
||||
depends_on:
|
||||
|
||||
2
docker/elasticsearch/env/docker.env
vendored
2
docker/elasticsearch/env/docker.env
vendored
@ -1,3 +1,3 @@
|
||||
discovery.type=single-node
|
||||
xpack.security.enabled=false
|
||||
ES_JAVA_OPTS=-Xms1g -Xmx1g
|
||||
ES_JAVA_OPTS=-Xms512m -Xmx512m
|
||||
|
||||
5
docker/mysql-setup/env/docker.env
vendored
Normal file
5
docker/mysql-setup/env/docker.env
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
MYSQL_HOST=mysql
|
||||
MYSQL_PORT=3306
|
||||
MYSQL_USERNAME=datahub
|
||||
MYSQL_PASSWORD=datahub
|
||||
DATAHUB_DB_NAME=datahub
|
||||
208
docker/quickstart/docker-compose.quickstart.yml
Normal file
208
docker/quickstart/docker-compose.quickstart.yml
Normal file
@ -0,0 +1,208 @@
|
||||
networks:
|
||||
default:
|
||||
name: datahub_network
|
||||
services:
|
||||
broker:
|
||||
container_name: broker
|
||||
depends_on:
|
||||
- zookeeper
|
||||
environment:
|
||||
- KAFKA_BROKER_ID=1
|
||||
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
|
||||
- KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
|
||||
- KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
|
||||
- KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
|
||||
- KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0
|
||||
hostname: broker
|
||||
image: confluentinc/cp-kafka:5.4.0
|
||||
ports:
|
||||
- 29092:29092
|
||||
- 9092:9092
|
||||
datahub-frontend-react:
|
||||
container_name: datahub-frontend-react
|
||||
depends_on:
|
||||
- datahub-gms
|
||||
environment:
|
||||
- DATAHUB_GMS_HOST=datahub-gms
|
||||
- DATAHUB_GMS_PORT=8080
|
||||
- DATAHUB_SECRET=YouKnowNothing
|
||||
- DATAHUB_APP_VERSION=1.0
|
||||
- DATAHUB_PLAY_MEM_BUFFER_SIZE=10MB
|
||||
- KAFKA_BOOTSTRAP_SERVER=broker:29092
|
||||
- DATAHUB_TRACKING_TOPIC=DataHubUsageEvent_v1
|
||||
- ELASTIC_CLIENT_HOST=elasticsearch
|
||||
- ELASTIC_CLIENT_PORT=9200
|
||||
hostname: datahub-frontend-react
|
||||
image: linkedin/datahub-frontend-react:${DATAHUB_VERSION:-latest}
|
||||
ports:
|
||||
- 9002:9002
|
||||
datahub-gms:
|
||||
container_name: datahub-gms
|
||||
depends_on:
|
||||
- mysql
|
||||
environment:
|
||||
- DATASET_ENABLE_SCSI=false
|
||||
- EBEAN_DATASOURCE_USERNAME=datahub
|
||||
- EBEAN_DATASOURCE_PASSWORD=datahub
|
||||
- EBEAN_DATASOURCE_HOST=mysql:3306
|
||||
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
|
||||
- EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
|
||||
- KAFKA_BOOTSTRAP_SERVER=broker:29092
|
||||
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
|
||||
- ELASTICSEARCH_HOST=elasticsearch
|
||||
- ELASTICSEARCH_PORT=9200
|
||||
- NEO4J_HOST=http://neo4j:7474
|
||||
- NEO4J_URI=bolt://neo4j
|
||||
- NEO4J_USERNAME=neo4j
|
||||
- NEO4J_PASSWORD=datahub
|
||||
hostname: datahub-gms
|
||||
image: linkedin/datahub-gms:${DATAHUB_VERSION:-latest}
|
||||
mem_limit: 850m
|
||||
ports:
|
||||
- 8080:8080
|
||||
datahub-mae-consumer:
|
||||
container_name: datahub-mae-consumer
|
||||
depends_on:
|
||||
- kafka-setup
|
||||
- elasticsearch-setup
|
||||
- neo4j
|
||||
environment:
|
||||
- KAFKA_BOOTSTRAP_SERVER=broker:29092
|
||||
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
|
||||
- ELASTICSEARCH_HOST=elasticsearch
|
||||
- ELASTICSEARCH_PORT=9200
|
||||
- NEO4J_HOST=http://neo4j:7474
|
||||
- NEO4J_URI=bolt://neo4j
|
||||
- NEO4J_USERNAME=neo4j
|
||||
- NEO4J_PASSWORD=datahub
|
||||
- GMS_HOST=datahub-gms
|
||||
- GMS_PORT=8080
|
||||
hostname: datahub-mae-consumer
|
||||
image: linkedin/datahub-mae-consumer:${DATAHUB_VERSION:-latest}
|
||||
mem_limit: 256m
|
||||
ports:
|
||||
- 9091:9091
|
||||
datahub-mce-consumer:
|
||||
container_name: datahub-mce-consumer
|
||||
depends_on:
|
||||
- kafka-setup
|
||||
- datahub-gms
|
||||
environment:
|
||||
- KAFKA_BOOTSTRAP_SERVER=broker:29092
|
||||
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
|
||||
- GMS_HOST=datahub-gms
|
||||
- GMS_PORT=8080
|
||||
hostname: datahub-mce-consumer
|
||||
image: linkedin/datahub-mce-consumer:${DATAHUB_VERSION:-latest}
|
||||
mem_limit: 384m
|
||||
ports:
|
||||
- 9090:9090
|
||||
elasticsearch:
|
||||
container_name: elasticsearch
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- xpack.security.enabled=false
|
||||
- ES_JAVA_OPTS=-Xms512m -Xmx512m
|
||||
healthcheck:
|
||||
retries: 4
|
||||
start_period: 2m
|
||||
test:
|
||||
- CMD-SHELL
|
||||
- curl -sS --fail 'http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=0s'
|
||||
|| exit 1
|
||||
hostname: elasticsearch
|
||||
image: elasticsearch:7.9.3
|
||||
mem_limit: 1g
|
||||
ports:
|
||||
- 9200:9200
|
||||
volumes:
|
||||
- esdata:/usr/share/elasticsearch/data
|
||||
elasticsearch-setup:
|
||||
container_name: elasticsearch-setup
|
||||
depends_on:
|
||||
- elasticsearch
|
||||
environment:
|
||||
- ELASTICSEARCH_HOST=elasticsearch
|
||||
- ELASTICSEARCH_PORT=9200
|
||||
- ELASTICSEARCH_PROTOCOL=http
|
||||
hostname: elasticsearch-setup
|
||||
image: linkedin/datahub-elasticsearch-setup:${DATAHUB_VERSION:-latest}
|
||||
kafka-setup:
|
||||
container_name: kafka-setup
|
||||
depends_on:
|
||||
- broker
|
||||
- schema-registry
|
||||
environment:
|
||||
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
|
||||
- KAFKA_BOOTSTRAP_SERVER=broker:29092
|
||||
hostname: kafka-setup
|
||||
image: linkedin/datahub-kafka-setup:${DATAHUB_VERSION:-latest}
|
||||
mysql:
|
||||
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci
|
||||
container_name: mysql
|
||||
environment:
|
||||
- MYSQL_DATABASE=datahub
|
||||
- MYSQL_USER=datahub
|
||||
- MYSQL_PASSWORD=datahub
|
||||
- MYSQL_ROOT_PASSWORD=datahub
|
||||
hostname: mysql
|
||||
image: mysql:5.7
|
||||
ports:
|
||||
- 3306:3306
|
||||
volumes:
|
||||
- ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
|
||||
- mysqldata:/var/lib/mysql
|
||||
mysql-setup:
|
||||
container_name: mysql-setup
|
||||
depends_on:
|
||||
- mysql
|
||||
environment:
|
||||
- MYSQL_HOST=mysql
|
||||
- MYSQL_PORT=3306
|
||||
- MYSQL_USERNAME=datahub
|
||||
- MYSQL_PASSWORD=datahub
|
||||
- DATAHUB_DB_NAME=datahub
|
||||
hostname: mysql-setup
|
||||
image: acryldata/datahub-mysql-setup:head
|
||||
neo4j:
|
||||
container_name: neo4j
|
||||
environment:
|
||||
- NEO4J_AUTH=neo4j/datahub
|
||||
- NEO4J_dbms_default__database=graph.db
|
||||
- NEO4J_dbms_allow__upgrade=true
|
||||
hostname: neo4j
|
||||
image: neo4j:4.0.6
|
||||
ports:
|
||||
- 7474:7474
|
||||
- 7687:7687
|
||||
volumes:
|
||||
- neo4jdata:/data
|
||||
schema-registry:
|
||||
container_name: schema-registry
|
||||
depends_on:
|
||||
- zookeeper
|
||||
- broker
|
||||
environment:
|
||||
- SCHEMA_REGISTRY_HOST_NAME=schemaregistry
|
||||
- SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181
|
||||
hostname: schema-registry
|
||||
image: confluentinc/cp-schema-registry:5.4.0
|
||||
ports:
|
||||
- 8081:8081
|
||||
zookeeper:
|
||||
container_name: zookeeper
|
||||
environment:
|
||||
- ZOOKEEPER_CLIENT_PORT=2181
|
||||
- ZOOKEEPER_TICK_TIME=2000
|
||||
hostname: zookeeper
|
||||
image: confluentinc/cp-zookeeper:5.4.0
|
||||
ports:
|
||||
- 2181:2181
|
||||
volumes:
|
||||
- zkdata:/var/opt/zookeeper
|
||||
version: '2'
|
||||
volumes:
|
||||
esdata: null
|
||||
mysqldata: null
|
||||
neo4jdata: null
|
||||
zkdata: null
|
||||
20
docker/quickstart/generate_and_compare.sh
Executable file
20
docker/quickstart/generate_and_compare.sh
Executable file
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
cd "$DIR"
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
|
||||
pip install -r requirements.txt
|
||||
python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml temp.quickstart.yml
|
||||
|
||||
if cmp docker-compose.quickstart.yml temp.quickstart.yml; then
|
||||
printf 'docker-compose.quickstart.yml is up to date.'
|
||||
exit 0
|
||||
else
|
||||
printf 'docker-compose.quickstart.yml is out of date.'
|
||||
exit 1
|
||||
fi
|
||||
113
docker/quickstart/generate_docker_quickstart.py
Normal file
113
docker/quickstart/generate_docker_quickstart.py
Normal file
@ -0,0 +1,113 @@
|
||||
import os
|
||||
from collections.abc import Mapping
|
||||
|
||||
import click
|
||||
import yaml
|
||||
from dotenv import dotenv_values
|
||||
from yaml import Loader
|
||||
|
||||
# Generates a merged docker-compose file with env variables inlined.
|
||||
# Usage: python3 docker_compose_cli_gen.py ../docker-compose.yml ../docker-compose.override.yml ../docker-compose-gen.yml
|
||||
|
||||
omitted_services = [
|
||||
"kafka-rest-proxy",
|
||||
"kafka-topics-ui",
|
||||
"schema-registry-ui",
|
||||
"kibana",
|
||||
]
|
||||
mem_limits = {
|
||||
"datahub-gms": "850m",
|
||||
"datahub-mae-consumer": "256m",
|
||||
"datahub-mce-consumer": "384m",
|
||||
"elasticsearch": "1g",
|
||||
}
|
||||
|
||||
|
||||
def dict_merge(dct, merge_dct):
|
||||
for k, v in merge_dct.items():
|
||||
if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
|
||||
dict_merge(dct[k], merge_dct[k])
|
||||
else:
|
||||
dct[k] = merge_dct[k]
|
||||
|
||||
|
||||
def modify_docker_config(base_path, docker_yaml_config):
|
||||
# 0. Filter out services to be omitted.
|
||||
for key in list(docker_yaml_config["services"]):
|
||||
if key in omitted_services:
|
||||
del docker_yaml_config["services"][key]
|
||||
|
||||
for name, service in docker_yaml_config["services"].items():
|
||||
# 1. Extract the env file pointer
|
||||
env_file = service.get("env_file")
|
||||
|
||||
if env_file is not None:
|
||||
# 2. Construct full .env path
|
||||
env_file_path = os.path.join(base_path, env_file)
|
||||
|
||||
# 3. Resolve the .env values
|
||||
env_vars = dotenv_values(env_file_path)
|
||||
|
||||
# 4. Add an "environment" block to YAML
|
||||
service["environment"] = list(
|
||||
f"{key}={value}" for key, value in env_vars.items()
|
||||
)
|
||||
|
||||
# 5. Delete the "env_file" value
|
||||
del service["env_file"]
|
||||
|
||||
# 6. Delete build instructions
|
||||
if "build" in service:
|
||||
del service["build"]
|
||||
|
||||
# 7. Set memory limits
|
||||
if name in mem_limits:
|
||||
service["mem_limit"] = mem_limits[name]
|
||||
|
||||
# 8. Set docker compose version to 2.
|
||||
docker_yaml_config["version"] = "2"
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument(
|
||||
"compose-files",
|
||||
nargs=-1,
|
||||
type=click.Path(
|
||||
exists=True,
|
||||
dir_okay=False,
|
||||
),
|
||||
)
|
||||
@click.argument("output-file", type=click.Path())
|
||||
def generate(compose_files, output_file) -> None:
|
||||
|
||||
# Resolve .env files to inlined vars
|
||||
modified_files = []
|
||||
for compose_file in compose_files:
|
||||
with open(compose_file, "r") as orig_conf:
|
||||
docker_config = yaml.load(orig_conf, Loader=Loader)
|
||||
|
||||
base_path = os.path.dirname(compose_file)
|
||||
modify_docker_config(base_path, docker_config)
|
||||
modified_files.append(docker_config)
|
||||
|
||||
# Merge services, networks, and volumes maps
|
||||
merged_docker_config = modified_files[0]
|
||||
for modified_file in modified_files:
|
||||
dict_merge(merged_docker_config, modified_file)
|
||||
|
||||
# Write output file
|
||||
output_dir = os.path.dirname(output_file)
|
||||
if len(output_dir) and not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
with open(output_file, "w") as new_conf_file:
|
||||
yaml.dump(
|
||||
merged_docker_config,
|
||||
new_conf_file,
|
||||
default_flow_style=False,
|
||||
)
|
||||
|
||||
print(f"Successfully generated {output_file}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate()
|
||||
13
docker/quickstart/generate_docker_quickstart.sh
Executable file
13
docker/quickstart/generate_docker_quickstart.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/sh
|
||||
|
||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
cd "$DIR"
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
|
||||
pip install -r requirements.txt
|
||||
python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml docker-compose.quickstart.yml
|
||||
|
||||
3
docker/quickstart/requirements.txt
Normal file
3
docker/quickstart/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
PyYAML==5.4.1
|
||||
python-dotenv==0.17.0
|
||||
click
|
||||
@ -1,19 +1,59 @@
|
||||
# DataHub Quickstart Guide
|
||||
|
||||
1. Install [docker](https://docs.docker.com/install/) and [docker-compose](https://docs.docker.com/compose/install/) (if using Linux). Make sure to allocate enough hardware resources for Docker engine. Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area.
|
||||
2. Open Docker either from the command line or the desktop app and ensure it is up and running.
|
||||
3. Clone this repo and `cd` into the root directory of the cloned repository.
|
||||
4. Run the following command to download and run all Docker containers locally:
|
||||
```
|
||||
./docker/quickstart.sh
|
||||
```
|
||||
This step takes a while to run the first time, and it may be difficult to tell if DataHub is fully up and running from the combined log. Please use [this guide](debugging.md#how-can-i-confirm-if-all-docker-containers-are-running-as-expected-after-a-quickstart) to verify that each container is running correctly.
|
||||
5. At this point, you should be able to start DataHub by opening [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as username and any password (no password validation by default). However, you'll notice that no data has been ingested yet.
|
||||
6. To ingest provided [sample data](https://github.com/linkedin/datahub/blob/master/metadata-ingestion/mce-cli/bootstrap_mce.dat) to DataHub, switch to a new terminal window, `cd` into the cloned `datahub` repo, and run the following command:
|
||||
```
|
||||
./docker/ingestion/ingestion.sh
|
||||
```
|
||||
After running this, you should be able to see and search sample datasets in DataHub.
|
||||
7. That's it! To get some real data into DataHub, take a look at the [ingestion framework](../metadata-ingestion/README.md).
|
||||
## Deploying DataHub
|
||||
|
||||
Please refer to the [debugging guide](debugging.md) if you encounter any issues during the quickstart.
|
||||
To deploy a new instance of DataHub, perform the following steps.
|
||||
|
||||
1. Install [docker](https://docs.docker.com/install/) and [docker-compose](https://docs.docker.com/compose/install/) (if using Linux). Make sure to allocate enough hardware resources for Docker engine. Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area.
|
||||
|
||||
|
||||
2. Launch the Docker Engine from command line or the desktop app.
|
||||
|
||||
|
||||
3. Install the DataHub CLI
|
||||
|
||||
a. Ensure you have Python 3.6+ installed & configured. (Check using `python3 --version`)
|
||||
|
||||
b. Run the following commands in your terminal
|
||||
```
|
||||
python3 -m pip install --upgrade pip wheel setuptools
|
||||
python3 -m pip uninstall datahub acryl-datahub || true # sanity check - ok if it fails
|
||||
python3 -m pip install --upgrade acryl-datahub
|
||||
datahub version
|
||||
```
|
||||
If you see "command not found", try running cli commands with the prefix 'python3 -m' instead: `python3 -m datahub version`
|
||||
|
||||
|
||||
4. To deploy DataHub, run the following CLI command from your terminal
|
||||
```
|
||||
datahub docker quickstart
|
||||
```
|
||||
Upon completion of this step, you should be able to navigate to the DataHub UI at [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as username and any password (no password validation by default).
|
||||
|
||||
|
||||
5. To ingest the sample metadata, run the following CLI command from your terminal
|
||||
```
|
||||
datahub docker ingest-sample-data
|
||||
```
|
||||
|
||||
That's it! To start pushing your company's metadata into DataHub, take a look at the [Metadata Ingestion Framework](../metadata-ingestion/README.md).
|
||||
|
||||
|
||||
## Resetting DataHub
|
||||
|
||||
To cleanse DataHub of all of it's state (e.g. before ingesting your own), you can use the CLI `nuke` command.
|
||||
|
||||
```
|
||||
datahub docker nuke
|
||||
```
|
||||
|
||||
|
||||
## FAQ
|
||||
|
||||
### Command not found: datahub
|
||||
|
||||
If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an older
|
||||
version of Python. Try prefixing your `datahub` commands with `python3 -m`:
|
||||
```
|
||||
python3 -m datahub docker quickstart
|
||||
```
|
||||
@ -45,7 +45,7 @@ testpaths =
|
||||
tests/integration
|
||||
|
||||
[coverage:report]
|
||||
fail_under = 75
|
||||
fail_under = 70
|
||||
show_missing = true
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
|
||||
@ -1,23 +1,23 @@
|
||||
import sys
|
||||
|
||||
import click
|
||||
|
||||
from datahub import __package_name__
|
||||
from datahub.check.docker import check_local_docker_containers
|
||||
from datahub.check.json_file import check_mce_file
|
||||
from datahub.cli.docker import docker_check_impl
|
||||
from datahub.cli.json_file import check_mce_file
|
||||
from datahub.ingestion.sink.sink_registry import sink_registry
|
||||
from datahub.ingestion.source.source_registry import source_registry
|
||||
from datahub.ingestion.transformer.transform_registry import transform_registry
|
||||
|
||||
|
||||
@click.group()
|
||||
def check() -> None:
|
||||
"""Helper commands for checking various aspects of DataHub."""
|
||||
pass
|
||||
|
||||
|
||||
@check.command()
|
||||
@click.argument("json-file", type=click.Path(exists=True, dir_okay=False))
|
||||
def mce_file(json_file: str) -> None:
|
||||
"""Check the schema of a MCE JSON file"""
|
||||
"""Check the schema of a MCE JSON file."""
|
||||
|
||||
report = check_mce_file(json_file)
|
||||
click.echo(report)
|
||||
@ -25,16 +25,9 @@ def mce_file(json_file: str) -> None:
|
||||
|
||||
@check.command()
|
||||
def local_docker() -> None:
|
||||
"""Check that the local Docker containers are healthy"""
|
||||
|
||||
issues = check_local_docker_containers()
|
||||
if not issues:
|
||||
click.secho("✔ No issues detected", fg="green")
|
||||
else:
|
||||
click.secho("The following issues were detected:", fg="bright_red")
|
||||
for issue in issues:
|
||||
click.echo(f"- {issue}")
|
||||
sys.exit(1)
|
||||
"""Check that the local Docker containers are healthy. (deprecated)"""
|
||||
click.secho("DeprecationWarning: use `datahub docker check` instead", fg="yellow")
|
||||
docker_check_impl()
|
||||
|
||||
|
||||
@check.command()
|
||||
@ -44,10 +37,10 @@ def local_docker() -> None:
|
||||
type=bool,
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Include extra information for each plugin",
|
||||
help="Include extra information for each plugin.",
|
||||
)
|
||||
def plugins(verbose: bool) -> None:
|
||||
"""Check the enabled ingestion plugins"""
|
||||
"""List the enabled ingestion plugins."""
|
||||
|
||||
click.secho("Sources:", bold=True)
|
||||
click.echo(source_registry.summary(verbose=verbose))
|
||||
@ -55,6 +48,9 @@ def plugins(verbose: bool) -> None:
|
||||
click.secho("Sinks:", bold=True)
|
||||
click.echo(sink_registry.summary(verbose=verbose))
|
||||
click.echo()
|
||||
click.secho("Transformers:", bold=True)
|
||||
click.echo(transform_registry.summary(verbose=verbose, col_width=30))
|
||||
click.echo()
|
||||
if not verbose:
|
||||
click.echo("For details on why a plugin is disabled, rerun with '--verbose'")
|
||||
click.echo(
|
||||
290
metadata-ingestion/src/datahub/cli/docker.py
Normal file
290
metadata-ingestion/src/datahub/cli/docker.py
Normal file
@ -0,0 +1,290 @@
|
||||
import datetime
|
||||
import itertools
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from typing import List, NoReturn, Optional
|
||||
|
||||
import click
|
||||
import requests
|
||||
|
||||
from datahub.cli.docker_check import (
|
||||
check_local_docker_containers,
|
||||
get_client_with_error,
|
||||
)
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
|
||||
SIMPLE_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml"
|
||||
BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
|
||||
|
||||
GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master"
|
||||
GITHUB_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{SIMPLE_QUICKSTART_COMPOSE_FILE}"
|
||||
GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}"
|
||||
|
||||
|
||||
@click.group()
|
||||
def docker() -> None:
|
||||
"""Helper commands for setting up and interacting with a local
|
||||
DataHub instance using Docker."""
|
||||
pass
|
||||
|
||||
|
||||
def _print_issue_list_and_exit(
|
||||
issues: List[str], header: str, footer: Optional[str] = None
|
||||
) -> NoReturn:
|
||||
click.secho(header, fg="bright_red")
|
||||
for issue in issues:
|
||||
click.echo(f"- {issue}")
|
||||
if footer:
|
||||
click.echo()
|
||||
click.echo(footer)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def docker_check_impl() -> None:
|
||||
issues = check_local_docker_containers()
|
||||
if not issues:
|
||||
click.secho("✔ No issues detected", fg="green")
|
||||
else:
|
||||
_print_issue_list_and_exit(issues, "The following issues were detected:")
|
||||
|
||||
|
||||
@docker.command()
|
||||
def check() -> None:
|
||||
"""Check that the Docker containers are healthy"""
|
||||
docker_check_impl()
|
||||
|
||||
|
||||
@docker.command()
|
||||
@click.option(
|
||||
"--version",
|
||||
type=str,
|
||||
default="head",
|
||||
help="Datahub version to be deployed. If not set, deploy latest",
|
||||
)
|
||||
@click.option(
|
||||
"--build-locally",
|
||||
type=bool,
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Attempt to build the containers locally before starting",
|
||||
)
|
||||
@click.option(
|
||||
"--quickstart-compose-file",
|
||||
type=click.Path(exists=True, dir_okay=False, readable=True),
|
||||
default=[],
|
||||
multiple=True,
|
||||
help="Use a local docker-compose file instead of pulling from GitHub",
|
||||
)
|
||||
@click.option(
|
||||
"--dump-logs-on-failure",
|
||||
type=bool,
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="If true, the docker-compose logs will be printed to console if something fails",
|
||||
)
|
||||
def quickstart(
|
||||
version: str,
|
||||
build_locally: bool,
|
||||
quickstart_compose_file: List[pathlib.Path],
|
||||
dump_logs_on_failure: bool,
|
||||
) -> None:
|
||||
"""Start an instance of DataHub locally using docker-compose.
|
||||
|
||||
This command will automatically download the latest docker-compose configuration
|
||||
from GitHub, pull the latest images, and bring up the DataHub system.
|
||||
There are options to override the docker-compose config file, build the containers
|
||||
locally, and dump logs to the console or to a file if something goes wrong.
|
||||
"""
|
||||
|
||||
# Run pre-flight checks.
|
||||
issues = check_local_docker_containers(preflight_only=True)
|
||||
if issues:
|
||||
_print_issue_list_and_exit(issues, "Unable to run quickstart:")
|
||||
|
||||
quickstart_compose_file = list(
|
||||
quickstart_compose_file
|
||||
) # convert to list from tuple
|
||||
if not quickstart_compose_file:
|
||||
click.echo("Fetching docker-compose file from GitHub")
|
||||
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp_file:
|
||||
path = pathlib.Path(tmp_file.name)
|
||||
quickstart_compose_file.append(path)
|
||||
|
||||
# Download the quickstart docker-compose file from GitHub.
|
||||
quickstart_download_response = requests.get(GITHUB_QUICKSTART_COMPOSE_URL)
|
||||
quickstart_download_response.raise_for_status()
|
||||
tmp_file.write(quickstart_download_response.content)
|
||||
|
||||
# set version
|
||||
os.environ["DATAHUB_VERSION"] = version
|
||||
|
||||
base_command: List[str] = [
|
||||
"docker-compose",
|
||||
*itertools.chain.from_iterable(
|
||||
("-f", f"{path}") for path in quickstart_compose_file
|
||||
),
|
||||
"-p",
|
||||
"datahub",
|
||||
]
|
||||
|
||||
# Pull and possibly build the latest containers.
|
||||
subprocess.run(
|
||||
[
|
||||
*base_command,
|
||||
"pull",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
if build_locally:
|
||||
subprocess.run(
|
||||
[
|
||||
*base_command,
|
||||
"build",
|
||||
"--pull",
|
||||
],
|
||||
check=True,
|
||||
env={
|
||||
**os.environ,
|
||||
"DOCKER_BUILDKIT": "1",
|
||||
},
|
||||
)
|
||||
|
||||
# Start it up! (with retries)
|
||||
max_wait_time = datetime.timedelta(minutes=6)
|
||||
start_time = datetime.datetime.now()
|
||||
sleep_interval = datetime.timedelta(seconds=2)
|
||||
up_interval = datetime.timedelta(seconds=30)
|
||||
up_attempts = 0
|
||||
while (datetime.datetime.now() - start_time) < max_wait_time:
|
||||
# Attempt to run docker-compose up every minute.
|
||||
if (datetime.datetime.now() - start_time) > up_attempts * up_interval:
|
||||
click.echo()
|
||||
subprocess.run(base_command + ["up", "-d"])
|
||||
up_attempts += 1
|
||||
|
||||
# Check docker health every few seconds.
|
||||
issues = check_local_docker_containers()
|
||||
if not issues:
|
||||
break
|
||||
|
||||
# Wait until next iteration.
|
||||
click.echo(".", nl=False)
|
||||
time.sleep(sleep_interval.total_seconds())
|
||||
else:
|
||||
# Falls through if the while loop doesn't exit via break.
|
||||
click.echo()
|
||||
with tempfile.NamedTemporaryFile(suffix=".log", delete=False) as log_file:
|
||||
ret = subprocess.run(
|
||||
base_command + ["logs"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
check=True,
|
||||
)
|
||||
log_file.write(ret.stdout)
|
||||
|
||||
if dump_logs_on_failure:
|
||||
with open(log_file.name, "r") as logs:
|
||||
click.echo("Dumping docker-compose logs:")
|
||||
click.echo(logs.read())
|
||||
click.echo()
|
||||
|
||||
_print_issue_list_and_exit(
|
||||
issues,
|
||||
header="Unable to run quickstart - the following issues were detected:",
|
||||
footer="If you think something went wrong, please file an issue at https://github.com/linkedin/datahub/issues\n"
|
||||
"or send a message in our Slack https://slack.datahubproject.io/\n"
|
||||
f"Be sure to attach the logs from {log_file.name}",
|
||||
)
|
||||
|
||||
# Handle success condition.
|
||||
click.echo()
|
||||
click.secho("✔ DataHub is now running", fg="green")
|
||||
click.secho(
|
||||
"Ingest some demo data using `datahub docker ingest-sample-data`,\n"
|
||||
"or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend.",
|
||||
fg="green",
|
||||
)
|
||||
|
||||
|
||||
@docker.command()
|
||||
@click.option(
|
||||
"--path",
|
||||
type=click.Path(exists=True, dir_okay=False),
|
||||
help=f"The MCE json file to ingest. Defaults to downloading {BOOTSTRAP_MCES_FILE} from GitHub",
|
||||
)
|
||||
def ingest_sample_data(path: Optional[str]) -> None:
|
||||
"""Ingest sample data into a running DataHub instance."""
|
||||
|
||||
if path is None:
|
||||
click.echo("Downloading sample data...")
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
|
||||
path = str(pathlib.Path(tmp_file.name))
|
||||
|
||||
# Download the bootstrap MCE file from GitHub.
|
||||
mce_json_download_response = requests.get(GITHUB_BOOTSTRAP_MCES_URL)
|
||||
mce_json_download_response.raise_for_status()
|
||||
tmp_file.write(mce_json_download_response.content)
|
||||
click.echo(f"Downloaded to {path}")
|
||||
|
||||
# Verify that docker is up.
|
||||
issues = check_local_docker_containers()
|
||||
if issues:
|
||||
_print_issue_list_and_exit(
|
||||
issues,
|
||||
header="Docker is not ready:",
|
||||
footer="Try running `datahub docker quickstart` first",
|
||||
)
|
||||
|
||||
# Run ingestion.
|
||||
click.echo("Starting ingestion...")
|
||||
pipeline = Pipeline.create(
|
||||
{
|
||||
"source": {
|
||||
"type": "file",
|
||||
"config": {
|
||||
"filename": path,
|
||||
},
|
||||
},
|
||||
"sink": {
|
||||
"type": "datahub-rest",
|
||||
"config": {"server": "http://localhost:8080"},
|
||||
},
|
||||
}
|
||||
)
|
||||
pipeline.run()
|
||||
ret = pipeline.pretty_print_summary()
|
||||
sys.exit(ret)
|
||||
|
||||
|
||||
@docker.command()
|
||||
def nuke() -> None:
|
||||
"""Remove all Docker containers, networks, and volumes associated with DataHub."""
|
||||
|
||||
with get_client_with_error() as (client, error):
|
||||
if error:
|
||||
click.secho(
|
||||
"Docker doesn't seem to be running. Did you start it?", fg="red"
|
||||
)
|
||||
return
|
||||
|
||||
click.echo("Removing containers in the datahub project")
|
||||
for container in client.containers.list(
|
||||
filters={"label": "com.docker.compose.project=datahub"}
|
||||
):
|
||||
container.remove(v=True, force=True)
|
||||
|
||||
click.echo("Removing volumes in the datahub project")
|
||||
for volume in client.volumes.list(
|
||||
filters={"label": "com.docker.compose.project=datahub"}
|
||||
):
|
||||
volume.remove(force=True)
|
||||
|
||||
click.echo("Removing networks in the datahub project")
|
||||
for network in client.networks.list(
|
||||
filters={"label": "com.docker.compose.project=datahub"}
|
||||
):
|
||||
network.remove()
|
||||
@ -3,11 +3,6 @@ from typing import List
|
||||
|
||||
import docker
|
||||
|
||||
ENSURE_EXIT_SUCCESS = [
|
||||
"kafka-setup",
|
||||
"elasticsearch-setup",
|
||||
]
|
||||
|
||||
REQUIRED_CONTAINERS = [
|
||||
"elasticsearch-setup",
|
||||
"elasticsearch",
|
||||
@ -28,8 +23,20 @@ REQUIRED_CONTAINERS = [
|
||||
# "kafka-rest-proxy",
|
||||
]
|
||||
|
||||
ENSURE_EXIT_SUCCESS = [
|
||||
"kafka-setup",
|
||||
"elasticsearch-setup",
|
||||
"mysql-setup",
|
||||
]
|
||||
|
||||
CONTAINERS_TO_CHECK_IF_PRESENT = [
|
||||
# We only add this container in some cases, but if it's present, we
|
||||
# definitely want to check that it exits properly.
|
||||
"mysql-setup",
|
||||
]
|
||||
|
||||
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
|
||||
MIN_MEMORY_NEEDED = 6.75 # GB
|
||||
MIN_MEMORY_NEEDED = 3.8 # GB
|
||||
|
||||
|
||||
@contextmanager
|
||||
@ -49,7 +56,7 @@ def memory_in_gb(mem_bytes: int) -> float:
|
||||
return mem_bytes / (1024 * 1024 * 1000)
|
||||
|
||||
|
||||
def check_local_docker_containers() -> List[str]:
|
||||
def check_local_docker_containers(preflight_only: bool = False) -> List[str]:
|
||||
issues: List[str] = []
|
||||
with get_client_with_error() as (client, error):
|
||||
if error:
|
||||
@ -63,6 +70,9 @@ def check_local_docker_containers() -> List[str]:
|
||||
f"Total Docker memory configured {memory_in_gb(total_mem_configured):.2f}GB is below the minimum threshold {MIN_MEMORY_NEEDED}GB"
|
||||
)
|
||||
|
||||
if preflight_only:
|
||||
return issues
|
||||
|
||||
containers = client.containers.list(
|
||||
all=True,
|
||||
filters={
|
||||
@ -81,7 +91,9 @@ def check_local_docker_containers() -> List[str]:
|
||||
|
||||
# Check that the containers are running and healthy.
|
||||
for container in containers:
|
||||
if container.name not in REQUIRED_CONTAINERS:
|
||||
if container.name not in (
|
||||
REQUIRED_CONTAINERS + CONTAINERS_TO_CHECK_IF_PRESENT
|
||||
):
|
||||
# Ignores things like "datahub-frontend" which are no longer used.
|
||||
# This way, we only check required containers like "datahub-frontend-react"
|
||||
# even if there are some old containers lying around.
|
||||
@ -7,7 +7,8 @@ import click
|
||||
from pydantic import ValidationError
|
||||
|
||||
import datahub as datahub_package
|
||||
from datahub.check.check_cli import check
|
||||
from datahub.cli.check_cli import check
|
||||
from datahub.cli.docker import docker
|
||||
from datahub.configuration.config_loader import load_config_file
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
|
||||
@ -25,7 +26,13 @@ BASE_LOGGING_FORMAT = (
|
||||
logging.basicConfig(format=BASE_LOGGING_FORMAT)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.group(
|
||||
context_settings=dict(
|
||||
# Avoid truncation of help text.
|
||||
# See https://github.com/pallets/click/issues/486.
|
||||
max_content_width=120,
|
||||
)
|
||||
)
|
||||
@click.option("--debug/--no-debug", default=False)
|
||||
@click.version_option(
|
||||
version=datahub_package.nice_version_name(),
|
||||
@ -45,7 +52,7 @@ def datahub(debug: bool) -> None:
|
||||
|
||||
@datahub.command()
|
||||
def version() -> None:
|
||||
"""Print version number and exit"""
|
||||
"""Print version number and exit."""
|
||||
click.echo(f"DataHub CLI version: {datahub_package.nice_version_name()}")
|
||||
click.echo(f"Python version: {sys.version}")
|
||||
|
||||
@ -55,11 +62,11 @@ def version() -> None:
|
||||
"-c",
|
||||
"--config",
|
||||
type=click.Path(exists=True, dir_okay=False),
|
||||
help="Config file in .toml or .yaml format",
|
||||
help="Config file in .toml or .yaml format.",
|
||||
required=True,
|
||||
)
|
||||
def ingest(config: str) -> None:
|
||||
"""Main command for ingesting metadata into DataHub"""
|
||||
"""Ingest metadata into DataHub."""
|
||||
|
||||
config_file = pathlib.Path(config)
|
||||
pipeline_config = load_config_file(config_file)
|
||||
@ -77,12 +84,16 @@ def ingest(config: str) -> None:
|
||||
|
||||
|
||||
datahub.add_command(check)
|
||||
datahub.add_command(docker)
|
||||
|
||||
|
||||
def main(**kwargs):
|
||||
# This wrapper prevents click from suppressing errors.
|
||||
try:
|
||||
sys.exit(datahub(standalone_mode=False, **kwargs))
|
||||
except click.exceptions.Abort:
|
||||
# Click already automatically prints an abort message, so we can just exit.
|
||||
sys.exit(1)
|
||||
except click.ClickException as error:
|
||||
error.show()
|
||||
sys.exit(1)
|
||||
|
||||
@ -84,10 +84,7 @@ class Registry(Generic[T]):
|
||||
# If it's not an exception, then it's a registered type.
|
||||
return tp
|
||||
|
||||
def summary(self, verbose=True):
|
||||
col_width = 15
|
||||
verbose_col_width = 20
|
||||
|
||||
def summary(self, verbose=True, col_width=15, verbose_col_width=20):
|
||||
lines = []
|
||||
for key in sorted(self._mapping.keys()):
|
||||
line = f"{key}"
|
||||
|
||||
@ -1,26 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -euxo pipefail
|
||||
|
||||
# Runs a basic e2e test. It is not meant to be fully comprehensive,
|
||||
# but rather should catch obvious bugs before they make it into prod.
|
||||
#
|
||||
# Script assumptions:
|
||||
# - The gradle build has already been run.
|
||||
# - Python 3.6+ is installed.
|
||||
# - The metadata-ingestion codegen script has been run.
|
||||
# - A full DataHub setup is running on localhost with standard ports.
|
||||
# The easiest way to do this is by using the quickstart or dev
|
||||
# quickstart scripts.
|
||||
# - Python 3.6+ is installed and in the PATH.
|
||||
|
||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
cd "$DIR"
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install --upgrade pip wheel setuptools
|
||||
pip install -r requirements.txt
|
||||
|
||||
(cd ../metadata-ingestion && ./scripts/codegen.sh)
|
||||
datahub docker quickstart \
|
||||
--build-locally \
|
||||
--quickstart-compose-file ../docker/docker-compose.yml \
|
||||
--quickstart-compose-file ../docker/docker-compose.override.yml \
|
||||
--quickstart-compose-file ../docker/docker-compose.dev.yml \
|
||||
--dump-logs-on-failure
|
||||
|
||||
pytest -vv
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from datahub.cli.docker import check_local_docker_containers
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
from datahub.check.docker import check_local_docker_containers
|
||||
|
||||
GMS_ENDPOINT = "http://localhost:8080"
|
||||
FRONTEND_ENDPOINT = "http://localhost:9002"
|
||||
@ -15,26 +15,13 @@ restli_default_headers = {
|
||||
"X-RestLi-Protocol-Version": "2.0.0",
|
||||
}
|
||||
kafka_post_ingestion_wait_sec = 60
|
||||
healthcheck_wait_retries = 20
|
||||
healthcheck_wait_interval_sec = 15
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def wait_for_healthchecks():
|
||||
tries = 0
|
||||
while tries < healthcheck_wait_retries:
|
||||
if tries > 0:
|
||||
time.sleep(healthcheck_wait_interval_sec)
|
||||
tries += 1
|
||||
|
||||
issues = check_local_docker_containers()
|
||||
if not issues:
|
||||
print(f"finished waiting for healthchecks after {tries} tries")
|
||||
yield
|
||||
return
|
||||
|
||||
issues_str = '\n'.join(f"- {issue}" for issue in issues)
|
||||
raise RuntimeError(f"retry limit exceeded while waiting for docker healthchecks\n{issues_str}")
|
||||
# Simply assert that everything is healthy, but don't wait.
|
||||
assert not check_local_docker_containers()
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.dependency()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user