feat(datahub cli): DataHub CLI Quickstart (#2689)

This commit is contained in:
John Joyce 2021-06-14 17:15:24 -07:00 committed by GitHub
parent 1c364b52f6
commit 7591c8994a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 795 additions and 82 deletions

View File

@ -65,10 +65,7 @@ jobs:
- name: Gradle build
run: ./gradlew build -x check -x docs-website:build
- name: Smoke test
run: |
./docker/dev.sh -d
sleep 30
./smoke-test/smoke.sh
run: ./smoke-test/smoke.sh
- name: Slack failure notification
if: failure() && github.event_name == 'push'
uses: kpritam/slack-job-status-action@v1
@ -76,3 +73,13 @@ jobs:
job-status: ${{ job.status }}
slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
channel: github-activities
quickstart-compose-validation:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.6"
- name: Quickstart Compose Validation
run: ./docker/quickstart/generate_and_compare.sh

View File

@ -14,6 +14,17 @@ services:
- ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
mysql-setup:
build:
context: ../
dockerfile: docker/mysql-setup/Dockerfile
image: acryldata/datahub-mysql-setup:head
env_file: mysql-setup/env/docker.env
hostname: mysql-setup
container_name: mysql-setup
depends_on:
- mysql
datahub-gms:
env_file: datahub-gms/env/docker.env
depends_on:

View File

@ -1,3 +1,3 @@
discovery.type=single-node
xpack.security.enabled=false
ES_JAVA_OPTS=-Xms1g -Xmx1g
ES_JAVA_OPTS=-Xms512m -Xmx512m

5
docker/mysql-setup/env/docker.env vendored Normal file
View File

@ -0,0 +1,5 @@
MYSQL_HOST=mysql
MYSQL_PORT=3306
MYSQL_USERNAME=datahub
MYSQL_PASSWORD=datahub
DATAHUB_DB_NAME=datahub

View File

@ -0,0 +1,208 @@
networks:
default:
name: datahub_network
services:
broker:
container_name: broker
depends_on:
- zookeeper
environment:
- KAFKA_BROKER_ID=1
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
- KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
- KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
- KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0
hostname: broker
image: confluentinc/cp-kafka:5.4.0
ports:
- 29092:29092
- 9092:9092
datahub-frontend-react:
container_name: datahub-frontend-react
depends_on:
- datahub-gms
environment:
- DATAHUB_GMS_HOST=datahub-gms
- DATAHUB_GMS_PORT=8080
- DATAHUB_SECRET=YouKnowNothing
- DATAHUB_APP_VERSION=1.0
- DATAHUB_PLAY_MEM_BUFFER_SIZE=10MB
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- DATAHUB_TRACKING_TOPIC=DataHubUsageEvent_v1
- ELASTIC_CLIENT_HOST=elasticsearch
- ELASTIC_CLIENT_PORT=9200
hostname: datahub-frontend-react
image: linkedin/datahub-frontend-react:${DATAHUB_VERSION:-latest}
ports:
- 9002:9002
datahub-gms:
container_name: datahub-gms
depends_on:
- mysql
environment:
- DATASET_ENABLE_SCSI=false
- EBEAN_DATASOURCE_USERNAME=datahub
- EBEAN_DATASOURCE_PASSWORD=datahub
- EBEAN_DATASOURCE_HOST=mysql:3306
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
- EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- NEO4J_HOST=http://neo4j:7474
- NEO4J_URI=bolt://neo4j
- NEO4J_USERNAME=neo4j
- NEO4J_PASSWORD=datahub
hostname: datahub-gms
image: linkedin/datahub-gms:${DATAHUB_VERSION:-latest}
mem_limit: 850m
ports:
- 8080:8080
datahub-mae-consumer:
container_name: datahub-mae-consumer
depends_on:
- kafka-setup
- elasticsearch-setup
- neo4j
environment:
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- NEO4J_HOST=http://neo4j:7474
- NEO4J_URI=bolt://neo4j
- NEO4J_USERNAME=neo4j
- NEO4J_PASSWORD=datahub
- GMS_HOST=datahub-gms
- GMS_PORT=8080
hostname: datahub-mae-consumer
image: linkedin/datahub-mae-consumer:${DATAHUB_VERSION:-latest}
mem_limit: 256m
ports:
- 9091:9091
datahub-mce-consumer:
container_name: datahub-mce-consumer
depends_on:
- kafka-setup
- datahub-gms
environment:
- KAFKA_BOOTSTRAP_SERVER=broker:29092
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- GMS_HOST=datahub-gms
- GMS_PORT=8080
hostname: datahub-mce-consumer
image: linkedin/datahub-mce-consumer:${DATAHUB_VERSION:-latest}
mem_limit: 384m
ports:
- 9090:9090
elasticsearch:
container_name: elasticsearch
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms512m -Xmx512m
healthcheck:
retries: 4
start_period: 2m
test:
- CMD-SHELL
- curl -sS --fail 'http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=0s'
|| exit 1
hostname: elasticsearch
image: elasticsearch:7.9.3
mem_limit: 1g
ports:
- 9200:9200
volumes:
- esdata:/usr/share/elasticsearch/data
elasticsearch-setup:
container_name: elasticsearch-setup
depends_on:
- elasticsearch
environment:
- ELASTICSEARCH_HOST=elasticsearch
- ELASTICSEARCH_PORT=9200
- ELASTICSEARCH_PROTOCOL=http
hostname: elasticsearch-setup
image: linkedin/datahub-elasticsearch-setup:${DATAHUB_VERSION:-latest}
kafka-setup:
container_name: kafka-setup
depends_on:
- broker
- schema-registry
environment:
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- KAFKA_BOOTSTRAP_SERVER=broker:29092
hostname: kafka-setup
image: linkedin/datahub-kafka-setup:${DATAHUB_VERSION:-latest}
mysql:
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci
container_name: mysql
environment:
- MYSQL_DATABASE=datahub
- MYSQL_USER=datahub
- MYSQL_PASSWORD=datahub
- MYSQL_ROOT_PASSWORD=datahub
hostname: mysql
image: mysql:5.7
ports:
- 3306:3306
volumes:
- ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
mysql-setup:
container_name: mysql-setup
depends_on:
- mysql
environment:
- MYSQL_HOST=mysql
- MYSQL_PORT=3306
- MYSQL_USERNAME=datahub
- MYSQL_PASSWORD=datahub
- DATAHUB_DB_NAME=datahub
hostname: mysql-setup
image: acryldata/datahub-mysql-setup:head
neo4j:
container_name: neo4j
environment:
- NEO4J_AUTH=neo4j/datahub
- NEO4J_dbms_default__database=graph.db
- NEO4J_dbms_allow__upgrade=true
hostname: neo4j
image: neo4j:4.0.6
ports:
- 7474:7474
- 7687:7687
volumes:
- neo4jdata:/data
schema-registry:
container_name: schema-registry
depends_on:
- zookeeper
- broker
environment:
- SCHEMA_REGISTRY_HOST_NAME=schemaregistry
- SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181
hostname: schema-registry
image: confluentinc/cp-schema-registry:5.4.0
ports:
- 8081:8081
zookeeper:
container_name: zookeeper
environment:
- ZOOKEEPER_CLIENT_PORT=2181
- ZOOKEEPER_TICK_TIME=2000
hostname: zookeeper
image: confluentinc/cp-zookeeper:5.4.0
ports:
- 2181:2181
volumes:
- zkdata:/var/opt/zookeeper
version: '2'
volumes:
esdata: null
mysqldata: null
neo4jdata: null
zkdata: null

View File

@ -0,0 +1,20 @@
#!/bin/bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$DIR"
set -euxo pipefail
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml temp.quickstart.yml
if cmp docker-compose.quickstart.yml temp.quickstart.yml; then
printf 'docker-compose.quickstart.yml is up to date.'
exit 0
else
printf 'docker-compose.quickstart.yml is out of date.'
exit 1
fi

View File

@ -0,0 +1,113 @@
import os
from collections.abc import Mapping
import click
import yaml
from dotenv import dotenv_values
from yaml import Loader
# Generates a merged docker-compose file with env variables inlined.
# Usage: python3 docker_compose_cli_gen.py ../docker-compose.yml ../docker-compose.override.yml ../docker-compose-gen.yml
omitted_services = [
"kafka-rest-proxy",
"kafka-topics-ui",
"schema-registry-ui",
"kibana",
]
mem_limits = {
"datahub-gms": "850m",
"datahub-mae-consumer": "256m",
"datahub-mce-consumer": "384m",
"elasticsearch": "1g",
}
def dict_merge(dct, merge_dct):
for k, v in merge_dct.items():
if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
dict_merge(dct[k], merge_dct[k])
else:
dct[k] = merge_dct[k]
def modify_docker_config(base_path, docker_yaml_config):
# 0. Filter out services to be omitted.
for key in list(docker_yaml_config["services"]):
if key in omitted_services:
del docker_yaml_config["services"][key]
for name, service in docker_yaml_config["services"].items():
# 1. Extract the env file pointer
env_file = service.get("env_file")
if env_file is not None:
# 2. Construct full .env path
env_file_path = os.path.join(base_path, env_file)
# 3. Resolve the .env values
env_vars = dotenv_values(env_file_path)
# 4. Add an "environment" block to YAML
service["environment"] = list(
f"{key}={value}" for key, value in env_vars.items()
)
# 5. Delete the "env_file" value
del service["env_file"]
# 6. Delete build instructions
if "build" in service:
del service["build"]
# 7. Set memory limits
if name in mem_limits:
service["mem_limit"] = mem_limits[name]
# 8. Set docker compose version to 2.
docker_yaml_config["version"] = "2"
@click.command()
@click.argument(
"compose-files",
nargs=-1,
type=click.Path(
exists=True,
dir_okay=False,
),
)
@click.argument("output-file", type=click.Path())
def generate(compose_files, output_file) -> None:
# Resolve .env files to inlined vars
modified_files = []
for compose_file in compose_files:
with open(compose_file, "r") as orig_conf:
docker_config = yaml.load(orig_conf, Loader=Loader)
base_path = os.path.dirname(compose_file)
modify_docker_config(base_path, docker_config)
modified_files.append(docker_config)
# Merge services, networks, and volumes maps
merged_docker_config = modified_files[0]
for modified_file in modified_files:
dict_merge(merged_docker_config, modified_file)
# Write output file
output_dir = os.path.dirname(output_file)
if len(output_dir) and not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(output_file, "w") as new_conf_file:
yaml.dump(
merged_docker_config,
new_conf_file,
default_flow_style=False,
)
print(f"Successfully generated {output_file}.")
if __name__ == "__main__":
generate()

View File

@ -0,0 +1,13 @@
#!/bin/sh
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$DIR"
set -euxo pipefail
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml docker-compose.quickstart.yml

View File

@ -0,0 +1,3 @@
PyYAML==5.4.1
python-dotenv==0.17.0
click

View File

@ -1,19 +1,59 @@
# DataHub Quickstart Guide
1. Install [docker](https://docs.docker.com/install/) and [docker-compose](https://docs.docker.com/compose/install/) (if using Linux). Make sure to allocate enough hardware resources for Docker engine. Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area.
2. Open Docker either from the command line or the desktop app and ensure it is up and running.
3. Clone this repo and `cd` into the root directory of the cloned repository.
4. Run the following command to download and run all Docker containers locally:
```
./docker/quickstart.sh
```
This step takes a while to run the first time, and it may be difficult to tell if DataHub is fully up and running from the combined log. Please use [this guide](debugging.md#how-can-i-confirm-if-all-docker-containers-are-running-as-expected-after-a-quickstart) to verify that each container is running correctly.
5. At this point, you should be able to start DataHub by opening [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as username and any password (no password validation by default). However, you'll notice that no data has been ingested yet.
6. To ingest provided [sample data](https://github.com/linkedin/datahub/blob/master/metadata-ingestion/mce-cli/bootstrap_mce.dat) to DataHub, switch to a new terminal window, `cd` into the cloned `datahub` repo, and run the following command:
```
./docker/ingestion/ingestion.sh
```
After running this, you should be able to see and search sample datasets in DataHub.
7. That's it! To get some real data into DataHub, take a look at the [ingestion framework](../metadata-ingestion/README.md).
## Deploying DataHub
Please refer to the [debugging guide](debugging.md) if you encounter any issues during the quickstart.
To deploy a new instance of DataHub, perform the following steps.
1. Install [docker](https://docs.docker.com/install/) and [docker-compose](https://docs.docker.com/compose/install/) (if using Linux). Make sure to allocate enough hardware resources for Docker engine. Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area.
2. Launch the Docker Engine from command line or the desktop app.
3. Install the DataHub CLI
a. Ensure you have Python 3.6+ installed & configured. (Check using `python3 --version`)
b. Run the following commands in your terminal
```
python3 -m pip install --upgrade pip wheel setuptools
python3 -m pip uninstall datahub acryl-datahub || true # sanity check - ok if it fails
python3 -m pip install --upgrade acryl-datahub
datahub version
```
If you see "command not found", try running cli commands with the prefix 'python3 -m' instead: `python3 -m datahub version`
4. To deploy DataHub, run the following CLI command from your terminal
```
datahub docker quickstart
```
Upon completion of this step, you should be able to navigate to the DataHub UI at [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as username and any password (no password validation by default).
5. To ingest the sample metadata, run the following CLI command from your terminal
```
datahub docker ingest-sample-data
```
That's it! To start pushing your company's metadata into DataHub, take a look at the [Metadata Ingestion Framework](../metadata-ingestion/README.md).
## Resetting DataHub
To cleanse DataHub of all of it's state (e.g. before ingesting your own), you can use the CLI `nuke` command.
```
datahub docker nuke
```
## FAQ
### Command not found: datahub
If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an older
version of Python. Try prefixing your `datahub` commands with `python3 -m`:
```
python3 -m datahub docker quickstart
```

View File

@ -45,7 +45,7 @@ testpaths =
tests/integration
[coverage:report]
fail_under = 75
fail_under = 70
show_missing = true
exclude_lines =
pragma: no cover

View File

@ -1,23 +1,23 @@
import sys
import click
from datahub import __package_name__
from datahub.check.docker import check_local_docker_containers
from datahub.check.json_file import check_mce_file
from datahub.cli.docker import docker_check_impl
from datahub.cli.json_file import check_mce_file
from datahub.ingestion.sink.sink_registry import sink_registry
from datahub.ingestion.source.source_registry import source_registry
from datahub.ingestion.transformer.transform_registry import transform_registry
@click.group()
def check() -> None:
"""Helper commands for checking various aspects of DataHub."""
pass
@check.command()
@click.argument("json-file", type=click.Path(exists=True, dir_okay=False))
def mce_file(json_file: str) -> None:
"""Check the schema of a MCE JSON file"""
"""Check the schema of a MCE JSON file."""
report = check_mce_file(json_file)
click.echo(report)
@ -25,16 +25,9 @@ def mce_file(json_file: str) -> None:
@check.command()
def local_docker() -> None:
"""Check that the local Docker containers are healthy"""
issues = check_local_docker_containers()
if not issues:
click.secho("✔ No issues detected", fg="green")
else:
click.secho("The following issues were detected:", fg="bright_red")
for issue in issues:
click.echo(f"- {issue}")
sys.exit(1)
"""Check that the local Docker containers are healthy. (deprecated)"""
click.secho("DeprecationWarning: use `datahub docker check` instead", fg="yellow")
docker_check_impl()
@check.command()
@ -44,10 +37,10 @@ def local_docker() -> None:
type=bool,
is_flag=True,
default=False,
help="Include extra information for each plugin",
help="Include extra information for each plugin.",
)
def plugins(verbose: bool) -> None:
"""Check the enabled ingestion plugins"""
"""List the enabled ingestion plugins."""
click.secho("Sources:", bold=True)
click.echo(source_registry.summary(verbose=verbose))
@ -55,6 +48,9 @@ def plugins(verbose: bool) -> None:
click.secho("Sinks:", bold=True)
click.echo(sink_registry.summary(verbose=verbose))
click.echo()
click.secho("Transformers:", bold=True)
click.echo(transform_registry.summary(verbose=verbose, col_width=30))
click.echo()
if not verbose:
click.echo("For details on why a plugin is disabled, rerun with '--verbose'")
click.echo(

View File

@ -0,0 +1,290 @@
import datetime
import itertools
import os
import pathlib
import subprocess
import sys
import tempfile
import time
from typing import List, NoReturn, Optional
import click
import requests
from datahub.cli.docker_check import (
check_local_docker_containers,
get_client_with_error,
)
from datahub.ingestion.run.pipeline import Pipeline
SIMPLE_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml"
BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master"
GITHUB_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{SIMPLE_QUICKSTART_COMPOSE_FILE}"
GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}"
@click.group()
def docker() -> None:
"""Helper commands for setting up and interacting with a local
DataHub instance using Docker."""
pass
def _print_issue_list_and_exit(
issues: List[str], header: str, footer: Optional[str] = None
) -> NoReturn:
click.secho(header, fg="bright_red")
for issue in issues:
click.echo(f"- {issue}")
if footer:
click.echo()
click.echo(footer)
sys.exit(1)
def docker_check_impl() -> None:
issues = check_local_docker_containers()
if not issues:
click.secho("✔ No issues detected", fg="green")
else:
_print_issue_list_and_exit(issues, "The following issues were detected:")
@docker.command()
def check() -> None:
"""Check that the Docker containers are healthy"""
docker_check_impl()
@docker.command()
@click.option(
"--version",
type=str,
default="head",
help="Datahub version to be deployed. If not set, deploy latest",
)
@click.option(
"--build-locally",
type=bool,
is_flag=True,
default=False,
help="Attempt to build the containers locally before starting",
)
@click.option(
"--quickstart-compose-file",
type=click.Path(exists=True, dir_okay=False, readable=True),
default=[],
multiple=True,
help="Use a local docker-compose file instead of pulling from GitHub",
)
@click.option(
"--dump-logs-on-failure",
type=bool,
is_flag=True,
default=False,
help="If true, the docker-compose logs will be printed to console if something fails",
)
def quickstart(
version: str,
build_locally: bool,
quickstart_compose_file: List[pathlib.Path],
dump_logs_on_failure: bool,
) -> None:
"""Start an instance of DataHub locally using docker-compose.
This command will automatically download the latest docker-compose configuration
from GitHub, pull the latest images, and bring up the DataHub system.
There are options to override the docker-compose config file, build the containers
locally, and dump logs to the console or to a file if something goes wrong.
"""
# Run pre-flight checks.
issues = check_local_docker_containers(preflight_only=True)
if issues:
_print_issue_list_and_exit(issues, "Unable to run quickstart:")
quickstart_compose_file = list(
quickstart_compose_file
) # convert to list from tuple
if not quickstart_compose_file:
click.echo("Fetching docker-compose file from GitHub")
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp_file:
path = pathlib.Path(tmp_file.name)
quickstart_compose_file.append(path)
# Download the quickstart docker-compose file from GitHub.
quickstart_download_response = requests.get(GITHUB_QUICKSTART_COMPOSE_URL)
quickstart_download_response.raise_for_status()
tmp_file.write(quickstart_download_response.content)
# set version
os.environ["DATAHUB_VERSION"] = version
base_command: List[str] = [
"docker-compose",
*itertools.chain.from_iterable(
("-f", f"{path}") for path in quickstart_compose_file
),
"-p",
"datahub",
]
# Pull and possibly build the latest containers.
subprocess.run(
[
*base_command,
"pull",
],
check=True,
)
if build_locally:
subprocess.run(
[
*base_command,
"build",
"--pull",
],
check=True,
env={
**os.environ,
"DOCKER_BUILDKIT": "1",
},
)
# Start it up! (with retries)
max_wait_time = datetime.timedelta(minutes=6)
start_time = datetime.datetime.now()
sleep_interval = datetime.timedelta(seconds=2)
up_interval = datetime.timedelta(seconds=30)
up_attempts = 0
while (datetime.datetime.now() - start_time) < max_wait_time:
# Attempt to run docker-compose up every minute.
if (datetime.datetime.now() - start_time) > up_attempts * up_interval:
click.echo()
subprocess.run(base_command + ["up", "-d"])
up_attempts += 1
# Check docker health every few seconds.
issues = check_local_docker_containers()
if not issues:
break
# Wait until next iteration.
click.echo(".", nl=False)
time.sleep(sleep_interval.total_seconds())
else:
# Falls through if the while loop doesn't exit via break.
click.echo()
with tempfile.NamedTemporaryFile(suffix=".log", delete=False) as log_file:
ret = subprocess.run(
base_command + ["logs"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
check=True,
)
log_file.write(ret.stdout)
if dump_logs_on_failure:
with open(log_file.name, "r") as logs:
click.echo("Dumping docker-compose logs:")
click.echo(logs.read())
click.echo()
_print_issue_list_and_exit(
issues,
header="Unable to run quickstart - the following issues were detected:",
footer="If you think something went wrong, please file an issue at https://github.com/linkedin/datahub/issues\n"
"or send a message in our Slack https://slack.datahubproject.io/\n"
f"Be sure to attach the logs from {log_file.name}",
)
# Handle success condition.
click.echo()
click.secho("✔ DataHub is now running", fg="green")
click.secho(
"Ingest some demo data using `datahub docker ingest-sample-data`,\n"
"or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend.",
fg="green",
)
@docker.command()
@click.option(
"--path",
type=click.Path(exists=True, dir_okay=False),
help=f"The MCE json file to ingest. Defaults to downloading {BOOTSTRAP_MCES_FILE} from GitHub",
)
def ingest_sample_data(path: Optional[str]) -> None:
"""Ingest sample data into a running DataHub instance."""
if path is None:
click.echo("Downloading sample data...")
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
path = str(pathlib.Path(tmp_file.name))
# Download the bootstrap MCE file from GitHub.
mce_json_download_response = requests.get(GITHUB_BOOTSTRAP_MCES_URL)
mce_json_download_response.raise_for_status()
tmp_file.write(mce_json_download_response.content)
click.echo(f"Downloaded to {path}")
# Verify that docker is up.
issues = check_local_docker_containers()
if issues:
_print_issue_list_and_exit(
issues,
header="Docker is not ready:",
footer="Try running `datahub docker quickstart` first",
)
# Run ingestion.
click.echo("Starting ingestion...")
pipeline = Pipeline.create(
{
"source": {
"type": "file",
"config": {
"filename": path,
},
},
"sink": {
"type": "datahub-rest",
"config": {"server": "http://localhost:8080"},
},
}
)
pipeline.run()
ret = pipeline.pretty_print_summary()
sys.exit(ret)
@docker.command()
def nuke() -> None:
"""Remove all Docker containers, networks, and volumes associated with DataHub."""
with get_client_with_error() as (client, error):
if error:
click.secho(
"Docker doesn't seem to be running. Did you start it?", fg="red"
)
return
click.echo("Removing containers in the datahub project")
for container in client.containers.list(
filters={"label": "com.docker.compose.project=datahub"}
):
container.remove(v=True, force=True)
click.echo("Removing volumes in the datahub project")
for volume in client.volumes.list(
filters={"label": "com.docker.compose.project=datahub"}
):
volume.remove(force=True)
click.echo("Removing networks in the datahub project")
for network in client.networks.list(
filters={"label": "com.docker.compose.project=datahub"}
):
network.remove()

View File

@ -3,11 +3,6 @@ from typing import List
import docker
ENSURE_EXIT_SUCCESS = [
"kafka-setup",
"elasticsearch-setup",
]
REQUIRED_CONTAINERS = [
"elasticsearch-setup",
"elasticsearch",
@ -28,8 +23,20 @@ REQUIRED_CONTAINERS = [
# "kafka-rest-proxy",
]
ENSURE_EXIT_SUCCESS = [
"kafka-setup",
"elasticsearch-setup",
"mysql-setup",
]
CONTAINERS_TO_CHECK_IF_PRESENT = [
# We only add this container in some cases, but if it's present, we
# definitely want to check that it exits properly.
"mysql-setup",
]
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
MIN_MEMORY_NEEDED = 6.75 # GB
MIN_MEMORY_NEEDED = 3.8 # GB
@contextmanager
@ -49,7 +56,7 @@ def memory_in_gb(mem_bytes: int) -> float:
return mem_bytes / (1024 * 1024 * 1000)
def check_local_docker_containers() -> List[str]:
def check_local_docker_containers(preflight_only: bool = False) -> List[str]:
issues: List[str] = []
with get_client_with_error() as (client, error):
if error:
@ -63,6 +70,9 @@ def check_local_docker_containers() -> List[str]:
f"Total Docker memory configured {memory_in_gb(total_mem_configured):.2f}GB is below the minimum threshold {MIN_MEMORY_NEEDED}GB"
)
if preflight_only:
return issues
containers = client.containers.list(
all=True,
filters={
@ -81,7 +91,9 @@ def check_local_docker_containers() -> List[str]:
# Check that the containers are running and healthy.
for container in containers:
if container.name not in REQUIRED_CONTAINERS:
if container.name not in (
REQUIRED_CONTAINERS + CONTAINERS_TO_CHECK_IF_PRESENT
):
# Ignores things like "datahub-frontend" which are no longer used.
# This way, we only check required containers like "datahub-frontend-react"
# even if there are some old containers lying around.

View File

@ -7,7 +7,8 @@ import click
from pydantic import ValidationError
import datahub as datahub_package
from datahub.check.check_cli import check
from datahub.cli.check_cli import check
from datahub.cli.docker import docker
from datahub.configuration.config_loader import load_config_file
from datahub.ingestion.run.pipeline import Pipeline
@ -25,7 +26,13 @@ BASE_LOGGING_FORMAT = (
logging.basicConfig(format=BASE_LOGGING_FORMAT)
@click.group()
@click.group(
context_settings=dict(
# Avoid truncation of help text.
# See https://github.com/pallets/click/issues/486.
max_content_width=120,
)
)
@click.option("--debug/--no-debug", default=False)
@click.version_option(
version=datahub_package.nice_version_name(),
@ -45,7 +52,7 @@ def datahub(debug: bool) -> None:
@datahub.command()
def version() -> None:
"""Print version number and exit"""
"""Print version number and exit."""
click.echo(f"DataHub CLI version: {datahub_package.nice_version_name()}")
click.echo(f"Python version: {sys.version}")
@ -55,11 +62,11 @@ def version() -> None:
"-c",
"--config",
type=click.Path(exists=True, dir_okay=False),
help="Config file in .toml or .yaml format",
help="Config file in .toml or .yaml format.",
required=True,
)
def ingest(config: str) -> None:
"""Main command for ingesting metadata into DataHub"""
"""Ingest metadata into DataHub."""
config_file = pathlib.Path(config)
pipeline_config = load_config_file(config_file)
@ -77,12 +84,16 @@ def ingest(config: str) -> None:
datahub.add_command(check)
datahub.add_command(docker)
def main(**kwargs):
# This wrapper prevents click from suppressing errors.
try:
sys.exit(datahub(standalone_mode=False, **kwargs))
except click.exceptions.Abort:
# Click already automatically prints an abort message, so we can just exit.
sys.exit(1)
except click.ClickException as error:
error.show()
sys.exit(1)

View File

@ -84,10 +84,7 @@ class Registry(Generic[T]):
# If it's not an exception, then it's a registered type.
return tp
def summary(self, verbose=True):
col_width = 15
verbose_col_width = 20
def summary(self, verbose=True, col_width=15, verbose_col_width=20):
lines = []
for key in sorted(self._mapping.keys()):
line = f"{key}"

View File

@ -1,26 +1,26 @@
#!/bin/bash
set -euxo pipefail
# Runs a basic e2e test. It is not meant to be fully comprehensive,
# but rather should catch obvious bugs before they make it into prod.
#
# Script assumptions:
# - The gradle build has already been run.
# - Python 3.6+ is installed.
# - The metadata-ingestion codegen script has been run.
# - A full DataHub setup is running on localhost with standard ports.
# The easiest way to do this is by using the quickstart or dev
# quickstart scripts.
# - Python 3.6+ is installed and in the PATH.
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd "$DIR"
set -euxo pipefail
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip wheel setuptools
pip install -r requirements.txt
(cd ../metadata-ingestion && ./scripts/codegen.sh)
datahub docker quickstart \
--build-locally \
--quickstart-compose-file ../docker/docker-compose.yml \
--quickstart-compose-file ../docker/docker-compose.override.yml \
--quickstart-compose-file ../docker/docker-compose.dev.yml \
--dump-logs-on-failure
pytest -vv

View File

@ -1,9 +1,9 @@
import time
import pytest
import requests
from datahub.cli.docker import check_local_docker_containers
from datahub.ingestion.run.pipeline import Pipeline
from datahub.check.docker import check_local_docker_containers
GMS_ENDPOINT = "http://localhost:8080"
FRONTEND_ENDPOINT = "http://localhost:9002"
@ -15,26 +15,13 @@ restli_default_headers = {
"X-RestLi-Protocol-Version": "2.0.0",
}
kafka_post_ingestion_wait_sec = 60
healthcheck_wait_retries = 20
healthcheck_wait_interval_sec = 15
@pytest.fixture(scope="session")
def wait_for_healthchecks():
tries = 0
while tries < healthcheck_wait_retries:
if tries > 0:
time.sleep(healthcheck_wait_interval_sec)
tries += 1
issues = check_local_docker_containers()
if not issues:
print(f"finished waiting for healthchecks after {tries} tries")
yield
return
issues_str = '\n'.join(f"- {issue}" for issue in issues)
raise RuntimeError(f"retry limit exceeded while waiting for docker healthchecks\n{issues_str}")
# Simply assert that everything is healthy, but don't wait.
assert not check_local_docker_containers()
yield
@pytest.mark.dependency()