feat(elastic-as-graph): defaulting to elastic in quickstart (#2753)

This commit is contained in:
Gabe Lyons 2021-06-24 15:44:03 -07:00 committed by GitHub
parent 82468016ae
commit 62ba937bb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 10 deletions

View File

@ -34,12 +34,15 @@ The main components are powered by 4 external dependencies:
- Kafka
- Local DB (MySQL, Postgres, MariaDB)
- Search Index (Elasticsearch)
- Graph Index (Supports only Neo4j)
- Graph Index (Supports either Neo4j or Elasticsearch)
The dependencies must be deployed before deploying Datahub. We created a separate
[chart](https://github.com/linkedin/datahub/tree/master/datahub-kubernetes/prerequisites)
for deploying the dependencies with example configuration. They could also be deployed
separately on-prem or leveraged as managed services.
separately on-prem or leveraged as managed services. To remove your dependency on Neo4j,
set enabled to false in the `datahub-kubernetes/prerequisites/values.yaml` file.
Then, override the `graph_service_impl` field in `datahub-kubernetes/datahub/values.yaml` to
have the value `elasticsearch` instead of `neo4j`.
## Quickstart
Assuming kubectl context points to the correct kubernetes cluster, first create kubernetes secrets that contain MySQL and Neo4j passwords.
@ -130,5 +133,3 @@ to expose the 9002 port to the public.
| helm uninstall datahub | Remove DataHub |
| helm ls | List of Helm charts |
| helm history | Fetch a release history |

View File

@ -32,7 +32,7 @@ Dependencies:
* [Kafka, Zookeeper, and Schema Registry](kafka-setup)
* [Elasticsearch](elasticsearch-setup)
* [MySQL](mysql)
* [Neo4j](neo4j)
* [(Optional) Neo4j](neo4j)
### Ingesting demo data.

View File

@ -15,4 +15,17 @@ DEFAULT_VERSION=$(echo $TAG_VERSION | sed 's/undefined/head/')
export DATAHUB_VERSION=${DATAHUB_VERSION:-${DEFAULT_VERSION}}
echo "Quickstarting DataHub: version ${DATAHUB_VERSION}"
cd $DIR && docker-compose pull && docker-compose -p datahub up
if docker volume ls | grep -c -q datahub_neo4jdata
then
echo "Datahub Neo4j volume found, starting with neo4j as graph service"
cd $DIR && docker-compose pull && docker-compose -p datahub up
else
echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service"
cd $DIR && \
docker-compose \
-f quickstart/docker-compose-without-neo4j.quickstart.yml \
pull && \
docker-compose -p datahub \
-f quickstart/docker-compose-without-neo4j.quickstart.yml \
up
fi

View File

@ -17,11 +17,21 @@ from datahub.cli.docker_check import (
)
from datahub.ingestion.run.pipeline import Pipeline
SIMPLE_QUICKSTART_COMPOSE_FILE = "docker/quickstart/docker-compose.quickstart.yml"
NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE = (
"docker/quickstart/docker-compose.quickstart.yml"
)
ELASTIC_QUICKSTART_COMPOSE_FILE = (
"docker/quickstart/docker-compose-without-neo4j.quickstart.yml"
)
BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
GITHUB_BASE_URL = "https://raw.githubusercontent.com/linkedin/datahub/master"
GITHUB_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{SIMPLE_QUICKSTART_COMPOSE_FILE}"
GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = (
f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}"
)
GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL = (
f"{GITHUB_BASE_URL}/{ELASTIC_QUICKSTART_COMPOSE_FILE}"
)
GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}"
@ -58,6 +68,30 @@ def check() -> None:
docker_check_impl()
def check_neo4j_volume_exists():
with get_client_with_error() as (client, error):
if error:
click.secho(
"Docker doesn't seem to be running. Did you start it?", fg="red"
)
return
if len(client.volumes.list(filters={"name": "datahub_neo4jdata"})) > 0:
click.echo(
"Datahub Neo4j volume found, starting with neo4j as graph service.\n"
"If you want to run using elastic, run `datahub docker nuke` and re-ingest your data.\n"
)
return True
click.echo(
"No Datahub Neo4j volume found, starting with elasticsearch as graph service.\n"
"To use neo4j as a graph backend, run \n"
"`datahub docker quickstart --quickstart-compose-file ./docker/quickstart/docker-compose.quickstart.yml`"
"\nfrom the root of the datahub repo\n"
)
return False
@docker.command()
@click.option(
"--version",
@ -115,7 +149,11 @@ def quickstart(
quickstart_compose_file.append(path)
# Download the quickstart docker-compose file from GitHub.
quickstart_download_response = requests.get(GITHUB_QUICKSTART_COMPOSE_URL)
quickstart_download_response = requests.get(
GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL
if check_neo4j_volume_exists()
else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL
)
quickstart_download_response.raise_for_status()
tmp_file.write(quickstart_download_response.content)

View File

@ -12,7 +12,6 @@ REQUIRED_CONTAINERS = [
"schema-registry",
"broker",
"mysql",
"neo4j",
"zookeeper",
# These two containers are not necessary - only helpful in debugging.
# "kafka-topics-ui",
@ -33,6 +32,7 @@ CONTAINERS_TO_CHECK_IF_PRESENT = [
# We only add this container in some cases, but if it's present, we
# definitely want to check that it exits properly.
"mysql-setup",
"neo4j",
]
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.