feat(quickstart): Adding env variables and cli options for customizing mapped ports in quickstart (#5353)

Co-authored-by: Shirshanka Das <shirshanka@apache.org>
This commit is contained in:
Navin Sharma 2022-07-12 11:03:24 +05:30 committed by GitHub
parent 489b5bb5b4
commit f3e5afdba9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 261 additions and 81 deletions

View File

@ -9,7 +9,7 @@ services:
env_file: mysql/env/docker.env
command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin
ports:
- "3306:3306"
- ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306
volumes:
- ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql

View File

@ -12,7 +12,7 @@ services:
hostname: zookeeper
container_name: zookeeper
ports:
- "2181:2181"
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/opt/zookeeper
@ -24,8 +24,7 @@ services:
depends_on:
- zookeeper
ports:
- "29092:29092"
- "9092:9092"
- ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092
volumes:
- broker:/var/lib/kafka/data/
@ -50,7 +49,7 @@ services:
- zookeeper
- broker
ports:
- "8081:8081"
- ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081
elasticsearch:
image: elasticsearch:7.9.3
@ -58,7 +57,7 @@ services:
container_name: elasticsearch
hostname: elasticsearch
ports:
- "9200:9200"
- ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200
environment:
- discovery.type=single-node
- xpack.security.enabled=false
@ -75,8 +74,8 @@ services:
hostname: neo4j
container_name: neo4j
ports:
- "7474:7474"
- "7687:7687"
- ${DATAHUB_MAPPED_NEO4J_HTTP_PORT:-7474}:7474
- ${DATAHUB_MAPPED_NEO4J_BOLT_PORT:-7687}:7687
volumes:
- neo4jdata:/data
@ -100,7 +99,7 @@ services:
hostname: datahub-gms
container_name: datahub-gms
ports:
- "8080:8080"
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
depends_on:
- elasticsearch-setup
- kafka-setup
@ -116,7 +115,7 @@ services:
hostname: datahub-frontend-react
container_name: datahub-frontend-react
ports:
- "9002:9002"
- ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
depends_on:
- datahub-gms
volumes:

View File

@ -17,8 +17,7 @@ services:
hostname: broker
image: kymeric/cp-kafka:latest
ports:
- 29092:29092
- 9092:9092
- ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092
datahub-actions:
depends_on:
- datahub-gms
@ -57,7 +56,7 @@ services:
hostname: datahub-frontend-react
image: linkedin/datahub-frontend-react:${DATAHUB_VERSION:-head}
ports:
- 9002:9002
- ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins
datahub-gms:
@ -86,7 +85,7 @@ services:
hostname: datahub-gms
image: linkedin/datahub-gms:${DATAHUB_VERSION:-head}
ports:
- 8080:8080
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins
elasticsearch:
@ -106,7 +105,7 @@ services:
image: elasticsearch:7.9.3
mem_limit: 1g
ports:
- 9200:9200
- ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200
volumes:
- esdata:/usr/share/elasticsearch/data
elasticsearch-setup:
@ -140,7 +139,7 @@ services:
hostname: mysql
image: mariadb:10.5.8
ports:
- 3306:3306
- ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306
volumes:
- ../mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
@ -167,7 +166,7 @@ services:
hostname: schema-registry
image: eugenetea/schema-registry-arm64:latest
ports:
- 8081:8081
- ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081
zookeeper:
container_name: zookeeper
environment:
@ -176,7 +175,7 @@ services:
hostname: zookeeper
image: kymeric/cp-zookeeper:latest
ports:
- 2181:2181
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/opt/zookeeper
version: '2.3'

View File

@ -17,8 +17,7 @@ services:
hostname: broker
image: confluentinc/cp-kafka:5.4.0
ports:
- 29092:29092
- 9092:9092
- ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092
volumes:
- broker:/var/lib/kafka/data/
datahub-actions:
@ -59,7 +58,7 @@ services:
hostname: datahub-frontend-react
image: linkedin/datahub-frontend-react:${DATAHUB_VERSION:-head}
ports:
- 9002:9002
- ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins
datahub-gms:
@ -94,7 +93,7 @@ services:
hostname: datahub-gms
image: linkedin/datahub-gms:${DATAHUB_VERSION:-head}
ports:
- 8080:8080
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
volumes:
- ${HOME}/.datahub/plugins/:/etc/datahub/plugins
elasticsearch:
@ -114,7 +113,7 @@ services:
image: elasticsearch:7.9.3
mem_limit: 1g
ports:
- 9200:9200
- ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200
volumes:
- esdata:/usr/share/elasticsearch/data
elasticsearch-setup:
@ -148,7 +147,7 @@ services:
hostname: mysql
image: mysql:5.7
ports:
- 3306:3306
- ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306
volumes:
- ../mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
@ -173,8 +172,8 @@ services:
hostname: neo4j
image: neo4j:4.0.6
ports:
- 7474:7474
- 7687:7687
- ${DATAHUB_MAPPED_NEO4J_HTTP_PORT:-7474}:7474
- ${DATAHUB_MAPPED_NEO4J_BOLT_PORT:-7687}:7687
volumes:
- neo4jdata:/data
schema-registry:
@ -188,7 +187,7 @@ services:
hostname: schema-registry
image: confluentinc/cp-schema-registry:5.4.0
ports:
- 8081:8081
- ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081
zookeeper:
container_name: zookeeper
environment:
@ -197,7 +196,7 @@ services:
hostname: zookeeper
image: confluentinc/cp-zookeeper:5.4.0
ports:
- 2181:2181
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/opt/zookeeper
version: '2.3'

View File

@ -42,10 +42,6 @@ To deploy a new instance of DataHub, perform the following steps.
at [http://localhost:9002](http://localhost:9002) in your browser. You can sign in using `datahub` as both the
username and password.
If you would like to modify/configure the DataHub installation in some way, please download the [docker-compose.yaml](https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml) used by the cli tool, modify it as necessary and deploy DataHub by passing the downloaded docker-compose file:
```
datahub docker quickstart --quickstart-compose-file <path to compose file>
```
5. To ingest the sample metadata, run the following CLI command from your terminal
@ -62,48 +58,11 @@ using the `--token <token>` parameter in the command.
That's it! Now feel free to play around with DataHub!
## Next Steps
## Troubleshooting Issues
### Ingest Metadata
To start pushing your company's metadata into DataHub, take a look at the [Metadata Ingestion Framework](../metadata-ingestion/README.md).
### Invite Users
To add users to your deployment to share with your team check out our [Adding Users to DataHub](authentication/guides/add-users.md)
### Enable Authentication
To enable SSO, check out [Configuring OIDC Authentication](authentication/guides/sso/configure-oidc-react.md) or [Configuring JaaS Authentication](authentication/guides/jaas.md).
To enable backend Authentication, check out [authentication in DataHub's backend](authentication/introducing-metadata-service-authentication.md#Configuring Metadata Service Authentication).
### Move to Production
We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough.
## Resetting DataHub
To cleanse DataHub of all of it's state (e.g. before ingesting your own), you can use the CLI `nuke` command.
```
datahub docker nuke
```
## Updating DataHub locally
If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can use below commands.
```
datahub docker nuke --keep-data
datahub docker quickstart
```
This will keep the data that you have ingested so far in DataHub and start a new quickstart with the latest version of DataHub.
## Troubleshooting
### Command not found: datahub
<details><summary>
Command not found: datahub
</summary>
If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an
older version of Python. Try prefixing your `datahub` commands with `python3 -m`:
@ -119,9 +78,31 @@ if [ -d "$HOME/.local/bin" ] ; then
PATH="$HOME/.local/bin:$PATH"
fi
```
</details>
### Miscellaneous Docker issues
<details>
<summary>
Port Conflicts
</summary>
By default the quickstart deploy will require the following ports to be free on your local machine:
- 3306 for MySQL
- 9200 for Elasticsearch
- 9092 for the Kafka broker
- 8081 for Schema Registry
- 2181 for ZooKeeper
- 9002 for the DataHub Web Application (datahub-frontend)
- 8080 for the DataHub Metadata Service (datahub-gms)
In case the default ports conflict with software you are already running on your machine, you can override these ports by passing additional flags to the `datahub docker quickstart` command.
e.g. To override the MySQL port with 53306 (instead of the default 3306), you can say: `datahub docker quickstart --mysql-port 53306`. Use `datahub docker quickstart --help` to see all the supported options.
</details>
<details>
<summary>
Miscellaneous Docker issues
</summary>
There can be misc issues with Docker, like conflicting containers and dangling volumes, that can often be resolved by
pruning your Docker state with the following command. Note that this command removes all unused containers, networks,
images (both dangling and unreferenced), and optionally, volumes.
@ -129,3 +110,66 @@ images (both dangling and unreferenced), and optionally, volumes.
```
docker system prune
```
</details>
<details>
<summary>
Still stuck?
</summary>
Hop over to our [Slack community](https://slack.datahubproject.io) and ask for help in the [#troubleshoot](https://datahubspace.slack.com/archives/C029A3M079U) channel!
</details>
## Next Steps
### Ingest Metadata
To start pushing your company's metadata into DataHub, take a look at [UI-based Ingestion Guide](./ui-ingestion.md), or to run ingestion using the cli, look at the [Metadata Ingestion Guide](../metadata-ingestion/README.md).
### Invite Users
To add users to your deployment to share with your team check out our [Adding Users to DataHub](authentication/guides/add-users.md)
### Enable Authentication
To enable SSO, check out [Configuring OIDC Authentication](authentication/guides/sso/configure-oidc-react.md) or [Configuring JaaS Authentication](authentication/guides/jaas.md).
To enable backend Authentication, check out [authentication in DataHub's backend](authentication/introducing-metadata-service-authentication.md#Configuring Metadata Service Authentication).
### Move to Production
We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough.
## Other Common Operations
### Stopping DataHub
To stop DataHub's quickstart, you can issue the following command.
```
datahub docker quickstart --stop
```
### Resetting DataHub
To cleanse DataHub of all of its state (e.g. before ingesting your own), you can use the CLI `nuke` command.
```
datahub docker nuke
```
### Upgrading your local DataHub
If you have been testing DataHub locally, a new version of DataHub got released and you want to try the new version then you can just issue the quickstart command again. It will pull down newer images and restart your instance without losing any data.
```
datahub docker quickstart
```
### Customization
If you would like to customize the DataHub installation further, please download the [docker-compose.yaml](https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml) used by the cli tool, modify it as necessary and deploy DataHub by passing the downloaded docker-compose file:
```
datahub docker quickstart --quickstart-compose-file <path to compose file>
```

View File

@ -68,7 +68,7 @@ kafka_common = {
# At the same time, we use Kafka's AvroSerializer, which internally relies on
# fastavro for serialization. We do not use confluent_kafka[avro], since it
# is incompatible with its own dep on avro-python3.
"confluent_kafka>=1.5.0,<1.9.0",
"confluent_kafka>=1.5.0",
"fastavro>=1.2.0",
}

View File

@ -11,6 +11,7 @@ import time
from typing import List, NoReturn, Optional
import click
import pydantic
import requests
from datahub.cli.docker_check import (
@ -131,6 +132,80 @@ def should_use_neo4j_for_graph_service(graph_service_override: Optional[str]) ->
return False
def _set_environment_variables(
version: Optional[str],
mysql_port: Optional[pydantic.PositiveInt],
zk_port: Optional[pydantic.PositiveInt],
kafka_broker_port: Optional[pydantic.PositiveInt],
schema_registry_port: Optional[pydantic.PositiveInt],
elastic_port: Optional[pydantic.PositiveInt],
) -> None:
if version is not None:
os.environ["DATAHUB_VERSION"] = version
if mysql_port is not None:
os.environ["DATAHUB_MAPPED_MYSQL_PORT"] = str(mysql_port)
if zk_port is not None:
os.environ["DATAHUB_MAPPED_ZK_PORT"] = str(zk_port)
if kafka_broker_port is not None:
os.environ["DATAHUB_MAPPED_KAFKA_BROKER_PORT"] = str(kafka_broker_port)
if schema_registry_port is not None:
os.environ["DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT"] = str(schema_registry_port)
if elastic_port is not None:
os.environ["DATAHUB_MAPPED_ELASTIC_PORT"] = str(elastic_port)
def _get_default_quickstart_compose_file() -> Optional[str]:
home = os.environ["HOME"]
if home:
try:
os.makedirs(f"{home}/.datahub/quickstart", exist_ok=True)
return f"{home}/.datahub/quickstart/docker-compose.yml"
except Exception as e:
logger.debug(
f"Failed to identify a default quickstart compose file due to {e}"
)
return None
def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None:
default_quickstart_compose_file = _get_default_quickstart_compose_file()
compose_files_for_stopping = (
quickstart_compose_file
if quickstart_compose_file
else [pathlib.Path(default_quickstart_compose_file)]
if default_quickstart_compose_file
else None
)
if compose_files_for_stopping:
# docker-compose stop
base_command: List[str] = [
"docker-compose",
*itertools.chain.from_iterable(
("-f", f"{path}") for path in compose_files_for_stopping
),
"-p",
"datahub",
]
try:
logger.debug(f"Executing {base_command} stop")
subprocess.run(
[*base_command, "stop"],
check=True,
)
click.secho("Stopped datahub successfully.", fg="green")
except subprocess.CalledProcessError:
click.secho(
"Error while stopping.",
fg="red",
)
return
@docker.command()
@click.option(
"--version",
@ -166,6 +241,48 @@ def should_use_neo4j_for_graph_service(graph_service_override: Optional[str]) ->
default=None,
help="If set, forces docker-compose to use that graph service implementation",
)
@click.option(
"--mysql-port",
type=pydantic.PositiveInt,
is_flag=False,
default=None,
help="If there is an existing mysql instance running on port 3306, set this to a free port to avoid port conflicts on startup",
)
@click.option(
"--zk-port",
type=pydantic.PositiveInt,
is_flag=False,
default=None,
help="If there is an existing zookeeper instance running on port 2181, set this to a free port to avoid port conflicts on startup",
)
@click.option(
"--kafka-broker-port",
type=pydantic.PositiveInt,
is_flag=False,
default=None,
help="If there is an existing Kafka broker running on port 9092, set this to a free port to avoid port conflicts on startup",
)
@click.option(
"--schema-registry-port",
type=pydantic.PositiveInt,
is_flag=False,
default=None,
help="If there is an existing process running on port 8081, set this to a free port to avoid port conflicts with Kafka schema registry on startup",
)
@click.option(
"--elastic-port",
type=pydantic.PositiveInt,
is_flag=False,
default=None,
help="If there is an existing Elasticsearch instance running on port 9092, set this to a free port to avoid port conflicts on startup",
)
@click.option(
"--stop",
type=bool,
is_flag=True,
default=False,
help="Use this flag to stop the running containers",
)
@upgrade.check_upgrade
@telemetry.with_telemetry
def quickstart(
@ -174,6 +291,12 @@ def quickstart(
quickstart_compose_file: List[pathlib.Path],
dump_logs_on_failure: bool,
graph_service_impl: Optional[str],
mysql_port: Optional[pydantic.PositiveInt],
zk_port: Optional[pydantic.PositiveInt],
kafka_broker_port: Optional[pydantic.PositiveInt],
schema_registry_port: Optional[pydantic.PositiveInt],
elastic_port: Optional[pydantic.PositiveInt],
stop: bool,
) -> None:
"""Start an instance of DataHub locally using docker-compose.
@ -185,7 +308,7 @@ def quickstart(
running_on_m1 = is_m1()
if running_on_m1:
click.echo("Detected M1 machine")
click.secho("Detected M1 machine", fg="yellow")
# Run pre-flight checks.
issues = check_local_docker_containers(preflight_only=True)
@ -195,7 +318,13 @@ def quickstart(
quickstart_compose_file = list(
quickstart_compose_file
) # convert to list from tuple
if not quickstart_compose_file:
default_quickstart_compose_file = _get_default_quickstart_compose_file()
if stop:
_attempt_stop(quickstart_compose_file)
return
elif not quickstart_compose_file:
# download appropriate quickstart file
should_use_neo4j = should_use_neo4j_for_graph_service(graph_service_impl)
if should_use_neo4j and running_on_m1:
click.secho(
@ -210,7 +339,11 @@ def quickstart(
else GITHUB_M1_QUICKSTART_COMPOSE_URL
)
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp_file:
with open(
default_quickstart_compose_file, "wb"
) if default_quickstart_compose_file else tempfile.NamedTemporaryFile(
suffix=".yml", delete=False
) as tmp_file:
path = pathlib.Path(tmp_file.name)
quickstart_compose_file.append(path)
click.echo(f"Fetching docker-compose file {github_file} from GitHub")
@ -221,8 +354,14 @@ def quickstart(
logger.debug(f"Copied to {path}")
# set version
if version is not None:
os.environ["DATAHUB_VERSION"] = version
_set_environment_variables(
version=version,
mysql_port=mysql_port,
zk_port=zk_port,
kafka_broker_port=kafka_broker_port,
schema_registry_port=schema_registry_port,
elastic_port=elastic_port,
)
base_command: List[str] = [
"docker-compose",

View File

@ -26,7 +26,7 @@ from datahub.utilities.server_config_util import get_gms_config
logger = logging.getLogger(__name__)
# Configure some loggers.
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("snowflake").setLevel(level=logging.WARNING)
# logging.getLogger("botocore").setLevel(logging.INFO)
# logging.getLogger("google").setLevel(logging.INFO)