fix(cli): correct handling of env variables (#5203)

This commit is contained in:
Aseem Bansal 2022-06-20 20:53:47 +05:30 committed by GitHub
parent 0ee2569d5c
commit d518b5a085
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 104 additions and 16 deletions

View File

@ -129,7 +129,10 @@ datahub check plugins
## Environment variables supported
The env variables take precedence over what is in the DataHub CLI config created through `init` command. The list of supported environment variables are as follows
- `DATAHUB_SKIP_CONFIG` (default `false`) - Set to `true` to skip creating the configuration file.
- `DATAHUB_GMS_HOST` (default `http://localhost:8080`) - Set to a URL of GMS instance.
- `DATAHUB_GMS_URL` (default `http://localhost:8080`) - Set to a URL of GMS instance
- `DATAHUB_GMS_HOST` (default `localhost`) - Set to a host of GMS instance. Prefer using `DATAHUB_GMS_URL` to set the URL.
- `DATAHUB_GMS_PORT` (default `8080`) - Set to a port of GMS instance. Prefer using `DATAHUB_GMS_URL` to set the URL.
- `DATAHUB_GMS_PROTOCOL` (default `http`) - Set to a protocol like `http` or `https`. Prefer using `DATAHUB_GMS_URL` to set the URL.
- `DATAHUB_GMS_TOKEN` (default `None`) - Used for communicating with DataHub Cloud.
- `DATAHUB_TELEMETRY_ENABLED` (default `true`) - Set to `false` to disable telemetry. If CLI is being run in an environment with no access to public internet then this should be disabled.
- `DATAHUB_TELEMETRY_TIMEOUT` (default `10`) - Set to a custom integer value to specify timeout in secs when sending telemetry.
@ -139,7 +142,7 @@ The env variables take precedence over what is in the DataHub CLI config created
```shell
DATAHUB_SKIP_CONFIG=false
DATAHUB_GMS_HOST=http://localhost:8080
DATAHUB_GMS_URL=http://localhost:8080
DATAHUB_GMS_TOKEN=
DATAHUB_TELEMETRY_ENABLED=true
DATAHUB_TELEMETRY_TIMEOUT=10

View File

@ -54,7 +54,7 @@ Make sure yaml plugin is installed for your editor:
:::
Since `acryl-datahub` version `>=0.8.33.2`, the default sink is assumed to be a DataHub REST endpoint:
- Hosted at "http://localhost:8080" or the environment variable `${DATAHUB_GMS_HOST}` if present
- Hosted at "http://localhost:8080" or the environment variable `${DATAHUB_GMS_URL}` if present
- With an empty auth token or the environment variable `${DATAHUB_GMS_TOKEN}` if present.
Here's a simple recipe that pulls metadata from MSSQL (source) and puts it into the default sink (datahub rest).
@ -79,7 +79,7 @@ datahub ingest -c recipe.dhub.yaml
or if you want to override the default endpoints, you can provide the environment variables as part of the command like below:
```shell
DATAHUB_GMS_HOST="https://my-datahub-server:8080" DATAHUB_GMS_TOKEN="my-datahub-token" datahub ingest -c recipe.dhub.yaml
DATAHUB_GMS_URL="https://my-datahub-server:8080" DATAHUB_GMS_TOKEN="my-datahub-token" datahub ingest -c recipe.dhub.yaml
```
A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](../docs/cli.md#installing-plugins).

View File

@ -64,7 +64,12 @@ CONDENSED_DATAHUB_CONFIG_PATH = "~/.datahubenv"
DATAHUB_CONFIG_PATH = os.path.expanduser(CONDENSED_DATAHUB_CONFIG_PATH)
ENV_SKIP_CONFIG = "DATAHUB_SKIP_CONFIG"
ENV_METADATA_HOST_URL = "DATAHUB_GMS_URL"
ENV_METADATA_HOST = "DATAHUB_GMS_HOST"
ENV_METADATA_PORT = "DATAHUB_GMS_PORT"
ENV_METADATA_PROTOCOL = "DATAHUB_GMS_PROTOCOL"
ENV_METADATA_HOST_DEPRECATED = "GMS_HOST"
ENV_METADATA_PORT_DEPRECATED = "GMS_PORT"
ENV_METADATA_TOKEN = "DATAHUB_GMS_TOKEN"
config_override: Dict = {}
@ -79,9 +84,9 @@ class DatahubConfig(BaseModel):
gms: GmsConfig
def set_env_variables_override_config(host: str, token: Optional[str]) -> None:
def set_env_variables_override_config(url: str, token: Optional[str]) -> None:
"""Should be used to override the config when using rest emitter"""
config_override[ENV_METADATA_HOST] = host
config_override[ENV_METADATA_HOST_URL] = url
if token is not None:
config_override[ENV_METADATA_TOKEN] = token
@ -135,7 +140,25 @@ def get_details_from_config():
def get_details_from_env() -> Tuple[Optional[str], Optional[str]]:
return os.environ.get(ENV_METADATA_HOST), os.environ.get(ENV_METADATA_TOKEN)
host = os.environ.get(ENV_METADATA_HOST) or os.environ.get(
ENV_METADATA_HOST_DEPRECATED
)
port = os.environ.get(ENV_METADATA_PORT) or os.environ.get(
ENV_METADATA_PORT_DEPRECATED
)
token = os.environ.get(ENV_METADATA_TOKEN)
protocol = os.environ.get(ENV_METADATA_PROTOCOL, "http")
url = os.environ.get(ENV_METADATA_HOST_URL)
if port is not None:
url = f"{protocol}://{host}:{port}"
return url, token
# The reason for using host as URL is backward compatibility
# If port is not being used we assume someone is using host env var as URL
if url is None and host is not None:
log.warning(
f"Do not use {ENV_METADATA_HOST} as URL. Use {ENV_METADATA_HOST_URL} instead"
)
return url or host, token
def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
@ -147,10 +170,10 @@ def guess_entity_type(urn: str) -> str:
return urn.split(":")[2]
def get_host_and_token():
def get_url_and_token():
gms_host_env, gms_token_env = get_details_from_env()
if len(config_override.keys()) > 0:
gms_host = config_override.get(ENV_METADATA_HOST)
gms_host = config_override.get(ENV_METADATA_HOST_URL)
gms_token = config_override.get(ENV_METADATA_TOKEN)
elif should_skip_config():
gms_host = gms_host_env
@ -164,17 +187,17 @@ def get_host_and_token():
def get_token():
return get_host_and_token()[1]
return get_url_and_token()[1]
def get_session_and_host():
session = requests.Session()
gms_host, gms_token = get_host_and_token()
gms_host, gms_token = get_url_and_token()
if gms_host is None or gms_host.strip() == "":
log.error(
f"GMS Host is not set. Use datahub init command or set {ENV_METADATA_HOST} env var"
f"GMS Host is not set. Use datahub init command or set {ENV_METADATA_HOST_URL} env var"
)
return None, None

View File

@ -7,6 +7,7 @@ from typing import Any, Dict, Iterable, List, Optional
import click
from pydantic import root_validator, validator
from datahub.cli.cli_utils import get_url_and_token
from datahub.configuration import config_loader
from datahub.configuration.common import (
ConfigModel,
@ -67,11 +68,12 @@ class PipelineConfig(ConfigModel):
@root_validator(pre=True)
def default_sink_is_datahub_rest(cls, values: Dict[str, Any]) -> Any:
if "sink" not in values:
gms_host, gms_token = get_url_and_token()
default_sink_config = {
"type": "datahub-rest",
"config": {
"server": "${DATAHUB_GMS_HOST:-http://localhost:8080}",
"token": "${DATAHUB_GMS_TOKEN:-}",
"server": gms_host,
"token": gms_token,
},
}
# resolve env variables if present

View File

@ -104,7 +104,7 @@ def retrieve_versions( # noqa: C901
if not server:
try:
# let's get the server from the cli config
host, token = cli_utils.get_host_and_token()
host, token = cli_utils.get_url_and_token()
server = DataHubGraph(DatahubClientConfig(server=host, token=token))
except Exception as e:
log.debug("Failed to get a valid server", e)

View File

@ -1,3 +1,6 @@
import os
from unittest import mock
from datahub.cli import cli_utils
@ -9,3 +12,60 @@ def test_first_non_null():
assert cli_utils.first_non_null(["3", "1", "2"]) == "3"
assert cli_utils.first_non_null(["", "1", "2"]) == "1"
assert cli_utils.first_non_null([" ", "1", "2"]) == "1"
@mock.patch.dict(os.environ, {"DATAHUB_GMS_HOST": "http://localhost:9092"})
def test_correct_url_when_gms_host_in_old_format():
assert cli_utils.get_details_from_env() == ("http://localhost:9092", None)
@mock.patch.dict(
os.environ, {"DATAHUB_GMS_HOST": "localhost", "DATAHUB_GMS_PORT": "8080"}
)
def test_correct_url_when_gms_host_and_port_set():
assert cli_utils.get_details_from_env() == ("http://localhost:8080", None)
@mock.patch.dict(
os.environ,
{
"DATAHUB_GMS_URL": "https://example.com",
"DATAHUB_GMS_HOST": "localhost",
"DATAHUB_GMS_PORT": "8080",
},
)
def test_correct_url_when_gms_host_port_url_set():
assert cli_utils.get_details_from_env() == ("http://localhost:8080", None)
@mock.patch.dict(
os.environ,
{
"DATAHUB_GMS_URL": "https://example.com",
"DATAHUB_GMS_HOST": "localhost",
"DATAHUB_GMS_PORT": "8080",
"DATAHUB_GMS_PROTOCOL": "https",
},
)
def test_correct_url_when_gms_host_port_url_protocol_set():
assert cli_utils.get_details_from_env() == ("https://localhost:8080", None)
@mock.patch.dict(
os.environ,
{
"DATAHUB_GMS_URL": "https://example.com",
},
)
def test_correct_url_when_url_set():
assert cli_utils.get_details_from_env() == ("https://example.com", None)
@mock.patch.dict(
os.environ,
{
"GMS_HOST": "https://example.com",
},
)
def test_correct_url_when_deprecated_host_env_set():
assert cli_utils.get_details_from_env() == ("https://example.com", None)

View File

@ -66,7 +66,7 @@ class TestPipeline(object):
assert pipeline.config.sink.type == "datahub-rest"
assert pipeline.config.sink.config == {
"server": "http://localhost:8080",
"token": "",
"token": None,
}
@freeze_time(FROZEN_TIME)