mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-11 00:42:29 +00:00
feat(ingest): support environment variables in recipes (#2306)
This commit is contained in:
parent
7afe038a5c
commit
f57c954fc6
@ -156,7 +156,7 @@ source:
|
|||||||
type: mssql
|
type: mssql
|
||||||
config:
|
config:
|
||||||
username: sa
|
username: sa
|
||||||
password: test!Password
|
password: ${MSSQL_PASSWORD}
|
||||||
database: DemoData
|
database: DemoData
|
||||||
|
|
||||||
sink:
|
sink:
|
||||||
@ -165,6 +165,10 @@ sink:
|
|||||||
server: "http://localhost:8080"
|
server: "http://localhost:8080"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
We automatically expand environment variables in the config,
|
||||||
|
similar to variable substitution in GNU bash or in docker-compose files. For details, see
|
||||||
|
https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitution.
|
||||||
|
|
||||||
Running a recipe is quite easy.
|
Running a recipe is quite easy.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
@ -208,19 +212,20 @@ source:
|
|||||||
database: dbname
|
database: dbname
|
||||||
host_port: localhost:3306
|
host_port: localhost:3306
|
||||||
table_pattern:
|
table_pattern:
|
||||||
|
deny:
|
||||||
|
# Note that the deny patterns take precedence over the allow patterns.
|
||||||
|
- "performance_schema"
|
||||||
allow:
|
allow:
|
||||||
- "schema1.table2"
|
- "schema1.table2"
|
||||||
deny:
|
|
||||||
- "performance_schema"
|
|
||||||
# Although the 'table_pattern' enables you to skip everything from certain schemas,
|
# Although the 'table_pattern' enables you to skip everything from certain schemas,
|
||||||
# having another option to allow/deny on schema level is an optimization for the case when there is a large number
|
# having another option to allow/deny on schema level is an optimization for the case when there is a large number
|
||||||
# of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter
|
# of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter
|
||||||
# them out afterwards via the table_pattern.
|
# them out afterwards via the table_pattern.
|
||||||
schema_pattern:
|
schema_pattern:
|
||||||
allow:
|
|
||||||
- "schema1"
|
|
||||||
deny:
|
deny:
|
||||||
- "garbage_schema"
|
- "garbage_schema"
|
||||||
|
allow:
|
||||||
|
- "schema1"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Microsoft SQL Server Metadata `mssql`
|
### Microsoft SQL Server Metadata `mssql`
|
||||||
@ -239,11 +244,11 @@ source:
|
|||||||
host_port: localhost:1433
|
host_port: localhost:1433
|
||||||
database: DemoDatabase
|
database: DemoDatabase
|
||||||
table_pattern:
|
table_pattern:
|
||||||
|
deny:
|
||||||
|
- "^.*\\.sys_.*" # deny all tables that start with sys_
|
||||||
allow:
|
allow:
|
||||||
- "schema1.table1"
|
- "schema1.table1"
|
||||||
- "schema1.table2"
|
- "schema1.table2"
|
||||||
deny:
|
|
||||||
- "^.*\\.sys_.*" # deny all tables that start with sys_
|
|
||||||
options:
|
options:
|
||||||
# Any options specified here will be passed to SQLAlchemy's create_engine as kwargs.
|
# Any options specified here will be passed to SQLAlchemy's create_engine as kwargs.
|
||||||
# See https://docs.sqlalchemy.org/en/14/core/engines.html for details.
|
# See https://docs.sqlalchemy.org/en/14/core/engines.html for details.
|
||||||
|
|||||||
@ -20,36 +20,13 @@ mypy_path = src
|
|||||||
plugins =
|
plugins =
|
||||||
sqlmypy,
|
sqlmypy,
|
||||||
pydantic.mypy
|
pydantic.mypy
|
||||||
|
ignore_missing_imports = yes
|
||||||
namespace_packages = true
|
namespace_packages = true
|
||||||
strict_optional = yes
|
strict_optional = yes
|
||||||
|
check_untyped_defs = yes
|
||||||
|
# eventually we'd like to enable these
|
||||||
disallow_untyped_defs = no
|
disallow_untyped_defs = no
|
||||||
disallow_incomplete_defs = no
|
disallow_incomplete_defs = no
|
||||||
check_untyped_defs = yes
|
|
||||||
|
|
||||||
[mypy-confluent_kafka.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-avro.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-ldap.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-sqlalchemy_pytds.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-pyhive]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-pybigquery]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-psycopg2]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-geoalchemy2.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-snowflake.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-pydruid.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-pymongo.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
[mypy-docker.*]
|
|
||||||
ignore_missing_imports = yes
|
|
||||||
|
|
||||||
[isort]
|
[isort]
|
||||||
profile = black
|
profile = black
|
||||||
|
|||||||
@ -38,6 +38,7 @@ framework_common = {
|
|||||||
"pyyaml>=5.4.1",
|
"pyyaml>=5.4.1",
|
||||||
"toml>=0.10.0",
|
"toml>=0.10.0",
|
||||||
"docker>=4.4",
|
"docker>=4.4",
|
||||||
|
"expandvars>=0.6.5",
|
||||||
"avro-gen3==0.3.8",
|
"avro-gen3==0.3.8",
|
||||||
"avro-python3>=1.8.2",
|
"avro-python3>=1.8.2",
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,34 @@
|
|||||||
|
import io
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
from expandvars import expandvars
|
||||||
|
|
||||||
|
from datahub.configuration.common import ConfigurationError, ConfigurationMechanism
|
||||||
|
from datahub.configuration.toml import TomlConfigurationMechanism
|
||||||
|
from datahub.configuration.yaml import YamlConfigurationMechanism
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_file(config_file: pathlib.Path) -> dict:
|
||||||
|
if not config_file.is_file():
|
||||||
|
raise ConfigurationError(f"Cannot open config file {config_file}")
|
||||||
|
|
||||||
|
config_mech: ConfigurationMechanism
|
||||||
|
if config_file.suffix in [".yaml", ".yml"]:
|
||||||
|
config_mech = YamlConfigurationMechanism()
|
||||||
|
elif config_file.suffix == ".toml":
|
||||||
|
config_mech = TomlConfigurationMechanism()
|
||||||
|
else:
|
||||||
|
raise ConfigurationError(
|
||||||
|
"Only .toml and .yml are supported. Cannot process file type {}".format(
|
||||||
|
config_file.suffix
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
with config_file.open() as raw_config_fp:
|
||||||
|
raw_config_file = raw_config_fp.read()
|
||||||
|
|
||||||
|
expanded_config_file = expandvars(raw_config_file, nounset=True)
|
||||||
|
config_fp = io.StringIO(expanded_config_file)
|
||||||
|
config = config_mech.load_config(config_fp)
|
||||||
|
|
||||||
|
return config
|
||||||
@ -7,9 +7,7 @@ import click
|
|||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from datahub.check.check_cli import check
|
from datahub.check.check_cli import check
|
||||||
from datahub.configuration.common import ConfigurationError, ConfigurationMechanism
|
from datahub.configuration.config_loader import load_config_file
|
||||||
from datahub.configuration.toml import TomlConfigurationMechanism
|
|
||||||
from datahub.configuration.yaml import YamlConfigurationMechanism
|
|
||||||
from datahub.ingestion.run.pipeline import Pipeline
|
from datahub.ingestion.run.pipeline import Pipeline
|
||||||
from datahub.ingestion.sink.sink_registry import sink_registry
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
||||||
from datahub.ingestion.source.source_registry import source_registry
|
from datahub.ingestion.source.source_registry import source_registry
|
||||||
@ -54,23 +52,7 @@ def ingest(config: str) -> None:
|
|||||||
"""Main command for ingesting metadata into DataHub"""
|
"""Main command for ingesting metadata into DataHub"""
|
||||||
|
|
||||||
config_file = pathlib.Path(config)
|
config_file = pathlib.Path(config)
|
||||||
if not config_file.is_file():
|
pipeline_config = load_config_file(config_file)
|
||||||
raise ConfigurationError(f"Cannot open config file {config}")
|
|
||||||
|
|
||||||
config_mech: ConfigurationMechanism
|
|
||||||
if config_file.suffix in [".yaml", ".yml"]:
|
|
||||||
config_mech = YamlConfigurationMechanism()
|
|
||||||
elif config_file.suffix == ".toml":
|
|
||||||
config_mech = TomlConfigurationMechanism()
|
|
||||||
else:
|
|
||||||
raise ConfigurationError(
|
|
||||||
"Only .toml and .yml are supported. Cannot process file type {}".format(
|
|
||||||
config_file.suffix
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
with config_file.open() as fp:
|
|
||||||
pipeline_config = config_mech.load_config(fp)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Using config: {pipeline_config}")
|
logger.info(f"Using config: {pipeline_config}")
|
||||||
|
|||||||
@ -0,0 +1,2 @@
|
|||||||
|
stuff:
|
||||||
|
filename: ${THIS_IS_UNSET?with an error message}
|
||||||
5
metadata-ingestion/tests/unit/config/basic.toml
Normal file
5
metadata-ingestion/tests/unit/config/basic.toml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
foo = "bar"
|
||||||
|
|
||||||
|
[nested]
|
||||||
|
hi = "hello"
|
||||||
|
array = [ "one", "two" ]
|
||||||
7
metadata-ingestion/tests/unit/config/basic.yml
Normal file
7
metadata-ingestion/tests/unit/config/basic.yml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
foo: bar
|
||||||
|
nested:
|
||||||
|
hi: hello
|
||||||
|
array:
|
||||||
|
- one
|
||||||
|
- two
|
||||||
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
normal: sa
|
||||||
|
working_dir: $VAR1
|
||||||
|
path: ${VAR2}
|
||||||
|
server: ${UNSET_VAR3:-http://localhost:8080}
|
||||||
75
metadata-ingestion/tests/unit/config/test_config_loader.py
Normal file
75
metadata-ingestion/tests/unit/config/test_config_loader.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import os
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
import expandvars
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from datahub.configuration.common import ConfigurationError
|
||||||
|
from datahub.configuration.config_loader import load_config_file
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename,golden_config,env,error_type",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
# Basic YAML load
|
||||||
|
"tests/unit/config/basic.yml",
|
||||||
|
{"foo": "bar", "nested": {"array": ["one", "two"], "hi": "hello"}},
|
||||||
|
{},
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Basic TOML load
|
||||||
|
"tests/unit/config/basic.toml",
|
||||||
|
{"foo": "bar", "nested": {"array": ["one", "two"], "hi": "hello"}},
|
||||||
|
{},
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
# Variable expansion load
|
||||||
|
(
|
||||||
|
"tests/unit/config/simple_variable_expansion.yml",
|
||||||
|
{
|
||||||
|
"normal": "sa",
|
||||||
|
"path": "stuff2",
|
||||||
|
"server": "http://localhost:8080",
|
||||||
|
"working_dir": "stuff1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"VAR1": "stuff1",
|
||||||
|
"VAR2": "stuff2",
|
||||||
|
},
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Variable expansion error
|
||||||
|
"tests/unit/config/bad_variable_expansion.yml",
|
||||||
|
None,
|
||||||
|
{},
|
||||||
|
expandvars.ParameterNullOrNotSet,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Missing file
|
||||||
|
"tests/unit/config/this_file_does_not_exist.yml",
|
||||||
|
None,
|
||||||
|
{},
|
||||||
|
ConfigurationError,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# Unknown extension
|
||||||
|
"tests/unit/config/bad_extension.whatevenisthis",
|
||||||
|
None,
|
||||||
|
{},
|
||||||
|
ConfigurationError,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_load(pytestconfig, filename, golden_config, env, error_type):
|
||||||
|
filepath = pytestconfig.rootpath / filename
|
||||||
|
|
||||||
|
with mock.patch.dict(os.environ, env):
|
||||||
|
if error_type:
|
||||||
|
with pytest.raises(error_type):
|
||||||
|
_ = load_config_file(filepath)
|
||||||
|
else:
|
||||||
|
loaded_config = load_config_file(filepath)
|
||||||
|
assert loaded_config == golden_config
|
||||||
Loading…
x
Reference in New Issue
Block a user