feat(docs): Updating example files with the new ingestion recipe suffix (#5103)

This commit is contained in:
Tamas Nemeth 2022-06-08 00:52:26 +02:00 committed by GitHub
parent 5c501fba2c
commit 039e98e8cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 35 additions and 17 deletions

View File

@ -107,6 +107,7 @@ def get_base() -> Any:
"bootstrap": { "bootstrap": {
"type": "string", "type": "string",
"description": "Kafka bootstrap URL.", "description": "Kafka bootstrap URL.",
"default": "localhost:9092"
}, },
"producer_config": { "producer_config": {
"type": "object", "type": "object",
@ -115,6 +116,7 @@ def get_base() -> Any:
"schema_registry_url": { "schema_registry_url": {
"type": "string", "type": "string",
"description": "URL of schema registry being used.", "description": "URL of schema registry being used.",
"default": "http://localhost:8081"
}, },
"schema_registry_config": { "schema_registry_config": {
"type": "object", "type": "object",
@ -174,7 +176,7 @@ def get_base() -> Any:
], ],
}, },
}, },
"required": ["source", "sink"], "required": ["source"],
} }

View File

@ -292,6 +292,7 @@ function new_url(original: string, filepath: string): string {
".py", ".py",
".ts", ".ts",
".yml", ".yml",
".yaml",
".sh", ".sh",
".env", ".env",
".sql", ".sql",

View File

@ -42,6 +42,17 @@ The default sink that most of the ingestion systems and guides assume is the `da
A recipe is the main configuration file that puts it all together. It tells our ingestion scripts where to pull data from (source) and where to put it (sink). A recipe is the main configuration file that puts it all together. It tells our ingestion scripts where to pull data from (source) and where to put it (sink).
:::tip
Name your recipe with **.dhub.yaml** extension like *myrecipe.dhub.yaml* to use vscode or intellij as a recipe editor with autocomplete
and syntax validation.
Make sure yaml plugin is installed for your editor:
- For vscode install [Redhat's yaml plugin](https://marketplace.visualstudio.com/items?itemName=redhat.vscode-yaml)
- For intellij install [official yaml plugin](https://plugins.jetbrains.com/plugin/13126-yaml
)
:::
Since `acryl-datahub` version `>=0.8.33.2`, the default sink is assumed to be a DataHub REST endpoint: Since `acryl-datahub` version `>=0.8.33.2`, the default sink is assumed to be a DataHub REST endpoint:
- Hosted at "http://localhost:8080" or the environment variable `${DATAHUB_GMS_HOST}` if present - Hosted at "http://localhost:8080" or the environment variable `${DATAHUB_GMS_HOST}` if present
- With an empty auth token or the environment variable `${DATAHUB_GMS_TOKEN}` if present. - With an empty auth token or the environment variable `${DATAHUB_GMS_TOKEN}` if present.
@ -63,12 +74,12 @@ source:
Running this recipe is as simple as: Running this recipe is as simple as:
```shell ```shell
datahub ingest -c recipe.yaml datahub ingest -c recipe.dhub.yaml
``` ```
or if you want to override the default endpoints, you can provide the environment variables as part of the command like below: or if you want to override the default endpoints, you can provide the environment variables as part of the command like below:
```shell ```shell
DATAHUB_GMS_HOST="https://my-datahub-server:8080" DATAHUB_GMS_TOKEN="my-datahub-token" datahub ingest -c recipe.yaml DATAHUB_GMS_HOST="https://my-datahub-server:8080" DATAHUB_GMS_TOKEN="my-datahub-token" datahub ingest -c recipe.dhub.yaml
``` ```
A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](../docs/cli.md#installing-plugins). A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](../docs/cli.md#installing-plugins).
@ -85,7 +96,7 @@ https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitut
```shell ```shell
pip install 'acryl-datahub[datahub-rest]' # install the required plugin pip install 'acryl-datahub[datahub-rest]' # install the required plugin
datahub ingest -c ./examples/recipes/mssql_to_datahub.yml datahub ingest -c ./examples/recipes/mssql_to_datahub.dhub.yml
``` ```
The `--dry-run` option of the `ingest` command performs all of the ingestion steps, except writing to the sink. This is useful to validate that the The `--dry-run` option of the `ingest` command performs all of the ingestion steps, except writing to the sink. This is useful to validate that the
@ -93,9 +104,9 @@ ingestion recipe is producing the desired metadata events before ingesting them
```shell ```shell
# Dry run # Dry run
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --dry-run datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml --dry-run
# Short-form # Short-form
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml -n
``` ```
The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source. The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source.
@ -103,23 +114,23 @@ This option helps with quick end-to-end smoke testing of the ingestion recipe.
```shell ```shell
# Preview # Preview
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --preview datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml --preview
# Preview with dry-run # Preview with dry-run
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n --preview datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml -n --preview
``` ```
By default `--preview` creates 10 workunits. But if you wish to try producing more workunits you can use another option `--preview-workunits` By default `--preview` creates 10 workunits. But if you wish to try producing more workunits you can use another option `--preview-workunits`
```shell ```shell
# Preview 20 workunits without sending anything to sink # Preview 20 workunits without sending anything to sink
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n --preview --preview-workunits=20 datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml -n --preview --preview-workunits=20
``` ```
Sometimes, while running the ingestion pipeline, unexpected exceptions may occur. This can cause `stackprinter` to print all variables the logs. This may lead to credentials being written to logfiles. To prevent this behavior, in case of unexpected errors, a `--suppress-error-logs` option can be added to ingest cli command. By default, this option is set to false. However, if enabled, prevents printing all variables to logs, mitigating the risk of writing credentials to logs. The `--suppress-error-logs` option is applied when the ingestion pipeline is actually running. Sometimes, while running the ingestion pipeline, unexpected exceptions may occur. This can cause `stackprinter` to print all variables the logs. This may lead to credentials being written to logfiles. To prevent this behavior, in case of unexpected errors, a `--suppress-error-logs` option can be added to ingest cli command. By default, this option is set to false. However, if enabled, prevents printing all variables to logs, mitigating the risk of writing credentials to logs. The `--suppress-error-logs` option is applied when the ingestion pipeline is actually running.
```shell ```shell
# Running ingestion with --suppress-error-logs option # Running ingestion with --suppress-error-logs option
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --suppress-error-logs datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml --suppress-error-logs
``` ```
## Transformations ## Transformations

View File

@ -195,7 +195,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively. The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively.
For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.dhub.yml).
## Compatibility ## Compatibility

View File

@ -1,5 +1,5 @@
source: source:
type: "feast-repository" type: "feast"
config: config:
path: "/path/to/repository/" path: "/path/to/repository/"
environment: "PROD" environment: "PROD"

View File

@ -1,6 +1,6 @@
# see https://datahubproject.io/docs/generated/ingestion/sources/feast for complete documentation # see https://datahubproject.io/docs/generated/ingestion/sources/feast for complete documentation
source: source:
type: feast type: feast-legacy
config: config:
core_url: localhost:6565 # default core_url: localhost:6565 # default
env: "PROD" # Optional, default is "PROD" env: "PROD" # Optional, default is "PROD"

View File

@ -7,7 +7,8 @@ source:
# see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation # see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation
sink: sink:
type: "datahub-kafka" type: datahub-kafka
config: config:
connection: connection:
schema_registry_url: http://localhost:8081
bootstrap: localhost:9092 bootstrap: localhost:9092

View File

@ -12,7 +12,8 @@ source:
ca_file: ./ssl_files/server_certfile.pem ca_file: ./ssl_files/server_certfile.pem
provenance_days: 30 provenance_days: 30
process_group_pattern: process_group_pattern:
deny: "^WIP" deny:
- "^WIP"
site_url_to_site_name: site_url_to_site_name:
https://localhost:9080/nifi/ : site2 https://localhost:9080/nifi/ : site2
https://localhost:9081/nifi/ : site2 https://localhost:9081/nifi/ : site2

View File

@ -1,6 +1,8 @@
--- ---
source: source:
type: "okta" type: okta
config:
sink: sink:
type: "datahub-rest" type: "datahub-rest"
config: config:

View File

@ -114,7 +114,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively. The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively.
For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml). For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.dhub.yaml).
## Questions ## Questions