feat(docs): Updating example files with the new ingestion recipe suffix (#5103)

This commit is contained in:
Tamas Nemeth 2022-06-08 00:52:26 +02:00 committed by GitHub
parent 5c501fba2c
commit 039e98e8cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 35 additions and 17 deletions

View File

@ -107,6 +107,7 @@ def get_base() -> Any:
"bootstrap": {
"type": "string",
"description": "Kafka bootstrap URL.",
"default": "localhost:9092"
},
"producer_config": {
"type": "object",
@ -115,6 +116,7 @@ def get_base() -> Any:
"schema_registry_url": {
"type": "string",
"description": "URL of schema registry being used.",
"default": "http://localhost:8081"
},
"schema_registry_config": {
"type": "object",
@ -174,7 +176,7 @@ def get_base() -> Any:
],
},
},
"required": ["source", "sink"],
"required": ["source"],
}

View File

@ -292,6 +292,7 @@ function new_url(original: string, filepath: string): string {
".py",
".ts",
".yml",
".yaml",
".sh",
".env",
".sql",

View File

@ -42,6 +42,17 @@ The default sink that most of the ingestion systems and guides assume is the `da
A recipe is the main configuration file that puts it all together. It tells our ingestion scripts where to pull data from (source) and where to put it (sink).
:::tip
Name your recipe with **.dhub.yaml** extension like *myrecipe.dhub.yaml* to use vscode or intellij as a recipe editor with autocomplete
and syntax validation.
Make sure yaml plugin is installed for your editor:
- For vscode install [Redhat's yaml plugin](https://marketplace.visualstudio.com/items?itemName=redhat.vscode-yaml)
- For intellij install [official yaml plugin](https://plugins.jetbrains.com/plugin/13126-yaml
)
:::
Since `acryl-datahub` version `>=0.8.33.2`, the default sink is assumed to be a DataHub REST endpoint:
- Hosted at "http://localhost:8080" or the environment variable `${DATAHUB_GMS_HOST}` if present
- With an empty auth token or the environment variable `${DATAHUB_GMS_TOKEN}` if present.
@ -63,12 +74,12 @@ source:
Running this recipe is as simple as:
```shell
datahub ingest -c recipe.yaml
datahub ingest -c recipe.dhub.yaml
```
or if you want to override the default endpoints, you can provide the environment variables as part of the command like below:
```shell
DATAHUB_GMS_HOST="https://my-datahub-server:8080" DATAHUB_GMS_TOKEN="my-datahub-token" datahub ingest -c recipe.yaml
DATAHUB_GMS_HOST="https://my-datahub-server:8080" DATAHUB_GMS_TOKEN="my-datahub-token" datahub ingest -c recipe.dhub.yaml
```
A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](../docs/cli.md#installing-plugins).
@ -85,7 +96,7 @@ https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitut
```shell
pip install 'acryl-datahub[datahub-rest]' # install the required plugin
datahub ingest -c ./examples/recipes/mssql_to_datahub.yml
datahub ingest -c ./examples/recipes/mssql_to_datahub.dhub.yml
```
The `--dry-run` option of the `ingest` command performs all of the ingestion steps, except writing to the sink. This is useful to validate that the
@ -93,9 +104,9 @@ ingestion recipe is producing the desired metadata events before ingesting them
```shell
# Dry run
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --dry-run
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml --dry-run
# Short-form
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml -n
```
The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source.
@ -103,23 +114,23 @@ This option helps with quick end-to-end smoke testing of the ingestion recipe.
```shell
# Preview
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --preview
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml --preview
# Preview with dry-run
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n --preview
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml -n --preview
```
By default `--preview` creates 10 workunits. But if you wish to try producing more workunits you can use another option `--preview-workunits`
```shell
# Preview 20 workunits without sending anything to sink
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n --preview --preview-workunits=20
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml -n --preview --preview-workunits=20
```
Sometimes, while running the ingestion pipeline, unexpected exceptions may occur. This can cause `stackprinter` to print all variables the logs. This may lead to credentials being written to logfiles. To prevent this behavior, in case of unexpected errors, a `--suppress-error-logs` option can be added to ingest cli command. By default, this option is set to false. However, if enabled, prevents printing all variables to logs, mitigating the risk of writing credentials to logs. The `--suppress-error-logs` option is applied when the ingestion pipeline is actually running.
```shell
# Running ingestion with --suppress-error-logs option
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --suppress-error-logs
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yml --suppress-error-logs
```
## Transformations

View File

@ -195,7 +195,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
The options in the consumer config and schema registry config are passed to the Kafka DeserializingConsumer and SchemaRegistryClient respectively.
For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml).
For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.dhub.yml).
## Compatibility

View File

@ -1,5 +1,5 @@
source:
type: "feast-repository"
type: "feast"
config:
path: "/path/to/repository/"
environment: "PROD"

View File

@ -1,6 +1,6 @@
# see https://datahubproject.io/docs/generated/ingestion/sources/feast for complete documentation
source:
type: feast
type: feast-legacy
config:
core_url: localhost:6565 # default
env: "PROD" # Optional, default is "PROD"

View File

@ -7,7 +7,8 @@ source:
# see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation
sink:
type: "datahub-kafka"
type: datahub-kafka
config:
connection:
schema_registry_url: http://localhost:8081
bootstrap: localhost:9092

View File

@ -12,7 +12,8 @@ source:
ca_file: ./ssl_files/server_certfile.pem
provenance_days: 30
process_group_pattern:
deny: "^WIP"
deny:
- "^WIP"
site_url_to_site_name:
https://localhost:9080/nifi/ : site2
https://localhost:9081/nifi/ : site2

View File

@ -1,6 +1,8 @@
---
source:
type: "okta"
type: okta
config:
sink:
type: "datahub-rest"
config:

View File

@ -114,7 +114,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
The options in the producer config and schema registry config are passed to the Kafka SerializingProducer and SchemaRegistryClient respectively.
For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.yml).
For a full example with a number of security options, see this [example recipe](../examples/recipes/secured_kafka.dhub.yaml).
## Questions