mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-13 17:59:48 +00:00
feat(ingestion): Adds --dry-run and --preview options to datahub ingest command. (#3584)
This commit is contained in:
parent
806b28e697
commit
3a9ef61147
@ -96,6 +96,26 @@ pip install 'acryl-datahub[datahub-rest]' # install the required plugin
|
|||||||
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml
|
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The `--dry-run` option of the `ingest` command performs all of the ingestion steps, except writing to the sink. This is useful to ensure that the
|
||||||
|
ingestion recipe is producing the desired workunits before ingesting them into datahub.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Dry run
|
||||||
|
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --dry-run
|
||||||
|
# Short-form
|
||||||
|
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source.
|
||||||
|
This option helps with quick end-to-end smoke testing of the ingestion recipe.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Preview
|
||||||
|
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml --preview
|
||||||
|
# Preview with dry-run
|
||||||
|
datahub ingest -c ./examples/recipes/example_to_datahub_rest.yml -n --preview
|
||||||
|
```
|
||||||
|
|
||||||
### Install using Docker
|
### Install using Docker
|
||||||
|
|
||||||
[](https://hub.docker.com/r/linkedin/datahub-ingestion)
|
[](https://hub.docker.com/r/linkedin/datahub-ingestion)
|
||||||
|
|||||||
@ -40,7 +40,22 @@ def ingest() -> None:
|
|||||||
help="Config file in .toml or .yaml format.",
|
help="Config file in .toml or .yaml format.",
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
def run(config: str) -> None:
|
@click.option(
|
||||||
|
"-n",
|
||||||
|
"--dry-run",
|
||||||
|
type=bool,
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Perform a dry run of the ingestion, essentially skipping writing to sink.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--preview",
|
||||||
|
type=bool,
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Perform limited ingestion from the source to the sink to get a quick preview.",
|
||||||
|
)
|
||||||
|
def run(config: str, dry_run: bool, preview: bool) -> None:
|
||||||
"""Ingest metadata into DataHub."""
|
"""Ingest metadata into DataHub."""
|
||||||
logger.debug("DataHub CLI version: %s", datahub_package.nice_version_name())
|
logger.debug("DataHub CLI version: %s", datahub_package.nice_version_name())
|
||||||
|
|
||||||
@ -49,7 +64,7 @@ def run(config: str) -> None:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Using config: {pipeline_config}")
|
logger.debug(f"Using config: {pipeline_config}")
|
||||||
pipeline = Pipeline.create(pipeline_config)
|
pipeline = Pipeline.create(pipeline_config, dry_run, preview)
|
||||||
except ValidationError as e:
|
except ValidationError as e:
|
||||||
click.echo(e, err=True)
|
click.echo(e, err=True)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import datetime
|
import datetime
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
@ -93,8 +94,12 @@ class Pipeline:
|
|||||||
sink: Sink
|
sink: Sink
|
||||||
transformers: List[Transformer]
|
transformers: List[Transformer]
|
||||||
|
|
||||||
def __init__(self, config: PipelineConfig):
|
def __init__(
|
||||||
|
self, config: PipelineConfig, dry_run: bool = False, preview_mode: bool = False
|
||||||
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.dry_run = dry_run
|
||||||
|
self.preview_mode = preview_mode
|
||||||
self.ctx = PipelineContext(
|
self.ctx = PipelineContext(
|
||||||
run_id=self.config.run_id, datahub_api=self.config.datahub_api
|
run_id=self.config.run_id, datahub_api=self.config.datahub_api
|
||||||
)
|
)
|
||||||
@ -131,24 +136,31 @@ class Pipeline:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict: dict) -> "Pipeline":
|
def create(
|
||||||
|
cls, config_dict: dict, dry_run: bool = False, preview_mode: bool = False
|
||||||
|
) -> "Pipeline":
|
||||||
config = PipelineConfig.parse_obj(config_dict)
|
config = PipelineConfig.parse_obj(config_dict)
|
||||||
return cls(config)
|
return cls(config, dry_run=dry_run, preview_mode=preview_mode)
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
callback = LoggingCallback()
|
callback = LoggingCallback()
|
||||||
extractor: Extractor = self.extractor_class()
|
extractor: Extractor = self.extractor_class()
|
||||||
for wu in self.source.get_workunits():
|
for wu in itertools.islice(
|
||||||
|
self.source.get_workunits(), 10 if self.preview_mode else None
|
||||||
|
):
|
||||||
# TODO: change extractor interface
|
# TODO: change extractor interface
|
||||||
extractor.configure({}, self.ctx)
|
extractor.configure({}, self.ctx)
|
||||||
|
|
||||||
self.sink.handle_work_unit_start(wu)
|
if not self.dry_run:
|
||||||
|
self.sink.handle_work_unit_start(wu)
|
||||||
record_envelopes = extractor.get_records(wu)
|
record_envelopes = extractor.get_records(wu)
|
||||||
for record_envelope in self.transform(record_envelopes):
|
for record_envelope in self.transform(record_envelopes):
|
||||||
self.sink.write_record_async(record_envelope, callback)
|
if not self.dry_run:
|
||||||
|
self.sink.write_record_async(record_envelope, callback)
|
||||||
|
|
||||||
extractor.close()
|
extractor.close()
|
||||||
self.sink.handle_work_unit_end(wu)
|
if not self.dry_run:
|
||||||
|
self.sink.handle_work_unit_end(wu)
|
||||||
self.source.close()
|
self.source.close()
|
||||||
self.sink.close()
|
self.sink.close()
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user