From dda5963e34f131af24be4e6e7ab3099bcd524c7b Mon Sep 17 00:00:00 2001 From: Milan Bariya <52292922+MilanBariya@users.noreply.github.com> Date: Thu, 13 Apr 2023 19:12:07 +0530 Subject: [PATCH] Data insights with metadata command-line (#11045) * Data insights with metadata command-line * Python Checkstyle * Python Checkstyle --- ingestion/src/metadata/cli/insight.py | 49 ++++++++++++++ ingestion/src/metadata/cmd.py | 11 +++- .../examples/workflows/data_insight.yaml | 20 ++++++ .../metadata/utils/workflow_output_handler.py | 1 + .../openmetadata/data-insight/index.md | 66 +++++++++++++++++++ 5 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 ingestion/src/metadata/cli/insight.py create mode 100644 ingestion/src/metadata/examples/workflows/data_insight.yaml diff --git a/ingestion/src/metadata/cli/insight.py b/ingestion/src/metadata/cli/insight.py new file mode 100644 index 00000000000..3cc2cb30709 --- /dev/null +++ b/ingestion/src/metadata/cli/insight.py @@ -0,0 +1,49 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Data Insigt utility for the metadata CLI +""" +import pathlib +import sys +import traceback + +from metadata.config.common import load_config_file +from metadata.data_insight.api.workflow import DataInsightWorkflow +from metadata.utils.logger import cli_logger +from metadata.utils.workflow_output_handler import WorkflowType, print_init_error + +logger = cli_logger() + + +def run_insight(config_path: str) -> None: + """ + Run the Data Insigt workflow from a config path + to a JSON or YAML file + :param config_path: Path to load JSON config + """ + + config_file = pathlib.Path(config_path) + config_dict = None + try: + config_dict = load_config_file(config_file) + workflow = DataInsightWorkflow.create(config_dict) + logger.debug(f"Using config: {workflow.config}") + except Exception as exc: + logger.debug(traceback.format_exc()) + print_init_error(exc, config_dict, WorkflowType.INSIGHT) + sys.exit(1) + + workflow.execute() + workflow.stop() + workflow.print_status() + ret = workflow.result_status() + sys.exit(ret) diff --git a/ingestion/src/metadata/cmd.py b/ingestion/src/metadata/cmd.py index 34cec218d17..393304dffa9 100644 --- a/ingestion/src/metadata/cmd.py +++ b/ingestion/src/metadata/cmd.py @@ -22,6 +22,7 @@ from metadata.cli.backup import UploadDestinationType, run_backup from metadata.cli.dataquality import run_test from metadata.cli.docker import BACKEND_DATABASES, DockerActions, run_docker from metadata.cli.ingest import run_ingest +from metadata.cli.insight import run_insight from metadata.cli.openmetadata_dag_config_migration import ( run_openmetadata_dag_config_migration, ) @@ -44,6 +45,7 @@ class MetadataCommands(Enum): BACKUP = "backup" RESTORE = "restore" WEBHOOK = "webhook" + INSIGHT = "insight" OPENMETADATA_IMPORTS_MIGRATION = "openmetadata_imports_migration" OPENMETADATA_DAG_CONFIG_MIGRATION = "openmetadata_dag_config_migration" @@ -378,13 +380,18 @@ def get_parser(args=None): help="Simple Webserver to test webhook metadata events", ) ) + create_common_config_parser_args( + sub_parser.add_parser( + MetadataCommands.INSIGHT.value, help="Data Insigt Workflow" + ) + ) add_metadata_args(parser) parser.add_argument("--debug", help="Debug Mode", action="store_true") return parser.parse_args(args) -def metadata(args=None): +def metadata(args=None): # pylint: disable=too-many-branches """ This method implements parsing of the arguments passed from CLI """ @@ -400,6 +407,8 @@ def metadata(args=None): if metadata_workflow == MetadataCommands.INGEST.value: run_ingest(config_path=config_file) + if metadata_workflow == MetadataCommands.INSIGHT.value: + run_insight(config_path=config_file) if metadata_workflow == MetadataCommands.PROFILE.value: run_profiler(config_path=config_file) if metadata_workflow == MetadataCommands.TEST.value: diff --git a/ingestion/src/metadata/examples/workflows/data_insight.yaml b/ingestion/src/metadata/examples/workflows/data_insight.yaml new file mode 100644 index 00000000000..363d1db1f2b --- /dev/null +++ b/ingestion/src/metadata/examples/workflows/data_insight.yaml @@ -0,0 +1,20 @@ +source: + type: dataInsight + serviceName: OpenMetadata + sourceConfig: + config: + type: MetadataToElasticSearch +processor: + type: data-insight-processor + config: {} +sink: + type: elasticsearch + config: + es_host: localhost + es_port: 9200 + recreate_indexes: false +workflowConfig: + loggerLevel: DEBUG + openMetadataServerConfig: + hostPort: http://localhost:8585/api + authProvider: no-auth diff --git a/ingestion/src/metadata/utils/workflow_output_handler.py b/ingestion/src/metadata/utils/workflow_output_handler.py index efa67da9fc8..1ad6eb086eb 100644 --- a/ingestion/src/metadata/utils/workflow_output_handler.py +++ b/ingestion/src/metadata/utils/workflow_output_handler.py @@ -74,6 +74,7 @@ class WorkflowType(Enum): TEST = "test" LINEAGE = "lineage" USAGE = "usage" + INSIGHT = "insight" EXAMPLES_WORKFLOW_PATH: Path = Path(__file__).parent / "../examples" / "workflows" diff --git a/openmetadata-docs/content/openmetadata/data-insight/index.md b/openmetadata-docs/content/openmetadata/data-insight/index.md index 9a23f7834dc..4b68f174376 100644 --- a/openmetadata-docs/content/openmetadata/data-insight/index.md +++ b/openmetadata-docs/content/openmetadata/data-insight/index.md @@ -276,6 +276,72 @@ with DAG( ) ``` +# Run Data Insights using the metadata CLI + +### 1. Define the YAML Config + +This is a sample config for Data Insights: + +```yaml +source: + type: dataInsight + serviceName: OpenMetadata + sourceConfig: + config: + type: MetadataToElasticSearch +processor: + type: data-insight-processor + config: {} +sink: + type: elasticsearch + config: + es_host: localhost + es_port: 9200 + recreate_indexes: false +workflowConfig: + loggerLevel: DEBUG + openMetadataServerConfig: + hostPort: "" + authProvider: openmetadata + securityConfig: + jwtToken: '{bot_jwt_token}' +``` + +#### Source Configuration - Source Config + +- To send the metadata to OpenMetadata, it needs to be specified as `type: MetadataToElasticSearch`. + + +#### processor Configuration + +- To send the metadata to OpenMetadata, it needs to be specified as `type: data-insight-processor`. + +#### Workflow Configuration + +The main property here is the `openMetadataServerConfig`, where you can define the host and security provider of your OpenMetadata installation. + +For a simple, local installation using our docker containers, this looks like: + +```yaml +workflowConfig: + openMetadataServerConfig: + hostPort: 'http://localhost:8585/api' + authProvider: openmetadata + securityConfig: + jwtToken: '{bot_jwt_token}' +``` + +We support different security providers. You can find their definitions [here](https://github.com/open-metadata/OpenMetadata/tree/main/openmetadata-spec/src/main/resources/json/schema/security/client). +You can find the different implementation of the ingestion below. + +### 2. Run with the CLI + +First, we will need to save the YAML file. Afterward, and with all requirements installed, we can run: + +```bash +metadata insight -c +``` + # Run Elasticsearch Reindex using the Airflow SDK