From 6ec468def72be44139d80a3ada654f79b18b5546 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Fri, 23 Feb 2024 08:53:10 +0100 Subject: [PATCH] Update external docs (#15319) --- .../connectors/yaml/workflow-config-def.md | 16 + .../v1.3/connectors/yaml/workflow-config.md | 5 + .../content/v1.3.x/connectors/index.md | 2 +- .../ingestion/{ => external}/airflow.md | 2 +- .../ingestion/{ => external}/credentials.md | 2 +- .../ingestion/{ => external}/gcs-composer.md | 2 +- .../{ => external}/github-actions.md | 2 +- .../deployment/ingestion/external/index.md | 761 ++++++++++++++++++ .../ingestion/{ => external}/mwaa.md | 2 +- .../v1.3.x/deployment/ingestion/index.md | 160 +--- openmetadata-docs/content/v1.3.x/menu.md | 24 +- 11 files changed, 807 insertions(+), 171 deletions(-) rename openmetadata-docs/content/v1.3.x/deployment/ingestion/{ => external}/airflow.md (99%) rename openmetadata-docs/content/v1.3.x/deployment/ingestion/{ => external}/credentials.md (99%) rename openmetadata-docs/content/v1.3.x/deployment/ingestion/{ => external}/gcs-composer.md (99%) rename openmetadata-docs/content/v1.3.x/deployment/ingestion/{ => external}/github-actions.md (99%) create mode 100644 openmetadata-docs/content/v1.3.x/deployment/ingestion/external/index.md rename openmetadata-docs/content/v1.3.x/deployment/ingestion/{ => external}/mwaa.md (99%) diff --git a/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config-def.md b/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config-def.md index 97e97599ddd..f0b9c5a814b 100644 --- a/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config-def.md +++ b/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config-def.md @@ -17,6 +17,22 @@ To enable JWT Tokens, you will get more details [here](/deployment/security/enab You can refer to the JWT Troubleshooting section [link](/deployment/security/jwt-troubleshooting) for any issues in your JWT configuration. +**Store Service Connection** + +If set to `true` (default), we will store the sensitive information either encrypted via the Fernet Key in the database +or externally, if you have configured any [Secrets Manager](/deployment/secrets-manager). + +If set to `false`, the service will be created, but the service connection information will only be used by the Ingestion +Framework at runtime, and won't be sent to the OpenMetadata server. + +**Store Service Connection** + +If set to `true` (default), we will store the sensitive information either encrypted via the Fernet Key in the database +or externally, if you have configured any [Secrets Manager](/deployment/secrets-manager). + +If set to `false`, the service will be created, but the service connection information will only be used by the Ingestion +Framework at runtime, and won't be sent to the OpenMetadata server. + **SSL Configuration** If you have added SSL to the [OpenMetadata server](/deployment/security/enable-ssl), then you will need to handle diff --git a/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config.md b/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config.md index c577f415feb..0d9dbb8f0bd 100644 --- a/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config.md +++ b/openmetadata-docs/content/partials/v1.3/connectors/yaml/workflow-config.md @@ -6,6 +6,11 @@ workflowConfig: authProvider: openmetadata securityConfig: jwtToken: "{bot_jwt_token}" + ## Store the service Connection information + storeServiceConnection: true # false + ## Secrets Manager Configuration + # secretsManagerProvider: aws, azure or noop + # secretsManagerLoader: airflow or env ## If SSL, fill the following # verifySSL: validate # or ignore # sslConfig: diff --git a/openmetadata-docs/content/v1.3.x/connectors/index.md b/openmetadata-docs/content/v1.3.x/connectors/index.md index cacccd3d9f0..8ee15302f46 100644 --- a/openmetadata-docs/content/v1.3.x/connectors/index.md +++ b/openmetadata-docs/content/v1.3.x/connectors/index.md @@ -25,7 +25,7 @@ the following docs to run the Ingestion Framework in any orchestrator externally {% tile title="External Schedulers" description="Get more information about running the Ingestion Framework Externally" - link="/deployment/ingestion" + link="/deployment/ingestion/external" / %} {% /tilesContainer %} diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/airflow.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/airflow.md similarity index 99% rename from openmetadata-docs/content/v1.3.x/deployment/ingestion/airflow.md rename to openmetadata-docs/content/v1.3.x/deployment/ingestion/external/airflow.md index 2a0a2ef5c5a..3e535a4c0dd 100644 --- a/openmetadata-docs/content/v1.3.x/deployment/ingestion/airflow.md +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/airflow.md @@ -1,6 +1,6 @@ --- title: Run the ingestion from your Airflow -slug: /deployment/ingestion/airflow +slug: /deployment/ingestion/external/airflow --- {% partial file="/v1.3/deployment/external-ingestion.md" /%} diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/credentials.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/credentials.md similarity index 99% rename from openmetadata-docs/content/v1.3.x/deployment/ingestion/credentials.md rename to openmetadata-docs/content/v1.3.x/deployment/ingestion/external/credentials.md index fde38c79690..afe2c699fd0 100644 --- a/openmetadata-docs/content/v1.3.x/deployment/ingestion/credentials.md +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/credentials.md @@ -1,6 +1,6 @@ --- title: Managing Credentials -slug: /deployment/ingestion/credentials +slug: /deployment/ingestion/external/credentials --- # Managing Credentials diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/gcs-composer.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/gcs-composer.md similarity index 99% rename from openmetadata-docs/content/v1.3.x/deployment/ingestion/gcs-composer.md rename to openmetadata-docs/content/v1.3.x/deployment/ingestion/external/gcs-composer.md index 313bdc9ab23..3bead257c0a 100644 --- a/openmetadata-docs/content/v1.3.x/deployment/ingestion/gcs-composer.md +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/gcs-composer.md @@ -1,6 +1,6 @@ --- title: Run the ingestion from GCS Composer -slug: /deployment/ingestion/gcs-composer +slug: /deployment/ingestion/external/gcs-composer --- {% partial file="/v1.3/deployment/external-ingestion.md" /%} diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/github-actions.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/github-actions.md similarity index 99% rename from openmetadata-docs/content/v1.3.x/deployment/ingestion/github-actions.md rename to openmetadata-docs/content/v1.3.x/deployment/ingestion/external/github-actions.md index cf47e21b1a3..cac915c1719 100644 --- a/openmetadata-docs/content/v1.3.x/deployment/ingestion/github-actions.md +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/github-actions.md @@ -1,6 +1,6 @@ --- title: Run the ingestion from GitHub Actions -slug: /deployment/ingestion/github-actions +slug: /deployment/ingestion/external/github-actions --- {% partial file="/v1.3/deployment/external-ingestion.md" /%} diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/index.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/index.md new file mode 100644 index 00000000000..55f399c6cc8 --- /dev/null +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/index.md @@ -0,0 +1,761 @@ +--- +title: Run the Ingestion Framework Externally +slug: /deployment/ingestion/external +--- + +# Ingestion Framework External Deployment + +Any tool capable of running Python code can be used to configure the metadata extraction from your sources. + +## 1. How does the Ingestion Framework work? + +The Ingestion Framework contains all the logic about how to connect to the sources, extract their metadata +and send it to the OpenMetadata server. We have built it from scratch with the main idea of making it an independent +component that can be run from - **literally** - anywhere. + +In order to install it, you just need to get it from [PyPI](https://pypi.org/project/openmetadata-ingestion/). + +We will show further examples later, but a piece of code is the best showcase for its simplicity. In order to run +a full ingestion process, you just need to execute a single function. For example, if we wanted to run the metadata +ingestion from within a simple Python script: + +```python +from metadata.workflow.metadata import MetadataWorkflow +from metadata.workflow.workflow_output_handler import print_status + +# Specify your YAML configuration +CONFIG = """ +source: + ... +workflowConfig: + openMetadataServerConfig: + hostPort: 'http://localhost:8585/api' + authProvider: openmetadata + securityConfig: + jwtToken: ... +""" + +def run(): + workflow_config = yaml.safe_load(CONFIG) + workflow = MetadataWorkflow.create(workflow_config) + workflow.execute() + workflow.raise_from_status() + print_status(workflow) + workflow.stop() + + +if __name__ == "__main__": + run() +``` + +Where this function runs is completely up to you, and you can adapt it to what makes the most sense within your +organization and engineering context. Below you'll see some examples of different orchestrators you can leverage +to execute the ingestion process. + +## 2. Ingestion Configuration + +In the example above, the `Workflow` class got created from a YAML configuration. Any Workflow that you execute (ingestion, +profiler, lineage,...) will have its own YAML representation. + +You can think about this configuration as the recipe you want to execute: where is your source, which pieces do you +extract, how are they processed and where are they sent. + +An example YAML config for extracting MySQL metadata looks like this: + +```yaml +source: + type: mysql + serviceName: mysql + serviceConnection: + config: + type: Mysql + username: openmetadata_user + authType: + password: openmetadata_password + hostPort: localhost:3306 + databaseSchema: openmetadata_db + sourceConfig: + config: + type: DatabaseMetadata +sink: + type: metadata-rest + config: {} +workflowConfig: + openMetadataServerConfig: + hostPort: 'http://localhost:8585/api' + authProvider: openmetadata + securityConfig: + jwtToken: ... +``` + +{% note %} +You will find examples of all the workflow's YAML files at each Connector [page](/connectors). +{% /note %} + +We will now show you examples on how to configure and run every workflow externally by using Snowflake as an example. But +first, let's digest some information that will be common everywhere, the `workflowConfig`. + +### Workflow Config + +Here you will define information such as where are you hosting the OpenMetadata server, and the JWT token to authenticate. + +{% note noteType="Warning" %} + +Review this section carefully to ensure you are properly managing service credentials and other security configurations. + +{% /note %} + +**Logger Level** + +You can specify the `loggerLevel` depending on your needs. If you are trying to troubleshoot an ingestion, running +with `DEBUG` will give you far more traces for identifying issues. + +**JWT Token** + +JWT tokens will allow your clients to authenticate against the OpenMetadata server. +To enable JWT Tokens, you will get more details [here](/deployment/security/enable-jwt-tokens). + +You can refer to the JWT Troubleshooting section [link](/deployment/security/jwt-troubleshooting) for any issues in +your JWT configuration. + +**Store Service Connection** + +If set to `true` (default), we will store the sensitive information either encrypted via the Fernet Key in the database +or externally, if you have configured any [Secrets Manager](/deployment/secrets-manager). + +If set to `false`, the service will be created, but the service connection information will only be used by the Ingestion +Framework at runtime, and won't be sent to the OpenMetadata server. + +**Secrets Manager Configuration** + +If you have configured any [Secrets Manager](/deployment/secrets-manager), you need to let the Ingestion Framework know +how to retrieve the credentials securely. + +Follow the [docs](/deployment/secrets-manager) to configure the secret retrieval based on your environment. + +**SSL Configuration** + +If you have added SSL to the [OpenMetadata server](/deployment/security/enable-ssl), then you will need to handle +the certificates when running the ingestion too. You can either set `verifySSL` to `ignore`, or have it as `validate`, +which will require you to set the `sslConfig.certificatePath` with a local path where your ingestion runs that points +to the server certificate file. + +Find more information on how to troubleshoot SSL issues [here](/deployment/security/enable-ssl/ssl-troubleshooting). + +```yaml +workflowConfig: + loggerLevel: INFO # DEBUG, INFO, WARNING or ERROR + openMetadataServerConfig: + hostPort: "http://localhost:8585/api" + authProvider: openmetadata + securityConfig: + jwtToken: "{bot_jwt_token}" + ## Store the service Connection information + # storeServiceConnection: true or false + ## Secrets Manager Configuration + # secretsManagerProvider: aws, azure or noop + # secretsManagerLoader: airflow or env + ## If SSL, fill the following + # verifySSL: validate # or ignore + # sslConfig: + # certificatePath: /local/path/to/certificate +``` + +## 3. (Optional) Ingestion Pipeline + +Additionally, if you want to see your runs logged in the `Ingestions` tab of the connectors page in the UI as you would +when running the connectors natively with OpenMetadata, you can add the following configuration on your YAMLs: + +```yaml +source: + type: mysql + serviceName: mysql +[...] +workflowConfig: + openMetadataServerConfig: + hostPort: 'http://localhost:8585/api' + authProvider: openmetadata + securityConfig: + jwtToken: ... +ingestionPipelineFQN: . # E.g., mysql.marketing_metadata` +``` + +Adding the `ingestionPipelineFQN` - the Ingestion Pipeline Fully Qualified Name - will tell the Ingestion Framework +to log the executions and update the ingestion status, which will appear on the UI. Note that the action buttons +will be disabled, since OpenMetadata won't be able to interact with external systems. + +## 4. (Optional) Disable the Pipeline Service Client + +If you want to run your workflows **ONLY externally** without relying on OpenMetadata for any workflow management +or scheduling, you can update the following server configuration: + +```yaml +pipelineServiceClientConfiguration: + enabled: ${PIPELINE_SERVICE_CLIENT_ENABLED:-true} +``` + +by setting `enabled: false` or setting the `PIPELINE_SERVICE_CLIENT_ENABLED=false` as an environment variable. + +This will stop certain APIs and monitors related to the Pipeline Service Client (e.g., Airflow) from being operative. + +## Examples + +{% note %} + +This is not an exhaustive list, and it will keep growing over time. Not because the orchestrators X or Y are not supported, +but just because we did not have the time yet to add it here. If you'd like to chip in and help us expand these guides and examples, +don't hesitate to reach to us in [Slack](https://slack.open-metadata.org/) or directly open a PR in +[GitHub](https://github.com/open-metadata/OpenMetadata/tree/main/openmetadata-docs/content). + +{% /note %} +{% inlineCalloutContainer %} + {% inlineCallout + color="violet-70" + icon="10k" + bold="Airflow" + href="/deployment/ingestion/external/airflow" %} + Run the ingestion process externally from Airflow + {% /inlineCallout %} + {% inlineCallout + color="violet-70" + icon="10k" + bold="MWAA" + href="/deployment/ingestion/external/mwaa" %} + Run the ingestion process externally using AWS MWAA + {% /inlineCallout %} + {% inlineCallout + color="violet-70" + icon="10k" + bold="GCS Composer" + href="/deployment/ingestion/external/gcs-composer" %} + Run the ingestion process externally from GCS Composer + {% /inlineCallout %} + {% inlineCallout + color="violet-70" + icon="10k" + bold="GitHub Actions" + href="/deployment/ingestion/external/github-actions" %} + Run the ingestion process externally from GitHub Actions + {% /inlineCallout %} +{% /inlineCalloutContainer %} + +Let's jump now into some examples on how you could create the function the run the different workflows. Note that this code +can then be executed inside a DAG, a GitHub action, or a vanilla Python script. It will work for any environment. + +### Metadata Workflow + +This is the first workflow you have to configure and run. It will take care of fetching the metadata from your sources, +be it Database Services, Dashboard Services, Pipelines, etc. + +The rest of the workflows (Lineage, Profiler,...) will be executed on top of the metadata already available in the platform. + +{% codePreview %} + +{% codeInfoContainer %} + +{% codeInfo srNumber=1 %} +**Adding the imports** + +The first step is to import the `MetadataWorkflow` class, which will take care of the full ingestion logic. We'll +add the import for printing the results at the end. +{% /codeInfo %} + +{% codeInfo srNumber=2 %} +**Defining the YAML** + +Then, we need to pass the YAML configuration. For this simple example we are defining a variable, but you can +read from a file, parse secrets from your environment, or any other approach you'd need. In the end, it's just +Python code. + +{% note %} +You can find complete YAMLs in each connector [docs](/connectors) and find more information about the available +configurations. +{% /note %} + +{% /codeInfo %} + +{% codeInfo srNumber=3 %} +**Preparing the Workflow** + +Finally, we'll prepare a function that we can execute anywhere. + +It will take care of instantiating the workflow, executing it and giving us the results. +{% /codeInfo %} + +{% /codeInfoContainer %} + +{% codeBlock fileName="ingestion.py" %} + +```python +import yaml + +``` + +```python {% srNumber=1 %} +from metadata.workflow.metadata import MetadataWorkflow +from metadata.workflow.workflow_output_handler import print_status + +``` + +```python {% srNumber=2 %} + +CONFIG = """ +source: + type: snowflake + serviceName: + serviceConnection: + config: + type: Snowflake + ... + sourceConfig: + config: + type: DatabaseMetadata + markDeletedTables: true + includeTables: true + ... +sink: + type: metadata-rest + config: {} +workflowConfig: + openMetadataServerConfig: + hostPort: "http://localhost:8585/api" + authProvider: openmetadata + securityConfig: + jwtToken: "{bot_jwt_token}" +""" + +``` + +```python {% srNumber=3 %} + +def run(): + workflow = MetadataWorkflow.create(CONFIG) + workflow.execute() + workflow.raise_from_status() + print_status(workflow) + workflow.stop() +``` + +{% /codeBlock %} + +{% /codePreview %} + + +### Lineage Workflow + +This workflow will take care of scanning your query history and defining lineage relationships between your tables. + +You can find more information about this workflow [here](/connectors/ingestion/lineage). + +{% codePreview %} + +{% codeInfoContainer %} + +{% codeInfo srNumber=1 %} +**Adding the imports** + +The first step is to import the `MetadataWorkflow` class, which will take care of the full ingestion logic. We'll +add the import for printing the results at the end. + +Note that we are using the same class as in the Metadata Ingestion. +{% /codeInfo %} + +{% codeInfo srNumber=2 %} +**Defining the YAML** + +Then, we need to pass the YAML configuration. For this simple example we are defining a variable, but you can +read from a file, parse secrets from your environment, or any other approach you'd need. + +Note how we have not added here the `serviceConnection`. Since the service would have been created during the +metadata ingestion, we can let the Ingestion Framework dynamically fetch the Service Connection information. + +If, however, you are configuring the workflow with `storeServiceConnection: false`, you'll need to explicitly +define the `serviceConnection`. + +{% note %} +You can find complete YAMLs in each connector [docs](/connectors) and find more information about the available +configurations. +{% /note %} + +{% /codeInfo %} + +{% codeInfo srNumber=3 %} +**Preparing the Workflow** + +Finally, we'll prepare a function that we can execute anywhere. + +It will take care of instantiating the workflow, executing it and giving us the results. +{% /codeInfo %} + +{% /codeInfoContainer %} + +{% codeBlock fileName="ingestion.py" %} + +```python +import yaml + +``` + +```python {% srNumber=1 %} +from metadata.workflow.metadata import MetadataWorkflow +from metadata.workflow.workflow_output_handler import print_status + +``` + +```python {% srNumber=2 %} + +CONFIG = """ +source: + type: snowflake-lineage + serviceName: + sourceConfig: + config: + type: DatabaseLineage + queryLogDuration: 1 + parsingTimeoutLimit: 300 + ... +sink: + type: metadata-rest + config: {} +workflowConfig: + openMetadataServerConfig: + hostPort: "http://localhost:8585/api" + authProvider: openmetadata + securityConfig: + jwtToken: "{bot_jwt_token}" +""" + +``` + +```python {% srNumber=3 %} + +def run(): + workflow = MetadataWorkflow.create(CONFIG) + workflow.execute() + workflow.raise_from_status() + print_status(workflow) + workflow.stop() +``` + +{% /codeBlock %} + +{% /codePreview %} + + +### Usage Workflow + +As with the lineage workflow, we'll scan the query history for any DML statements. The goal is to ingest queries +into the platform, figure out the relevancy of your assets and frequently joined tables. + +{% codePreview %} + +{% codeInfoContainer %} + +{% codeInfo srNumber=1 %} +**Adding the imports** + +The first step is to import the `UsageWorkflow` class, which will take care of the full ingestion logic. We'll +add the import for printing the results at the end. + +{% /codeInfo %} + +{% codeInfo srNumber=2 %} +**Defining the YAML** + +Then, we need to pass the YAML configuration. For this simple example we are defining a variable, but you can +read from a file, parse secrets from your environment, or any other approach you'd need. + +Note how we have not added here the `serviceConnection`. Since the service would have been created during the +metadata ingestion, we can let the Ingestion Framework dynamically fetch the Service Connection information. + +If, however, you are configuring the workflow with `storeServiceConnection: false`, you'll need to explicitly +define the `serviceConnection`. + + +{% note %} +You can find complete YAMLs in each connector [docs](/connectors) and find more information about the available +configurations. +{% /note %} + +{% /codeInfo %} + +{% codeInfo srNumber=3 %} +**Preparing the Workflow** + +Finally, we'll prepare a function that we can execute anywhere. + +It will take care of instantiating the workflow, executing it and giving us the results. +{% /codeInfo %} + +{% /codeInfoContainer %} + +{% codeBlock fileName="ingestion.py" %} + +```python +import yaml + +``` + +```python {% srNumber=1 %} +from metadata.workflow.usage import UsageWorkflow +from metadata.workflow.workflow_output_handler import print_status + +``` + +```python {% srNumber=2 %} + +CONFIG = """ +source: + type: snowflake-usage + serviceName: + sourceConfig: + config: + type: DatabaseUsage + queryLogDuration: 1 + parsingTimeoutLimit: 300 + ... +processor: + type: query-parser + config: {} +stage: + type: table-usage + config: + filename: "/tmp/snowflake_usage" +bulkSink: + type: metadata-usage + config: + filename: "/tmp/snowflake_usage" +workflowConfig: + openMetadataServerConfig: + hostPort: "http://localhost:8585/api" + authProvider: openmetadata + securityConfig: + jwtToken: "{bot_jwt_token}" +""" + +``` + +```python {% srNumber=3 %} + +def run(): + workflow = UsageWorkflow.create(CONFIG) + workflow.execute() + workflow.raise_from_status() + print_status(workflow) + workflow.stop() +``` + +{% /codeBlock %} + +{% /codePreview %} + +### Profiler Workflow + +This workflow will execute queries against your database and send the results into OpenMetadata. The goal is to compute +metrics about your data and give you a high-level view of its shape, together with the sample data. + +This is an interesting previous step before creating Data Quality Workflows. + +You can find more information about this workflow [here](/connectors/ingestion/workflows/profiler). + +{% codePreview %} + +{% codeInfoContainer %} + +{% codeInfo srNumber=1 %} +**Adding the imports** + +The first step is to import the `ProfilerWorkflow` class, which will take care of the full ingestion logic. We'll +add the import for printing the results at the end. + +{% /codeInfo %} + +{% codeInfo srNumber=2 %} +**Defining the YAML** + +Then, we need to pass the YAML configuration. For this simple example we are defining a variable, but you can +read from a file, parse secrets from your environment, or any other approach you'd need. + +Note how we have not added here the `serviceConnection`. Since the service would have been created during the +metadata ingestion, we can let the Ingestion Framework dynamically fetch the Service Connection information. + +If, however, you are configuring the workflow with `storeServiceConnection: false`, you'll need to explicitly +define the `serviceConnection`. + +{% note %} +You can find complete YAMLs in each connector [docs](/connectors) and find more information about the available +configurations. +{% /note %} + +{% /codeInfo %} + +{% codeInfo srNumber=3 %} +**Preparing the Workflow** + +Finally, we'll prepare a function that we can execute anywhere. + +It will take care of instantiating the workflow, executing it and giving us the results. +{% /codeInfo %} + +{% /codeInfoContainer %} + +{% codeBlock fileName="ingestion.py" %} + +```python +import yaml + +``` + +```python {% srNumber=1 %} +from metadata.workflow.profiler import ProfilerWorkflow +from metadata.workflow.workflow_output_handler import print_status + +``` + +```python {% srNumber=2 %} + +CONFIG = """ +source: + type: snowflake + serviceName: + sourceConfig: + config: + type: Profiler + generateSampleData: true + ... +processor: + type: orm-profiler + config: {} +sink: + type: metadata-rest + config: {} +workflowConfig: + openMetadataServerConfig: + hostPort: "http://localhost:8585/api" + authProvider: openmetadata + securityConfig: + jwtToken: "{bot_jwt_token}" +""" + +``` + +```python {% srNumber=3 %} + +def run(): + workflow = ProfilerWorkflow.create(CONFIG) + workflow.execute() + workflow.raise_from_status() + print_status(workflow) + workflow.stop() +``` + +{% /codeBlock %} + +{% /codePreview %} + + +### Data Quality Workflow + +This workflow will execute queries against your database and send the results into OpenMetadata. The goal is to compute +metrics about your data and give you a high-level view of its shape, together with the sample data. + +This is an interesting previous step before creating Data Quality Workflows. + +You can find more information about this workflow [here](/connectors/ingestion/workflows/data-quality). + +{% codePreview %} + +{% codeInfoContainer %} + +{% codeInfo srNumber=1 %} +**Adding the imports** + +The first step is to import the `TestSuiteWorkflow` class, which will take care of the full ingestion logic. We'll +add the import for printing the results at the end. + +{% /codeInfo %} + +{% codeInfo srNumber=2 %} +**Defining the YAML** + +Then, we need to pass the YAML configuration. For this simple example we are defining a variable, but you can +read from a file, parse secrets from your environment, or any other approach you'd need. + +Note how we have not added here the `serviceConnection`. Since the service would have been created during the +metadata ingestion, we can let the Ingestion Framework dynamically fetch the Service Connection information. + +If, however, you are configuring the workflow with `storeServiceConnection: false`, you'll need to explicitly +define the `serviceConnection`. + +Moreover, see how we are not configuring any tests in the `processor`. You can do [that](/connectors/ingestion/workflows/data-quality#full-yaml-config-example), +but even if nothing gets defined in the YAML, we will execute all the tests configured against the table. + +{% note %} +You can find complete YAMLs in each connector [docs](/connectors) and find more information about the available +configurations. +{% /note %} + +{% /codeInfo %} + +{% codeInfo srNumber=3 %} +**Preparing the Workflow** + +Finally, we'll prepare a function that we can execute anywhere. + +It will take care of instantiating the workflow, executing it and giving us the results. +{% /codeInfo %} + +{% /codeInfoContainer %} + +{% codeBlock fileName="ingestion.py" %} + +```python +import yaml + +``` + +```python {% srNumber=1 %} +from metadata.workflow.data_quality import TestSuiteWorkflow +from metadata.workflow.workflow_output_handler import print_status + +``` + +```python {% srNumber=2 %} + +CONFIG = """ +source: + type: TestSuite + serviceName: + sourceConfig: + config: + type: TestSuite + entityFullyQualifiedName: +processor: + type: orm-test-runner + config: {} +sink: + type: metadata-rest + config: {} +workflowConfig: + openMetadataServerConfig: + hostPort: "http://localhost:8585/api" + authProvider: openmetadata + securityConfig: + jwtToken: "{bot_jwt_token}" +""" + +``` + +```python {% srNumber=3 %} + +def run(): + workflow = TestSuiteWorkflow.create(CONFIG) + workflow.execute() + workflow.raise_from_status() + print_status(workflow) + workflow.stop() +``` + +{% /codeBlock %} + +{% /codePreview %} diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/mwaa.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/mwaa.md similarity index 99% rename from openmetadata-docs/content/v1.3.x/deployment/ingestion/mwaa.md rename to openmetadata-docs/content/v1.3.x/deployment/ingestion/external/mwaa.md index 6c74f33ba57..be6562f0c57 100644 --- a/openmetadata-docs/content/v1.3.x/deployment/ingestion/mwaa.md +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/external/mwaa.md @@ -1,6 +1,6 @@ --- title: Run the ingestion from AWS MWAA -slug: /deployment/ingestion/mwaa +slug: /deployment/ingestion/external/mwaa --- {% partial file="/v1.3/deployment/external-ingestion.md" /%} diff --git a/openmetadata-docs/content/v1.3.x/deployment/ingestion/index.md b/openmetadata-docs/content/v1.3.x/deployment/ingestion/index.md index 6b1bd093e51..c93977597bf 100644 --- a/openmetadata-docs/content/v1.3.x/deployment/ingestion/index.md +++ b/openmetadata-docs/content/v1.3.x/deployment/ingestion/index.md @@ -19,7 +19,7 @@ Note that the end result is going to be the same. The only difference is that ru OpenMetadata will dynamically generate the processes that will perform the metadata extraction. If configuring the ingestion externally, you will be managing this processes directly on your platform of choice. -### Option 1 - From OpenMetadata +## Option 1 - From OpenMetadata If you want to learn how to configure your setup to run them from OpenMetadata, follow this guide: @@ -33,169 +33,19 @@ If you want to learn how to configure your setup to run them from OpenMetadata, {% /inlineCallout %} {% /inlineCalloutContainer %} -### Option 2 - Externally +## Option 2 - Externally Any tool capable of running Python code can be used to configure the metadata extraction from your sources. In this section, we are going to give you some background on how the Ingestion Framework works, how to configure the metadata extraction, and some examples on how to host the ingestion in different platforms. -### 1. How does the Ingestion Framework work? - -The Ingestion Framework contains all the logic about how to connect to the sources, extract their metadata -and send it to the OpenMetadata server. We have built it from scratch with the main idea of making it an independent -component that can be run from - literally - anywhere. - -In order to install it, you just need to get it from [PyPI](https://pypi.org/project/openmetadata-ingestion/). - -We will show further examples later, but a piece of code is the best showcase for its simplicity. In order to run -a full ingestion process, you just need to execute a single function. For example, if we wanted to run the ingestion -from within a simple YAML script: - -```python -from metadata.workflow.metadata import MetadataWorkflow -from metadata.workflow.workflow_output_handler import print_status - -# Specify your YAML configuration -CONFIG = """ -source: - ... -workflowConfig: - openMetadataServerConfig: - hostPort: 'http://localhost:8585/api' - authProvider: openmetadata - securityConfig: - jwtToken: ... -""" - -def run(): - workflow_config = yaml.safe_load(CONFIG) - workflow = MetadataWorkflow.create(workflow_config) - workflow.execute() - workflow.raise_from_status() - print_status(workflow) - workflow.stop() - - -if __name__ == "__main__": - run() -``` - -Where this function runs is completely up to you, and you can adapt it to what makes the most sense within your -organization and engineering context. Below you'll see some examples of different orchestrators you can leverage -to execute the ingestion process. - -### 2. Ingestion Configuration - -In the example above, the `Workflow` class got created from a YAML configuration. Any Workflow that you execute (ingestion, -profiler, lineage,...) will have its own YAML representation. - -You can think about this configuration as the recipe you want to execute: where is your source, which pieces do you -extract, how are they processed and where are they sent. - -An example YAML config for extracting MySQL metadata looks like this: - -```yaml -source: - type: mysql - serviceName: mysql - serviceConnection: - config: - type: Mysql - username: openmetadata_user - authType: - password: openmetadata_password - hostPort: localhost:3306 - databaseSchema: openmetadata_db - sourceConfig: - config: - type: DatabaseMetadata -sink: - type: metadata-rest - config: {} -workflowConfig: - openMetadataServerConfig: - hostPort: 'http://localhost:8585/api' - authProvider: openmetadata - securityConfig: - jwtToken: ... -``` - -If you need to get the YAML shape of any connector, you can pick it up from its doc [page](/connectors). - -Additionally, if you want to see your runs logged in the `Ingestions` tab of the connectors page as you would -when running the connectors natively with OpenMetadata, you can add the following configuration on your YAMLs: - -```yaml -source: - type: mysql - serviceName: mysql -[...] -workflowConfig: - openMetadataServerConfig: - hostPort: 'http://localhost:8585/api' - authProvider: openmetadata - securityConfig: - jwtToken: ... -ingestionPipelineFQN: . # E.g., mysql.marketing_metadata` -``` - -Adding the `ingestionPipelineFQN` - the Ingestion Pipeline Fully Qualified Name - will tell the Ingestion Framework -to log the executions and update the ingestion status, which will appear on the UI. Note that the action buttons -will be disabled, since OpenMetadata won't be able to interact with external systems. - -### 3. (Optional) Disable the Pipeline Service Client - -If you want to run your workflows **ONLY externally** without relying on OpenMetadata for any workflow management -or scheduling, you can update the following server configuration: - -```yaml -pipelineServiceClientConfiguration: - enabled: ${PIPELINE_SERVICE_CLIENT_ENABLED:-true} -``` - -by setting `enabled: false` or setting the `PIPELINE_SERVICE_CLIENT_ENABLED=false` as an environment variable. - -This will stop certain APIs and monitors related to the Pipeline Service Client (e.g., Airflow) from being operative. - -### Examples - -{% note %} - -This is not an exhaustive list, and it will keep growing over time. Not because the orchestrators X or Y are not supported, -but just because we did not have the time yet to add it here. If you'd like to chip in and help us expand these guides and examples, -don't hesitate to reach to us in [Slack](https://slack.open-metadata.org/) or directly open a PR in -[GitHub](https://github.com/open-metadata/OpenMetadata/tree/main/openmetadata-docs/content). - -{% /note %} - {% inlineCalloutContainer %} {% inlineCallout color="violet-70" icon="10k" - bold="Airflow" - href="/deployment/ingestion/airflow" %} - Run the ingestion process externally from Airflow - {% /inlineCallout %} - {% inlineCallout - color="violet-70" - icon="10k" - bold="MWAA" - href="/deployment/ingestion/mwaa" %} - Run the ingestion process externally using AWS MWAA - {% /inlineCallout %} - {% inlineCallout - color="violet-70" - icon="10k" - bold="GCS Composer" - href="/deployment/ingestion/gcs-composer" %} - Run the ingestion process externally from GCS Composer - {% /inlineCallout %} - {% inlineCallout - color="violet-70" - icon="10k" - bold="GitHub Actions" - href="/deployment/ingestion/github-actions" %} - Run the ingestion process externally from GitHub Actions + bold="External Ingestion" + href="/deployment/ingestion/external" %} + Manage the Ingestion Framework from anywhere! {% /inlineCallout %} {% /inlineCalloutContainer %} diff --git a/openmetadata-docs/content/v1.3.x/menu.md b/openmetadata-docs/content/v1.3.x/menu.md index 94ce020946e..9686f935d25 100644 --- a/openmetadata-docs/content/v1.3.x/menu.md +++ b/openmetadata-docs/content/v1.3.x/menu.md @@ -51,16 +51,18 @@ site_menu: url: /deployment/ingestion - category: Deployment / Ingestion / OpenMetadata url: /deployment/ingestion/openmetadata - - category: Deployment / Ingestion / Airflow - url: /deployment/ingestion/airflow - - category: Deployment / Ingestion / MWAA - url: /deployment/ingestion/mwaa - - category: Deployment / Ingestion / GCS Composer - url: /deployment/ingestion/gcs-composer - - category: Deployment / Ingestion / GitHub Actions - url: /deployment/ingestion/github-actions - - category: Deployment / Ingestion / Credentials - url: /deployment/ingestion/credentials + - category: Deployment / Ingestion / External Ingestion + url: /deployment/ingestion/external + - category: Deployment / Ingestion / External / Airflow + url: /deployment/ingestion/external/airflow + - category: Deployment / Ingestion / External / MWAA + url: /deployment/ingestion/external/mwaa + - category: Deployment / Ingestion / External / GCS Composer + url: /deployment/ingestion/external/gcs-composer + - category: Deployment / Ingestion / External / GitHub Actions + url: /deployment/ingestion/external/github-actions + - category: Deployment / Ingestion / External / Credentials + url: /deployment/ingestion/external/credentials - category: Deployment / Enable Security url: /deployment/security @@ -165,6 +167,8 @@ site_menu: url: /deployment/secrets-manager/supported-implementations/aws-secrets-manager - category: Deployment / Enable Secrets Manager / Supported Implementations / AWS SSM Parameter Store url: /deployment/secrets-manager/supported-implementations/aws-ssm-parameter-store + - category: Deployment / Enable Secrets Manager / Supported Implementations / Azure Key Vault + url: /deployment/secrets-manager/supported-implementations/azure-key-vault - category: Deployment / Enable Secrets Manager / How to add a new implementation url: /deployment/secrets-manager/how-to-add-a-new-implementation