From 68c3e0b8feb98ce2f7af7f632e667e4ba98eda6a Mon Sep 17 00:00:00 2001 From: clueless-bot <82027071+clueless-bot@users.noreply.github.com> Date: Tue, 1 Nov 2022 00:18:05 +0530 Subject: [PATCH] FIX: Add postgres usage and lineage details (#8456) --- .../connectors/database/postgres/airflow.md | 80 +++++++++++++++++ .../connectors/database/postgres/cli.md | 85 +++++++++++++++++++ .../connectors/database/postgres/index.md | 10 +++ openmetadata-docs/content/connectors/index.md | 2 + .../usage/usage-workflow-query-logs.md | 1 + 5 files changed, 178 insertions(+) diff --git a/openmetadata-docs/content/connectors/database/postgres/airflow.md b/openmetadata-docs/content/connectors/database/postgres/airflow.md index 79a5a0cdfe9..0727fb13329 100644 --- a/openmetadata-docs/content/connectors/database/postgres/airflow.md +++ b/openmetadata-docs/content/connectors/database/postgres/airflow.md @@ -10,6 +10,7 @@ In this section, we provide guides and references to use the Postgres connector. Configure and schedule Postgres metadata and profiler workflows from the OpenMetadata UI: - [Requirements](#requirements) - [Metadata Ingestion](#metadata-ingestion) +- [Query Usage and Lineage Ingestion](#query-usage-and-lineage-ingestion) - [Data Profiler](#data-profiler) - [DBT Integration](#dbt-integration) @@ -378,6 +379,85 @@ with DAG( Note that from connector to connector, this recipe will always be the same. By updating the YAML configuration, you will be able to extract metadata from different sources. +## Query Usage and Lineage Ingestion + +To ingest the Query Usage and Lineage information, the `serviceConnection` configuration will remain the same. +However, the `sourceConfig` is now modeled after this JSON Schema. + +### 1. Define the YAML Config + +This is a sample config for Postgres Usage: + +```yaml +source: + type: postgres + serviceName: local_postgres + serviceConnection: + config: + type: Postgres + username: username + password: password + hostPort: localhost:5432 + # database: database + sourceConfig: + config: + # Number of days to look back + queryLogDuration: 7 + # This is a directory that will be DELETED after the usage runs + stageFileLocation: + # resultLimit: 1000 + # If instead of getting the query logs from the database we want to pass a file with the queries + # queryLogFilePath: path-to-file +processor: + type: query-parser + config: {} +stage: + type: table-usage + config: + filename: /tmp/postgres_usage +bulkSink: + type: metadata-usage + config: + filename: /tmp/postgres_usage +workflowConfig: + # loggerLevel: DEBUG # DEBUG, INFO, WARN or ERROR + openMetadataServerConfig: + hostPort: + authProvider: +``` + +#### Source Configuration - Service Connection + +You can find all the definitions and types for the `serviceConnection` [here](https://github.com/open-metadata/OpenMetadata/blob/main/openmetadata-spec/src/main/resources/json/schema/entity/services/connections/database/postgresConnection.json). +They are the same as metadata ingestion. + +#### Source Configuration - Source Config + +The `sourceConfig` is defined [here](https://github.com/open-metadata/OpenMetadata/blob/main/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceQueryUsagePipeline.json). + +- `queryLogDuration`: Configuration to tune how far we want to look back in query logs to process usage data. +- `resultLimit`: Configuration to set the limit for query logs + +#### Processor, Stage and Bulk Sink + +To specify where the staging files will be located. + +Note that the location is a directory that will be cleaned at the end of the ingestion. + +#### Workflow Configuration + +The same as the metadata ingestion. + +### 2. Run with the CLI + +There is an extra requirement to run the Usage pipelines. You will need to install: + +```bash +pip3 install --upgrade 'openmetadata-ingestion[postgres]' +``` + +For the usage workflow creation, the Airflow file will look the same as for the metadata ingestion. Updating the YAML configuration will be enough. + ## Data Profiler The Data Profiler workflow will be using the `orm-profiler` processor. diff --git a/openmetadata-docs/content/connectors/database/postgres/cli.md b/openmetadata-docs/content/connectors/database/postgres/cli.md index 21908db9910..6f7a364d415 100644 --- a/openmetadata-docs/content/connectors/database/postgres/cli.md +++ b/openmetadata-docs/content/connectors/database/postgres/cli.md @@ -10,6 +10,7 @@ In this section, we provide guides and references to use the Postgres connector. Configure and schedule Postgres metadata and profiler workflows from the OpenMetadata UI: - [Requirements](#requirements) - [Metadata Ingestion](#metadata-ingestion) +- [Query Usage and Lineage Ingestion](#query-usage-and-lineage-ingestion) - [Data Profiler](#data-profiler) - [DBT Integration](#dbt-integration) @@ -331,6 +332,90 @@ metadata ingest -c Note that from connector to connector, this recipe will always be the same. By updating the YAML configuration, you will be able to extract metadata from different sources. +## Query Usage and Lineage Ingestion + +To ingest the Query Usage and Lineage information, the `serviceConnection` configuration will remain the same. +However, the `sourceConfig` is now modeled after this JSON Schema. + +### 1. Define the YAML Config + +This is a sample config for Postgres Usage: + +```yaml +source: + type: postgres + serviceName: local_postgres + serviceConnection: + config: + type: Postgres + username: username + password: password + hostPort: localhost:5432 + # database: database + sourceConfig: + config: + # Number of days to look back + queryLogDuration: 7 + # This is a directory that will be DELETED after the usage runs + stageFileLocation: + # resultLimit: 1000 + # If instead of getting the query logs from the database we want to pass a file with the queries + # queryLogFilePath: path-to-file +processor: + type: query-parser + config: {} +stage: + type: table-usage + config: + filename: /tmp/postgres_usage +bulkSink: + type: metadata-usage + config: + filename: /tmp/postgres_usage +workflowConfig: + # loggerLevel: DEBUG # DEBUG, INFO, WARN or ERROR + openMetadataServerConfig: + hostPort: + authProvider: +``` + +#### Source Configuration - Service Connection + +You can find all the definitions and types for the `serviceConnection` [here](https://github.com/open-metadata/OpenMetadata/blob/main/openmetadata-spec/src/main/resources/json/schema/entity/services/connections/database/postgresConnection.json). +They are the same as metadata ingestion. + +#### Source Configuration - Source Config + +The `sourceConfig` is defined [here](https://github.com/open-metadata/OpenMetadata/blob/main/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceQueryUsagePipeline.json). + +- `queryLogDuration`: Configuration to tune how far we want to look back in query logs to process usage data. +- `resultLimit`: Configuration to set the limit for query logs + +#### Processor, Stage and Bulk Sink + +To specify where the staging files will be located. + +Note that the location is a directory that will be cleaned at the end of the ingestion. + +#### Workflow Configuration + +The same as the metadata ingestion. + +### 2. Run with the CLI + +There is an extra requirement to run the Usage pipelines. You will need to install: + +```bash +pip3 install --upgrade 'openmetadata-ingestion[postgres]' +``` + +After saving the YAML config, we will run the command the same way we did for the metadata ingestion: + +```bash +metadata ingest -c +``` + + ## Data Profiler The Data Profiler workflow will be using the `orm-profiler` processor. diff --git a/openmetadata-docs/content/connectors/database/postgres/index.md b/openmetadata-docs/content/connectors/database/postgres/index.md index c6da3245111..c122e1202ee 100644 --- a/openmetadata-docs/content/connectors/database/postgres/index.md +++ b/openmetadata-docs/content/connectors/database/postgres/index.md @@ -10,6 +10,7 @@ In this section, we provide guides and references to use the PostgreSQL connecto Configure and schedule PostgreSQL metadata and profiler workflows from the OpenMetadata UI: - [Requirements](#requirements) - [Metadata Ingestion](#metadata-ingestion) +- [Query Usage and Lineage Ingestion](#query-usage-and-lineage-ingestion) - [Data Profiler](#data-profiler) - [DBT Integration](#dbt-integration) @@ -228,6 +229,15 @@ caption="Edit and Deploy the Ingestion Pipeline" From the Connection tab, you can also Edit the Service if needed. +## Query Usage and Lineage Ingestion + + + ## Data Profiler