diff --git a/ingestion/src/metadata/ingestion/source/database/vertica/metadata.py b/ingestion/src/metadata/ingestion/source/database/vertica/metadata.py index eb8beb65563..b36d6fc8a7d 100644 --- a/ingestion/src/metadata/ingestion/source/database/vertica/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/vertica/metadata.py @@ -12,7 +12,9 @@ Vertica source implementation. """ import re +import traceback from textwrap import dedent +from typing import Iterable from sqlalchemy import sql, util from sqlalchemy.engine import reflection @@ -20,6 +22,7 @@ from sqlalchemy.sql import sqltypes from sqlalchemy.sql.sqltypes import String from sqlalchemy_vertica.base import VerticaDialect +from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.services.connections.database.verticaConnection import ( VerticaConnection, ) @@ -34,8 +37,19 @@ from metadata.ingestion.source.database.common_db_source import CommonDbSourceSe from metadata.ingestion.source.database.vertica.queries import ( VERTICA_GET_COLUMNS, VERTICA_GET_PRIMARY_KEYS, + VERTICA_LIST_DATABASES, + VERTICA_TABLE_COMMENTS, VERTICA_VIEW_DEFINITION, ) +from metadata.utils import fqn +from metadata.utils.filters import filter_by_database +from metadata.utils.logger import ingestion_logger +from metadata.utils.sqlalchemy_utils import ( + get_all_table_comments, + get_table_comment_wrapper, +) + +logger = ingestion_logger() class UUID(String): @@ -219,9 +233,24 @@ def get_view_definition( return None +@reflection.cache +def get_table_comment( + self, connection, table_name, schema=None, **kw # pylint: disable=unused-argument +): + return get_table_comment_wrapper( + self, + connection, + table_name=table_name, + schema=schema, + query=VERTICA_TABLE_COMMENTS, + ) + + VerticaDialect.get_columns = get_columns VerticaDialect._get_column_info = _get_column_info # pylint: disable=protected-access VerticaDialect.get_view_definition = get_view_definition +VerticaDialect.get_all_table_comments = get_all_table_comments +VerticaDialect.get_table_comment = get_table_comment class VerticaSource(CommonDbSourceService): @@ -239,3 +268,38 @@ class VerticaSource(CommonDbSourceService): f"Expected VerticaConnection, but got {connection}" ) return cls(config, metadata_config) + + def get_database_names(self) -> Iterable[str]: + configured_db = self.config.serviceConnection.__root__.config.database + if configured_db: + self.set_inspector(database_name=configured_db) + yield configured_db + else: + results = self.connection.execute(VERTICA_LIST_DATABASES) + for res in results: + row = list(res) + new_database = row[0] + database_fqn = fqn.build( + self.metadata, + entity_type=Database, + service_name=self.context.database_service.name.__root__, + database_name=new_database, + ) + + if filter_by_database( + self.source_config.databaseFilterPattern, + database_fqn + if self.source_config.useFqnForFiltering + else new_database, + ): + self.status.filter(database_fqn, "Database Filtered Out") + continue + + try: + self.set_inspector(database_name=new_database) + yield new_database + except Exception as exc: + logger.debug(traceback.format_exc()) + logger.error( + f"Error trying to connect to database {new_database}: {exc}" + ) diff --git a/ingestion/src/metadata/ingestion/source/database/vertica/queries.py b/ingestion/src/metadata/ingestion/source/database/vertica/queries.py index e244c22017e..fc2e48acdfe 100644 --- a/ingestion/src/metadata/ingestion/source/database/vertica/queries.py +++ b/ingestion/src/metadata/ingestion/source/database/vertica/queries.py @@ -14,15 +14,37 @@ SQL Queries used during ingestion import textwrap +# Column comments in Vertica can only happen on Projections +# https://forum.vertica.com/discussion/238945/vertica-try-to-create-comment +# And Vertica projections follow this naming: +# https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/AdministratorsGuide/Projections/WorkingWithProjections.htm +# So to fetch column comments we need to concat the table_name + projection infix + column name. +# Example: querying `v_catalog.comments` we find an object_name for a column in the table vendor_dimension as +# `vendor_dimension_super.vendor_name`. Note how this is the `_super` projection. +# Then, our join looks for the match in `vendor_dimension_%.vendor_name`. +# Note: This might not suit for all column scenarios, but currently we did not find a better way to join +# v_catalog.comments with v_catalog.columns. VERTICA_GET_COLUMNS = textwrap.dedent( """ - SELECT column_name, data_type, column_default, is_nullable, comment - FROM v_catalog.columns col left join v_catalog.comments com on col.table_id=com.object_id - and com.object_type='COLUMN' and col.column_name=com.child_object + SELECT + column_name, + data_type, + column_default, + is_nullable, + comment + FROM v_catalog.columns col + LEFT JOIN v_catalog.comments com + ON com.object_type = 'COLUMN' + AND com.object_name LIKE CONCAT(CONCAT(col.table_name, '_%.'), col.column_name) WHERE lower(table_name) = '{table}' AND {schema_condition} UNION ALL - SELECT column_name, data_type, '' as column_default, true as is_nullable, '' as comment + SELECT + column_name, + data_type, + '' AS column_default, + true AS is_nullable, + '' AS comment FROM v_catalog.view_columns WHERE lower(table_name) = '{table}' AND {schema_condition} @@ -47,3 +69,16 @@ VERTICA_VIEW_DEFINITION = textwrap.dedent( AND {schema_condition} """ ) + +VERTICA_LIST_DATABASES = "SELECT database_name from v_catalog.databases" + +VERTICA_TABLE_COMMENTS = textwrap.dedent( + """ + SELECT + object_schema as schema, + object_name as table_name, + comment as table_comment + FROM v_catalog.comments + WHERE object_type = 'TABLE'; + """ +) diff --git a/ingestion/src/metadata/orm_profiler/orm/functions/median.py b/ingestion/src/metadata/orm_profiler/orm/functions/median.py index 31f23c97369..64bb51adfc3 100644 --- a/ingestion/src/metadata/orm_profiler/orm/functions/median.py +++ b/ingestion/src/metadata/orm_profiler/orm/functions/median.py @@ -59,7 +59,7 @@ def _(elements, compiler, **kwargs): def _(elements, compiler, **kwargs): """Median computation for MSSQL""" col = elements.clauses.clauses[0].name - return "percentile_cont(0.5) WITHIN GROUP (ORDER BY %s ASC) OVER()" % col + return "percentile_cont(0.5) WITHIN GROUP (ORDER BY %s ASC) OVER()" % col @compiles(MedianFn, Dialects.Hive) @@ -70,7 +70,7 @@ def _(elements, compiler, **kwargs): @compiles(MedianFn, Dialects.MySQL) -def _(elemenst, compiler, **kwargs): # pylint: disable=unused-argument +def _(elements, compiler, **kwargs): # pylint: disable=unused-argument """Median computation for MySQL currently not supported Needs to be tackled in https://github.com/open-metadata/OpenMetadata/issues/6340 """ @@ -93,3 +93,11 @@ def _(elements, compiler, **kwargs): # pylint: disable=unused-argument """.format( col=col, table=table.value ) + + +@compiles(MedianFn, Dialects.Vertica) +def _(elements, compiler, **kwargs): # pylint: disable=unused-argument + col, table = list(elements.clauses) + return "(SELECT MEDIAN({col}) OVER() FROM {table} LIMIT 1)".format( + col=col, table=table.value + ) diff --git a/ingestion/src/metadata/orm_profiler/orm/functions/modulo.py b/ingestion/src/metadata/orm_profiler/orm/functions/modulo.py index d7a5c6eec28..a2ec498ea86 100644 --- a/ingestion/src/metadata/orm_profiler/orm/functions/modulo.py +++ b/ingestion/src/metadata/orm_profiler/orm/functions/modulo.py @@ -58,6 +58,7 @@ def _(element, compiler, **kw): @compiles(ModuloFn, Dialects.Trino) @compiles(ModuloFn, Dialects.IbmDbSa) @compiles(ModuloFn, Dialects.Db2) +@compiles(ModuloFn, Dialects.Vertica) def _(element, compiler, **kw): """Modulo function for specific dialect""" value, base = validate_and_compile(element, compiler, **kw) diff --git a/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py b/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py index 011909b0e6a..f85b89e80f9 100644 --- a/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py +++ b/ingestion/src/metadata/orm_profiler/orm/functions/random_num.py @@ -101,3 +101,12 @@ def _(*_, **__): from the already sampled results when executing row::MOD(0, 100) < profile_sample. """ return "0" + + +@compiles(RandomNumFn, Dialects.Vertica) +def _(*_, **__): + """ + Vertica RANDOM() returns a number 0 < n < 1 as a float. + We need to cast it to integer to perform the modulo + """ + return "(RANDOM() * 100)::INTEGER" diff --git a/openmetadata-docs/content/connectors/database/vertica/airflow.md b/openmetadata-docs/content/connectors/database/vertica/airflow.md index 0e138013829..076fb433317 100644 --- a/openmetadata-docs/content/connectors/database/vertica/airflow.md +++ b/openmetadata-docs/content/connectors/database/vertica/airflow.md @@ -22,6 +22,17 @@ To deploy OpenMetadata, check the Deployment guides. To run the Ingestion via the UI you'll need to use the OpenMetadata Ingestion Container, which comes shipped with custom Airflow plugins to handle the workflow deployment. +### Permissions + +To run the ingestion we need a user with `SELECT` grants on the schemas that you'd like to ingest, as well as to the +`V_CATALOG` schema. You can grant those as follows for the schemas in your database: + +```sql +CREATE USER openmetadata IDENTIFIED BY 'password'; +GRANT SELECT ON ALL TABLES IN SCHEMA PUBLIC TO openmetadata; +GRANT SELECT ON ALL TABLES IN SCHEMA V_CATALOG TO openmetadata; +``` + ### Python Requirements To run the Vertica ingestion, you will need to install: diff --git a/openmetadata-docs/content/connectors/database/vertica/cli.md b/openmetadata-docs/content/connectors/database/vertica/cli.md index 5bd16319f0e..f99f6e32cfd 100644 --- a/openmetadata-docs/content/connectors/database/vertica/cli.md +++ b/openmetadata-docs/content/connectors/database/vertica/cli.md @@ -22,6 +22,17 @@ To deploy OpenMetadata, check the Deployment guides. To run the Ingestion via the UI you'll need to use the OpenMetadata Ingestion Container, which comes shipped with custom Airflow plugins to handle the workflow deployment. +### Permissions + +To run the ingestion we need a user with `SELECT` grants on the schemas that you'd like to ingest, as well as to the +`V_CATALOG` schema. You can grant those as follows for the schemas in your database: + +```sql +CREATE USER openmetadata IDENTIFIED BY 'password'; +GRANT SELECT ON ALL TABLES IN SCHEMA PUBLIC TO openmetadata; +GRANT SELECT ON ALL TABLES IN SCHEMA V_CATALOG TO openmetadata; +``` + ### Python Requirements To run the Vertica ingestion, you will need to install: diff --git a/openmetadata-docs/content/connectors/database/vertica/index.md b/openmetadata-docs/content/connectors/database/vertica/index.md index 7145db9c001..b97fb852468 100644 --- a/openmetadata-docs/content/connectors/database/vertica/index.md +++ b/openmetadata-docs/content/connectors/database/vertica/index.md @@ -43,6 +43,17 @@ To deploy OpenMetadata, check the Deployment guides. To run the Ingestion via the UI you'll need to use the OpenMetadata Ingestion Container, which comes shipped with custom Airflow plugins to handle the workflow deployment. +### Permissions + +To run the ingestion we need a user with `SELECT` grants on the schemas that you'd like to ingest, as well as to the +`V_CATALOG` schema. You can grant those as follows for the schemas in your database: + +```sql +CREATE USER openmetadata IDENTIFIED BY 'password'; +GRANT SELECT ON ALL TABLES IN SCHEMA PUBLIC TO openmetadata; +GRANT SELECT ON ALL TABLES IN SCHEMA V_CATALOG TO openmetadata; +``` + ## Metadata Ingestion ### 1. Visit the Services Page diff --git a/openmetadata-docs/content/connectors/database/vertica/troubleshooting.md b/openmetadata-docs/content/connectors/database/vertica/troubleshooting.md new file mode 100644 index 00000000000..b40e171441e --- /dev/null +++ b/openmetadata-docs/content/connectors/database/vertica/troubleshooting.md @@ -0,0 +1,16 @@ +--- +title: Vertica Connector Troubleshooting +slug: /connectors/database/vertica/troubleshooting +--- + +# Troubleshooting + +Learn how to resolve the most common problems people encounter in the Vertica connector. + +## Profiler: New session rejected + +If you see the following error when computing the profiler `New session rejected due to limit, already XYZ sessions active`, +it means that the number of threads configured in the profiler workflow is exceeding the connection limits of your +Vertica instance. + +Note that by default the profiler runs with 5 threads. In case you see this error, you might need to reduce this number. diff --git a/openmetadata-docs/content/menu.md b/openmetadata-docs/content/menu.md index 119160fbc7a..16de12e3a97 100644 --- a/openmetadata-docs/content/menu.md +++ b/openmetadata-docs/content/menu.md @@ -344,6 +344,8 @@ site_menu: url: /connectors/database/vertica/airflow - category: Connectors / Database / Vertica / CLI url: /connectors/database/vertica/cli + - category: Connectors / Database / Vertica / Troubleshooting + url: /connectors/database/vertica/troubleshooting - category: Connectors / Dashboard url: /connectors/dashboard - category: Connectors / Dashboard / Looker