diff --git a/ingestion/src/metadata/examples/workflows/databricks.yaml b/ingestion/src/metadata/examples/workflows/databricks.yaml index a7c3cfa42e6..f4e334deea9 100644 --- a/ingestion/src/metadata/examples/workflows/databricks.yaml +++ b/ingestion/src/metadata/examples/workflows/databricks.yaml @@ -4,6 +4,7 @@ source: serviceConnection: config: catalog: hive_metastore + databaseSchema: default token: hostPort: localhost:443 connectionArguments: diff --git a/ingestion/src/metadata/ingestion/source/database/databricks.py b/ingestion/src/metadata/ingestion/source/database/databricks.py index f1bd54b74b3..efda634c2f9 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks.py @@ -210,31 +210,30 @@ class DatabricksSource(CommonDbSourceService): else: results = self.connection.execute("SHOW CATALOGS") for res in results: - new_catalog = res[0] - database_fqn = fqn.build( - self.metadata, - entity_type=Database, - service_name=self.context.database_service.name.__root__, - database_name=new_catalog, - ) - if filter_by_database( - self.source_config.databaseFilterPattern, - database_fqn - if self.source_config.useFqnForFiltering - else new_catalog, - ): - self.status.filter(database_fqn, "Database Filtered Out") - continue - - try: - - self.set_inspector(database_name=new_catalog) - yield new_catalog - except Exception as exc: - logger.error(traceback.format_exc()) - logger.warning( - f"Error trying to process database {new_catalog}: {exc}" + if res: + new_catalog = res[0] + database_fqn = fqn.build( + self.metadata, + entity_type=Database, + service_name=self.context.database_service.name.__root__, + database_name=new_catalog, ) + if filter_by_database( + self.source_config.databaseFilterPattern, + database_fqn + if self.source_config.useFqnForFiltering + else new_catalog, + ): + self.status.filter(database_fqn, "Database Filtered Out") + continue + try: + self.set_inspector(database_name=new_catalog) + yield new_catalog + except Exception as exc: + logger.error(traceback.format_exc()) + logger.warning( + f"Error trying to process database {new_catalog}: {exc}" + ) def get_raw_database_schema_names(self) -> Iterable[str]: if self.service_connection.__dict__.get("databaseSchema"): diff --git a/ingestion/src/metadata/ingestion/source/database/databricks_lineage.py b/ingestion/src/metadata/ingestion/source/database/databricks_lineage.py index 78413aa24ed..14a0480047a 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks_lineage.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks_lineage.py @@ -70,7 +70,6 @@ class DatabricksLineageSource(DatabricksQueryParserSource, LineageSource): startTime=row.get("query_start_time_ms"), endTime=row.get("execution_end_time_ms"), analysisDate=datetime.now(), - databaseName="default", # In databricks databaseName is always default serviceName=self.config.serviceName, ) except Exception as exc: diff --git a/ingestion/src/metadata/ingestion/source/database/databricks_usage.py b/ingestion/src/metadata/ingestion/source/database/databricks_usage.py index 14b219ec478..aa1a1b2e8b9 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks_usage.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks_usage.py @@ -102,7 +102,6 @@ class DatabricksUsageSource(DatabricksQueryParserSource, UsageSource): endTime=row.get("execution_end_time_ms"), analysisDate=datetime.now(), serviceName=self.config.serviceName, - databaseName="default", # In databricks databaseName is always default ) ) except Exception as err: diff --git a/ingestion/tests/unit/test_databricks_lineage.py b/ingestion/tests/unit/test_databricks_lineage.py index 1cf13eae00c..5260e1370f6 100644 --- a/ingestion/tests/unit/test_databricks_lineage.py +++ b/ingestion/tests/unit/test_databricks_lineage.py @@ -41,7 +41,6 @@ EXPECTED_DATABRICKS_DETAILS = [ analysisDate=datetime.now(), aborted=None, serviceName="local_databricks1", - databaseName="default", databaseSchema=None, ), TableQuery( @@ -52,7 +51,6 @@ EXPECTED_DATABRICKS_DETAILS = [ analysisDate=datetime.now(), aborted=None, serviceName="local_databricks1", - databaseName="default", databaseSchema=None, ), TableQuery( @@ -63,7 +61,6 @@ EXPECTED_DATABRICKS_DETAILS = [ analysisDate=datetime.now(), aborted=None, serviceName="local_databricks1", - databaseName="default", databaseSchema=None, ), TableQuery( @@ -74,7 +71,6 @@ EXPECTED_DATABRICKS_DETAILS = [ analysisDate=datetime.now(), aborted=None, serviceName="local_databricks1", - databaseName="default", databaseSchema=None, ), ] diff --git a/openmetadata-docs/content/connectors/database/databricks/airflow.md b/openmetadata-docs/content/connectors/database/databricks/airflow.md index 0576241ab5b..b7de00b07b9 100644 --- a/openmetadata-docs/content/connectors/database/databricks/airflow.md +++ b/openmetadata-docs/content/connectors/database/databricks/airflow.md @@ -54,6 +54,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default type: Databricks token: hostPort: @@ -138,6 +140,8 @@ workflowConfig: - **hostPort**: Enter the fully qualified hostname and port number for your Databricks deployment in the Host and Port field. - **token**: Generated Token to connect to Databricks. - **httpPath**: Databricks compute resources URL. +- **catalog**: Catalog of the data source(Example: hive_metastore). This is optional parameter, if you would like to restrict the metadata reading to a single catalog. When left blank, OpenMetadata Ingestion attempts to scan all the catalog. +- **databaseSchema**: DatabaseSchema of the data source. This is optional parameter, if you would like to restrict the metadata reading to a single databaseSchema. When left blank, OpenMetadata Ingestion attempts to scan all the databaseSchema. - **Connection Options (Optional)**: Enter the details for any additional connection options that can be sent to Databricks during the connection. These details must be added as Key-Value pairs. - **Connection Arguments (Optional)**: Enter the details for any additional connection arguments such as security or protocol configs that can be sent to Databricks during the connection. These details must be added as Key-Value pairs. - In case you are using Single-Sign-On (SSO) for authentication, add the `authenticator` details in the Connection Arguments as a Key-Value pair as follows: `"authenticator" : "sso_login_url"` @@ -384,6 +388,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default token: hostPort: localhost:443 connectionArguments: @@ -463,6 +469,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default type: Databricks token: hostPort: diff --git a/openmetadata-docs/content/connectors/database/databricks/cli.md b/openmetadata-docs/content/connectors/database/databricks/cli.md index ab85871a64d..2694920ffb0 100644 --- a/openmetadata-docs/content/connectors/database/databricks/cli.md +++ b/openmetadata-docs/content/connectors/database/databricks/cli.md @@ -54,6 +54,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default type: Databricks token: hostPort: @@ -138,6 +140,8 @@ workflowConfig: - **hostPort**: Enter the fully qualified hostname and port number for your Databricks deployment in the Host and Port field. - **token**: Generated Token to connect to Databricks. - **httpPath**: Databricks compute resources URL. +- **catalog**: Catalog of the data source(Example: hive_metastore). This is optional parameter, if you would like to restrict the metadata reading to a single catalog. When left blank, OpenMetadata Ingestion attempts to scan all the catalog. +- **databaseSchema**: DatabaseSchema of the data source. This is optional parameter, if you would like to restrict the metadata reading to a single databaseSchema. When left blank, OpenMetadata Ingestion attempts to scan all the databaseSchema. - **Connection Options (Optional)**: Enter the details for any additional connection options that can be sent to Databricks during the connection. These details must be added as Key-Value pairs. - **Connection Arguments (Optional)**: Enter the details for any additional connection arguments such as security or protocol configs that can be sent to Databricks during the connection. These details must be added as Key-Value pairs. - In case you are using Single-Sign-On (SSO) for authentication, add the `authenticator` details in the Connection Arguments as a Key-Value pair as follows: `"authenticator" : "sso_login_url"` @@ -337,6 +341,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default token: hostPort: localhost:443 connectionArguments: @@ -421,6 +427,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default type: Databricks token: hostPort: diff --git a/openmetadata-docs/content/connectors/database/databricks/index.md b/openmetadata-docs/content/connectors/database/databricks/index.md index e97a4783bf0..6a9c0182e04 100644 --- a/openmetadata-docs/content/connectors/database/databricks/index.md +++ b/openmetadata-docs/content/connectors/database/databricks/index.md @@ -133,6 +133,8 @@ the changes. - **Host and Port**: Enter the fully qualified hostname and port number for your Databricks deployment in the Host and Port field. - **Token**: Generated Token to connect to Databricks. - **HTTP Path**: Databricks compute resources URL. +- **Catalog**: Catalog of the data source(Example: hive_metastore). This is optional parameter, if you would like to restrict the metadata reading to a single catalog. When left blank, OpenMetadata Ingestion attempts to scan all the catalog. +- **DatabaseSchema**: databaseSchema of the data source. This is optional parameter, if you would like to restrict the metadata reading to a single databaseSchema. When left blank, OpenMetadata Ingestion attempts to scan all the databaseSchema. - **Connection Options (Optional)**: Enter the details for any additional connection options that can be sent to Databricks during the connection. These details must be added as Key-Value pairs. - **Connection Arguments (Optional)**: Enter the details for any additional connection arguments such as security or protocol configs that can be sent to Databricks during the connection. These details must be added as Key-Value pairs. - In case you are using Single-Sign-On (SSO) for authentication, add the `authenticator` details in the Connection Arguments as a Key-Value pair as follows: `"authenticator" : "sso_login_url"` diff --git a/openmetadata-docs/content/connectors/database/databricks/troubleshooting.md b/openmetadata-docs/content/connectors/database/databricks/troubleshooting.md index 04c91b27d3e..e1cec10d0fb 100644 --- a/openmetadata-docs/content/connectors/database/databricks/troubleshooting.md +++ b/openmetadata-docs/content/connectors/database/databricks/troubleshooting.md @@ -13,6 +13,8 @@ source: serviceName: local_databricks serviceConnection: config: + catalog: hive_metastore + databaseSchema: default token: hostPort: localhost:443 connectionArguments: diff --git a/openmetadata-docs/images/openmetadata/connectors/databricks/service-connection.png b/openmetadata-docs/images/openmetadata/connectors/databricks/service-connection.png index ee007d678c2..b87eb3d78a8 100644 Binary files a/openmetadata-docs/images/openmetadata/connectors/databricks/service-connection.png and b/openmetadata-docs/images/openmetadata/connectors/databricks/service-connection.png differ