mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-29 09:52:40 +00:00
feat(ingestion): Added Databricks support to Fivetran source (#14897)
This commit is contained in:
parent
a043d15193
commit
40b51ac2da
@ -9,9 +9,10 @@ This source extracts the following:
|
||||
|
||||
## Configuration Notes
|
||||
|
||||
1. Fivetran supports the fivetran platform connector to dump the log events and connectors, destinations, users and roles metadata in your destination.
|
||||
1. Fivetran supports the [fivetran platform connector](https://fivetran.com/docs/logs/fivetran-platform) to dump the log events and connectors, destinations, users and roles metadata in your destination.
|
||||
2. You need to setup and start the initial sync of the fivetran platform connector before using this source. Refer [link](https://fivetran.com/docs/logs/fivetran-platform/setup-guide).
|
||||
3. Once initial sync up of your fivetran platform connector is done, you need to provide the fivetran platform connector's destination platform and its configuration in the recipe.
|
||||
4. We expect our users to enable automatic schema updates (default) in fivetran platform connector configured for DataHub, this ensures latest schema changes are applied and avoids inconsistency data syncs.
|
||||
|
||||
## Concept mapping
|
||||
|
||||
@ -30,6 +31,7 @@ Works only for
|
||||
|
||||
- Snowflake destination
|
||||
- Bigquery destination
|
||||
- Databricks destination
|
||||
|
||||
## Snowflake destination Configuration Guide
|
||||
|
||||
@ -58,6 +60,22 @@ grant role fivetran_datahub to user snowflake_user;
|
||||
1. If your fivetran platform connector destination is bigquery, you need to setup a ServiceAccount as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) and select BigQuery Data Viewer and BigQuery Job User IAM roles.
|
||||
2. Create and Download a service account JSON keyfile and provide bigquery connection credential in bigquery destination config.
|
||||
|
||||
## Databricks destination Configuration Guide
|
||||
|
||||
1. Get your Databricks instance's [workspace url](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids)
|
||||
2. Create a [Databricks Service Principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#what-is-a-service-principal)
|
||||
1. You can skip this step and use your own account to get things running quickly, but we strongly recommend creating a dedicated service principal for production use.
|
||||
3. Generate a Databricks Personal Access token following the following guides:
|
||||
1. [Service Principals](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#personal-access-tokens)
|
||||
2. [Personal Access Tokens](https://docs.databricks.com/dev-tools/auth.html#databricks-personal-access-tokens)
|
||||
4. Provision your service account, to ingest your workspace's metadata and lineage, your service principal must have all of the following:
|
||||
1. One of: metastore admin role, ownership of, or `USE CATALOG` privilege on any catalogs you want to ingest
|
||||
2. One of: metastore admin role, ownership of, or `USE SCHEMA` privilege on any schemas you want to ingest
|
||||
3. Ownership of or `SELECT` privilege on any tables and views you want to ingest
|
||||
4. [Ownership documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/ownership.html)
|
||||
5. [Privileges documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html)
|
||||
5. Check the starter recipe below and replace `workspace_url` and `token` with your information from the previous steps.
|
||||
|
||||
## Advanced Configurations
|
||||
|
||||
### Working with Platform Instances
|
||||
|
||||
@ -26,7 +26,18 @@ source:
|
||||
client_id: "client_id"
|
||||
private_key: "private_key"
|
||||
dataset: "fivetran_log_dataset"
|
||||
# Optional - If destination platform is 'databricks', provide databricks configuration.
|
||||
databricks_destination_config:
|
||||
# Credentials
|
||||
credential:
|
||||
token: "token"
|
||||
workspace_url: "workspace_url"
|
||||
warehouse_id: "warehouse_id"
|
||||
|
||||
# Coordinates
|
||||
catalog: "fivetran_catalog"
|
||||
log_schema: "fivetran_log"
|
||||
|
||||
# Optional - filter for certain connector names instead of ingesting everything.
|
||||
# connector_patterns:
|
||||
# allow:
|
||||
|
||||
@ -365,6 +365,10 @@ slack = {
|
||||
"tenacity>=8.0.1",
|
||||
}
|
||||
|
||||
databricks_common = {
|
||||
"databricks-sqlalchemy~=1.0", # Note: This is pinned to 1.0 for compatibility with SQLAlchemy 1.x which is default for fivetran
|
||||
}
|
||||
|
||||
databricks = {
|
||||
# 0.1.11 appears to have authentication issues with azure databricks
|
||||
# 0.22.0 has support for `include_browse` in metadata list apis
|
||||
@ -466,7 +470,14 @@ plugins: Dict[str, Set[str]] = {
|
||||
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/release-notes.html#rn-7-14-0
|
||||
# https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433
|
||||
"elasticsearch": {"elasticsearch==7.13.4", *cachetools_lib},
|
||||
"excel": {"openpyxl>=3.1.5", "pandas", *aws_common, *abs_base, *cachetools_lib, *data_lake_profiling},
|
||||
"excel": {
|
||||
"openpyxl>=3.1.5",
|
||||
"pandas",
|
||||
*aws_common,
|
||||
*abs_base,
|
||||
*cachetools_lib,
|
||||
*data_lake_profiling,
|
||||
},
|
||||
"cassandra": {
|
||||
"cassandra-driver>=3.28.0",
|
||||
# We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
|
||||
@ -582,7 +593,11 @@ plugins: Dict[str, Set[str]] = {
|
||||
"unity-catalog": databricks | sql_common,
|
||||
# databricks is alias for unity-catalog and needs to be kept in sync
|
||||
"databricks": databricks | sql_common,
|
||||
"fivetran": snowflake_common | bigquery_common | sqlalchemy_lib | sqlglot_lib,
|
||||
"fivetran": snowflake_common
|
||||
| bigquery_common
|
||||
| databricks_common
|
||||
| sqlalchemy_lib
|
||||
| sqlglot_lib,
|
||||
"snaplogic": set(),
|
||||
"qlik-sense": sqlglot_lib | {"requests", "websocket-client"},
|
||||
"sigma": sqlglot_lib | {"requests"},
|
||||
@ -737,7 +752,7 @@ base_dev_requirements = {
|
||||
"cassandra",
|
||||
"neo4j",
|
||||
"vertexai",
|
||||
"mssql-odbc"
|
||||
"mssql-odbc",
|
||||
]
|
||||
if plugin
|
||||
for dependency in plugins[plugin]
|
||||
|
||||
@ -29,6 +29,9 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
||||
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
||||
StatefulIngestionConfigBase,
|
||||
)
|
||||
from datahub.ingestion.source.unity.config import (
|
||||
UnityCatalogConnectionConfig,
|
||||
)
|
||||
from datahub.utilities.lossy_collections import LossyList
|
||||
from datahub.utilities.perf_timer import PerfTimer
|
||||
|
||||
@ -56,8 +59,8 @@ class Constant:
|
||||
STATUS = "status"
|
||||
USER_ID = "user_id"
|
||||
EMAIL = "email"
|
||||
CONNECTOR_ID = "connector_id"
|
||||
CONNECTOR_NAME = "connector_name"
|
||||
CONNECTOR_ID = "connection_id"
|
||||
CONNECTOR_NAME = "connection_name"
|
||||
CONNECTOR_TYPE_ID = "connector_type_id"
|
||||
PAUSED = "paused"
|
||||
SYNC_FREQUENCY = "sync_frequency"
|
||||
@ -85,10 +88,23 @@ class BigQueryDestinationConfig(BigQueryConnectionConfig):
|
||||
dataset: str = Field(description="The fivetran connector log dataset.")
|
||||
|
||||
|
||||
class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
|
||||
catalog: str = Field(description="The fivetran connector log catalog.")
|
||||
log_schema: str = Field(description="The fivetran connector log schema.")
|
||||
|
||||
@pydantic.validator("warehouse_id")
|
||||
def warehouse_id_should_not_be_empty(cls, warehouse_id: Optional[str]) -> str:
|
||||
if warehouse_id is None or (warehouse_id and warehouse_id.strip() == ""):
|
||||
raise ValueError("Fivetran requires warehouse_id to be set")
|
||||
return warehouse_id
|
||||
|
||||
|
||||
class FivetranLogConfig(ConfigModel):
|
||||
destination_platform: Literal["snowflake", "bigquery"] = pydantic.Field(
|
||||
default="snowflake",
|
||||
description="The destination platform where fivetran connector log tables are dumped.",
|
||||
destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
|
||||
pydantic.Field(
|
||||
default="snowflake",
|
||||
description="The destination platform where fivetran connector log tables are dumped.",
|
||||
)
|
||||
)
|
||||
snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field(
|
||||
default=None,
|
||||
@ -98,6 +114,12 @@ class FivetranLogConfig(ConfigModel):
|
||||
default=None,
|
||||
description="If destination platform is 'bigquery', provide bigquery configuration.",
|
||||
)
|
||||
databricks_destination_config: Optional[DatabricksDestinationConfig] = (
|
||||
pydantic.Field(
|
||||
default=None,
|
||||
description="If destination platform is 'databricks', provide databricks configuration.",
|
||||
)
|
||||
)
|
||||
_rename_destination_config = pydantic_renamed_field(
|
||||
"destination_config", "snowflake_destination_config"
|
||||
)
|
||||
@ -115,6 +137,11 @@ class FivetranLogConfig(ConfigModel):
|
||||
raise ValueError(
|
||||
"If destination platform is 'bigquery', user must provide bigquery destination configuration in the recipe."
|
||||
)
|
||||
elif destination_platform == "databricks":
|
||||
if "databricks_destination_config" not in values:
|
||||
raise ValueError(
|
||||
"If destination platform is 'databricks', user must provide databricks destination configuration in the recipe."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Destination platform '{destination_platform}' is not yet supported."
|
||||
|
||||
@ -66,7 +66,6 @@ logger = logging.getLogger(__name__)
|
||||
class FivetranSource(StatefulIngestionSourceBase):
|
||||
"""
|
||||
This plugin extracts fivetran users, connectors, destinations and sync history.
|
||||
This plugin is in beta and has only been tested on Snowflake connector.
|
||||
"""
|
||||
|
||||
config: FivetranSourceConfig
|
||||
|
||||
@ -73,6 +73,19 @@ class FivetranLogAPI:
|
||||
if result is None:
|
||||
raise ValueError("Failed to retrieve BigQuery project ID")
|
||||
fivetran_log_database = result[0]
|
||||
elif destination_platform == "databricks":
|
||||
databricks_destination_config = (
|
||||
self.fivetran_log_config.databricks_destination_config
|
||||
)
|
||||
if databricks_destination_config is not None:
|
||||
engine = create_engine(
|
||||
databricks_destination_config.get_sql_alchemy_url(
|
||||
databricks_destination_config.catalog
|
||||
),
|
||||
**databricks_destination_config.get_options(),
|
||||
)
|
||||
fivetran_log_query.set_schema(databricks_destination_config.log_schema)
|
||||
fivetran_log_database = databricks_destination_config.catalog
|
||||
else:
|
||||
raise ConfigurationError(
|
||||
f"Destination platform '{destination_platform}' is not yet supported."
|
||||
|
||||
@ -6,6 +6,21 @@ MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000
|
||||
MAX_JOBS_PER_CONNECTOR = 500
|
||||
|
||||
|
||||
"""
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Fivetran Platform Connector Handling
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Current Query Change Log: August 2025 (See: https://fivetran.com/docs/changelog/2025/august-2025)
|
||||
|
||||
All queries have to be updated as per Fivetran Platform Connector release if any. We expect customers
|
||||
and fivetran to keep platform connector configured for DataHub with auto sync enabled to get latest changes.
|
||||
|
||||
References:
|
||||
- Fivetran Release Notes: https://fivetran.com/docs/changelog (Look for "Fivetran Platform Connector")
|
||||
- Latest Platform Connector Schema: https://fivetran.com/docs/logs/fivetran-platform?erdModal=open
|
||||
"""
|
||||
|
||||
|
||||
class FivetranLogQuery:
|
||||
# Note: All queries are written in Snowflake SQL.
|
||||
# They will be transpiled to the target database's SQL dialect at runtime.
|
||||
@ -30,17 +45,17 @@ class FivetranLogQuery:
|
||||
def get_connectors_query(self) -> str:
|
||||
return f"""\
|
||||
SELECT
|
||||
connector_id,
|
||||
connection_id,
|
||||
connecting_user_id,
|
||||
connector_type_id,
|
||||
connector_name,
|
||||
connection_name,
|
||||
paused,
|
||||
sync_frequency,
|
||||
destination_id
|
||||
FROM {self.schema_clause}connector
|
||||
FROM {self.schema_clause}connection
|
||||
WHERE
|
||||
_fivetran_deleted = FALSE
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY _fivetran_synced DESC) = 1
|
||||
"""
|
||||
|
||||
def get_users_query(self) -> str:
|
||||
@ -63,20 +78,20 @@ FROM {self.schema_clause}user
|
||||
return f"""\
|
||||
WITH ranked_syncs AS (
|
||||
SELECT
|
||||
connector_id,
|
||||
connection_id,
|
||||
sync_id,
|
||||
MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
|
||||
MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
|
||||
MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
|
||||
ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
|
||||
ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY MAX(time_stamp) DESC) as rn
|
||||
FROM {self.schema_clause}log
|
||||
WHERE message_event in ('sync_start', 'sync_end')
|
||||
AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
|
||||
AND connector_id IN ({formatted_connector_ids})
|
||||
GROUP BY connector_id, sync_id
|
||||
AND connection_id IN ({formatted_connector_ids})
|
||||
GROUP BY connection_id, sync_id
|
||||
)
|
||||
SELECT
|
||||
connector_id,
|
||||
connection_id,
|
||||
sync_id,
|
||||
start_time,
|
||||
end_time,
|
||||
@ -85,7 +100,7 @@ FROM ranked_syncs
|
||||
WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
|
||||
AND start_time IS NOT NULL
|
||||
AND end_time IS NOT NULL
|
||||
ORDER BY connector_id, end_time DESC
|
||||
ORDER BY connection_id, end_time DESC
|
||||
"""
|
||||
|
||||
def get_table_lineage_query(self, connector_ids: List[str]) -> str:
|
||||
@ -97,7 +112,7 @@ SELECT
|
||||
*
|
||||
FROM (
|
||||
SELECT
|
||||
stm.connector_id as connector_id,
|
||||
stm.connection_id as connection_id,
|
||||
stm.id as source_table_id,
|
||||
stm.name as source_table_name,
|
||||
ssm.name as source_schema_name,
|
||||
@ -105,18 +120,18 @@ FROM (
|
||||
dtm.name as destination_table_name,
|
||||
dsm.name as destination_schema_name,
|
||||
tl.created_at as created_at,
|
||||
ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
|
||||
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
|
||||
FROM {self.schema_clause}table_lineage as tl
|
||||
JOIN {self.schema_clause}source_table_metadata as stm on tl.source_table_id = stm.id
|
||||
JOIN {self.schema_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
|
||||
JOIN {self.schema_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
|
||||
JOIN {self.schema_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
|
||||
WHERE stm.connector_id IN ({formatted_connector_ids})
|
||||
JOIN {self.schema_clause}source_table as stm on tl.source_table_id = stm.id -- stm: source_table_metadata
|
||||
JOIN {self.schema_clause}destination_table as dtm on tl.destination_table_id = dtm.id -- dtm: destination_table_metadata
|
||||
JOIN {self.schema_clause}source_schema as ssm on stm.schema_id = ssm.id -- ssm: source_schema_metadata
|
||||
JOIN {self.schema_clause}destination_schema as dsm on dtm.schema_id = dsm.id -- dsm: destination_schema_metadata
|
||||
WHERE stm.connection_id IN ({formatted_connector_ids})
|
||||
)
|
||||
-- Ensure that we only get back one entry per source and destination pair.
|
||||
WHERE table_combo_rn = 1
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
|
||||
ORDER BY connector_id, created_at DESC
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
|
||||
ORDER BY connection_id, created_at DESC
|
||||
"""
|
||||
|
||||
def get_column_lineage_query(self, connector_ids: List[str]) -> str:
|
||||
@ -131,25 +146,25 @@ SELECT
|
||||
destination_column_name
|
||||
FROM (
|
||||
SELECT
|
||||
stm.connector_id as connector_id,
|
||||
stm.connection_id as connection_id,
|
||||
scm.table_id as source_table_id,
|
||||
dcm.table_id as destination_table_id,
|
||||
scm.name as source_column_name,
|
||||
dcm.name as destination_column_name,
|
||||
cl.created_at as created_at,
|
||||
ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
|
||||
ROW_NUMBER() OVER (PARTITION BY stm.connection_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
|
||||
FROM {self.schema_clause}column_lineage as cl
|
||||
JOIN {self.schema_clause}source_column_metadata as scm
|
||||
JOIN {self.schema_clause}source_column as scm -- scm: source_column_metadata
|
||||
ON cl.source_column_id = scm.id
|
||||
JOIN {self.schema_clause}destination_column_metadata as dcm
|
||||
JOIN {self.schema_clause}destination_column as dcm -- dcm: destination_column_metadata
|
||||
ON cl.destination_column_id = dcm.id
|
||||
-- Only joining source_table_metadata to get the connector_id.
|
||||
JOIN {self.schema_clause}source_table_metadata as stm
|
||||
-- Only joining source_table to get the connection_id.
|
||||
JOIN {self.schema_clause}source_table as stm -- stm: source_table_metadata
|
||||
ON scm.table_id = stm.id
|
||||
WHERE stm.connector_id IN ({formatted_connector_ids})
|
||||
WHERE stm.connection_id IN ({formatted_connector_ids})
|
||||
)
|
||||
-- Ensure that we only get back one entry per (connector, source column, destination column) pair.
|
||||
WHERE column_combo_rn = 1
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
|
||||
ORDER BY connector_id, created_at DESC
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
|
||||
ORDER BY connection_id, created_at DESC
|
||||
"""
|
||||
|
||||
@ -132,14 +132,13 @@ class UnityCatalogGEProfilerConfig(UnityCatalogProfilerConfig, GEProfilingConfig
|
||||
)
|
||||
|
||||
|
||||
class UnityCatalogSourceConfig(
|
||||
SQLCommonConfig,
|
||||
StatefulIngestionConfigBase,
|
||||
BaseUsageConfig,
|
||||
DatasetSourceConfigMixin,
|
||||
StatefulProfilingConfigMixin,
|
||||
LowerCaseDatasetUrnConfigMixin,
|
||||
):
|
||||
class UnityCatalogConnectionConfig(ConfigModel):
|
||||
"""
|
||||
Configuration for connecting to Databricks Unity Catalog.
|
||||
Contains only connection-related fields that can be reused across different sources.
|
||||
"""
|
||||
|
||||
scheme: str = DATABRICKS
|
||||
token: str = pydantic.Field(description="Databricks personal access token")
|
||||
workspace_url: str = pydantic.Field(
|
||||
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
|
||||
@ -156,15 +155,41 @@ class UnityCatalogSourceConfig(
|
||||
"When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
|
||||
),
|
||||
)
|
||||
include_hive_metastore: bool = pydantic.Field(
|
||||
default=INCLUDE_HIVE_METASTORE_DEFAULT,
|
||||
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
||||
)
|
||||
workspace_name: Optional[str] = pydantic.Field(
|
||||
default=None,
|
||||
description="Name of the workspace. Default to deployment name present in workspace_url",
|
||||
|
||||
extra_client_options: Dict[str, Any] = Field(
|
||||
default={},
|
||||
description="Additional options to pass to Databricks SQLAlchemy client.",
|
||||
)
|
||||
|
||||
def __init__(self, **data: Any):
|
||||
super().__init__(**data)
|
||||
|
||||
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
||||
uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
|
||||
if database:
|
||||
uri_opts["catalog"] = database
|
||||
return make_sqlalchemy_uri(
|
||||
scheme=self.scheme,
|
||||
username="token",
|
||||
password=self.token,
|
||||
at=urlparse(self.workspace_url).netloc,
|
||||
db=database,
|
||||
uri_opts=uri_opts,
|
||||
)
|
||||
|
||||
def get_options(self) -> dict:
|
||||
return self.extra_client_options
|
||||
|
||||
|
||||
class UnityCatalogSourceConfig(
|
||||
UnityCatalogConnectionConfig,
|
||||
SQLCommonConfig,
|
||||
StatefulIngestionConfigBase,
|
||||
BaseUsageConfig,
|
||||
DatasetSourceConfigMixin,
|
||||
StatefulProfilingConfigMixin,
|
||||
LowerCaseDatasetUrnConfigMixin,
|
||||
):
|
||||
include_metastore: bool = pydantic.Field(
|
||||
default=False,
|
||||
description=(
|
||||
@ -344,7 +369,15 @@ class UnityCatalogSourceConfig(
|
||||
_forced_disable_tag_extraction: bool = pydantic.PrivateAttr(default=False)
|
||||
_forced_disable_hive_metastore_extraction = pydantic.PrivateAttr(default=False)
|
||||
|
||||
scheme: str = DATABRICKS
|
||||
include_hive_metastore: bool = pydantic.Field(
|
||||
default=INCLUDE_HIVE_METASTORE_DEFAULT,
|
||||
description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
|
||||
)
|
||||
|
||||
workspace_name: Optional[str] = pydantic.Field(
|
||||
default=None,
|
||||
description="Name of the workspace. Default to deployment name present in workspace_url",
|
||||
)
|
||||
|
||||
def __init__(self, **data):
|
||||
# First, let the parent handle the root validators and field processing
|
||||
@ -386,19 +419,6 @@ class UnityCatalogSourceConfig(
|
||||
forced_disable_hive_metastore_extraction
|
||||
)
|
||||
|
||||
def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
|
||||
uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
|
||||
if database:
|
||||
uri_opts["catalog"] = database
|
||||
return make_sqlalchemy_uri(
|
||||
scheme=self.scheme,
|
||||
username="token",
|
||||
password=self.token,
|
||||
at=urlparse(self.workspace_url).netloc,
|
||||
db=database,
|
||||
uri_opts=uri_opts,
|
||||
)
|
||||
|
||||
def is_profiling_enabled(self) -> bool:
|
||||
return self.profiling.enabled and is_profiling_enabled(
|
||||
self.profiling.operation_config
|
||||
|
||||
@ -26,19 +26,19 @@ FROZEN_TIME = "2022-06-07 17:00:00"
|
||||
|
||||
default_connector_query_results = [
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"connecting_user_id": "reapply_phone",
|
||||
"connector_type_id": "postgres",
|
||||
"connector_name": "postgres",
|
||||
"connection_name": "postgres",
|
||||
"paused": False,
|
||||
"sync_frequency": 1440,
|
||||
"destination_id": "interval_unconstitutional",
|
||||
},
|
||||
{
|
||||
"connector_id": "my_confluent_cloud_connector_id",
|
||||
"connection_id": "my_confluent_cloud_connector_id",
|
||||
"connecting_user_id": "reapply_phone",
|
||||
"connector_type_id": "confluent_cloud",
|
||||
"connector_name": "confluent_cloud",
|
||||
"connection_name": "confluent_cloud",
|
||||
"paused": False,
|
||||
"sync_frequency": 1440,
|
||||
"destination_id": "my_confluent_cloud_connector_id",
|
||||
@ -60,7 +60,7 @@ def default_query_results(
|
||||
):
|
||||
return [
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"source_table_id": "10040",
|
||||
"source_table_name": "employee",
|
||||
"source_schema_name": "public",
|
||||
@ -69,7 +69,7 @@ def default_query_results(
|
||||
"destination_schema_name": "postgres_public",
|
||||
},
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"source_table_id": "10041",
|
||||
"source_table_name": "company",
|
||||
"source_schema_name": "public",
|
||||
@ -78,7 +78,7 @@ def default_query_results(
|
||||
"destination_schema_name": "postgres_public",
|
||||
},
|
||||
{
|
||||
"connector_id": "my_confluent_cloud_connector_id",
|
||||
"connection_id": "my_confluent_cloud_connector_id",
|
||||
"source_table_id": "10042",
|
||||
"source_table_name": "my-source-topic",
|
||||
"source_schema_name": "confluent_cloud",
|
||||
@ -131,28 +131,28 @@ def default_query_results(
|
||||
):
|
||||
return [
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
|
||||
"start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
|
||||
"end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
|
||||
"end_message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"',
|
||||
},
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
|
||||
"start_time": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000),
|
||||
"end_time": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000),
|
||||
"end_message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"',
|
||||
},
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
|
||||
"start_time": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000),
|
||||
"end_time": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000),
|
||||
"end_message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"',
|
||||
},
|
||||
{
|
||||
"connector_id": "my_confluent_cloud_connector_id",
|
||||
"connection_id": "my_confluent_cloud_connector_id",
|
||||
"sync_id": "d9a03d6-eded-4422-a46a-163266e58244",
|
||||
"start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
|
||||
"end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
|
||||
@ -360,19 +360,19 @@ def test_fivetran_with_snowflake_dest_and_null_connector_user(pytestconfig, tmp_
|
||||
|
||||
connector_query_results = [
|
||||
{
|
||||
"connector_id": "calendar_elected",
|
||||
"connection_id": "calendar_elected",
|
||||
"connecting_user_id": None,
|
||||
"connector_type_id": "postgres",
|
||||
"connector_name": "postgres",
|
||||
"connection_name": "postgres",
|
||||
"paused": False,
|
||||
"sync_frequency": 1440,
|
||||
"destination_id": "interval_unconstitutional",
|
||||
},
|
||||
{
|
||||
"connector_id": "my_confluent_cloud_connector_id",
|
||||
"connection_id": "my_confluent_cloud_connector_id",
|
||||
"connecting_user_id": None,
|
||||
"connector_type_id": "confluent_cloud",
|
||||
"connector_name": "confluent_cloud",
|
||||
"connection_name": "confluent_cloud",
|
||||
"paused": False,
|
||||
"sync_frequency": 1440,
|
||||
"destination_id": "interval_unconstitutional",
|
||||
|
||||
@ -134,6 +134,9 @@ def test_warehouse_id_must_be_set_if_include_hive_metastore_is_true():
|
||||
assert config.warehouse_id is None
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="This test is making actual network calls with retries taking ~5 mins, needs to be mocked"
|
||||
)
|
||||
def test_warehouse_id_must_be_present_test_connection():
|
||||
"""Test that connection succeeds when hive_metastore gets auto-disabled."""
|
||||
config_dict = {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user