2024-07-12 15:08:51 -07:00
import abc
2024-07-17 10:22:14 -07:00
from functools import cached_property
2024-12-12 05:01:32 -05:00
from typing import ClassVar , List , Literal , Optional , Tuple
2022-08-16 09:24:02 +05:30
2022-12-06 00:57:25 +05:30
from datahub . configuration . pattern_utils import is_schema_allowed
2024-06-14 13:23:07 -07:00
from datahub . emitter . mce_builder import make_dataset_urn_with_platform_instance
2024-07-12 15:08:51 -07:00
from datahub . ingestion . api . source import SourceReport
2022-12-28 21:50:37 +05:30
from datahub . ingestion . source . snowflake . constants import (
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING ,
2023-12-18 19:54:31 +01:00
SnowflakeCloudProvider ,
2022-12-28 21:50:37 +05:30
SnowflakeObjectDomain ,
)
2024-07-12 15:08:51 -07:00
from datahub . ingestion . source . snowflake . snowflake_config import (
SnowflakeFilterConfig ,
SnowflakeIdentifierConfig ,
SnowflakeV2Config ,
)
2022-08-16 09:24:02 +05:30
from datahub . ingestion . source . snowflake . snowflake_report import SnowflakeV2Report
2024-07-12 15:08:51 -07:00
class SnowflakeStructuredReportMixin ( abc . ABC ) :
@property
@abc.abstractmethod
def structured_reporter ( self ) - > SourceReport :
2022-12-28 21:50:37 +05:30
. . .
2022-08-16 09:24:02 +05:30
2024-07-12 15:08:51 -07:00
class SnowsightUrlBuilder :
CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX : ClassVar = [
2023-12-18 19:54:31 +01:00
" us-west-2 " ,
" us-east-1 " ,
" eu-west-1 " ,
" eu-central-1 " ,
" ap-southeast-1 " ,
" ap-southeast-2 " ,
]
2024-07-12 15:08:51 -07:00
snowsight_base_url : str
def __init__ ( self , account_locator : str , region : str , privatelink : bool = False ) :
cloud , cloud_region_id = self . get_cloud_region_from_snowflake_region_id ( region )
self . snowsight_base_url = self . create_snowsight_base_url (
account_locator , cloud_region_id , cloud , privatelink
)
2022-12-03 00:08:46 +05:30
@staticmethod
2022-12-17 00:30:42 +05:30
def create_snowsight_base_url (
account_locator : str ,
cloud_region_id : str ,
cloud : str ,
privatelink : bool = False ,
2024-07-12 15:08:51 -07:00
) - > str :
2023-12-18 19:54:31 +01:00
if cloud :
url_cloud_provider_suffix = f " . { cloud } "
if cloud == SnowflakeCloudProvider . AWS :
# Some AWS regions do not have cloud suffix. See below the list:
# https://docs.snowflake.com/en/user-guide/admin-account-identifier#non-vps-account-locator-formats-by-cloud-platform-and-region
if (
cloud_region_id
2024-07-12 15:08:51 -07:00
in SnowsightUrlBuilder . CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX
2023-12-18 19:54:31 +01:00
) :
url_cloud_provider_suffix = " "
else :
url_cloud_provider_suffix = f " . { cloud } "
2022-12-17 00:30:42 +05:30
if privatelink :
url = f " https://app. { account_locator } . { cloud_region_id } .privatelink.snowflakecomputing.com/ "
else :
2023-12-18 19:54:31 +01:00
url = f " https://app.snowflake.com/ { cloud_region_id } { url_cloud_provider_suffix } / { account_locator } / "
2022-12-17 00:30:42 +05:30
return url
@staticmethod
2024-07-12 15:08:51 -07:00
def get_cloud_region_from_snowflake_region_id (
region : str ,
) - > Tuple [ str , str ] :
cloud : str
2022-12-17 00:30:42 +05:30
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING . keys ( ) :
cloud , cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING [ region ]
elif region . startswith ( ( " aws_ " , " gcp_ " , " azure_ " ) ) :
# e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
cloud , cloud_region_id = region . split ( " _ " , 1 )
cloud_region_id = cloud_region_id . replace ( " _ " , " - " )
2022-12-03 00:08:46 +05:30
else :
2022-12-17 00:30:42 +05:30
raise Exception ( f " Unknown snowflake region { region } " )
return cloud , cloud_region_id
2022-12-03 00:08:46 +05:30
2024-07-12 15:08:51 -07:00
# domain is either "view" or "table"
def get_external_url_for_table (
self ,
table_name : str ,
schema_name : str ,
db_name : str ,
domain : Literal [ SnowflakeObjectDomain . TABLE , SnowflakeObjectDomain . VIEW ] ,
) - > Optional [ str ] :
return f " { self . snowsight_base_url } #/data/databases/ { db_name } /schemas/ { schema_name } / { domain } / { table_name } / "
def get_external_url_for_schema (
self , schema_name : str , db_name : str
) - > Optional [ str ] :
return f " { self . snowsight_base_url } #/data/databases/ { db_name } /schemas/ { schema_name } / "
def get_external_url_for_database ( self , db_name : str ) - > Optional [ str ] :
return f " { self . snowsight_base_url } #/data/databases/ { db_name } / "
2024-07-17 10:22:14 -07:00
class SnowflakeFilter :
def __init__ (
self , filter_config : SnowflakeFilterConfig , structured_reporter : SourceReport
) - > None :
self . filter_config = filter_config
self . structured_reporter = structured_reporter
2024-07-12 15:08:51 -07:00
2024-07-17 10:22:14 -07:00
# TODO: Refactor remaining filtering logic into this class.
2024-07-12 15:08:51 -07:00
def is_dataset_pattern_allowed (
self ,
2022-08-16 09:24:02 +05:30
dataset_name : Optional [ str ] ,
dataset_type : Optional [ str ] ,
) - > bool :
if not dataset_type or not dataset_name :
return True
dataset_params = dataset_name . split ( " . " )
2022-12-28 21:50:37 +05:30
if dataset_type . lower ( ) not in (
SnowflakeObjectDomain . TABLE ,
SnowflakeObjectDomain . EXTERNAL_TABLE ,
SnowflakeObjectDomain . VIEW ,
SnowflakeObjectDomain . MATERIALIZED_VIEW ,
2024-09-16 21:19:56 -04:00
SnowflakeObjectDomain . ICEBERG_TABLE ,
2022-12-28 21:50:37 +05:30
) :
return False
2024-08-07 14:04:18 -07:00
if _is_sys_table ( dataset_name ) :
return False
2024-07-17 10:22:14 -07:00
2022-08-16 09:24:02 +05:30
if len ( dataset_params ) != 3 :
2024-07-17 10:22:14 -07:00
self . structured_reporter . info (
title = " Unexpected dataset pattern " ,
message = f " Found a { dataset_type } with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work. " ,
context = dataset_name ,
2022-08-16 09:24:02 +05:30
)
2024-07-17 10:22:14 -07:00
# We fall-through here so table/view filtering still works.
2022-08-16 09:24:02 +05:30
2024-07-17 10:22:14 -07:00
if (
len ( dataset_params ) > = 1
and not self . filter_config . database_pattern . allowed (
dataset_params [ 0 ] . strip ( ' " ' )
)
) or (
len ( dataset_params ) > = 2
and not is_schema_allowed (
self . filter_config . schema_pattern ,
dataset_params [ 1 ] . strip ( ' " ' ) ,
dataset_params [ 0 ] . strip ( ' " ' ) ,
self . filter_config . match_fully_qualified_names ,
)
2022-12-06 00:57:25 +05:30
) :
2022-08-16 09:24:02 +05:30
return False
2022-12-28 21:50:37 +05:30
if dataset_type . lower ( ) in {
SnowflakeObjectDomain . TABLE
2024-07-12 15:08:51 -07:00
} and not self . filter_config . table_pattern . allowed (
2024-07-17 10:22:14 -07:00
_cleanup_qualified_name ( dataset_name , self . structured_reporter )
2022-08-16 09:24:02 +05:30
) :
return False
if dataset_type . lower ( ) in {
2024-07-12 15:08:51 -07:00
SnowflakeObjectDomain . VIEW ,
SnowflakeObjectDomain . MATERIALIZED_VIEW ,
} and not self . filter_config . view_pattern . allowed (
2024-07-17 10:22:14 -07:00
_cleanup_qualified_name ( dataset_name , self . structured_reporter )
2022-11-11 21:17:09 +05:30
) :
2022-08-16 09:24:02 +05:30
return False
return True
2024-07-17 10:22:14 -07:00
def _combine_identifier_parts (
* , table_name : str , schema_name : str , db_name : str
) - > str :
return f " { db_name } . { schema_name } . { table_name } "
2024-08-07 14:04:18 -07:00
def _is_sys_table ( table_name : str ) - > bool :
# Often will look like `SYS$_UNPIVOT_VIEW1737` or `sys$_pivot_view19`.
return table_name . lower ( ) . startswith ( " sys$ " )
2024-12-12 05:01:32 -05:00
def _split_qualified_name ( qualified_name : str ) - > List [ str ] :
"""
Split a qualified name into its constituent parts .
>> > _split_qualified_name ( " db.my_schema.my_table " )
[ ' db ' , ' my_schema ' , ' my_table ' ]
>> > _split_qualified_name ( ' " db " . " my_schema " . " my_table " ' )
[ ' db ' , ' my_schema ' , ' my_table ' ]
>> > _split_qualified_name ( ' TEST_DB.TEST_SCHEMA. " TABLE.WITH.DOTS " ' )
[ ' TEST_DB ' , ' TEST_SCHEMA ' , ' TABLE.WITH.DOTS ' ]
>> > _split_qualified_name ( ' TEST_DB. " SCHEMA.WITH.DOTS " .MY_TABLE ' )
[ ' TEST_DB ' , ' SCHEMA.WITH.DOTS ' , ' MY_TABLE ' ]
"""
# Fast path - no quotes.
if ' " ' not in qualified_name :
return qualified_name . split ( " . " )
# First pass - split on dots that are not inside quotes.
in_quote = False
parts : List [ List [ str ] ] = [ [ ] ]
for char in qualified_name :
if char == ' " ' :
in_quote = not in_quote
elif char == " . " and not in_quote :
parts . append ( [ ] )
else :
parts [ - 1 ] . append ( char )
# Second pass - remove outer pairs of quotes.
result = [ ]
for part in parts :
if len ( part ) > 2 and part [ 0 ] == ' " ' and part [ - 1 ] == ' " ' :
part = part [ 1 : - 1 ]
result . append ( " " . join ( part ) )
return result
2024-07-17 10:22:14 -07:00
# Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
# For example "test-database"."test-schema".test_table
# whereas we generate urns without quotes even for quoted identifiers for backward compatibility
# and also unavailability of utility function to identify whether current table/schema/database
# name should be quoted in above method get_dataset_identifier
def _cleanup_qualified_name (
qualified_name : str , structured_reporter : SourceReport
) - > str :
2024-12-12 05:01:32 -05:00
name_parts = _split_qualified_name ( qualified_name )
2024-07-17 10:22:14 -07:00
if len ( name_parts ) != 3 :
2024-08-07 14:04:18 -07:00
if not _is_sys_table ( qualified_name ) :
structured_reporter . info (
title = " Unexpected dataset pattern " ,
message = " We failed to parse a Snowflake qualified name into its constituent parts. "
" DB/schema/table filtering may not work as expected on these entities. " ,
context = f " { qualified_name } has { len ( name_parts ) } parts " ,
)
2024-07-17 10:22:14 -07:00
return qualified_name . replace ( ' " ' , " " )
return _combine_identifier_parts (
2024-12-12 05:01:32 -05:00
db_name = name_parts [ 0 ] ,
schema_name = name_parts [ 1 ] ,
table_name = name_parts [ 2 ] ,
2024-07-17 10:22:14 -07:00
)
2024-07-12 15:08:51 -07:00
2024-07-17 10:22:14 -07:00
class SnowflakeIdentifierBuilder :
2024-07-12 15:08:51 -07:00
platform = " snowflake "
2024-07-17 10:22:14 -07:00
def __init__ (
self ,
identifier_config : SnowflakeIdentifierConfig ,
structured_reporter : SourceReport ,
) - > None :
self . identifier_config = identifier_config
self . structured_reporter = structured_reporter
2024-07-12 15:08:51 -07:00
def snowflake_identifier ( self , identifier : str ) - > str :
2022-08-16 09:24:02 +05:30
# to be in in sync with older connector, convert name to lowercase
2024-07-12 15:08:51 -07:00
if self . identifier_config . convert_urns_to_lowercase :
2022-08-16 09:24:02 +05:30
return identifier . lower ( )
return identifier
2024-07-12 15:08:51 -07:00
def get_dataset_identifier (
self , table_name : str , schema_name : str , db_name : str
) - > str :
return self . snowflake_identifier (
2024-07-17 10:22:14 -07:00
_combine_identifier_parts (
2024-07-12 15:08:51 -07:00
table_name = table_name , schema_name = schema_name , db_name = db_name
)
)
def gen_dataset_urn ( self , dataset_identifier : str ) - > str :
2024-06-14 13:23:07 -07:00
return make_dataset_urn_with_platform_instance (
platform = self . platform ,
name = dataset_identifier ,
2024-07-12 15:08:51 -07:00
platform_instance = self . identifier_config . platform_instance ,
env = self . identifier_config . env ,
2024-06-14 13:23:07 -07:00
)
2024-07-17 10:22:14 -07:00
def get_dataset_identifier_from_qualified_name ( self , qualified_name : str ) - > str :
return self . snowflake_identifier (
_cleanup_qualified_name ( qualified_name , self . structured_reporter )
)
2024-07-12 15:08:51 -07:00
2023-01-04 23:05:23 +02:00
@staticmethod
def get_quoted_identifier_for_database ( db_name ) :
return f ' " { db_name } " '
@staticmethod
def get_quoted_identifier_for_schema ( db_name , schema_name ) :
return f ' " { db_name } " . " { schema_name } " '
@staticmethod
def get_quoted_identifier_for_table ( db_name , schema_name , table_name ) :
return f ' " { db_name } " . " { schema_name } " . " { table_name } " '
2024-07-17 10:22:14 -07:00
class SnowflakeCommonMixin ( SnowflakeStructuredReportMixin ) :
platform = " snowflake "
config : SnowflakeV2Config
report : SnowflakeV2Report
@property
def structured_reporter ( self ) - > SourceReport :
return self . report
@cached_property
def identifiers ( self ) - > SnowflakeIdentifierBuilder :
return SnowflakeIdentifierBuilder ( self . config , self . report )
2022-08-16 09:24:02 +05:30
# Note - decide how to construct user urns.
# Historically urns were created using part before @ from user's email.
# Users without email were skipped from both user entries as well as aggregates.
# However email is not mandatory field in snowflake user, user_name is always present.
def get_user_identifier (
2024-07-17 10:22:14 -07:00
self ,
2023-08-03 08:30:50 +05:30
user_name : str ,
user_email : Optional [ str ] ,
email_as_user_identifier : bool ,
2022-08-16 09:24:02 +05:30
) - > str :
2022-09-12 23:12:52 +05:30
if user_email :
2024-07-17 10:22:14 -07:00
return self . identifiers . snowflake_identifier (
2023-08-03 08:30:50 +05:30
user_email
if email_as_user_identifier is True
else user_email . split ( " @ " ) [ 0 ]
)
2024-07-17 10:22:14 -07:00
return self . identifiers . snowflake_identifier ( user_name )
2022-08-16 09:24:02 +05:30
2022-12-28 21:50:37 +05:30
# TODO: Revisit this after stateful ingestion can commit checkpoint
# for failures that do not affect the checkpoint
2024-07-17 10:22:14 -07:00
# TODO: Add additional parameters to match the signature of the .warning and .failure methods
def warn_if_stateful_else_error ( self , key : str , reason : str ) - > None :
2022-12-28 21:50:37 +05:30
if (
self . config . stateful_ingestion is not None
and self . config . stateful_ingestion . enabled
) :
2024-07-17 10:22:14 -07:00
self . structured_reporter . warning ( key , reason )
2022-12-28 21:50:37 +05:30
else :
2024-07-17 10:22:14 -07:00
self . structured_reporter . failure ( key , reason )