2021-04-05 19:11:28 -07:00
""" Convenience functions for creating MCEs """
2021-07-22 21:50:44 -07:00
import logging
2021-10-03 14:54:34 -07:00
import re
2021-04-05 19:11:28 -07:00
import time
2021-11-03 21:39:52 -07:00
from typing import Any , List , Optional , Type , TypeVar , cast , get_type_hints
2021-07-17 20:02:07 -07:00
import typing_inspect
from avrogen . dict_wrapper import DictWrapper
2021-04-05 19:11:28 -07:00
2021-04-30 21:10:12 -07:00
from datahub . metadata . schema_classes import (
2021-10-11 21:54:57 +02:00
DatasetKeyClass ,
2021-04-05 19:11:28 -07:00
DatasetLineageTypeClass ,
DatasetSnapshotClass ,
MetadataChangeEventClass ,
2021-09-02 07:44:03 +02:00
OwnershipTypeClass ,
2021-04-05 19:11:28 -07:00
UpstreamClass ,
UpstreamLineageClass ,
)
2021-04-12 17:40:15 -07:00
DEFAULT_ENV = " PROD "
DEFAULT_FLOW_CLUSTER = " prod "
2021-06-24 17:11:00 -07:00
UNKNOWN_USER = " urn:li:corpuser:unknown "
2021-04-05 19:11:28 -07:00
2021-05-11 17:46:39 -07:00
2021-07-22 21:50:44 -07:00
logger = logging . getLogger ( __name__ )
2021-05-11 17:46:39 -07:00
def get_sys_time ( ) - > int :
2021-07-08 12:11:06 -07:00
# TODO deprecate this
2021-05-11 17:46:39 -07:00
return int ( time . time ( ) * 1000 )
2021-04-12 17:40:15 -07:00
2021-07-22 21:50:44 -07:00
def _check_data_platform_name ( platform_name : str ) - > None :
if not platform_name . isalpha ( ) :
logger . warning ( f " improperly formatted data platform: { platform_name } " )
2021-07-22 13:25:07 -07:00
def make_data_platform_urn ( platform : str ) - > str :
2021-07-22 21:50:44 -07:00
if platform . startswith ( " urn:li:dataPlatform: " ) :
return platform
_check_data_platform_name ( platform )
2021-07-22 13:25:07 -07:00
return f " urn:li:dataPlatform: { platform } "
2021-04-20 20:44:38 -07:00
def make_dataset_urn ( platform : str , name : str , env : str = DEFAULT_ENV ) - > str :
2021-07-22 13:25:07 -07:00
return f " urn:li:dataset:( { make_data_platform_urn ( platform ) } , { name } , { env } ) "
2021-04-05 19:11:28 -07:00
2021-10-11 21:54:57 +02:00
def dataset_urn_to_key ( dataset_urn : str ) - > Optional [ DatasetKeyClass ] :
2021-10-03 14:54:34 -07:00
pattern = r " urn:li:dataset: \ (urn:li:dataPlatform:(.*),(.*),(.*) \ ) "
results = re . search ( pattern , dataset_urn )
if results is not None :
2021-10-11 21:54:57 +02:00
return DatasetKeyClass (
2021-10-03 14:54:34 -07:00
platform = results . group ( 1 ) , name = results . group ( 2 ) , origin = results . group ( 3 )
)
return None
2021-04-20 20:44:38 -07:00
def make_user_urn ( username : str ) - > str :
2021-04-05 19:11:28 -07:00
return f " urn:li:corpuser: { username } "
2021-09-01 15:10:12 -07:00
def make_group_urn ( groupname : str ) - > str :
return f " urn:li:corpGroup: { groupname } "
2021-05-10 19:26:55 -07:00
def make_tag_urn ( tag : str ) - > str :
return f " urn:li:tag: { tag } "
2021-04-12 17:40:15 -07:00
def make_data_flow_urn (
orchestrator : str , flow_id : str , cluster : str = DEFAULT_FLOW_CLUSTER
2021-04-20 20:44:38 -07:00
) - > str :
2021-04-12 17:40:15 -07:00
return f " urn:li:dataFlow:( { orchestrator } , { flow_id } , { cluster } ) "
2021-04-20 20:44:38 -07:00
def make_data_job_urn_with_flow ( flow_urn : str , job_id : str ) - > str :
2021-04-12 17:40:15 -07:00
return f " urn:li:dataJob:( { flow_urn } , { job_id } ) "
def make_data_job_urn (
orchestrator : str , flow_id : str , job_id : str , cluster : str = DEFAULT_FLOW_CLUSTER
2021-04-20 20:44:38 -07:00
) - > str :
2021-04-12 17:40:15 -07:00
return make_data_job_urn_with_flow (
make_data_flow_urn ( orchestrator , flow_id , cluster ) , job_id
)
2021-07-22 21:50:44 -07:00
def make_dashboard_urn ( platform : str , name : str ) - > str :
# FIXME: dashboards don't currently include data platform urn prefixes.
_check_data_platform_name ( platform )
return f " urn:li:dashboard:( { platform } , { name } ) "
def make_chart_urn ( platform : str , name : str ) - > str :
# FIXME: charts don't currently include data platform urn prefixes.
_check_data_platform_name ( platform )
return f " urn:li:chart:( { platform } , { name } ) "
2021-06-09 15:07:04 -07:00
def make_ml_primary_key_urn ( feature_table_name : str , primary_key_name : str ) - > str :
return f " urn:li:mlPrimaryKey:( { feature_table_name } , { primary_key_name } ) "
def make_ml_feature_urn (
feature_table_name : str ,
feature_name : str ,
) - > str :
return f " urn:li:mlFeature:( { feature_table_name } , { feature_name } ) "
def make_ml_feature_table_urn ( platform : str , feature_table_name : str ) - > str :
2021-07-22 13:25:07 -07:00
return f " urn:li:mlFeatureTable:( { make_data_platform_urn ( platform ) } , { feature_table_name } ) "
2021-06-09 15:07:04 -07:00
2021-07-08 16:16:16 -07:00
def make_ml_model_urn ( platform : str , model_name : str , env : str ) - > str :
2021-07-22 13:25:07 -07:00
return f " urn:li:mlModel:( { make_data_platform_urn ( platform ) } , { model_name } , { env } ) "
2021-07-08 16:16:16 -07:00
2021-07-19 11:30:43 -07:00
def make_ml_model_deployment_urn ( platform : str , deployment_name : str , env : str ) - > str :
2021-07-22 13:25:07 -07:00
return f " urn:li:mlModelDeployment:( { make_data_platform_urn ( platform ) } , { deployment_name } , { env } ) "
2021-07-19 11:30:43 -07:00
def make_ml_model_group_urn ( platform : str , group_name : str , env : str ) - > str :
2021-07-22 13:25:07 -07:00
return (
f " urn:li:mlModelGroup:( { make_data_platform_urn ( platform ) } , { group_name } , { env } ) "
)
2021-07-19 11:30:43 -07:00
2021-09-02 07:44:03 +02:00
def is_valid_ownership_type ( ownership_type : Optional [ str ] ) - > bool :
return ownership_type is not None and ownership_type in [
OwnershipTypeClass . DEVELOPER ,
OwnershipTypeClass . DATAOWNER ,
OwnershipTypeClass . DELEGATE ,
OwnershipTypeClass . PRODUCER ,
OwnershipTypeClass . CONSUMER ,
OwnershipTypeClass . STAKEHOLDER ,
]
def validate_ownership_type ( ownership_type : Optional [ str ] ) - > str :
if is_valid_ownership_type ( ownership_type ) :
return cast ( str , ownership_type )
else :
raise ValueError ( f " Unexpected ownership type: { ownership_type } " )
2021-04-05 19:11:28 -07:00
def make_lineage_mce (
2021-05-11 17:46:39 -07:00
upstream_urns : List [ str ] ,
2021-04-05 19:11:28 -07:00
downstream_urn : str ,
lineage_type : str = DatasetLineageTypeClass . TRANSFORMED ,
) - > MetadataChangeEventClass :
mce = MetadataChangeEventClass (
proposedSnapshot = DatasetSnapshotClass (
urn = downstream_urn ,
aspects = [
UpstreamLineageClass (
upstreams = [
UpstreamClass (
dataset = upstream_urn ,
type = lineage_type ,
)
for upstream_urn in upstream_urns
]
)
] ,
)
)
return mce
2021-05-11 17:46:39 -07:00
2021-07-17 20:02:07 -07:00
# This bound isn't tight, but it's better than nothing.
Aspect = TypeVar ( " Aspect " , bound = DictWrapper )
def can_add_aspect ( mce : MetadataChangeEventClass , AspectType : Type [ Aspect ] ) - > bool :
SnapshotType = type ( mce . proposedSnapshot )
constructor_annotations = get_type_hints ( SnapshotType . __init__ )
2021-07-23 17:07:13 -07:00
aspect_list_union = typing_inspect . get_args ( constructor_annotations [ " aspects " ] ) [ 0 ]
if not isinstance ( aspect_list_union , tuple ) :
supported_aspect_types = typing_inspect . get_args ( aspect_list_union )
else :
# On Python 3.6, the union type is represented as a tuple, where
# the first item is typing.Union and the subsequent elements are
# the types within the union.
supported_aspect_types = aspect_list_union [ 1 : ]
2021-07-17 20:02:07 -07:00
return issubclass ( AspectType , supported_aspect_types )
2021-05-11 17:46:39 -07:00
def get_aspect_if_available (
2021-07-17 20:02:07 -07:00
mce : MetadataChangeEventClass , AspectType : Type [ Aspect ]
) - > Optional [ Aspect ] :
assert can_add_aspect ( mce , AspectType )
2021-05-11 17:46:39 -07:00
all_aspects = mce . proposedSnapshot . aspects
2021-07-17 20:02:07 -07:00
aspects : List [ Aspect ] = [
aspect for aspect in all_aspects if isinstance ( aspect , AspectType )
]
2021-05-11 17:46:39 -07:00
if len ( aspects ) > 1 :
2021-07-17 20:02:07 -07:00
raise ValueError (
f " MCE contains multiple aspects of type { AspectType } : { aspects } "
)
2021-05-11 17:46:39 -07:00
if aspects :
return aspects [ 0 ]
return None
2021-11-03 21:39:52 -07:00
def remove_aspect_if_available (
mce : MetadataChangeEventClass , aspect_type : Type [ Aspect ]
) - > bool :
assert can_add_aspect ( mce , aspect_type )
# loose type annotations since we checked before
aspects : List [ Any ] = [
aspect
for aspect in mce . proposedSnapshot . aspects
if not isinstance ( aspect , aspect_type )
]
removed = len ( aspects ) != len ( mce . proposedSnapshot . aspects )
mce . proposedSnapshot . aspects = aspects
return removed
2021-07-17 20:02:07 -07:00
def get_or_add_aspect ( mce : MetadataChangeEventClass , default : Aspect ) - > Aspect :
2021-05-11 17:46:39 -07:00
existing = get_aspect_if_available ( mce , type ( default ) )
if existing is not None :
return existing
2021-07-17 20:02:07 -07:00
mce . proposedSnapshot . aspects . append ( default ) # type: ignore
2021-05-11 17:46:39 -07:00
return default
2021-11-03 21:39:52 -07:00
def set_aspect (
mce : MetadataChangeEventClass , aspect : Optional [ Aspect ] , aspect_type : Type [ Aspect ]
) - > None :
""" Sets the aspect to the provided aspect, overwriting any previous aspect value that might have existed before. If passed in aspect is None, then the existing aspect value will be removed """
remove_aspect_if_available ( mce , aspect_type )
if aspect is not None :
mce . proposedSnapshot . aspects . append ( aspect ) # type: ignore