2021-04-05 19:11:28 -07:00
|
|
|
"""Convenience functions for creating MCEs"""
|
2022-02-03 12:26:08 +05:30
|
|
|
import json
|
2021-07-22 21:50:44 -07:00
|
|
|
import logging
|
2021-10-03 14:54:34 -07:00
|
|
|
import re
|
2021-04-05 19:11:28 -07:00
|
|
|
import time
|
2022-01-05 16:32:05 -08:00
|
|
|
from enum import Enum
|
2022-02-03 12:26:08 +05:30
|
|
|
from hashlib import md5
|
2022-02-16 19:45:07 -08:00
|
|
|
from typing import Any, List, Optional, Set, Type, TypeVar, Union, cast, get_type_hints
|
2021-07-17 20:02:07 -07:00
|
|
|
|
|
|
|
import typing_inspect
|
|
|
|
from avrogen.dict_wrapper import DictWrapper
|
2021-04-05 19:11:28 -07:00
|
|
|
|
2022-01-27 15:31:25 -08:00
|
|
|
from datahub.configuration.source_common import DEFAULT_ENV as DEFAULT_ENV_CONFIGURATION
|
2022-02-03 12:26:08 +05:30
|
|
|
from datahub.emitter.serialization_helper import pre_json_transform
|
2022-01-05 16:32:05 -08:00
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.common import GlossaryTerms
|
2021-04-30 21:10:12 -07:00
|
|
|
from datahub.metadata.schema_classes import (
|
2022-01-05 16:32:05 -08:00
|
|
|
AuditStampClass,
|
2022-02-07 18:51:49 +01:00
|
|
|
ContainerKeyClass,
|
2021-10-11 21:54:57 +02:00
|
|
|
DatasetKeyClass,
|
2021-04-05 19:11:28 -07:00
|
|
|
DatasetLineageTypeClass,
|
|
|
|
DatasetSnapshotClass,
|
2021-11-05 09:27:41 -07:00
|
|
|
GlobalTagsClass,
|
2022-01-05 16:32:05 -08:00
|
|
|
GlossaryTermAssociationClass,
|
2021-04-05 19:11:28 -07:00
|
|
|
MetadataChangeEventClass,
|
2022-01-05 16:32:05 -08:00
|
|
|
OwnerClass,
|
|
|
|
OwnershipClass,
|
|
|
|
OwnershipSourceClass,
|
|
|
|
OwnershipSourceTypeClass,
|
2021-09-02 07:44:03 +02:00
|
|
|
OwnershipTypeClass,
|
2021-11-05 09:27:41 -07:00
|
|
|
TagAssociationClass,
|
2021-04-05 19:11:28 -07:00
|
|
|
UpstreamClass,
|
|
|
|
UpstreamLineageClass,
|
|
|
|
)
|
|
|
|
|
2022-01-27 15:31:25 -08:00
|
|
|
DEFAULT_ENV = DEFAULT_ENV_CONFIGURATION
|
2021-04-12 17:40:15 -07:00
|
|
|
DEFAULT_FLOW_CLUSTER = "prod"
|
2021-06-24 17:11:00 -07:00
|
|
|
UNKNOWN_USER = "urn:li:corpuser:unknown"
|
2022-02-16 19:45:07 -08:00
|
|
|
SQL_STYLE_PLATFORMS: Set[str] = {
|
|
|
|
"athena",
|
|
|
|
"bigquery",
|
|
|
|
"druid",
|
|
|
|
"hive",
|
|
|
|
"mariadb",
|
|
|
|
"mssql",
|
|
|
|
"mysql",
|
|
|
|
"oracle",
|
|
|
|
"postgres",
|
|
|
|
"redshift",
|
|
|
|
"snowflake",
|
|
|
|
"trino",
|
|
|
|
}
|
2021-05-11 17:46:39 -07:00
|
|
|
|
2021-07-22 21:50:44 -07:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2022-01-05 16:32:05 -08:00
|
|
|
class OwnerType(Enum):
|
|
|
|
USER = "corpuser"
|
|
|
|
GROUP = "corpGroup"
|
|
|
|
|
|
|
|
|
2021-05-11 17:46:39 -07:00
|
|
|
def get_sys_time() -> int:
|
2021-07-08 12:11:06 -07:00
|
|
|
# TODO deprecate this
|
2021-05-11 17:46:39 -07:00
|
|
|
return int(time.time() * 1000)
|
|
|
|
|
2021-04-12 17:40:15 -07:00
|
|
|
|
2021-07-22 13:25:07 -07:00
|
|
|
def make_data_platform_urn(platform: str) -> str:
|
2021-07-22 21:50:44 -07:00
|
|
|
if platform.startswith("urn:li:dataPlatform:"):
|
|
|
|
return platform
|
2021-07-22 13:25:07 -07:00
|
|
|
return f"urn:li:dataPlatform:{platform}"
|
|
|
|
|
|
|
|
|
2021-04-20 20:44:38 -07:00
|
|
|
def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
|
2022-02-16 19:45:07 -08:00
|
|
|
# Use lower-case name for all SQL style datasets
|
|
|
|
if platform in SQL_STYLE_PLATFORMS:
|
|
|
|
name = name.lower()
|
2021-07-22 13:25:07 -07:00
|
|
|
return f"urn:li:dataset:({make_data_platform_urn(platform)},{name},{env})"
|
2021-04-05 19:11:28 -07:00
|
|
|
|
|
|
|
|
2022-01-27 15:31:25 -08:00
|
|
|
def make_dataplatform_instance_urn(platform: str, instance: str) -> str:
|
|
|
|
if instance.startswith("urn:li:dataPlatformInstance"):
|
|
|
|
return instance
|
|
|
|
else:
|
|
|
|
return f"urn:li:dataPlatformInstance:({make_data_platform_urn(platform)},{instance})"
|
|
|
|
|
|
|
|
|
|
|
|
def make_dataset_urn_with_platform_instance(
|
|
|
|
platform: str, name: str, platform_instance: Optional[str], env: str = DEFAULT_ENV
|
|
|
|
) -> str:
|
2022-02-16 19:45:07 -08:00
|
|
|
|
2022-01-27 15:31:25 -08:00
|
|
|
if platform_instance:
|
2022-02-16 19:45:07 -08:00
|
|
|
# Use lower-case name for all SQL style datasets
|
|
|
|
if platform in SQL_STYLE_PLATFORMS:
|
|
|
|
name = name.lower()
|
|
|
|
|
2022-01-27 15:31:25 -08:00
|
|
|
return f"urn:li:dataset:({make_data_platform_urn(platform)},{platform_instance}.{name},{env})"
|
|
|
|
else:
|
|
|
|
return make_dataset_urn(platform=platform, name=name, env=env)
|
|
|
|
|
|
|
|
|
2022-02-06 14:30:40 -08:00
|
|
|
def make_schema_field_urn(parent_urn: str, field_path: str) -> str:
|
|
|
|
assert parent_urn.startswith("urn:li:"), "Schema field's parent must be an urn"
|
|
|
|
return f"urn:li:schemaField:({parent_urn},{field_path})"
|
|
|
|
|
|
|
|
|
2021-10-11 21:54:57 +02:00
|
|
|
def dataset_urn_to_key(dataset_urn: str) -> Optional[DatasetKeyClass]:
|
2021-10-03 14:54:34 -07:00
|
|
|
pattern = r"urn:li:dataset:\(urn:li:dataPlatform:(.*),(.*),(.*)\)"
|
|
|
|
results = re.search(pattern, dataset_urn)
|
|
|
|
if results is not None:
|
2021-10-11 21:54:57 +02:00
|
|
|
return DatasetKeyClass(
|
2021-10-03 14:54:34 -07:00
|
|
|
platform=results.group(1), name=results.group(2), origin=results.group(3)
|
|
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2022-02-07 18:51:49 +01:00
|
|
|
def make_container_new_urn(guid: str) -> str:
|
|
|
|
return f"urn:dh:container:0:({guid})"
|
|
|
|
|
|
|
|
|
|
|
|
def container_new_urn_to_key(dataset_urn: str) -> Optional[ContainerKeyClass]:
|
|
|
|
pattern = r"urn:dh:container:0:\((.*)\)"
|
|
|
|
results = re.search(pattern, dataset_urn)
|
|
|
|
if results is not None:
|
|
|
|
return ContainerKeyClass(
|
|
|
|
guid=results.group(1),
|
|
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# def make_container_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
|
|
|
|
# return f"urn:li:container:({make_data_platform_urn(platform)},{env},{name})"
|
|
|
|
|
|
|
|
|
|
|
|
def make_container_urn(guid: str) -> str:
|
|
|
|
return f"urn:li:container:{guid}"
|
|
|
|
|
|
|
|
|
|
|
|
def container_urn_to_key(guid: str) -> Optional[ContainerKeyClass]:
|
|
|
|
pattern = r"urn:li:container:(.*)"
|
|
|
|
results = re.search(pattern, guid)
|
|
|
|
if results is not None:
|
|
|
|
return ContainerKeyClass(
|
|
|
|
guid=results.group(1),
|
|
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2022-02-03 12:26:08 +05:30
|
|
|
def datahub_guid(obj: dict) -> str:
|
|
|
|
obj_str = json.dumps(
|
|
|
|
pre_json_transform(obj), separators=(",", ":"), sort_keys=True
|
|
|
|
).encode("utf-8")
|
|
|
|
datahub_guid = md5(obj_str).hexdigest()
|
|
|
|
return datahub_guid
|
|
|
|
|
|
|
|
|
|
|
|
def make_assertion_urn(assertion_id: str) -> str:
|
|
|
|
return f"urn:li:assertion:{assertion_id}"
|
|
|
|
|
|
|
|
|
2021-04-20 20:44:38 -07:00
|
|
|
def make_user_urn(username: str) -> str:
|
2021-04-05 19:11:28 -07:00
|
|
|
return f"urn:li:corpuser:{username}"
|
|
|
|
|
|
|
|
|
2021-09-01 15:10:12 -07:00
|
|
|
def make_group_urn(groupname: str) -> str:
|
|
|
|
return f"urn:li:corpGroup:{groupname}"
|
|
|
|
|
|
|
|
|
2021-05-10 19:26:55 -07:00
|
|
|
def make_tag_urn(tag: str) -> str:
|
|
|
|
return f"urn:li:tag:{tag}"
|
|
|
|
|
|
|
|
|
2022-01-05 16:32:05 -08:00
|
|
|
def make_owner_urn(owner: str, owner_type: OwnerType) -> str:
|
|
|
|
return f"urn:li:{owner_type.value}:{owner}"
|
|
|
|
|
|
|
|
|
2021-12-07 23:54:15 -06:00
|
|
|
def make_term_urn(term: str) -> str:
|
|
|
|
return f"urn:li:glossaryTerm:{term}"
|
|
|
|
|
|
|
|
|
2021-04-12 17:40:15 -07:00
|
|
|
def make_data_flow_urn(
|
|
|
|
orchestrator: str, flow_id: str, cluster: str = DEFAULT_FLOW_CLUSTER
|
2021-04-20 20:44:38 -07:00
|
|
|
) -> str:
|
2021-04-12 17:40:15 -07:00
|
|
|
return f"urn:li:dataFlow:({orchestrator},{flow_id},{cluster})"
|
|
|
|
|
|
|
|
|
2021-04-20 20:44:38 -07:00
|
|
|
def make_data_job_urn_with_flow(flow_urn: str, job_id: str) -> str:
|
2021-04-12 17:40:15 -07:00
|
|
|
return f"urn:li:dataJob:({flow_urn},{job_id})"
|
|
|
|
|
|
|
|
|
|
|
|
def make_data_job_urn(
|
|
|
|
orchestrator: str, flow_id: str, job_id: str, cluster: str = DEFAULT_FLOW_CLUSTER
|
2021-04-20 20:44:38 -07:00
|
|
|
) -> str:
|
2021-04-12 17:40:15 -07:00
|
|
|
return make_data_job_urn_with_flow(
|
|
|
|
make_data_flow_urn(orchestrator, flow_id, cluster), job_id
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-07-22 21:50:44 -07:00
|
|
|
def make_dashboard_urn(platform: str, name: str) -> str:
|
|
|
|
# FIXME: dashboards don't currently include data platform urn prefixes.
|
|
|
|
return f"urn:li:dashboard:({platform},{name})"
|
|
|
|
|
|
|
|
|
|
|
|
def make_chart_urn(platform: str, name: str) -> str:
|
|
|
|
# FIXME: charts don't currently include data platform urn prefixes.
|
|
|
|
return f"urn:li:chart:({platform},{name})"
|
|
|
|
|
|
|
|
|
2022-02-07 18:51:49 +01:00
|
|
|
def make_domain_urn(domain: str) -> str:
|
|
|
|
if domain.startswith("urn:li:domain:"):
|
|
|
|
return domain
|
|
|
|
return f"urn:li:domain:{domain}"
|
|
|
|
|
|
|
|
|
2021-06-09 15:07:04 -07:00
|
|
|
def make_ml_primary_key_urn(feature_table_name: str, primary_key_name: str) -> str:
|
|
|
|
return f"urn:li:mlPrimaryKey:({feature_table_name},{primary_key_name})"
|
|
|
|
|
|
|
|
|
|
|
|
def make_ml_feature_urn(
|
|
|
|
feature_table_name: str,
|
|
|
|
feature_name: str,
|
|
|
|
) -> str:
|
|
|
|
return f"urn:li:mlFeature:({feature_table_name},{feature_name})"
|
|
|
|
|
|
|
|
|
|
|
|
def make_ml_feature_table_urn(platform: str, feature_table_name: str) -> str:
|
2021-07-22 13:25:07 -07:00
|
|
|
return f"urn:li:mlFeatureTable:({make_data_platform_urn(platform)},{feature_table_name})"
|
2021-06-09 15:07:04 -07:00
|
|
|
|
|
|
|
|
2021-07-08 16:16:16 -07:00
|
|
|
def make_ml_model_urn(platform: str, model_name: str, env: str) -> str:
|
2021-07-22 13:25:07 -07:00
|
|
|
return f"urn:li:mlModel:({make_data_platform_urn(platform)},{model_name},{env})"
|
2021-07-08 16:16:16 -07:00
|
|
|
|
|
|
|
|
2021-07-19 11:30:43 -07:00
|
|
|
def make_ml_model_deployment_urn(platform: str, deployment_name: str, env: str) -> str:
|
2021-07-22 13:25:07 -07:00
|
|
|
return f"urn:li:mlModelDeployment:({make_data_platform_urn(platform)},{deployment_name},{env})"
|
2021-07-19 11:30:43 -07:00
|
|
|
|
|
|
|
|
|
|
|
def make_ml_model_group_urn(platform: str, group_name: str, env: str) -> str:
|
2021-07-22 13:25:07 -07:00
|
|
|
return (
|
|
|
|
f"urn:li:mlModelGroup:({make_data_platform_urn(platform)},{group_name},{env})"
|
|
|
|
)
|
2021-07-19 11:30:43 -07:00
|
|
|
|
|
|
|
|
2021-09-02 07:44:03 +02:00
|
|
|
def is_valid_ownership_type(ownership_type: Optional[str]) -> bool:
|
|
|
|
return ownership_type is not None and ownership_type in [
|
|
|
|
OwnershipTypeClass.DEVELOPER,
|
|
|
|
OwnershipTypeClass.DATAOWNER,
|
|
|
|
OwnershipTypeClass.DELEGATE,
|
|
|
|
OwnershipTypeClass.PRODUCER,
|
|
|
|
OwnershipTypeClass.CONSUMER,
|
|
|
|
OwnershipTypeClass.STAKEHOLDER,
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def validate_ownership_type(ownership_type: Optional[str]) -> str:
|
|
|
|
if is_valid_ownership_type(ownership_type):
|
|
|
|
return cast(str, ownership_type)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"Unexpected ownership type: {ownership_type}")
|
|
|
|
|
|
|
|
|
2021-04-05 19:11:28 -07:00
|
|
|
def make_lineage_mce(
|
2021-05-11 17:46:39 -07:00
|
|
|
upstream_urns: List[str],
|
2021-04-05 19:11:28 -07:00
|
|
|
downstream_urn: str,
|
|
|
|
lineage_type: str = DatasetLineageTypeClass.TRANSFORMED,
|
|
|
|
) -> MetadataChangeEventClass:
|
|
|
|
mce = MetadataChangeEventClass(
|
|
|
|
proposedSnapshot=DatasetSnapshotClass(
|
|
|
|
urn=downstream_urn,
|
|
|
|
aspects=[
|
|
|
|
UpstreamLineageClass(
|
|
|
|
upstreams=[
|
|
|
|
UpstreamClass(
|
|
|
|
dataset=upstream_urn,
|
|
|
|
type=lineage_type,
|
|
|
|
)
|
|
|
|
for upstream_urn in upstream_urns
|
|
|
|
]
|
|
|
|
)
|
|
|
|
],
|
|
|
|
)
|
|
|
|
)
|
|
|
|
return mce
|
2021-05-11 17:46:39 -07:00
|
|
|
|
|
|
|
|
2021-07-17 20:02:07 -07:00
|
|
|
# This bound isn't tight, but it's better than nothing.
|
|
|
|
Aspect = TypeVar("Aspect", bound=DictWrapper)
|
|
|
|
|
|
|
|
|
|
|
|
def can_add_aspect(mce: MetadataChangeEventClass, AspectType: Type[Aspect]) -> bool:
|
|
|
|
SnapshotType = type(mce.proposedSnapshot)
|
|
|
|
|
|
|
|
constructor_annotations = get_type_hints(SnapshotType.__init__)
|
2021-07-23 17:07:13 -07:00
|
|
|
aspect_list_union = typing_inspect.get_args(constructor_annotations["aspects"])[0]
|
|
|
|
if not isinstance(aspect_list_union, tuple):
|
|
|
|
supported_aspect_types = typing_inspect.get_args(aspect_list_union)
|
|
|
|
else:
|
|
|
|
# On Python 3.6, the union type is represented as a tuple, where
|
|
|
|
# the first item is typing.Union and the subsequent elements are
|
|
|
|
# the types within the union.
|
|
|
|
supported_aspect_types = aspect_list_union[1:]
|
2021-07-17 20:02:07 -07:00
|
|
|
|
|
|
|
return issubclass(AspectType, supported_aspect_types)
|
|
|
|
|
|
|
|
|
2021-05-11 17:46:39 -07:00
|
|
|
def get_aspect_if_available(
|
2021-07-17 20:02:07 -07:00
|
|
|
mce: MetadataChangeEventClass, AspectType: Type[Aspect]
|
|
|
|
) -> Optional[Aspect]:
|
|
|
|
assert can_add_aspect(mce, AspectType)
|
|
|
|
|
2021-05-11 17:46:39 -07:00
|
|
|
all_aspects = mce.proposedSnapshot.aspects
|
2021-07-17 20:02:07 -07:00
|
|
|
aspects: List[Aspect] = [
|
|
|
|
aspect for aspect in all_aspects if isinstance(aspect, AspectType)
|
|
|
|
]
|
2021-05-11 17:46:39 -07:00
|
|
|
|
|
|
|
if len(aspects) > 1:
|
2021-07-17 20:02:07 -07:00
|
|
|
raise ValueError(
|
|
|
|
f"MCE contains multiple aspects of type {AspectType}: {aspects}"
|
|
|
|
)
|
2021-05-11 17:46:39 -07:00
|
|
|
if aspects:
|
|
|
|
return aspects[0]
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2021-11-03 21:39:52 -07:00
|
|
|
def remove_aspect_if_available(
|
|
|
|
mce: MetadataChangeEventClass, aspect_type: Type[Aspect]
|
|
|
|
) -> bool:
|
|
|
|
assert can_add_aspect(mce, aspect_type)
|
|
|
|
# loose type annotations since we checked before
|
|
|
|
aspects: List[Any] = [
|
|
|
|
aspect
|
|
|
|
for aspect in mce.proposedSnapshot.aspects
|
|
|
|
if not isinstance(aspect, aspect_type)
|
|
|
|
]
|
|
|
|
removed = len(aspects) != len(mce.proposedSnapshot.aspects)
|
|
|
|
mce.proposedSnapshot.aspects = aspects
|
|
|
|
return removed
|
|
|
|
|
|
|
|
|
2021-07-17 20:02:07 -07:00
|
|
|
def get_or_add_aspect(mce: MetadataChangeEventClass, default: Aspect) -> Aspect:
|
2021-05-11 17:46:39 -07:00
|
|
|
existing = get_aspect_if_available(mce, type(default))
|
|
|
|
if existing is not None:
|
|
|
|
return existing
|
2021-07-17 20:02:07 -07:00
|
|
|
mce.proposedSnapshot.aspects.append(default) # type: ignore
|
2021-05-11 17:46:39 -07:00
|
|
|
return default
|
2021-11-03 21:39:52 -07:00
|
|
|
|
|
|
|
|
2021-11-05 09:27:41 -07:00
|
|
|
def make_global_tag_aspect_with_tag_list(tags: List[str]) -> GlobalTagsClass:
|
|
|
|
return GlobalTagsClass(
|
|
|
|
tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-01-05 16:32:05 -08:00
|
|
|
def make_ownership_aspect_from_urn_list(
|
|
|
|
owner_urns: List[str], source_type: Optional[Union[str, OwnershipSourceTypeClass]]
|
|
|
|
) -> OwnershipClass:
|
|
|
|
for owner_urn in owner_urns:
|
|
|
|
assert owner_urn.startswith("urn:li:corpuser:") or owner_urn.startswith(
|
|
|
|
"urn:li:corpGroup:"
|
|
|
|
)
|
|
|
|
ownership_source_type: Union[None, OwnershipSourceClass] = None
|
|
|
|
if source_type:
|
|
|
|
ownership_source_type = OwnershipSourceClass(type=source_type)
|
|
|
|
|
|
|
|
owners_list = [
|
|
|
|
OwnerClass(
|
|
|
|
owner=owner_urn,
|
|
|
|
type=OwnershipTypeClass.DATAOWNER,
|
|
|
|
source=ownership_source_type,
|
|
|
|
)
|
|
|
|
for owner_urn in owner_urns
|
|
|
|
]
|
|
|
|
return OwnershipClass(
|
|
|
|
owners=owners_list,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def make_glossary_terms_aspect_from_urn_list(term_urns: List[str]) -> GlossaryTerms:
|
|
|
|
for term_urn in term_urns:
|
|
|
|
assert term_urn.startswith("urn:li:glossaryTerm:")
|
|
|
|
glossary_terms = GlossaryTerms(
|
|
|
|
[GlossaryTermAssociationClass(term_urn) for term_urn in term_urns],
|
|
|
|
AuditStampClass(
|
|
|
|
time=int(time.time() * 1000),
|
|
|
|
actor="urn:li:corpuser:datahub",
|
|
|
|
),
|
|
|
|
)
|
|
|
|
return glossary_terms
|
|
|
|
|
|
|
|
|
2021-11-03 21:39:52 -07:00
|
|
|
def set_aspect(
|
|
|
|
mce: MetadataChangeEventClass, aspect: Optional[Aspect], aspect_type: Type[Aspect]
|
|
|
|
) -> None:
|
2021-11-05 09:27:41 -07:00
|
|
|
"""Sets the aspect to the provided aspect, overwriting any previous aspect value that might have existed before.
|
|
|
|
If passed in aspect is None, then the existing aspect value will be removed"""
|
2021-11-03 21:39:52 -07:00
|
|
|
remove_aspect_if_available(mce, aspect_type)
|
|
|
|
if aspect is not None:
|
|
|
|
mce.proposedSnapshot.aspects.append(aspect) # type: ignore
|