datahub/metadata-ingestion/examples/library/version_set_link_multiple_versions.py

132 lines
4.1 KiB
Python

# metadata-ingestion/examples/library/version_set_link_multiple_versions.py
"""
Link multiple versions of an entity to a version set.
This example demonstrates creating a complete version history for an ML model,
showing how to manage multiple versions with semantic versioning.
"""
from typing import TypedDict
from datahub.emitter.mce_builder import datahub_guid
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.schema_classes import (
AuditStampClass,
MetadataAttributionClass,
VersioningSchemeClass,
VersionPropertiesClass,
VersionSetPropertiesClass,
VersionTagClass,
)
server = "http://localhost:8080"
emitter = DatahubRestEmitter(gms_server=server)
class VersionInfo(TypedDict):
urn: str
version: str
sortId: str
comment: str
timestamp: int
aliases: list[str]
actor: str
# Generate version set URN
guid_dict = {"platform": "pytorch", "name": "sentiment-analyzer"}
version_set_id = datahub_guid(guid_dict)
version_set_urn = f"urn:li:versionSet:({version_set_id},mlModel)"
# Define the model versions we want to link
versions: list[VersionInfo] = [
{
"urn": "urn:li:mlModel:(urn:li:dataPlatform:pytorch,sentiment-analyzer-v1,PROD)",
"version": "1.0.0",
"sortId": "1.0.0",
"comment": "Initial release with basic sentiment analysis",
"timestamp": 1672531200000,
"aliases": ["v1"],
"actor": "urn:li:corpuser:data-scientist",
},
{
"urn": "urn:li:mlModel:(urn:li:dataPlatform:pytorch,sentiment-analyzer-v1.1,PROD)",
"version": "1.1.0",
"sortId": "1.1.0",
"comment": "Minor improvements to accuracy",
"timestamp": 1675209600000,
"aliases": ["v1.1"],
"actor": "urn:li:corpuser:data-scientist",
},
{
"urn": "urn:li:mlModel:(urn:li:dataPlatform:pytorch,sentiment-analyzer-v2,PROD)",
"version": "2.0.0",
"sortId": "2.0.0",
"comment": "Major update with multi-language support",
"timestamp": 1677628800000,
"aliases": ["v2", "latest", "production"],
"actor": "urn:li:corpuser:ml-engineer",
},
]
# Link each version to the version set
for i, version_info in enumerate(versions):
is_latest = i == len(versions) - 1
# Create version properties for each model
version_properties = VersionPropertiesClass(
versionSet=version_set_urn,
version=VersionTagClass(
versionTag=version_info["version"],
metadataAttribution=MetadataAttributionClass(
time=version_info["timestamp"],
actor=version_info["actor"],
),
),
sortId=version_info["sortId"],
versioningScheme=VersioningSchemeClass.LEXICOGRAPHIC_STRING,
comment=version_info["comment"],
aliases=[
VersionTagClass(versionTag=alias) for alias in version_info["aliases"]
],
sourceCreatedTimestamp=AuditStampClass(
time=version_info["timestamp"],
actor=version_info["actor"],
),
metadataCreatedTimestamp=AuditStampClass(
time=version_info["timestamp"],
actor="urn:li:corpuser:datahub",
),
)
# Emit version properties
model_version_mcp = MetadataChangeProposalWrapper(
entityUrn=version_info["urn"],
aspect=version_properties,
)
emitter.emit(model_version_mcp)
print(f"Linked version {version_info['version']}: {version_info['urn']}")
# Update version set to point to the latest version
version_set_properties = VersionSetPropertiesClass(
latest=versions[-1]["urn"],
versioningScheme=VersioningSchemeClass.LEXICOGRAPHIC_STRING,
customProperties={
"model_type": "sentiment_analysis",
"language_support": "multi-language",
"framework": "pytorch",
},
)
version_set_mcp = MetadataChangeProposalWrapper(
entityUrn=version_set_urn,
aspect=version_set_properties,
)
emitter.emit(version_set_mcp)
print(f"\nVersion set created: {version_set_urn}")
print(f"Total versions: {len(versions)}")
print(f"Latest version: {versions[-1]['version']}")