113 lines
3.5 KiB
Python

# metadata-ingestion/examples/library/notebook_create.py
import logging
import time
from typing import Dict, Optional
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.schema_classes import (
AuditStampClass,
ChangeAuditStampsClass,
NotebookInfoClass,
)
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def create_notebook_metadata(
notebook_urn: str,
title: str,
description: str,
external_url: str,
custom_properties: Optional[Dict[str, str]] = None,
actor: str = "urn:li:corpuser:data_scientist",
timestamp_millis: Optional[int] = None,
) -> MetadataChangeProposalWrapper:
"""
Create metadata for a notebook entity.
Args:
notebook_urn: URN of the notebook
title: Title of the notebook
description: Description of the notebook
external_url: URL to access the notebook
custom_properties: Optional dictionary of custom properties
actor: URN of the actor creating the notebook
timestamp_millis: Optional timestamp in milliseconds (defaults to current time)
Returns:
MetadataChangeProposalWrapper containing the notebook metadata
"""
timestamp_millis = timestamp_millis or int(time.time() * 1000)
audit_stamp = AuditStampClass(time=timestamp_millis, actor=actor)
notebook_info = NotebookInfoClass(
title=title,
description=description,
externalUrl=external_url,
customProperties=custom_properties or {},
changeAuditStamps=ChangeAuditStampsClass(
created=audit_stamp,
lastModified=audit_stamp,
),
)
return MetadataChangeProposalWrapper(
entityUrn=notebook_urn,
aspect=notebook_info,
)
def main(emitter: Optional[DatahubRestEmitter] = None) -> None:
"""
Main function to create a notebook example.
Args:
emitter: Optional emitter to use (for testing). If not provided, creates a new one.
Environment Variables:
DATAHUB_GMS_URL: DataHub GMS server URL (default: http://localhost:8080)
DATAHUB_GMS_TOKEN: DataHub access token (if authentication is required)
"""
if emitter is None:
import os
gms_server = os.getenv("DATAHUB_GMS_URL", "http://localhost:8080")
token = os.getenv("DATAHUB_GMS_TOKEN")
# If no token in env, try to get from datahub config
if not token:
try:
from datahub.ingestion.graph.client import get_default_graph
graph = get_default_graph()
token = graph.config.token
except Exception:
# Fall back to no token
pass
emitter = DatahubRestEmitter(gms_server=gms_server, token=token)
notebook_urn = "urn:li:notebook:(querybook,customer_analysis_2024)"
event = create_notebook_metadata(
notebook_urn=notebook_urn,
title="Customer Segmentation Analysis 2024",
description="Comprehensive analysis of customer segments including RFM analysis, cohort analysis, and predictive scoring for marketing campaigns",
external_url="https://querybook.company.com/notebook/customer_analysis_2024",
custom_properties={
"workspace": "analytics",
"team": "growth",
"last_run": "2024-01-15T10:30:00Z",
},
)
emitter.emit(event)
log.info(f"Created notebook {notebook_urn}")
if __name__ == "__main__":
main()