datahub/metadata-ingestion/examples/library/container_hierarchy_with_dataset.py

133 lines
4.0 KiB
Python

from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
from datahub.metadata.urns import CorpUserUrn, TagUrn
from datahub.sdk import Container, DataHubClient, Dataset
client = DataHubClient.from_env()
# Create a complete container hierarchy for a Snowflake database structure
# Hierarchy: Platform -> Database -> Schema -> Dataset
# Step 1: Create the database container (top-level)
database_key = DatabaseKey(
platform="snowflake",
instance="production",
database="analytics_db",
)
database_container = Container(
database_key,
display_name="Analytics Database",
description="Main analytics database for business intelligence",
subtype="Database",
parent_container=None,
)
# Step 2: Create schema containers within the database
reporting_schema_key = SchemaKey(
platform="snowflake",
instance="production",
database="analytics_db",
schema="reporting",
)
reporting_schema_container = Container(
reporting_schema_key,
display_name="Reporting Schema",
description="Schema for business reporting tables",
subtype="Schema",
parent_container=database_key,
tags=[TagUrn("reporting")],
)
metrics_schema_key = SchemaKey(
platform="snowflake",
instance="production",
database="analytics_db",
schema="metrics",
)
metrics_schema_container = Container(
metrics_schema_key,
display_name="Metrics Schema",
description="Schema for aggregated metrics and KPIs",
subtype="Schema",
parent_container=database_key,
tags=[TagUrn("metrics"), TagUrn("kpi")],
)
# Step 3: Create datasets within the schema containers
sales_dataset = Dataset(
platform="snowflake",
name="analytics_db.reporting.sales_summary",
env="PROD",
description="Daily sales summary aggregations",
parent_container=reporting_schema_key,
schema=[
("date", "DATE", "Transaction date"),
("region", "VARCHAR", "Sales region"),
("revenue", "DECIMAL", "Total revenue in USD"),
("order_count", "INTEGER", "Number of orders"),
],
owners=[(CorpUserUrn("sales-team"), "DATAOWNER")],
tags=[TagUrn("sales"), TagUrn("daily")],
)
kpi_dataset = Dataset(
platform="snowflake",
name="analytics_db.metrics.monthly_kpis",
env="PROD",
description="Monthly key performance indicators",
parent_container=metrics_schema_key,
schema=[
("month", "DATE", "Month start date"),
("metric_name", "VARCHAR", "KPI metric name"),
("metric_value", "DECIMAL", "Metric value"),
("target_value", "DECIMAL", "Target value for the metric"),
],
owners=[(CorpUserUrn("analytics-team"), "DATAOWNER")],
tags=[TagUrn("kpi"), TagUrn("monthly")],
)
# Emit all entities to DataHub in hierarchical order
print("Creating container hierarchy...")
print()
# Emit database (top level)
client.entities.upsert(database_container)
print(f"1. Database: {database_container.display_name}")
print(f" URN: {database_container.urn}")
print()
# Emit schemas (second level)
client.entities.upsert(reporting_schema_container)
print(f"2. Schema: {reporting_schema_container.display_name}")
print(f" URN: {reporting_schema_container.urn}")
print(f" Parent: {database_container.display_name}")
print()
client.entities.upsert(metrics_schema_container)
print(f"3. Schema: {metrics_schema_container.display_name}")
print(f" URN: {metrics_schema_container.urn}")
print(f" Parent: {database_container.display_name}")
print()
# Emit datasets (leaf level)
client.entities.upsert(sales_dataset)
print("4. Dataset: sales_summary")
print(f" URN: {sales_dataset.urn}")
print(f" Parent: {reporting_schema_container.display_name}")
print()
client.entities.upsert(kpi_dataset)
print("5. Dataset: monthly_kpis")
print(f" URN: {kpi_dataset.urn}")
print(f" Parent: {metrics_schema_container.display_name}")
print()
print("Complete hierarchy created:")
print(" analytics_db (Database)")
print(" ├── reporting (Schema)")
print(" │ └── sales_summary (Dataset)")
print(" └── metrics (Schema)")
print(" └── monthly_kpis (Dataset)")