mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-11 17:03:33 +00:00
133 lines
4.0 KiB
Python
133 lines
4.0 KiB
Python
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
from datahub.metadata.urns import CorpUserUrn, TagUrn
|
|
from datahub.sdk import Container, DataHubClient, Dataset
|
|
|
|
client = DataHubClient.from_env()
|
|
|
|
# Create a complete container hierarchy for a Snowflake database structure
|
|
# Hierarchy: Platform -> Database -> Schema -> Dataset
|
|
|
|
# Step 1: Create the database container (top-level)
|
|
database_key = DatabaseKey(
|
|
platform="snowflake",
|
|
instance="production",
|
|
database="analytics_db",
|
|
)
|
|
|
|
database_container = Container(
|
|
database_key,
|
|
display_name="Analytics Database",
|
|
description="Main analytics database for business intelligence",
|
|
subtype="Database",
|
|
parent_container=None,
|
|
)
|
|
|
|
# Step 2: Create schema containers within the database
|
|
reporting_schema_key = SchemaKey(
|
|
platform="snowflake",
|
|
instance="production",
|
|
database="analytics_db",
|
|
schema="reporting",
|
|
)
|
|
|
|
reporting_schema_container = Container(
|
|
reporting_schema_key,
|
|
display_name="Reporting Schema",
|
|
description="Schema for business reporting tables",
|
|
subtype="Schema",
|
|
parent_container=database_key,
|
|
tags=[TagUrn("reporting")],
|
|
)
|
|
|
|
metrics_schema_key = SchemaKey(
|
|
platform="snowflake",
|
|
instance="production",
|
|
database="analytics_db",
|
|
schema="metrics",
|
|
)
|
|
|
|
metrics_schema_container = Container(
|
|
metrics_schema_key,
|
|
display_name="Metrics Schema",
|
|
description="Schema for aggregated metrics and KPIs",
|
|
subtype="Schema",
|
|
parent_container=database_key,
|
|
tags=[TagUrn("metrics"), TagUrn("kpi")],
|
|
)
|
|
|
|
# Step 3: Create datasets within the schema containers
|
|
sales_dataset = Dataset(
|
|
platform="snowflake",
|
|
name="analytics_db.reporting.sales_summary",
|
|
env="PROD",
|
|
description="Daily sales summary aggregations",
|
|
parent_container=reporting_schema_key,
|
|
schema=[
|
|
("date", "DATE", "Transaction date"),
|
|
("region", "VARCHAR", "Sales region"),
|
|
("revenue", "DECIMAL", "Total revenue in USD"),
|
|
("order_count", "INTEGER", "Number of orders"),
|
|
],
|
|
owners=[(CorpUserUrn("sales-team"), "DATAOWNER")],
|
|
tags=[TagUrn("sales"), TagUrn("daily")],
|
|
)
|
|
|
|
kpi_dataset = Dataset(
|
|
platform="snowflake",
|
|
name="analytics_db.metrics.monthly_kpis",
|
|
env="PROD",
|
|
description="Monthly key performance indicators",
|
|
parent_container=metrics_schema_key,
|
|
schema=[
|
|
("month", "DATE", "Month start date"),
|
|
("metric_name", "VARCHAR", "KPI metric name"),
|
|
("metric_value", "DECIMAL", "Metric value"),
|
|
("target_value", "DECIMAL", "Target value for the metric"),
|
|
],
|
|
owners=[(CorpUserUrn("analytics-team"), "DATAOWNER")],
|
|
tags=[TagUrn("kpi"), TagUrn("monthly")],
|
|
)
|
|
|
|
# Emit all entities to DataHub in hierarchical order
|
|
print("Creating container hierarchy...")
|
|
print()
|
|
|
|
# Emit database (top level)
|
|
client.entities.upsert(database_container)
|
|
print(f"1. Database: {database_container.display_name}")
|
|
print(f" URN: {database_container.urn}")
|
|
print()
|
|
|
|
# Emit schemas (second level)
|
|
client.entities.upsert(reporting_schema_container)
|
|
print(f"2. Schema: {reporting_schema_container.display_name}")
|
|
print(f" URN: {reporting_schema_container.urn}")
|
|
print(f" Parent: {database_container.display_name}")
|
|
print()
|
|
|
|
client.entities.upsert(metrics_schema_container)
|
|
print(f"3. Schema: {metrics_schema_container.display_name}")
|
|
print(f" URN: {metrics_schema_container.urn}")
|
|
print(f" Parent: {database_container.display_name}")
|
|
print()
|
|
|
|
# Emit datasets (leaf level)
|
|
client.entities.upsert(sales_dataset)
|
|
print("4. Dataset: sales_summary")
|
|
print(f" URN: {sales_dataset.urn}")
|
|
print(f" Parent: {reporting_schema_container.display_name}")
|
|
print()
|
|
|
|
client.entities.upsert(kpi_dataset)
|
|
print("5. Dataset: monthly_kpis")
|
|
print(f" URN: {kpi_dataset.urn}")
|
|
print(f" Parent: {metrics_schema_container.display_name}")
|
|
print()
|
|
|
|
print("Complete hierarchy created:")
|
|
print(" analytics_db (Database)")
|
|
print(" ├── reporting (Schema)")
|
|
print(" │ └── sales_summary (Dataset)")
|
|
print(" └── metrics (Schema)")
|
|
print(" └── monthly_kpis (Dataset)")
|