178 lines
5.4 KiB
Python
Raw Normal View History

import pathlib
from datetime import datetime, timezone
import pytest
from datahub.emitter.mcp_builder import SchemaKey
from datahub.errors import SchemaFieldKeyError
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
from datahub.metadata.urns import (
CorpUserUrn,
DatasetUrn,
DomainUrn,
GlossaryTermUrn,
TagUrn,
)
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
from datahub.sdk.dataset import Dataset
from tests.test_helpers.sdk_v2_helpers import assert_entity_golden
_GOLDEN_DIR = pathlib.Path(__file__).parent / "dataset_golden"
def test_dataset_basic(pytestconfig: pytest.Config) -> None:
d = Dataset(
platform="bigquery",
name="proj.dataset.table",
subtype=DatasetSubTypes.TABLE,
schema=[
("field1", "string", "field1 description"),
("field2", "int64", "field2 description"),
],
)
# Check urn setup.
assert Dataset.get_urn_type() == DatasetUrn
assert isinstance(d.urn, DatasetUrn)
assert (
str(d.urn)
== "urn:li:dataset:(urn:li:dataPlatform:bigquery,proj.dataset.table,PROD)"
)
assert str(d.urn) in repr(d)
# Check most attributes.
assert d.platform_instance is None
assert d.tags is None
assert d.terms is None
assert d.created is None
assert d.last_modified is None
assert d.description is None
assert d.custom_properties == {}
assert d.domain is None
# TODO: The column descriptions should go in the editable fields, since we're not in ingestion mode.
assert len(d.schema) == 2
assert d["field1"].description == "field1 description"
with pytest.raises(SchemaFieldKeyError, match=r"Field .* not found"):
d["should_be_missing"]
with pytest.raises(AttributeError):
assert d.extra_attribute # type: ignore
with pytest.raises(AttributeError):
d.extra_attribute = "slots should reject extra fields" # type: ignore
with pytest.raises(AttributeError):
# This should fail. Eventually we should make it suggest calling set_owners instead.
d.owners = [] # type: ignore
assert_entity_golden(
pytestconfig, d, _GOLDEN_DIR / "test_dataset_basic_golden.json"
)
def _build_complex_dataset() -> Dataset:
schema = SchemaKey(
platform="snowflake",
instance="my_instance",
database="MY_DB",
schema="MY_SCHEMA",
)
created = datetime(2025, 1, 2, 3, 4, 5, tzinfo=timezone.utc)
updated = datetime(2025, 1, 9, 3, 4, 6, tzinfo=timezone.utc)
d = Dataset(
platform="snowflake",
platform_instance="my_instance",
name="my_db.my_schema.my_table",
container=schema,
subtype=DatasetSubTypes.TABLE,
schema=[
("field1", "string"),
("field2", "int64", "field2 description"),
],
display_name="MY_TABLE",
qualified_name="MY_DB.MY_SCHEMA.MY_TABLE",
created=created,
last_modified=updated,
custom_properties={
"key1": "value1",
"key2": "value2",
},
description="test",
external_url="https://example.com",
owners=[
CorpUserUrn("admin@datahubproject.io"),
],
tags=[
TagUrn("tag1"),
TagUrn("tag2"),
],
terms=[
GlossaryTermUrn("AccountBalance"),
],
domain=DomainUrn("Marketing"),
)
assert d.platform_instance is not None
assert (
str(d.platform_instance)
== "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,my_instance)"
)
assert d.subtype == "Table"
assert d.description == "test"
assert d.display_name == "MY_TABLE"
assert d.qualified_name == "MY_DB.MY_SCHEMA.MY_TABLE"
assert d.external_url == "https://example.com"
assert d.created == created
assert d.last_modified == updated
assert d.custom_properties == {"key1": "value1", "key2": "value2"}
# Check standard aspects.
assert d.domain == DomainUrn("Marketing")
assert d.tags is not None
assert len(d.tags) == 2
assert d.terms is not None
assert len(d.terms) == 1
assert d.owners is not None
assert len(d.owners) == 1
assert len(d.schema) == 2
# Schema field description.
assert d["field1"].description is None
assert d["field2"].description == "field2 description"
d["field1"].set_description("field1 description")
assert d["field1"].description == "field1 description"
# Schema field tags.
assert d["field1"].tags is None
d["field1"].set_tags([TagUrn("field1_tag1"), TagUrn("field1_tag2")])
assert d["field1"].tags is not None
assert len(d["field1"].tags) == 2
# Schema field terms.
assert d["field2"].terms is None
d["field2"].set_terms(
[GlossaryTermUrn("field2_term1"), GlossaryTermUrn("field2_term2")]
)
assert d["field2"].terms is not None
assert len(d["field2"].terms) == 2
return d
def test_dataset_complex(pytestconfig: pytest.Config) -> None:
d = _build_complex_dataset()
assert_entity_golden(
pytestconfig, d, _GOLDEN_DIR / "test_dataset_complex_golden.json"
)
def test_dataset_ingestion(pytestconfig: pytest.Config) -> None:
with change_default_attribution(KnownAttribution.INGESTION):
d = _build_complex_dataset()
assert_entity_golden(
pytestconfig, d, _GOLDEN_DIR / "test_dataset_ingestion_golden.json"
)