datahub/metadata-ingestion/tests/unit/test_dbt_source.py

256 lines
9.3 KiB
Python
Raw Normal View History

from typing import Dict, List, Union
from unittest import mock
from pydantic import ValidationError
from datahub.emitter import mce_builder
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig, DBTCoreSource
from datahub.metadata.schema_classes import (
OwnerClass,
OwnershipSourceClass,
OwnershipSourceTypeClass,
OwnershipTypeClass,
)
def create_owners_list_from_urn_list(
owner_urns: List[str], source_type: str
) -> List[OwnerClass]:
ownership_source_type: Union[None, OwnershipSourceClass] = None
if source_type:
ownership_source_type = OwnershipSourceClass(type=source_type)
owners_list = [
OwnerClass(
owner=owner_urn,
type=OwnershipTypeClass.DATAOWNER,
source=ownership_source_type,
)
for owner_urn in owner_urns
]
return owners_list
def create_mocked_dbt_source() -> DBTCoreSource:
ctx = PipelineContext("test-run-id")
graph = mock.MagicMock()
graph.get_ownership.return_value = mce_builder.make_ownership_aspect_from_urn_list(
["urn:li:corpuser:test_user"], "AUDIT"
)
graph.get_glossary_terms.return_value = (
mce_builder.make_glossary_terms_aspect_from_urn_list(
["urn:li:glossaryTerm:old", "urn:li:glossaryTerm:old2"]
)
)
graph.get_tags.return_value = mce_builder.make_global_tag_aspect_with_tag_list(
["non_dbt_existing", "dbt:existing"]
)
ctx.graph = graph
return DBTCoreSource(DBTCoreConfig(**create_base_dbt_config()), ctx, "dbt")
def create_base_dbt_config() -> Dict:
return dict(
{
"manifest_path": "temp/",
"catalog_path": "temp/",
"sources_path": "temp/",
"target_platform": "postgres",
"enable_meta_mapping": False,
},
)
def test_dbt_source_patching_no_new():
source = create_mocked_dbt_source()
# verifying when there are no new owners to be added
assert source.ctx.graph
transformed_owner_list = source.get_transformed_owners_by_source_type(
[], "urn:li:dataset:dummy", "SERVICE"
)
assert len(transformed_owner_list) == 1
def test_dbt_source_patching_no_conflict():
# verifying when new owners to be added do not conflict with existing source types
source = create_mocked_dbt_source()
new_owner_urns = ["urn:li:corpuser:new_test"]
new_owners_list = create_owners_list_from_urn_list(new_owner_urns, "SERVICE")
transformed_owner_list = source.get_transformed_owners_by_source_type(
new_owners_list, "urn:li:dataset:dummy", "DATABASE"
)
assert len(transformed_owner_list) == 2
owner_set = {"urn:li:corpuser:test_user", "urn:li:corpuser:new_test"}
for single_owner in transformed_owner_list:
assert single_owner.owner in owner_set
assert single_owner.source and single_owner.source.type in {
OwnershipSourceTypeClass.AUDIT,
OwnershipSourceTypeClass.SERVICE,
}
def test_dbt_source_patching_with_conflict():
# verifying when new owner overrides existing owner
source = create_mocked_dbt_source()
new_owner_urns = ["urn:li:corpuser:new_test", "urn:li:corpuser:new_test2"]
new_owners_list = create_owners_list_from_urn_list(new_owner_urns, "AUDIT")
transformed_owner_list = source.get_transformed_owners_by_source_type(
new_owners_list, "urn:li:dataset:dummy", "AUDIT"
)
assert len(transformed_owner_list) == 2
expected_owner_set = {"urn:li:corpuser:new_test", "urn:li:corpuser:new_test2"}
for single_owner in transformed_owner_list:
assert single_owner.owner in expected_owner_set
assert (
single_owner.source
and single_owner.source.type == OwnershipSourceTypeClass.AUDIT
)
def test_dbt_source_patching_with_conflict_null_source_type_in_existing_owner():
# verifying when existing owners have null source_type and new owners are present.
# So the existing owners will null type will be removed.
source = create_mocked_dbt_source()
graph = mock.MagicMock()
graph.get_ownership.return_value = mce_builder.make_ownership_aspect_from_urn_list(
["urn:li:corpuser:existing_test_user"], None
)
source.ctx.graph = graph
new_owner_urns = ["urn:li:corpuser:new_test", "urn:li:corpuser:new_test2"]
new_owners_list = create_owners_list_from_urn_list(new_owner_urns, "AUDIT")
transformed_owner_list = source.get_transformed_owners_by_source_type(
new_owners_list, "urn:li:dataset:dummy", "AUDIT"
)
assert len(transformed_owner_list) == 2
expected_owner_set = {"urn:li:corpuser:new_test", "urn:li:corpuser:new_test2"}
for single_owner in transformed_owner_list:
assert single_owner.owner in expected_owner_set
assert (
single_owner.source
and single_owner.source.type == OwnershipSourceTypeClass.AUDIT
)
def test_dbt_source_patching_tags():
# two existing tags out of which one as a prefix that we want to filter on.
# two new tags out of which one has a prefix we are filtering on existing tags, so this tag will
# override the existing one with the same prefix.
source = create_mocked_dbt_source()
new_tag_aspect = mce_builder.make_global_tag_aspect_with_tag_list(
["new_non_dbt", "dbt:new_dbt"]
)
transformed_tags = source.get_transformed_tags_by_prefix(
new_tag_aspect.tags, "urn:li:dataset:dummy", "urn:li:tag:dbt:"
)
expected_tags = {
"urn:li:tag:new_non_dbt",
"urn:li:tag:non_dbt_existing",
"urn:li:tag:dbt:new_dbt",
}
assert len(transformed_tags) == 3
for transformed_tag in transformed_tags:
assert transformed_tag.tag in expected_tags
def test_dbt_source_patching_terms():
# existing terms and new terms have two terms each and one common. After deduping we should only get 3 unique terms
source = create_mocked_dbt_source()
new_terms = mce_builder.make_glossary_terms_aspect_from_urn_list(
["urn:li:glossaryTerm:old", "urn:li:glossaryTerm:new"]
)
transformed_terms = source.get_transformed_terms(
new_terms.terms, "urn:li:dataset:dummy"
)
expected_terms = {
"urn:li:glossaryTerm:old",
"urn:li:glossaryTerm:old2",
"urn:li:glossaryTerm:new",
}
assert len(transformed_terms) == 3
for transformed_term in transformed_terms:
assert transformed_term.urn in expected_terms
def test_dbt_entity_emission_configuration():
config_dict = {
"manifest_path": "dummy_path",
"catalog_path": "dummy_path",
"target_platform": "dummy_platform",
"entities_enabled": {"models": "Only", "seeds": "Only"},
}
try:
DBTCoreConfig.parse_obj(config_dict)
except ValidationError as ve:
assert len(ve.errors()) == 1
assert (
"Cannot have more than 1 type of entity emission set to ONLY"
in ve.errors()[0]["msg"]
)
# valid config
config_dict = {
"manifest_path": "dummy_path",
"catalog_path": "dummy_path",
"target_platform": "dummy_platform",
"entities_enabled": {"models": "Yes", "seeds": "Only"},
}
DBTCoreConfig.parse_obj(config_dict)
def test_dbt_entity_emission_configuration_helpers():
config_dict = {
"manifest_path": "dummy_path",
"catalog_path": "dummy_path",
"target_platform": "dummy_platform",
"entities_enabled": {
"models": "Only",
},
}
config = DBTCoreConfig.parse_obj(config_dict)
assert config.entities_enabled.can_emit_node_type("model")
assert not config.entities_enabled.can_emit_node_type("source")
assert not config.entities_enabled.can_emit_node_type("test")
assert not config.entities_enabled.can_emit_test_results
config_dict = {
"manifest_path": "dummy_path",
"catalog_path": "dummy_path",
"target_platform": "dummy_platform",
}
config = DBTCoreConfig.parse_obj(config_dict)
assert config.entities_enabled.can_emit_node_type("model")
assert config.entities_enabled.can_emit_node_type("source")
assert config.entities_enabled.can_emit_node_type("test")
assert config.entities_enabled.can_emit_test_results
config_dict = {
"manifest_path": "dummy_path",
"catalog_path": "dummy_path",
"target_platform": "dummy_platform",
"entities_enabled": {
"test_results": "Only",
},
}
config = DBTCoreConfig.parse_obj(config_dict)
assert not config.entities_enabled.can_emit_node_type("model")
assert not config.entities_enabled.can_emit_node_type("source")
assert not config.entities_enabled.can_emit_node_type("test")
assert config.entities_enabled.can_emit_test_results
config_dict = {
"manifest_path": "dummy_path",
"catalog_path": "dummy_path",
"target_platform": "dummy_platform",
"entities_enabled": {
"test_results": "Yes",
"test_definitions": "Yes",
"models": "No",
"sources": "No",
},
}
config = DBTCoreConfig.parse_obj(config_dict)
assert not config.entities_enabled.can_emit_node_type("model")
assert not config.entities_enabled.can_emit_node_type("source")
assert config.entities_enabled.can_emit_node_type("test")
assert config.entities_enabled.can_emit_test_results