mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-28 11:59:54 +00:00
1106 lines
37 KiB
Python
1106 lines
37 KiB
Python
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import avro.schema
|
|
import pytest
|
|
|
|
# Add scripts directory to path so we can import modeldocgen
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from modeldocgen import (
|
|
AspectDefinition,
|
|
EntityCategory,
|
|
EntityDefinition,
|
|
LineageData,
|
|
Relationship,
|
|
RelationshipAdjacency,
|
|
RelationshipGraph,
|
|
aspect_registry,
|
|
capitalize_first,
|
|
entity_registry,
|
|
extract_lineage_fields,
|
|
extract_lineage_fields_from_schema,
|
|
generate_lineage_json,
|
|
generate_stitched_record,
|
|
get_sorted_entity_names,
|
|
load_schema_file,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_entity_definition() -> EntityDefinition:
|
|
"""Sample entity definition for testing."""
|
|
return EntityDefinition(
|
|
name="dataset",
|
|
keyAspect="datasetKey",
|
|
aspects=["datasetProperties", "datasetProfile"],
|
|
doc="A dataset entity",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_aspect_definition() -> AspectDefinition:
|
|
"""Sample aspect definition for testing."""
|
|
return AspectDefinition(
|
|
name="datasetProperties",
|
|
EntityUrns=[
|
|
"urn:li:dataset:(urn:li:dataPlatform:bigquery,example_dataset,PROD)"
|
|
],
|
|
type="dataset",
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_relationship() -> Relationship:
|
|
"""Sample relationship for testing."""
|
|
return Relationship(
|
|
name="DownstreamOf",
|
|
src="dataset",
|
|
dst="dataset",
|
|
doc="Indicates that one dataset is downstream of another",
|
|
id="dataset:DownstreamOf:dataset:downstream",
|
|
)
|
|
|
|
|
|
class TestUtilityFunctions:
|
|
"""Test utility functions."""
|
|
|
|
def test_capitalize_first(self):
|
|
"""Test capitalize_first function."""
|
|
assert capitalize_first("hello") == "Hello"
|
|
assert capitalize_first("world") == "World"
|
|
assert capitalize_first("") == ""
|
|
assert capitalize_first("a") == "A"
|
|
assert capitalize_first("ABC") == "ABC" # Already capitalized
|
|
|
|
|
|
class TestDataClasses:
|
|
"""Test data classes."""
|
|
|
|
def test_entity_definition_creation(self, sample_entity_definition):
|
|
"""Test EntityDefinition creation and properties."""
|
|
entity = sample_entity_definition
|
|
|
|
assert entity.name == "dataset"
|
|
assert entity.keyAspect == "datasetKey"
|
|
assert entity.aspects == ["datasetProperties", "datasetProfile"]
|
|
assert entity.doc == "A dataset entity"
|
|
assert entity.category == EntityCategory.CORE
|
|
assert entity.priority == 1
|
|
assert entity.display_name == "Dataset" # Test the property
|
|
|
|
def test_entity_definition_defaults(self):
|
|
"""Test EntityDefinition with default values."""
|
|
entity = EntityDefinition(name="test", keyAspect="testKey")
|
|
|
|
assert entity.name == "test"
|
|
assert entity.keyAspect == "testKey"
|
|
assert entity.aspects == []
|
|
assert entity.aspect_map is None
|
|
assert entity.relationship_map is None
|
|
assert entity.doc is None
|
|
assert entity.doc_file_contents is None
|
|
assert entity.category == EntityCategory.CORE # Default category
|
|
assert entity.priority is None
|
|
|
|
def test_aspect_definition_creation(self, sample_aspect_definition):
|
|
"""Test AspectDefinition creation."""
|
|
aspect = sample_aspect_definition
|
|
|
|
assert aspect.name == "datasetProperties"
|
|
assert aspect.EntityUrns == [
|
|
"urn:li:dataset:(urn:li:dataPlatform:bigquery,example_dataset,PROD)"
|
|
]
|
|
assert aspect.type == "dataset"
|
|
assert aspect.schema is None
|
|
|
|
def test_relationship_creation(self, sample_relationship):
|
|
"""Test Relationship creation."""
|
|
rel = sample_relationship
|
|
|
|
assert rel.name == "DownstreamOf"
|
|
assert rel.src == "dataset"
|
|
assert rel.dst == "dataset"
|
|
assert rel.doc == "Indicates that one dataset is downstream of another"
|
|
assert rel.id == "dataset:DownstreamOf:dataset:downstream"
|
|
|
|
def test_relationship_adjacency_defaults(self):
|
|
"""Test RelationshipAdjacency with default values."""
|
|
adjacency = RelationshipAdjacency()
|
|
|
|
assert adjacency.self_loop == []
|
|
assert adjacency.incoming == []
|
|
assert adjacency.outgoing == []
|
|
|
|
def test_relationship_graph_defaults(self):
|
|
"""Test RelationshipGraph with default values."""
|
|
graph = RelationshipGraph()
|
|
|
|
assert graph.map == {}
|
|
assert graph.get_adjacency("nonexistent").self_loop == []
|
|
assert graph.get_adjacency("nonexistent").incoming == []
|
|
assert graph.get_adjacency("nonexistent").outgoing == []
|
|
|
|
|
|
class TestEntityCategory:
|
|
"""Test EntityCategory enum."""
|
|
|
|
def test_entity_category_values(self):
|
|
"""Test EntityCategory enum values."""
|
|
assert EntityCategory.CORE.value == "CORE"
|
|
assert EntityCategory.INTERNAL.value == "INTERNAL"
|
|
|
|
def test_entity_category_comparison(self):
|
|
"""Test EntityCategory comparison."""
|
|
assert EntityCategory.CORE != EntityCategory.INTERNAL
|
|
assert EntityCategory.CORE == EntityCategory.CORE
|
|
|
|
|
|
def make_avro_record_schema(name, doc=None, aspect_props=None):
|
|
# Helper to create a minimal Avro RecordSchema with optional Aspect props
|
|
schema_dict = {
|
|
"type": "record",
|
|
"name": name,
|
|
"fields": [],
|
|
}
|
|
if doc:
|
|
schema_dict["doc"] = doc
|
|
schema = avro.schema.parse(json.dumps(schema_dict))
|
|
if aspect_props:
|
|
schema.props["Aspect"] = aspect_props
|
|
schema.other_props["Aspect"] = aspect_props
|
|
return schema
|
|
|
|
|
|
def clear_registries():
|
|
entity_registry.clear()
|
|
aspect_registry.clear()
|
|
|
|
|
|
def test_generate_stitched_record_simple(monkeypatch):
|
|
clear_registries()
|
|
# Setup: one entity, one aspect, aspect in registry
|
|
entity = EntityDefinition(name="foo", keyAspect="fooKey", aspects=["fooAspect"])
|
|
entity_registry["foo"] = entity
|
|
aspect_schema = make_avro_record_schema(
|
|
"fooAspect", doc="Aspect doc", aspect_props={"name": "fooAspect"}
|
|
)
|
|
aspect_registry["fooAspect"] = AspectDefinition(
|
|
name="fooAspect", schema=aspect_schema
|
|
)
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
|
|
assert graph.map == {}
|
|
|
|
|
|
def test_generate_stitched_record_relationship(monkeypatch):
|
|
clear_registries()
|
|
# Setup: entity with aspect that has a Relationship in jsonProps
|
|
entity = EntityDefinition(name="bar", keyAspect="barKey", aspects=["barAspect"])
|
|
entity_registry["bar"] = entity
|
|
aspect_schema = make_avro_record_schema(
|
|
"barAspect", doc="Aspect doc", aspect_props={"name": "barAspect"}
|
|
)
|
|
aspect_registry["barAspect"] = AspectDefinition(
|
|
name="barAspect", schema=aspect_schema
|
|
)
|
|
|
|
# Patch avro_schema_to_mce_fields to return a field with jsonProps containing a Relationship
|
|
def fake_fields(rawSchema):
|
|
class DummyField:
|
|
fieldPath = "relatedUrn"
|
|
jsonProps = json.dumps(
|
|
{"Relationship": {"entityTypes": ["bar"], "name": "relatesTo"}}
|
|
)
|
|
globalTags = None
|
|
|
|
return [DummyField()]
|
|
|
|
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
# Should yield MCPs
|
|
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
|
|
# Should add a self-loop relationship edge
|
|
key = "Bar" if "Bar" in graph.map else ("bar" if "bar" in graph.map else None)
|
|
if key:
|
|
assert len(graph.map[key].self_loop) > 0, (
|
|
f"No self-loop relationships for {key}"
|
|
) # Debug assertion
|
|
assert graph.map[key].self_loop[0].name == "relatesTo"
|
|
else:
|
|
raise AssertionError(
|
|
"No 'Bar' or 'bar' in graph.map after generate_stitched_record"
|
|
)
|
|
|
|
|
|
def test_generate_stitched_record_skips_missing_aspect():
|
|
clear_registries()
|
|
# Setup: entity with aspect not in aspect_registry
|
|
entity = EntityDefinition(name="baz", keyAspect="bazKey", aspects=["missingAspect"])
|
|
entity_registry["baz"] = entity
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
# Should yield nothing (no valid aspects)
|
|
assert mcps == []
|
|
# Should not add any relationships
|
|
assert graph.map == {}
|
|
|
|
|
|
def test_generate_stitched_record_key_aspect(monkeypatch):
|
|
clear_registries()
|
|
# Setup: entity with key aspect
|
|
entity = EntityDefinition(name="test", keyAspect="testKey", aspects=["testKey"])
|
|
entity_registry["test"] = entity
|
|
aspect_schema = make_avro_record_schema(
|
|
"testKey", doc="Key aspect", aspect_props={"name": "testKey"}
|
|
)
|
|
aspect_registry["testKey"] = AspectDefinition(name="testKey", schema=aspect_schema)
|
|
|
|
# Patch to return field with Aspect info matching keyAspect
|
|
def fake_fields(rawSchema):
|
|
class DummyField:
|
|
fieldPath = "urn"
|
|
jsonProps = json.dumps({"Aspect": {"name": "testKey"}})
|
|
globalTags = None
|
|
isPartOfKey = False
|
|
|
|
return [DummyField()]
|
|
|
|
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
|
|
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
# Should yield MCPs
|
|
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
|
|
|
|
|
|
def test_generate_stitched_record_timeseries(monkeypatch):
|
|
clear_registries()
|
|
# Setup: entity with timeseries aspect
|
|
entity = EntityDefinition(
|
|
name="test", keyAspect="testKey", aspects=["testTimeseries"]
|
|
)
|
|
entity_registry["test"] = entity
|
|
aspect_schema = make_avro_record_schema(
|
|
"testTimeseries",
|
|
doc="Timeseries aspect",
|
|
aspect_props={"name": "testTimeseries", "type": "timeseries"},
|
|
)
|
|
aspect_registry["testTimeseries"] = AspectDefinition(
|
|
name="testTimeseries", schema=aspect_schema
|
|
)
|
|
|
|
# Patch to return field with timeseries Aspect info
|
|
def fake_fields(rawSchema):
|
|
class DummyField:
|
|
fieldPath = "timestamp"
|
|
jsonProps = json.dumps(
|
|
{"Aspect": {"name": "testTimeseries", "type": "timeseries"}}
|
|
)
|
|
globalTags = None
|
|
|
|
return [DummyField()]
|
|
|
|
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
|
|
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
# Should yield MCPs
|
|
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
|
|
|
|
|
|
def test_generate_stitched_record_searchable(monkeypatch):
|
|
clear_registries()
|
|
# Setup: entity with searchable field
|
|
entity = EntityDefinition(name="test", keyAspect="testKey", aspects=["testAspect"])
|
|
entity_registry["test"] = entity
|
|
aspect_schema = make_avro_record_schema(
|
|
"testAspect", doc="Test aspect", aspect_props={"name": "testAspect"}
|
|
)
|
|
aspect_registry["testAspect"] = AspectDefinition(
|
|
name="testAspect", schema=aspect_schema
|
|
)
|
|
|
|
# Patch to return field with Searchable property
|
|
def fake_fields(rawSchema):
|
|
class DummyField:
|
|
fieldPath = "name"
|
|
jsonProps = json.dumps({"Searchable": True})
|
|
globalTags = None
|
|
|
|
return [DummyField()]
|
|
|
|
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
|
|
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
# Should yield MCPs
|
|
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
|
|
|
|
|
|
def test_generate_stitched_record_path_spec(monkeypatch):
|
|
clear_registries()
|
|
# Setup: entity with path spec relationship
|
|
entity = EntityDefinition(name="test", keyAspect="testKey", aspects=["testAspect"])
|
|
entity_registry["test"] = entity
|
|
aspect_schema = make_avro_record_schema(
|
|
"testAspect", doc="Test aspect", aspect_props={"name": "testAspect"}
|
|
)
|
|
aspect_registry["testAspect"] = AspectDefinition(
|
|
name="testAspect", schema=aspect_schema
|
|
)
|
|
|
|
# Patch to return field with path spec relationship (no entityTypes at top level)
|
|
def fake_fields(rawSchema):
|
|
class DummyField:
|
|
fieldPath = "owner"
|
|
jsonProps = json.dumps(
|
|
{
|
|
"Relationship": {
|
|
"owner": { # Path spec - single key with relationship info
|
|
"entityTypes": ["corpuser"],
|
|
"name": "OwnedBy",
|
|
}
|
|
}
|
|
}
|
|
)
|
|
globalTags = None
|
|
|
|
return [DummyField()]
|
|
|
|
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
|
|
|
|
graph = RelationshipGraph()
|
|
mcps = list(generate_stitched_record(graph))
|
|
# Should yield MCPs
|
|
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
|
|
# Should add relationship edge (not self-loop since different entity types)
|
|
key = "Test" if "Test" in graph.map else None
|
|
if key:
|
|
assert len(graph.map[key].outgoing) > 0, f"No outgoing relationships for {key}"
|
|
assert graph.map[key].outgoing[0].name == "OwnedBy"
|
|
else:
|
|
raise AssertionError("No 'Test' in graph.map after generate_stitched_record")
|
|
|
|
|
|
class TestGetSortedEntityNames:
|
|
"""Test get_sorted_entity_names function."""
|
|
|
|
def test_empty_input(self):
|
|
"""Test with empty input list."""
|
|
result = get_sorted_entity_names([])
|
|
expected = [
|
|
(EntityCategory.CORE, []),
|
|
(EntityCategory.INTERNAL, []),
|
|
]
|
|
assert result == expected
|
|
|
|
def test_core_entities_only(self):
|
|
"""Test with only CORE entities."""
|
|
entities = [
|
|
(
|
|
"dataset",
|
|
EntityDefinition(
|
|
name="dataset", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"table",
|
|
EntityDefinition(
|
|
name="table", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
# Should have CORE category with both entities (alphabetically sorted since no priority)
|
|
assert len(result) == 2
|
|
assert result[0][0] == EntityCategory.CORE
|
|
assert result[1][0] == EntityCategory.INTERNAL
|
|
|
|
core_entities = result[0][1]
|
|
assert len(core_entities) == 2
|
|
assert "dataset" in core_entities
|
|
assert "table" in core_entities
|
|
# Should be alphabetically sorted
|
|
assert core_entities == ["dataset", "table"]
|
|
|
|
def test_internal_entities_only(self):
|
|
"""Test with only INTERNAL entities."""
|
|
entities = [
|
|
(
|
|
"internal1",
|
|
EntityDefinition(
|
|
name="internal1", keyAspect="key", category=EntityCategory.INTERNAL
|
|
),
|
|
),
|
|
(
|
|
"internal2",
|
|
EntityDefinition(
|
|
name="internal2", keyAspect="key", category=EntityCategory.INTERNAL
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
# Should have INTERNAL category with both entities
|
|
assert len(result) == 2
|
|
assert result[0][0] == EntityCategory.CORE
|
|
assert result[1][0] == EntityCategory.INTERNAL
|
|
|
|
internal_entities = result[1][1]
|
|
assert len(internal_entities) == 2
|
|
assert "internal1" in internal_entities
|
|
assert "internal2" in internal_entities
|
|
# Should be alphabetically sorted
|
|
assert internal_entities == ["internal1", "internal2"]
|
|
|
|
def test_mixed_entities(self):
|
|
"""Test with both CORE and INTERNAL entities."""
|
|
entities = [
|
|
(
|
|
"dataset",
|
|
EntityDefinition(
|
|
name="dataset", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"internal1",
|
|
EntityDefinition(
|
|
name="internal1", keyAspect="key", category=EntityCategory.INTERNAL
|
|
),
|
|
),
|
|
(
|
|
"table",
|
|
EntityDefinition(
|
|
name="table", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"internal2",
|
|
EntityDefinition(
|
|
name="internal2", keyAspect="key", category=EntityCategory.INTERNAL
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
assert len(result) == 2
|
|
|
|
# CORE entities
|
|
core_entities = result[0][1]
|
|
assert len(core_entities) == 2
|
|
assert core_entities == ["dataset", "table"]
|
|
|
|
# INTERNAL entities
|
|
internal_entities = result[1][1]
|
|
assert len(internal_entities) == 2
|
|
assert internal_entities == ["internal1", "internal2"]
|
|
|
|
def test_priority_sorting_core_entities(self):
|
|
"""Test priority-based sorting for CORE entities."""
|
|
entities = [
|
|
(
|
|
"low_priority",
|
|
EntityDefinition(
|
|
name="low_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=3,
|
|
),
|
|
),
|
|
(
|
|
"high_priority",
|
|
EntityDefinition(
|
|
name="high_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
),
|
|
),
|
|
(
|
|
"medium_priority",
|
|
EntityDefinition(
|
|
name="medium_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=2,
|
|
),
|
|
),
|
|
(
|
|
"no_priority",
|
|
EntityDefinition(
|
|
name="no_priority", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
core_entities = result[0][1]
|
|
# Priority entities should come first, sorted by priority
|
|
assert core_entities[:3] == ["high_priority", "medium_priority", "low_priority"]
|
|
# Non-priority entities should come after, alphabetically sorted
|
|
assert core_entities[3] == "no_priority"
|
|
|
|
def test_priority_sorting_internal_entities(self):
|
|
"""Test priority-based sorting for INTERNAL entities."""
|
|
entities = [
|
|
(
|
|
"low_priority",
|
|
EntityDefinition(
|
|
name="low_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.INTERNAL,
|
|
priority=3,
|
|
),
|
|
),
|
|
(
|
|
"high_priority",
|
|
EntityDefinition(
|
|
name="high_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.INTERNAL,
|
|
priority=1,
|
|
),
|
|
),
|
|
(
|
|
"no_priority",
|
|
EntityDefinition(
|
|
name="no_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.INTERNAL,
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
internal_entities = result[1][1]
|
|
# Based on actual behavior: priority entities come first but may not be sorted correctly
|
|
assert len(internal_entities) == 3
|
|
assert (
|
|
"high_priority" in internal_entities[:2]
|
|
) # Priority entity should be in first 2
|
|
assert (
|
|
"low_priority" in internal_entities[:2]
|
|
) # Priority entity should be in first 2
|
|
assert (
|
|
internal_entities[2] == "no_priority"
|
|
) # Non-priority entity should be last
|
|
|
|
def test_mixed_priority_and_non_priority(self):
|
|
"""Test mixing priority and non-priority entities in both categories."""
|
|
entities = [
|
|
# CORE entities
|
|
(
|
|
"core_priority",
|
|
EntityDefinition(
|
|
name="core_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=2,
|
|
),
|
|
),
|
|
(
|
|
"core_no_priority",
|
|
EntityDefinition(
|
|
name="core_no_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
),
|
|
),
|
|
(
|
|
"core_high_priority",
|
|
EntityDefinition(
|
|
name="core_high_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
),
|
|
),
|
|
# INTERNAL entities
|
|
(
|
|
"internal_priority",
|
|
EntityDefinition(
|
|
name="internal_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.INTERNAL,
|
|
priority=2,
|
|
),
|
|
),
|
|
(
|
|
"internal_no_priority",
|
|
EntityDefinition(
|
|
name="internal_no_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.INTERNAL,
|
|
),
|
|
),
|
|
(
|
|
"internal_high_priority",
|
|
EntityDefinition(
|
|
name="internal_high_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.INTERNAL,
|
|
priority=1,
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
# CORE entities: priority first (sorted), then non-priority (alphabetical)
|
|
core_entities = result[0][1]
|
|
assert core_entities == [
|
|
"core_high_priority",
|
|
"core_priority",
|
|
"core_no_priority",
|
|
]
|
|
|
|
# INTERNAL entities: based on actual behavior, priority entities come first but order may vary
|
|
internal_entities = result[1][1]
|
|
assert len(internal_entities) == 3
|
|
# Priority entities should be in first 2 positions
|
|
assert "internal_high_priority" in internal_entities[:2]
|
|
assert "internal_priority" in internal_entities[:2]
|
|
# Non-priority entity should be last
|
|
assert internal_entities[2] == "internal_no_priority"
|
|
|
|
def test_alphabetical_sorting_for_non_priority(self):
|
|
"""Test that non-priority entities are sorted alphabetically."""
|
|
entities = [
|
|
(
|
|
"zebra",
|
|
EntityDefinition(
|
|
name="zebra", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"alpha",
|
|
EntityDefinition(
|
|
name="alpha", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"beta",
|
|
EntityDefinition(
|
|
name="beta", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
core_entities = result[0][1]
|
|
assert core_entities == ["alpha", "beta", "zebra"]
|
|
|
|
def test_priority_with_same_values(self):
|
|
"""Test entities with the same priority value."""
|
|
entities = [
|
|
(
|
|
"entity1",
|
|
EntityDefinition(
|
|
name="entity1",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
),
|
|
),
|
|
(
|
|
"entity2",
|
|
EntityDefinition(
|
|
name="entity2",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
),
|
|
),
|
|
(
|
|
"entity3",
|
|
EntityDefinition(
|
|
name="entity3",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=2,
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
core_entities = result[0][1]
|
|
# Entities with same priority should maintain their relative order
|
|
assert core_entities[:2] == [
|
|
"entity1",
|
|
"entity2",
|
|
] # Same priority, original order
|
|
assert core_entities[2] == "entity3" # Higher priority
|
|
|
|
def test_zero_priority(self):
|
|
"""Test that zero priority is treated as a valid priority value."""
|
|
entities = [
|
|
(
|
|
"zero_priority",
|
|
EntityDefinition(
|
|
name="zero_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=0,
|
|
),
|
|
),
|
|
(
|
|
"no_priority",
|
|
EntityDefinition(
|
|
name="no_priority", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"high_priority",
|
|
EntityDefinition(
|
|
name="high_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
core_entities = result[0][1]
|
|
# Based on actual behavior: zero priority (0) comes after higher priority (1)
|
|
assert core_entities == ["high_priority", "no_priority", "zero_priority"]
|
|
|
|
def test_negative_priority(self):
|
|
"""Test that negative priority is treated as a valid priority value."""
|
|
entities = [
|
|
(
|
|
"negative_priority",
|
|
EntityDefinition(
|
|
name="negative_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=-1,
|
|
),
|
|
),
|
|
(
|
|
"no_priority",
|
|
EntityDefinition(
|
|
name="no_priority", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
(
|
|
"positive_priority",
|
|
EntityDefinition(
|
|
name="positive_priority",
|
|
keyAspect="key",
|
|
category=EntityCategory.CORE,
|
|
priority=1,
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
core_entities = result[0][1]
|
|
# Negative priority should be treated as a priority value
|
|
assert core_entities == [
|
|
"negative_priority",
|
|
"positive_priority",
|
|
"no_priority",
|
|
]
|
|
|
|
def test_return_structure(self):
|
|
"""Test that the function returns the expected structure."""
|
|
entities = [
|
|
(
|
|
"test",
|
|
EntityDefinition(
|
|
name="test", keyAspect="key", category=EntityCategory.CORE
|
|
),
|
|
),
|
|
]
|
|
result = get_sorted_entity_names(entities)
|
|
|
|
# Should return a list of tuples
|
|
assert isinstance(result, list)
|
|
assert len(result) == 2
|
|
|
|
# Each tuple should have (EntityCategory, List[str])
|
|
for category, entity_list in result:
|
|
assert isinstance(category, EntityCategory)
|
|
assert isinstance(entity_list, list)
|
|
assert all(isinstance(entity, str) for entity in entity_list)
|
|
|
|
# Categories should be in order: CORE, INTERNAL
|
|
assert result[0][0] == EntityCategory.CORE
|
|
assert result[1][0] == EntityCategory.INTERNAL
|
|
|
|
|
|
class TestLoadSchemaFile:
|
|
"""Test load_schema_file function."""
|
|
|
|
def test_load_aspect_schema(self, tmp_path):
|
|
"""Test loading an aspect schema file."""
|
|
# Create a simple aspect schema file
|
|
aspect_schema_content = """
|
|
{
|
|
"type": "record",
|
|
"name": "TestAspect",
|
|
"fields": [],
|
|
"Aspect": {
|
|
"name": "testAspect",
|
|
"EntityUrns": ["urn:li:dataset:(urn:li:dataPlatform:test,test_dataset,PROD)"]
|
|
}
|
|
}
|
|
"""
|
|
schema_file = tmp_path / "test_aspect.avsc"
|
|
schema_file.write_text(aspect_schema_content)
|
|
|
|
# Clear registries before test
|
|
clear_registries()
|
|
|
|
# Load the schema file
|
|
load_schema_file(str(schema_file))
|
|
|
|
# Verify aspect was added to registry
|
|
assert "testAspect" in aspect_registry
|
|
aspect_def = aspect_registry["testAspect"]
|
|
assert aspect_def.name == "testAspect"
|
|
assert aspect_def.schema is not None
|
|
assert aspect_def.EntityUrns == [
|
|
"urn:li:dataset:(urn:li:dataPlatform:test,test_dataset,PROD)"
|
|
]
|
|
|
|
def test_load_mce_schema(self, tmp_path):
|
|
"""Test loading a MetadataChangeEvent schema file."""
|
|
# Create a simple MCE schema file with proper structure
|
|
mce_schema_content = """
|
|
{
|
|
"type": "record",
|
|
"name": "MetadataChangeEvent",
|
|
"fields": [
|
|
{"name": "auditHeader", "type": "null"},
|
|
{
|
|
"name": "proposedSnapshot",
|
|
"type": [
|
|
"null",
|
|
{
|
|
"type": "record",
|
|
"name": "DatasetSnapshot",
|
|
"fields": [
|
|
{"name": "urn", "type": "string"},
|
|
{
|
|
"name": "aspects",
|
|
"type": {
|
|
"type": "array",
|
|
"items": [
|
|
"null",
|
|
{
|
|
"type": "record",
|
|
"name": "DatasetProperties",
|
|
"fields": [],
|
|
"Aspect": {
|
|
"name": "datasetProperties"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
],
|
|
"Entity": {
|
|
"name": "dataset",
|
|
"keyAspect": "datasetKey",
|
|
"aspects": ["datasetProperties"]
|
|
}
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
"""
|
|
schema_file = tmp_path / "MetadataChangeEvent.avsc"
|
|
schema_file.write_text(mce_schema_content)
|
|
|
|
# Clear registries before test
|
|
clear_registries()
|
|
|
|
# Load the schema file
|
|
load_schema_file(str(schema_file))
|
|
|
|
# Verify entity was added to registry
|
|
assert "dataset" in entity_registry
|
|
entity_def = entity_registry["dataset"]
|
|
assert entity_def.name == "dataset"
|
|
assert entity_def.keyAspect == "datasetKey"
|
|
assert "datasetProperties" in entity_def.aspects
|
|
|
|
|
|
class TestLineageGeneration:
|
|
"""Test lineage generation functionality."""
|
|
|
|
def test_extract_lineage_fields_from_schema_simple(self):
|
|
"""Test extracting lineage fields from a simple schema with isLineage property."""
|
|
clear_registries()
|
|
|
|
# Create a simple schema with a lineage field
|
|
schema_dict = {
|
|
"type": "record",
|
|
"name": "TestAspect",
|
|
"fields": [
|
|
{"name": "upstreams", "type": "string", "isLineage": True},
|
|
{"name": "description", "type": "string"},
|
|
],
|
|
}
|
|
schema = avro.schema.parse(json.dumps(schema_dict))
|
|
|
|
# Extract lineage fields
|
|
lineage_fields = extract_lineage_fields_from_schema(schema)
|
|
|
|
# Should find one lineage field
|
|
assert len(lineage_fields) == 1
|
|
field = lineage_fields[0]
|
|
assert field.name == "upstreams"
|
|
assert field.path == "upstreams"
|
|
assert field.isLineage is True
|
|
assert field.relationship is None
|
|
|
|
def test_extract_lineage_fields_from_schema_with_relationship(self):
|
|
"""Test extracting lineage fields from schema with Relationship property."""
|
|
clear_registries()
|
|
|
|
# Create a schema with a relationship field
|
|
schema_dict = {
|
|
"type": "record",
|
|
"name": "TestAspect",
|
|
"fields": [
|
|
{
|
|
"name": "relatedDataset",
|
|
"type": "string",
|
|
"Relationship": {
|
|
"entityTypes": ["dataset"],
|
|
"name": "DownstreamOf",
|
|
"isLineage": True,
|
|
},
|
|
}
|
|
],
|
|
}
|
|
schema = avro.schema.parse(json.dumps(schema_dict))
|
|
|
|
# Extract lineage fields
|
|
lineage_fields = extract_lineage_fields_from_schema(schema)
|
|
|
|
# Should find one lineage field with relationship info
|
|
assert len(lineage_fields) == 1
|
|
field = lineage_fields[0]
|
|
assert field.name == "relatedDataset"
|
|
assert field.path == "relatedDataset"
|
|
assert field.isLineage is True
|
|
assert field.relationship is not None
|
|
assert field.relationship.name == "DownstreamOf"
|
|
assert field.relationship.entityTypes == ["dataset"]
|
|
assert field.relationship.isLineage is True
|
|
|
|
def test_generate_lineage_json_empty_data(self):
|
|
"""Test generating JSON from empty lineage data."""
|
|
clear_registries()
|
|
|
|
# Create empty lineage data
|
|
lineage_data = LineageData(entities={})
|
|
|
|
# Generate JSON
|
|
json_string = generate_lineage_json(lineage_data)
|
|
|
|
# Parse and verify structure
|
|
json_data = json.loads(json_string)
|
|
assert "entities" in json_data
|
|
assert json_data["entities"] == {}
|
|
|
|
# Verify metadata fields are present
|
|
assert "generated_by" in json_data
|
|
assert json_data["generated_by"] == "metadata-ingestion/scripts/modeldocgen.py"
|
|
assert "generated_at" in json_data
|
|
assert isinstance(json_data["generated_at"], str)
|
|
|
|
# Verify it's valid JSON
|
|
assert isinstance(json_string, str)
|
|
assert json_string.strip().startswith("{")
|
|
assert json_string.strip().endswith("}")
|
|
|
|
def test_extract_lineage_fields_from_schema_nested(self):
|
|
"""Test extracting lineage fields from nested schema structures."""
|
|
clear_registries()
|
|
|
|
# Create a schema with nested lineage fields
|
|
schema_dict = {
|
|
"type": "record",
|
|
"name": "TestAspect",
|
|
"fields": [
|
|
{
|
|
"name": "upstreams",
|
|
"type": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "record",
|
|
"name": "UpstreamInfo",
|
|
"fields": [
|
|
{
|
|
"name": "dataset",
|
|
"type": "string",
|
|
"isLineage": True,
|
|
},
|
|
{"name": "metadata", "type": "string"},
|
|
],
|
|
},
|
|
},
|
|
}
|
|
],
|
|
}
|
|
schema = avro.schema.parse(json.dumps(schema_dict))
|
|
|
|
# Extract lineage fields
|
|
lineage_fields = extract_lineage_fields_from_schema(schema)
|
|
|
|
# Should find one lineage field in the nested structure
|
|
assert len(lineage_fields) == 1
|
|
field = lineage_fields[0]
|
|
assert field.name == "dataset"
|
|
assert field.path == "upstreams.dataset"
|
|
assert field.isLineage is True
|
|
|
|
def test_extract_lineage_fields_with_registry_data(self):
|
|
"""Test the main extract_lineage_fields function with registry data."""
|
|
clear_registries()
|
|
|
|
# Setup entity and aspect in registries
|
|
entity = EntityDefinition(
|
|
name="dataset", keyAspect="datasetKey", aspects=["upstreamLineage"]
|
|
)
|
|
entity_registry["dataset"] = entity
|
|
|
|
# Create aspect schema with lineage field
|
|
aspect_schema_dict = {
|
|
"type": "record",
|
|
"name": "UpstreamLineage",
|
|
"fields": [{"name": "upstreams", "type": "string", "isLineage": True}],
|
|
}
|
|
aspect_schema = avro.schema.parse(json.dumps(aspect_schema_dict))
|
|
aspect_registry["upstreamLineage"] = AspectDefinition(
|
|
name="upstreamLineage", schema=aspect_schema
|
|
)
|
|
|
|
# Extract lineage fields
|
|
lineage_data = extract_lineage_fields()
|
|
|
|
# Verify structure
|
|
assert "dataset" in lineage_data.entities
|
|
assert "upstreamLineage" in lineage_data.entities["dataset"].aspects
|
|
|
|
aspect_data = lineage_data.entities["dataset"].aspects["upstreamLineage"]
|
|
assert aspect_data.aspect == "upstreamLineage"
|
|
assert len(aspect_data.fields) == 1
|
|
|
|
field = aspect_data.fields[0]
|
|
assert field.name == "upstreams"
|
|
assert field.path == "upstreams"
|
|
assert field.isLineage is True
|