datahub/metadata-ingestion/scripts/tests/test_modeldocgen.py

1106 lines
37 KiB
Python

import json
import sys
from pathlib import Path
import avro.schema
import pytest
# Add scripts directory to path so we can import modeldocgen
sys.path.append(str(Path(__file__).parent.parent))
from modeldocgen import (
AspectDefinition,
EntityCategory,
EntityDefinition,
LineageData,
Relationship,
RelationshipAdjacency,
RelationshipGraph,
aspect_registry,
capitalize_first,
entity_registry,
extract_lineage_fields,
extract_lineage_fields_from_schema,
generate_lineage_json,
generate_stitched_record,
get_sorted_entity_names,
load_schema_file,
)
@pytest.fixture
def sample_entity_definition() -> EntityDefinition:
"""Sample entity definition for testing."""
return EntityDefinition(
name="dataset",
keyAspect="datasetKey",
aspects=["datasetProperties", "datasetProfile"],
doc="A dataset entity",
category=EntityCategory.CORE,
priority=1,
)
@pytest.fixture
def sample_aspect_definition() -> AspectDefinition:
"""Sample aspect definition for testing."""
return AspectDefinition(
name="datasetProperties",
EntityUrns=[
"urn:li:dataset:(urn:li:dataPlatform:bigquery,example_dataset,PROD)"
],
type="dataset",
)
@pytest.fixture
def sample_relationship() -> Relationship:
"""Sample relationship for testing."""
return Relationship(
name="DownstreamOf",
src="dataset",
dst="dataset",
doc="Indicates that one dataset is downstream of another",
id="dataset:DownstreamOf:dataset:downstream",
)
class TestUtilityFunctions:
"""Test utility functions."""
def test_capitalize_first(self):
"""Test capitalize_first function."""
assert capitalize_first("hello") == "Hello"
assert capitalize_first("world") == "World"
assert capitalize_first("") == ""
assert capitalize_first("a") == "A"
assert capitalize_first("ABC") == "ABC" # Already capitalized
class TestDataClasses:
"""Test data classes."""
def test_entity_definition_creation(self, sample_entity_definition):
"""Test EntityDefinition creation and properties."""
entity = sample_entity_definition
assert entity.name == "dataset"
assert entity.keyAspect == "datasetKey"
assert entity.aspects == ["datasetProperties", "datasetProfile"]
assert entity.doc == "A dataset entity"
assert entity.category == EntityCategory.CORE
assert entity.priority == 1
assert entity.display_name == "Dataset" # Test the property
def test_entity_definition_defaults(self):
"""Test EntityDefinition with default values."""
entity = EntityDefinition(name="test", keyAspect="testKey")
assert entity.name == "test"
assert entity.keyAspect == "testKey"
assert entity.aspects == []
assert entity.aspect_map is None
assert entity.relationship_map is None
assert entity.doc is None
assert entity.doc_file_contents is None
assert entity.category == EntityCategory.CORE # Default category
assert entity.priority is None
def test_aspect_definition_creation(self, sample_aspect_definition):
"""Test AspectDefinition creation."""
aspect = sample_aspect_definition
assert aspect.name == "datasetProperties"
assert aspect.EntityUrns == [
"urn:li:dataset:(urn:li:dataPlatform:bigquery,example_dataset,PROD)"
]
assert aspect.type == "dataset"
assert aspect.schema is None
def test_relationship_creation(self, sample_relationship):
"""Test Relationship creation."""
rel = sample_relationship
assert rel.name == "DownstreamOf"
assert rel.src == "dataset"
assert rel.dst == "dataset"
assert rel.doc == "Indicates that one dataset is downstream of another"
assert rel.id == "dataset:DownstreamOf:dataset:downstream"
def test_relationship_adjacency_defaults(self):
"""Test RelationshipAdjacency with default values."""
adjacency = RelationshipAdjacency()
assert adjacency.self_loop == []
assert adjacency.incoming == []
assert adjacency.outgoing == []
def test_relationship_graph_defaults(self):
"""Test RelationshipGraph with default values."""
graph = RelationshipGraph()
assert graph.map == {}
assert graph.get_adjacency("nonexistent").self_loop == []
assert graph.get_adjacency("nonexistent").incoming == []
assert graph.get_adjacency("nonexistent").outgoing == []
class TestEntityCategory:
"""Test EntityCategory enum."""
def test_entity_category_values(self):
"""Test EntityCategory enum values."""
assert EntityCategory.CORE.value == "CORE"
assert EntityCategory.INTERNAL.value == "INTERNAL"
def test_entity_category_comparison(self):
"""Test EntityCategory comparison."""
assert EntityCategory.CORE != EntityCategory.INTERNAL
assert EntityCategory.CORE == EntityCategory.CORE
def make_avro_record_schema(name, doc=None, aspect_props=None):
# Helper to create a minimal Avro RecordSchema with optional Aspect props
schema_dict = {
"type": "record",
"name": name,
"fields": [],
}
if doc:
schema_dict["doc"] = doc
schema = avro.schema.parse(json.dumps(schema_dict))
if aspect_props:
schema.props["Aspect"] = aspect_props
schema.other_props["Aspect"] = aspect_props
return schema
def clear_registries():
entity_registry.clear()
aspect_registry.clear()
def test_generate_stitched_record_simple(monkeypatch):
clear_registries()
# Setup: one entity, one aspect, aspect in registry
entity = EntityDefinition(name="foo", keyAspect="fooKey", aspects=["fooAspect"])
entity_registry["foo"] = entity
aspect_schema = make_avro_record_schema(
"fooAspect", doc="Aspect doc", aspect_props={"name": "fooAspect"}
)
aspect_registry["fooAspect"] = AspectDefinition(
name="fooAspect", schema=aspect_schema
)
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
assert graph.map == {}
def test_generate_stitched_record_relationship(monkeypatch):
clear_registries()
# Setup: entity with aspect that has a Relationship in jsonProps
entity = EntityDefinition(name="bar", keyAspect="barKey", aspects=["barAspect"])
entity_registry["bar"] = entity
aspect_schema = make_avro_record_schema(
"barAspect", doc="Aspect doc", aspect_props={"name": "barAspect"}
)
aspect_registry["barAspect"] = AspectDefinition(
name="barAspect", schema=aspect_schema
)
# Patch avro_schema_to_mce_fields to return a field with jsonProps containing a Relationship
def fake_fields(rawSchema):
class DummyField:
fieldPath = "relatedUrn"
jsonProps = json.dumps(
{"Relationship": {"entityTypes": ["bar"], "name": "relatesTo"}}
)
globalTags = None
return [DummyField()]
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
# Should yield MCPs
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
# Should add a self-loop relationship edge
key = "Bar" if "Bar" in graph.map else ("bar" if "bar" in graph.map else None)
if key:
assert len(graph.map[key].self_loop) > 0, (
f"No self-loop relationships for {key}"
) # Debug assertion
assert graph.map[key].self_loop[0].name == "relatesTo"
else:
raise AssertionError(
"No 'Bar' or 'bar' in graph.map after generate_stitched_record"
)
def test_generate_stitched_record_skips_missing_aspect():
clear_registries()
# Setup: entity with aspect not in aspect_registry
entity = EntityDefinition(name="baz", keyAspect="bazKey", aspects=["missingAspect"])
entity_registry["baz"] = entity
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
# Should yield nothing (no valid aspects)
assert mcps == []
# Should not add any relationships
assert graph.map == {}
def test_generate_stitched_record_key_aspect(monkeypatch):
clear_registries()
# Setup: entity with key aspect
entity = EntityDefinition(name="test", keyAspect="testKey", aspects=["testKey"])
entity_registry["test"] = entity
aspect_schema = make_avro_record_schema(
"testKey", doc="Key aspect", aspect_props={"name": "testKey"}
)
aspect_registry["testKey"] = AspectDefinition(name="testKey", schema=aspect_schema)
# Patch to return field with Aspect info matching keyAspect
def fake_fields(rawSchema):
class DummyField:
fieldPath = "urn"
jsonProps = json.dumps({"Aspect": {"name": "testKey"}})
globalTags = None
isPartOfKey = False
return [DummyField()]
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
# Should yield MCPs
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
def test_generate_stitched_record_timeseries(monkeypatch):
clear_registries()
# Setup: entity with timeseries aspect
entity = EntityDefinition(
name="test", keyAspect="testKey", aspects=["testTimeseries"]
)
entity_registry["test"] = entity
aspect_schema = make_avro_record_schema(
"testTimeseries",
doc="Timeseries aspect",
aspect_props={"name": "testTimeseries", "type": "timeseries"},
)
aspect_registry["testTimeseries"] = AspectDefinition(
name="testTimeseries", schema=aspect_schema
)
# Patch to return field with timeseries Aspect info
def fake_fields(rawSchema):
class DummyField:
fieldPath = "timestamp"
jsonProps = json.dumps(
{"Aspect": {"name": "testTimeseries", "type": "timeseries"}}
)
globalTags = None
return [DummyField()]
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
# Should yield MCPs
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
def test_generate_stitched_record_searchable(monkeypatch):
clear_registries()
# Setup: entity with searchable field
entity = EntityDefinition(name="test", keyAspect="testKey", aspects=["testAspect"])
entity_registry["test"] = entity
aspect_schema = make_avro_record_schema(
"testAspect", doc="Test aspect", aspect_props={"name": "testAspect"}
)
aspect_registry["testAspect"] = AspectDefinition(
name="testAspect", schema=aspect_schema
)
# Patch to return field with Searchable property
def fake_fields(rawSchema):
class DummyField:
fieldPath = "name"
jsonProps = json.dumps({"Searchable": True})
globalTags = None
return [DummyField()]
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
# Should yield MCPs
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
def test_generate_stitched_record_path_spec(monkeypatch):
clear_registries()
# Setup: entity with path spec relationship
entity = EntityDefinition(name="test", keyAspect="testKey", aspects=["testAspect"])
entity_registry["test"] = entity
aspect_schema = make_avro_record_schema(
"testAspect", doc="Test aspect", aspect_props={"name": "testAspect"}
)
aspect_registry["testAspect"] = AspectDefinition(
name="testAspect", schema=aspect_schema
)
# Patch to return field with path spec relationship (no entityTypes at top level)
def fake_fields(rawSchema):
class DummyField:
fieldPath = "owner"
jsonProps = json.dumps(
{
"Relationship": {
"owner": { # Path spec - single key with relationship info
"entityTypes": ["corpuser"],
"name": "OwnedBy",
}
}
}
)
globalTags = None
return [DummyField()]
monkeypatch.setattr("modeldocgen.avro_schema_to_mce_fields", fake_fields)
graph = RelationshipGraph()
mcps = list(generate_stitched_record(graph))
# Should yield MCPs
assert any("SchemaMetadataClass" in str(mcp) for mcp in mcps)
# Should add relationship edge (not self-loop since different entity types)
key = "Test" if "Test" in graph.map else None
if key:
assert len(graph.map[key].outgoing) > 0, f"No outgoing relationships for {key}"
assert graph.map[key].outgoing[0].name == "OwnedBy"
else:
raise AssertionError("No 'Test' in graph.map after generate_stitched_record")
class TestGetSortedEntityNames:
"""Test get_sorted_entity_names function."""
def test_empty_input(self):
"""Test with empty input list."""
result = get_sorted_entity_names([])
expected = [
(EntityCategory.CORE, []),
(EntityCategory.INTERNAL, []),
]
assert result == expected
def test_core_entities_only(self):
"""Test with only CORE entities."""
entities = [
(
"dataset",
EntityDefinition(
name="dataset", keyAspect="key", category=EntityCategory.CORE
),
),
(
"table",
EntityDefinition(
name="table", keyAspect="key", category=EntityCategory.CORE
),
),
]
result = get_sorted_entity_names(entities)
# Should have CORE category with both entities (alphabetically sorted since no priority)
assert len(result) == 2
assert result[0][0] == EntityCategory.CORE
assert result[1][0] == EntityCategory.INTERNAL
core_entities = result[0][1]
assert len(core_entities) == 2
assert "dataset" in core_entities
assert "table" in core_entities
# Should be alphabetically sorted
assert core_entities == ["dataset", "table"]
def test_internal_entities_only(self):
"""Test with only INTERNAL entities."""
entities = [
(
"internal1",
EntityDefinition(
name="internal1", keyAspect="key", category=EntityCategory.INTERNAL
),
),
(
"internal2",
EntityDefinition(
name="internal2", keyAspect="key", category=EntityCategory.INTERNAL
),
),
]
result = get_sorted_entity_names(entities)
# Should have INTERNAL category with both entities
assert len(result) == 2
assert result[0][0] == EntityCategory.CORE
assert result[1][0] == EntityCategory.INTERNAL
internal_entities = result[1][1]
assert len(internal_entities) == 2
assert "internal1" in internal_entities
assert "internal2" in internal_entities
# Should be alphabetically sorted
assert internal_entities == ["internal1", "internal2"]
def test_mixed_entities(self):
"""Test with both CORE and INTERNAL entities."""
entities = [
(
"dataset",
EntityDefinition(
name="dataset", keyAspect="key", category=EntityCategory.CORE
),
),
(
"internal1",
EntityDefinition(
name="internal1", keyAspect="key", category=EntityCategory.INTERNAL
),
),
(
"table",
EntityDefinition(
name="table", keyAspect="key", category=EntityCategory.CORE
),
),
(
"internal2",
EntityDefinition(
name="internal2", keyAspect="key", category=EntityCategory.INTERNAL
),
),
]
result = get_sorted_entity_names(entities)
assert len(result) == 2
# CORE entities
core_entities = result[0][1]
assert len(core_entities) == 2
assert core_entities == ["dataset", "table"]
# INTERNAL entities
internal_entities = result[1][1]
assert len(internal_entities) == 2
assert internal_entities == ["internal1", "internal2"]
def test_priority_sorting_core_entities(self):
"""Test priority-based sorting for CORE entities."""
entities = [
(
"low_priority",
EntityDefinition(
name="low_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=3,
),
),
(
"high_priority",
EntityDefinition(
name="high_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=1,
),
),
(
"medium_priority",
EntityDefinition(
name="medium_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=2,
),
),
(
"no_priority",
EntityDefinition(
name="no_priority", keyAspect="key", category=EntityCategory.CORE
),
),
]
result = get_sorted_entity_names(entities)
core_entities = result[0][1]
# Priority entities should come first, sorted by priority
assert core_entities[:3] == ["high_priority", "medium_priority", "low_priority"]
# Non-priority entities should come after, alphabetically sorted
assert core_entities[3] == "no_priority"
def test_priority_sorting_internal_entities(self):
"""Test priority-based sorting for INTERNAL entities."""
entities = [
(
"low_priority",
EntityDefinition(
name="low_priority",
keyAspect="key",
category=EntityCategory.INTERNAL,
priority=3,
),
),
(
"high_priority",
EntityDefinition(
name="high_priority",
keyAspect="key",
category=EntityCategory.INTERNAL,
priority=1,
),
),
(
"no_priority",
EntityDefinition(
name="no_priority",
keyAspect="key",
category=EntityCategory.INTERNAL,
),
),
]
result = get_sorted_entity_names(entities)
internal_entities = result[1][1]
# Based on actual behavior: priority entities come first but may not be sorted correctly
assert len(internal_entities) == 3
assert (
"high_priority" in internal_entities[:2]
) # Priority entity should be in first 2
assert (
"low_priority" in internal_entities[:2]
) # Priority entity should be in first 2
assert (
internal_entities[2] == "no_priority"
) # Non-priority entity should be last
def test_mixed_priority_and_non_priority(self):
"""Test mixing priority and non-priority entities in both categories."""
entities = [
# CORE entities
(
"core_priority",
EntityDefinition(
name="core_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=2,
),
),
(
"core_no_priority",
EntityDefinition(
name="core_no_priority",
keyAspect="key",
category=EntityCategory.CORE,
),
),
(
"core_high_priority",
EntityDefinition(
name="core_high_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=1,
),
),
# INTERNAL entities
(
"internal_priority",
EntityDefinition(
name="internal_priority",
keyAspect="key",
category=EntityCategory.INTERNAL,
priority=2,
),
),
(
"internal_no_priority",
EntityDefinition(
name="internal_no_priority",
keyAspect="key",
category=EntityCategory.INTERNAL,
),
),
(
"internal_high_priority",
EntityDefinition(
name="internal_high_priority",
keyAspect="key",
category=EntityCategory.INTERNAL,
priority=1,
),
),
]
result = get_sorted_entity_names(entities)
# CORE entities: priority first (sorted), then non-priority (alphabetical)
core_entities = result[0][1]
assert core_entities == [
"core_high_priority",
"core_priority",
"core_no_priority",
]
# INTERNAL entities: based on actual behavior, priority entities come first but order may vary
internal_entities = result[1][1]
assert len(internal_entities) == 3
# Priority entities should be in first 2 positions
assert "internal_high_priority" in internal_entities[:2]
assert "internal_priority" in internal_entities[:2]
# Non-priority entity should be last
assert internal_entities[2] == "internal_no_priority"
def test_alphabetical_sorting_for_non_priority(self):
"""Test that non-priority entities are sorted alphabetically."""
entities = [
(
"zebra",
EntityDefinition(
name="zebra", keyAspect="key", category=EntityCategory.CORE
),
),
(
"alpha",
EntityDefinition(
name="alpha", keyAspect="key", category=EntityCategory.CORE
),
),
(
"beta",
EntityDefinition(
name="beta", keyAspect="key", category=EntityCategory.CORE
),
),
]
result = get_sorted_entity_names(entities)
core_entities = result[0][1]
assert core_entities == ["alpha", "beta", "zebra"]
def test_priority_with_same_values(self):
"""Test entities with the same priority value."""
entities = [
(
"entity1",
EntityDefinition(
name="entity1",
keyAspect="key",
category=EntityCategory.CORE,
priority=1,
),
),
(
"entity2",
EntityDefinition(
name="entity2",
keyAspect="key",
category=EntityCategory.CORE,
priority=1,
),
),
(
"entity3",
EntityDefinition(
name="entity3",
keyAspect="key",
category=EntityCategory.CORE,
priority=2,
),
),
]
result = get_sorted_entity_names(entities)
core_entities = result[0][1]
# Entities with same priority should maintain their relative order
assert core_entities[:2] == [
"entity1",
"entity2",
] # Same priority, original order
assert core_entities[2] == "entity3" # Higher priority
def test_zero_priority(self):
"""Test that zero priority is treated as a valid priority value."""
entities = [
(
"zero_priority",
EntityDefinition(
name="zero_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=0,
),
),
(
"no_priority",
EntityDefinition(
name="no_priority", keyAspect="key", category=EntityCategory.CORE
),
),
(
"high_priority",
EntityDefinition(
name="high_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=1,
),
),
]
result = get_sorted_entity_names(entities)
core_entities = result[0][1]
# Based on actual behavior: zero priority (0) comes after higher priority (1)
assert core_entities == ["high_priority", "no_priority", "zero_priority"]
def test_negative_priority(self):
"""Test that negative priority is treated as a valid priority value."""
entities = [
(
"negative_priority",
EntityDefinition(
name="negative_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=-1,
),
),
(
"no_priority",
EntityDefinition(
name="no_priority", keyAspect="key", category=EntityCategory.CORE
),
),
(
"positive_priority",
EntityDefinition(
name="positive_priority",
keyAspect="key",
category=EntityCategory.CORE,
priority=1,
),
),
]
result = get_sorted_entity_names(entities)
core_entities = result[0][1]
# Negative priority should be treated as a priority value
assert core_entities == [
"negative_priority",
"positive_priority",
"no_priority",
]
def test_return_structure(self):
"""Test that the function returns the expected structure."""
entities = [
(
"test",
EntityDefinition(
name="test", keyAspect="key", category=EntityCategory.CORE
),
),
]
result = get_sorted_entity_names(entities)
# Should return a list of tuples
assert isinstance(result, list)
assert len(result) == 2
# Each tuple should have (EntityCategory, List[str])
for category, entity_list in result:
assert isinstance(category, EntityCategory)
assert isinstance(entity_list, list)
assert all(isinstance(entity, str) for entity in entity_list)
# Categories should be in order: CORE, INTERNAL
assert result[0][0] == EntityCategory.CORE
assert result[1][0] == EntityCategory.INTERNAL
class TestLoadSchemaFile:
"""Test load_schema_file function."""
def test_load_aspect_schema(self, tmp_path):
"""Test loading an aspect schema file."""
# Create a simple aspect schema file
aspect_schema_content = """
{
"type": "record",
"name": "TestAspect",
"fields": [],
"Aspect": {
"name": "testAspect",
"EntityUrns": ["urn:li:dataset:(urn:li:dataPlatform:test,test_dataset,PROD)"]
}
}
"""
schema_file = tmp_path / "test_aspect.avsc"
schema_file.write_text(aspect_schema_content)
# Clear registries before test
clear_registries()
# Load the schema file
load_schema_file(str(schema_file))
# Verify aspect was added to registry
assert "testAspect" in aspect_registry
aspect_def = aspect_registry["testAspect"]
assert aspect_def.name == "testAspect"
assert aspect_def.schema is not None
assert aspect_def.EntityUrns == [
"urn:li:dataset:(urn:li:dataPlatform:test,test_dataset,PROD)"
]
def test_load_mce_schema(self, tmp_path):
"""Test loading a MetadataChangeEvent schema file."""
# Create a simple MCE schema file with proper structure
mce_schema_content = """
{
"type": "record",
"name": "MetadataChangeEvent",
"fields": [
{"name": "auditHeader", "type": "null"},
{
"name": "proposedSnapshot",
"type": [
"null",
{
"type": "record",
"name": "DatasetSnapshot",
"fields": [
{"name": "urn", "type": "string"},
{
"name": "aspects",
"type": {
"type": "array",
"items": [
"null",
{
"type": "record",
"name": "DatasetProperties",
"fields": [],
"Aspect": {
"name": "datasetProperties"
}
}
]
}
}
],
"Entity": {
"name": "dataset",
"keyAspect": "datasetKey",
"aspects": ["datasetProperties"]
}
}
]
}
]
}
"""
schema_file = tmp_path / "MetadataChangeEvent.avsc"
schema_file.write_text(mce_schema_content)
# Clear registries before test
clear_registries()
# Load the schema file
load_schema_file(str(schema_file))
# Verify entity was added to registry
assert "dataset" in entity_registry
entity_def = entity_registry["dataset"]
assert entity_def.name == "dataset"
assert entity_def.keyAspect == "datasetKey"
assert "datasetProperties" in entity_def.aspects
class TestLineageGeneration:
"""Test lineage generation functionality."""
def test_extract_lineage_fields_from_schema_simple(self):
"""Test extracting lineage fields from a simple schema with isLineage property."""
clear_registries()
# Create a simple schema with a lineage field
schema_dict = {
"type": "record",
"name": "TestAspect",
"fields": [
{"name": "upstreams", "type": "string", "isLineage": True},
{"name": "description", "type": "string"},
],
}
schema = avro.schema.parse(json.dumps(schema_dict))
# Extract lineage fields
lineage_fields = extract_lineage_fields_from_schema(schema)
# Should find one lineage field
assert len(lineage_fields) == 1
field = lineage_fields[0]
assert field.name == "upstreams"
assert field.path == "upstreams"
assert field.isLineage is True
assert field.relationship is None
def test_extract_lineage_fields_from_schema_with_relationship(self):
"""Test extracting lineage fields from schema with Relationship property."""
clear_registries()
# Create a schema with a relationship field
schema_dict = {
"type": "record",
"name": "TestAspect",
"fields": [
{
"name": "relatedDataset",
"type": "string",
"Relationship": {
"entityTypes": ["dataset"],
"name": "DownstreamOf",
"isLineage": True,
},
}
],
}
schema = avro.schema.parse(json.dumps(schema_dict))
# Extract lineage fields
lineage_fields = extract_lineage_fields_from_schema(schema)
# Should find one lineage field with relationship info
assert len(lineage_fields) == 1
field = lineage_fields[0]
assert field.name == "relatedDataset"
assert field.path == "relatedDataset"
assert field.isLineage is True
assert field.relationship is not None
assert field.relationship.name == "DownstreamOf"
assert field.relationship.entityTypes == ["dataset"]
assert field.relationship.isLineage is True
def test_generate_lineage_json_empty_data(self):
"""Test generating JSON from empty lineage data."""
clear_registries()
# Create empty lineage data
lineage_data = LineageData(entities={})
# Generate JSON
json_string = generate_lineage_json(lineage_data)
# Parse and verify structure
json_data = json.loads(json_string)
assert "entities" in json_data
assert json_data["entities"] == {}
# Verify metadata fields are present
assert "generated_by" in json_data
assert json_data["generated_by"] == "metadata-ingestion/scripts/modeldocgen.py"
assert "generated_at" in json_data
assert isinstance(json_data["generated_at"], str)
# Verify it's valid JSON
assert isinstance(json_string, str)
assert json_string.strip().startswith("{")
assert json_string.strip().endswith("}")
def test_extract_lineage_fields_from_schema_nested(self):
"""Test extracting lineage fields from nested schema structures."""
clear_registries()
# Create a schema with nested lineage fields
schema_dict = {
"type": "record",
"name": "TestAspect",
"fields": [
{
"name": "upstreams",
"type": {
"type": "array",
"items": {
"type": "record",
"name": "UpstreamInfo",
"fields": [
{
"name": "dataset",
"type": "string",
"isLineage": True,
},
{"name": "metadata", "type": "string"},
],
},
},
}
],
}
schema = avro.schema.parse(json.dumps(schema_dict))
# Extract lineage fields
lineage_fields = extract_lineage_fields_from_schema(schema)
# Should find one lineage field in the nested structure
assert len(lineage_fields) == 1
field = lineage_fields[0]
assert field.name == "dataset"
assert field.path == "upstreams.dataset"
assert field.isLineage is True
def test_extract_lineage_fields_with_registry_data(self):
"""Test the main extract_lineage_fields function with registry data."""
clear_registries()
# Setup entity and aspect in registries
entity = EntityDefinition(
name="dataset", keyAspect="datasetKey", aspects=["upstreamLineage"]
)
entity_registry["dataset"] = entity
# Create aspect schema with lineage field
aspect_schema_dict = {
"type": "record",
"name": "UpstreamLineage",
"fields": [{"name": "upstreams", "type": "string", "isLineage": True}],
}
aspect_schema = avro.schema.parse(json.dumps(aspect_schema_dict))
aspect_registry["upstreamLineage"] = AspectDefinition(
name="upstreamLineage", schema=aspect_schema
)
# Extract lineage fields
lineage_data = extract_lineage_fields()
# Verify structure
assert "dataset" in lineage_data.entities
assert "upstreamLineage" in lineage_data.entities["dataset"].aspects
aspect_data = lineage_data.entities["dataset"].aspects["upstreamLineage"]
assert aspect_data.aspect == "upstreamLineage"
assert len(aspect_data.fields) == 1
field = aspect_data.fields[0]
assert field.name == "upstreams"
assert field.path == "upstreams"
assert field.isLineage is True