OpenMetadata/ingestion/tests/unit/lineage/test_cross_database_lineage_sql.py
Mohit Tilala 9b2b4d2452
[Lineage] Fix cross services lineage changes of service_names to missed methods (#23240)
* Fix cross db changes of service_names to missed methods

* Handle string value passed to service_names
2025-09-04 20:38:05 +05:30

731 lines
31 KiB
Python

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test cross database lineage functionality in SQL lineage module
"""
import uuid
from datetime import datetime
from unittest import TestCase
from unittest.mock import MagicMock, patch
from metadata.generated.schema.entity.data.storedProcedure import (
StoredProcedure,
StoredProcedureCode,
)
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.lineage.models import Dialect
from metadata.ingestion.lineage.sql_lineage import (
get_lineage_by_query,
get_lineage_via_table_entity,
get_table_entities_from_query,
search_table_entities,
)
from metadata.ingestion.source.database.lineage_processors import (
_yield_procedure_lineage,
)
from metadata.ingestion.source.database.stored_procedures_mixin import (
QueryByProcedure,
StoredProcedureLineageMixin,
)
from metadata.ingestion.source.models import TableView
from metadata.utils.db_utils import get_view_lineage
class CrossDatabaseLineageSQLTest(TestCase):
"""
Test cross database lineage functionality in SQL lineage module
"""
def setUp(self):
"""Set up test fixtures"""
self.mock_metadata = MagicMock()
# Create mock tables
self.mock_table1 = Table(
id=uuid.uuid4(),
name="test_table",
fullyQualifiedName="service1.db1.schema1.test_table",
columns=[
{
"name": "id",
"dataType": "NUMBER",
"fullyQualifiedName": "service1.db1.schema1.test_table.id",
},
{
"name": "name",
"dataType": "VARCHAR",
"fullyQualifiedName": "service1.db1.schema1.test_table.name",
},
],
)
self.mock_table2 = Table(
id=uuid.uuid4(),
name="test_table",
fullyQualifiedName="service2.db2.schema2.test_table",
columns=[
{
"name": "id",
"dataType": "NUMBER",
"fullyQualifiedName": "service2.db2.schema2.test_table.id",
},
{
"name": "name",
"dataType": "VARCHAR",
"fullyQualifiedName": "service2.db2.schema2.test_table.name",
},
],
)
def test_search_table_entities_single_service(self):
"""Test search_table_entities with single service (backward compatibility)"""
# Mock the metadata methods
self.mock_metadata.es_search_from_fqn.return_value = [self.mock_table1]
# Test with single service name (string)
result = search_table_entities(
metadata=self.mock_metadata,
service_names="service1",
database="db1",
database_schema="schema1",
table="test_table",
)
# Verify the result
self.assertEqual(result, [self.mock_table1])
# Verify the method was called correctly
self.mock_metadata.es_search_from_fqn.assert_called()
def test_search_table_entities_multiple_services(self):
"""Test search_table_entities with multiple services (cross-database)"""
# Mock the metadata methods - first service returns None for both ES and API, second returns table
self.mock_metadata.es_search_from_fqn.side_effect = [None, [self.mock_table2]]
# Mock fqn.build to return empty list for first service, list with FQN for second service
with patch(
"metadata.ingestion.lineage.sql_lineage.fqn.build"
) as mock_fqn_build:
mock_fqn_build.side_effect = [[], ["service2.db2.schema2.test_table"]]
# Mock metadata.get_by_name to return the table for second service
self.mock_metadata.get_by_name.return_value = self.mock_table2
# Test with multiple service names
result = search_table_entities(
metadata=self.mock_metadata,
service_names=["service1", "service2"],
database="db2",
database_schema="schema2",
table="test_table",
)
# Verify the result - should return table from second service
self.assertEqual(result, [self.mock_table2])
# Verify the method was called for both services
self.assertEqual(self.mock_metadata.es_search_from_fqn.call_count, 2)
def test_search_table_entities_no_results(self):
"""Test search_table_entities when no tables are found in any service"""
# Mock the metadata methods to return None for all services
self.mock_metadata.es_search_from_fqn.return_value = None
# Mock fqn.build to return empty list
with patch(
"metadata.ingestion.lineage.sql_lineage.fqn.build"
) as mock_fqn_build:
mock_fqn_build.return_value = []
# Test with multiple service names
result = search_table_entities(
metadata=self.mock_metadata,
service_names=["service1", "service2"],
database="db1",
database_schema="schema1",
table="nonexistent_table",
)
# Verify the result is None
self.assertIsNone(result)
def test_get_table_entities_from_query_single_service(self):
"""Test get_table_entities_from_query with single service (backward compatibility)"""
# Mock search_table_entities to return a table
with patch(
"metadata.ingestion.lineage.sql_lineage.search_table_entities"
) as mock_search:
mock_search.return_value = [self.mock_table1]
result = get_table_entities_from_query(
metadata=self.mock_metadata,
service_names="service1",
database_name="db1",
database_schema="schema1",
table_name="test_table",
)
# Verify the result
self.assertEqual(result, [self.mock_table1])
# Verify search_table_entities was called correctly
mock_search.assert_called_with(
metadata=self.mock_metadata,
service_names="service1",
database="db1",
database_schema="schema1",
table="test_table",
)
def test_get_table_entities_from_query_multiple_services(self):
"""Test get_table_entities_from_query with multiple services (cross-database)"""
# Mock search_table_entities to return a table from second service
with patch(
"metadata.ingestion.lineage.sql_lineage.search_table_entities"
) as mock_search:
mock_search.return_value = [self.mock_table2]
result = get_table_entities_from_query(
metadata=self.mock_metadata,
service_names=["service1", "service2"],
database_name="db2",
database_schema="schema2",
table_name="test_table",
)
# Verify the result
self.assertEqual(result, [self.mock_table2])
# Verify search_table_entities was called correctly
mock_search.assert_called_with(
metadata=self.mock_metadata,
service_names=["service1", "service2"],
database="db2",
database_schema="schema2",
table="test_table",
)
def test_get_lineage_by_query_single_service(self):
"""Test get_lineage_by_query with single service (backward compatibility)"""
# Mock the lineage parser and other dependencies
with patch(
"metadata.ingestion.lineage.sql_lineage.LineageParser"
) as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = "SELECT * FROM test"
mock_parser_instance.column_lineage = []
mock_parser_instance.intermediate_tables = []
mock_parser_instance.source_tables = []
mock_parser_instance.target_tables = []
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return empty
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = []
result = list(
get_lineage_by_query(
metadata=self.mock_metadata,
service_names="service1",
database_name="db1",
schema_name="schema1",
query="SELECT * FROM test",
dialect=Dialect.ANSI,
)
)
# Verify no lineage is generated (empty source tables)
self.assertEqual(len(result), 0)
def test_get_lineage_by_query_multiple_services(self):
"""Test get_lineage_by_query with multiple services (cross-database)"""
# Mock the lineage parser and other dependencies
with patch(
"metadata.ingestion.lineage.sql_lineage.LineageParser"
) as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = "SELECT * FROM test"
mock_parser_instance.column_lineage = []
mock_parser_instance.intermediate_tables = []
mock_parser_instance.source_tables = []
mock_parser_instance.target_tables = []
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return empty
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = []
result = list(
get_lineage_by_query(
metadata=self.mock_metadata,
service_names=["service1", "service2"],
database_name="db1",
schema_name="schema1",
query="SELECT * FROM test",
dialect=Dialect.ANSI,
)
)
# Verify no lineage is generated (empty source tables)
self.assertEqual(len(result), 0)
def test_get_lineage_by_query_with_source_tables(self):
"""Test get_lineage_by_query with actual source tables (query lineage)"""
# Mock the lineage parser with source and target tables
with patch(
"metadata.ingestion.lineage.sql_lineage.LineageParser"
) as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = (
"CREATE TABLE target AS SELECT * FROM source"
)
mock_parser_instance.column_lineage = []
mock_parser_instance.intermediate_tables = []
mock_parser_instance.source_tables = ["source"]
mock_parser_instance.target_tables = ["target"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return a source table
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = [("", "source_table")]
# Mock search_table_entities to return a table
with patch(
"metadata.ingestion.lineage.sql_lineage.search_table_entities"
) as mock_search:
mock_search.return_value = [self.mock_table1]
result = list(
get_lineage_by_query(
metadata=self.mock_metadata,
service_names=["service1", "service2"],
database_name="db1",
schema_name="schema1",
query="CREATE TABLE target AS SELECT * FROM source",
dialect=Dialect.ANSI,
)
)
# Verify that lineage was attempted
self.assertIsInstance(result, list)
mock_search.assert_called()
def test_get_lineage_via_table_entity_single_service(self):
"""Test get_lineage_via_table_entity with single service (backward compatibility)"""
# Mock the lineage parser
with patch(
"metadata.ingestion.lineage.sql_lineage.LineageParser"
) as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = "SELECT * FROM source"
mock_parser_instance.column_lineage = []
mock_parser_instance.source_tables = ["source"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return empty
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = []
result = list(
get_lineage_via_table_entity(
metadata=self.mock_metadata,
table_entity=self.mock_table1,
service_names="service1",
database_name="db1",
schema_name="schema1",
query="SELECT * FROM source",
dialect=Dialect.ANSI,
)
)
# Verify the method executes without errors
self.assertIsInstance(result, list)
def test_get_lineage_via_table_entity_multiple_services(self):
"""Test get_lineage_via_table_entity with multiple services (cross-database)"""
# Mock the lineage parser
with patch(
"metadata.ingestion.lineage.sql_lineage.LineageParser"
) as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = "SELECT * FROM source"
mock_parser_instance.column_lineage = []
mock_parser_instance.source_tables = ["source"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return a source table
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = [("", "source_table")]
# Mock search_table_entities to return a table from second service
with patch(
"metadata.ingestion.lineage.sql_lineage.search_table_entities"
) as mock_search:
mock_search.return_value = [self.mock_table2]
result = list(
get_lineage_via_table_entity(
metadata=self.mock_metadata,
table_entity=self.mock_table1,
service_names=["service1", "service2"],
database_name="db1",
schema_name="schema1",
query="SELECT * FROM source",
dialect=Dialect.ANSI,
)
)
# Verify that lineage was attempted with multiple services
self.assertIsInstance(result, list)
# Verify that search_table_entities was called (the exact parameters may vary)
mock_search.assert_called()
def test_get_view_lineage_single_service(self):
"""Test get_view_lineage with single service (backward compatibility)"""
# Create a real TableView
view = TableView(
table_name="test_view",
schema_name="schema1",
db_name="db1",
view_definition="CREATE VIEW test_view AS SELECT * FROM source_table",
)
# Mock the metadata methods
self.mock_metadata.get_by_name.return_value = self.mock_table1
# Mock fqn.build to return a valid FQN
with patch("metadata.utils.db_utils.fqn.build") as mock_fqn_build:
mock_fqn_build.return_value = "service1.db1.schema1.test_view"
# Mock the lineage parser
with patch("metadata.utils.db_utils.LineageParser") as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = (
"CREATE VIEW test_view AS SELECT * FROM source_table"
)
mock_parser_instance.column_lineage = []
mock_parser_instance.source_tables = ["source_table"]
mock_parser_instance.target_tables = ["test_view"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return empty (from sql_lineage module)
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = []
result = list(
get_view_lineage(
view=view,
metadata=self.mock_metadata,
service_names="service1",
connection_type="snowflake",
)
)
# Verify the method executes without errors
self.assertIsInstance(result, list)
def test_get_view_lineage_multiple_services(self):
"""Test get_view_lineage with multiple services (cross-database view lineage)"""
# Create a real TableView
view = TableView(
table_name="test_view",
schema_name="schema1",
db_name="db1",
view_definition="CREATE VIEW test_view AS SELECT * FROM source_table",
)
# Mock the metadata methods
self.mock_metadata.get_by_name.return_value = self.mock_table1
# Mock fqn.build to return a valid FQN
with patch("metadata.utils.db_utils.fqn.build") as mock_fqn_build:
mock_fqn_build.return_value = "service1.db1.schema1.test_view"
# Mock the lineage parser
with patch("metadata.utils.db_utils.LineageParser") as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = (
"CREATE VIEW test_view AS SELECT * FROM source_table"
)
mock_parser_instance.column_lineage = []
mock_parser_instance.source_tables = ["source_table"]
mock_parser_instance.target_tables = ["test_view"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_lineage_by_query which is what get_view_lineage actually calls
# Since get_view_lineage imports it, we need to patch it where it's used
with patch(
"metadata.utils.db_utils.get_lineage_by_query"
) as mock_get_lineage:
# Return empty list to simulate successful lineage processing
mock_get_lineage.return_value = []
result = list(
get_view_lineage(
view=view,
metadata=self.mock_metadata,
service_names=["service1", "service2"],
connection_type="snowflake",
)
)
# Verify that view lineage was attempted with multiple services
self.assertIsInstance(result, list)
# Verify get_lineage_by_query was called with the service_names list
mock_get_lineage.assert_called()
call_kwargs = mock_get_lineage.call_args.kwargs
# Check that service_names was passed as a list
self.assertEqual(
call_kwargs["service_names"], ["service1", "service2"]
)
def test_get_view_lineage_with_postgres_schema_fallback(self):
"""Test get_view_lineage with Postgres schema fallback"""
# Create a mock TableView for Postgres
mock_view = MagicMock()
mock_view.table_name = "test_view"
mock_view.schema_name = None # No schema specified
mock_view.db_name = "db1"
mock_view.view_definition = (
"CREATE VIEW test_view AS SELECT * FROM source_table"
)
# Mock the metadata methods
self.mock_metadata.get_by_name.return_value = self.mock_table1
# Mock fqn.build to return a valid FQN
with patch("metadata.utils.db_utils.fqn.build") as mock_fqn_build:
mock_fqn_build.return_value = "service1.db1.public.test_view"
# Mock the lineage parser
with patch("metadata.utils.db_utils.LineageParser") as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = (
"CREATE VIEW test_view AS SELECT * FROM source_table"
)
mock_parser_instance.column_lineage = []
mock_parser_instance.source_tables = ["source_table"]
mock_parser_instance.target_tables = ["test_view"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_lineage_by_query which is what get_view_lineage actually calls
with patch(
"metadata.utils.db_utils.get_lineage_by_query"
) as mock_get_lineage:
# Return empty list to simulate successful lineage processing
mock_get_lineage.return_value = []
result = list(
get_view_lineage(
view=mock_view,
metadata=self.mock_metadata,
service_names=["service1", "service2"],
connection_type="postgres",
)
)
# Verify that view lineage was attempted with schema fallback
self.assertIsInstance(result, list)
# Verify get_lineage_by_query was called
mock_get_lineage.assert_called()
def test_stored_procedure_lineage_cross_database(self):
"""Test stored procedure lineage with cross-database support"""
# Create a mock stored procedure
mock_procedure = StoredProcedure(
id=uuid.uuid4(),
name="test_procedure",
fullyQualifiedName="service1.db1.schema1.test_procedure",
storedProcedureCode=StoredProcedureCode(
code="CREATE PROCEDURE test_procedure() BEGIN SELECT * FROM source; END",
language="SQL",
),
database=EntityReference(id=uuid.uuid4(), type="database"),
databaseSchema=EntityReference(id=uuid.uuid4(), type="databaseSchema"),
service=EntityReference(id=uuid.uuid4(), type="databaseService"),
)
# Create a mock query by procedure
mock_query = QueryByProcedure(
procedure_name="test_procedure",
query_type="CREATE_TABLE_AS_SELECT",
query_database_name="db1",
query_schema_name="schema1",
procedure_text="CALL test_procedure()",
procedure_start_time=datetime.now(),
procedure_end_time=datetime.now(),
query_text="CREATE TABLE target AS SELECT * FROM source",
)
# Create a mock mixin class
class MockStoredProcedureMixin(StoredProcedureLineageMixin):
def __init__(self, mock_metadata):
self.metadata = mock_metadata
self.service_name = "service1"
self.source_config = MagicMock()
self.source_config.processCrossDatabaseLineage = True
self.source_config.crossDatabaseServiceNames = ["service2"]
self.source_config.parsingTimeoutLimit = 30
self.source_config.enableTempTableLineage = False
self.service_connection = MagicMock()
self.service_connection.type.value = "mysql"
self.stored_procedure_query_lineage = False
self.procedure_graph_map = {}
self.status = MagicMock()
self.dialect = MagicMock()
def get_stored_procedure_sql_statement(self):
"""Mock implementation of abstract method"""
return ""
mixin = MockStoredProcedureMixin(self.mock_metadata)
# Mock the lineage parser and other dependencies
with patch(
"metadata.ingestion.lineage.sql_lineage.LineageParser"
) as mock_parser:
mock_parser_instance = MagicMock()
mock_parser_instance.masked_query = (
"CREATE TABLE target AS SELECT * FROM source"
)
mock_parser_instance.column_lineage = []
mock_parser_instance.intermediate_tables = []
mock_parser_instance.source_tables = ["source"]
mock_parser_instance.target_tables = ["target"]
mock_parser_instance.query_parsing_success = True
mock_parser.return_value = mock_parser_instance
# Mock get_source_table_names to return empty
with patch(
"metadata.ingestion.lineage.sql_lineage.get_source_table_names"
) as mock_source:
mock_source.return_value = []
# Test the _yield_procedure_lineage method
result = list(
_yield_procedure_lineage(
metadata=mixin.metadata,
service_name=mixin.service_name,
dialect=mixin.dialect,
processCrossDatabaseLineage=mixin.source_config.processCrossDatabaseLineage,
crossDatabaseServiceNames=mixin.source_config.crossDatabaseServiceNames,
parsingTimeoutLimit=mixin.source_config.parsingTimeoutLimit,
query_by_procedure=mock_query,
procedure=mock_procedure,
procedure_graph_map=mixin.procedure_graph_map,
enableTempTableLineage=mixin.source_config.enableTempTableLineage,
)
)
# Verify that the method was called with the correct service names
# The actual lineage generation depends on the mocked dependencies
# but we can verify that the method executes without errors
self.assertIsInstance(result, list)
def test_build_es_fqn_search_string_kwargs(self):
"""
Test that build_es_fqn_search_string is called with keyword arguments
and handles service_names list correctly via get_source_table_names
"""
from unittest.mock import MagicMock, patch
from collate_sqllineage.core.models import DataFunction
from metadata.ingestion.lineage.sql_lineage import get_source_table_names
mock_metadata = MagicMock()
mock_metadata.es_search_from_fqn.return_value = None
# Test with a DataFunction to trigger the UDF processing
source_table = DataFunction("test_function")
# Mock build_es_fqn_search_string to capture how it's called
with patch(
"metadata.ingestion.lineage.sql_lineage.build_es_fqn_search_string"
) as mock_build:
mock_build.return_value = "test.fqn.string"
# Test with list of service names - this is the bug scenario
service_names = ["service1", "service2"]
list(
get_source_table_names(
metadata=mock_metadata,
dialect=Dialect.ANSI,
source_table=source_table,
database_name="test_db",
schema_name="test_schema",
service_names=service_names,
timeout_seconds=30,
column_lineage={},
procedure=None,
)
)
# Verify build_es_fqn_search_string was called with keyword arguments
# and the first service name from the list
mock_build.assert_called_with(
database_name="test_db",
schema_name="test_schema",
service_name="service1", # Should use first service from list
table_name="test_function",
)
# Test with single service name
with patch(
"metadata.ingestion.lineage.sql_lineage.build_es_fqn_search_string"
) as mock_build:
mock_build.return_value = "test.fqn.string"
service_names = "single_service"
list(
get_source_table_names(
metadata=mock_metadata,
dialect=Dialect.ANSI,
source_table=source_table,
database_name="test_db",
schema_name="test_schema",
service_names=service_names,
timeout_seconds=30,
column_lineage={},
procedure=None,
)
)
# Should handle string service name correctly
mock_build.assert_called_with(
database_name="test_db",
schema_name="test_schema",
service_name="single_service",
table_name="test_function",
)