MINOR: fix Kafka connect CDC lineage (#23836)

This commit is contained in:
Ayush Shah 2025-10-11 15:40:03 +05:30 committed by GitHub
parent d4a9381473
commit a90cacc93b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 209 additions and 10 deletions

View File

@ -39,6 +39,7 @@ from metadata.ingestion.models.custom_pydantic import BaseModel
from metadata.ingestion.ometa.client import REST, APIError from metadata.ingestion.ometa.client import REST, APIError
from metadata.ingestion.ometa.utils import quote from metadata.ingestion.ometa.utils import quote
from metadata.ingestion.source.models import TableView from metadata.ingestion.source.models import TableView
from metadata.utils import fqn
from metadata.utils.elasticsearch import ES_INDEX_MAP, get_entity_from_es_result from metadata.utils.elasticsearch import ES_INDEX_MAP, get_entity_from_es_result
from metadata.utils.execution_time_tracker import calculate_execution_time_generator from metadata.utils.execution_time_tracker import calculate_execution_time_generator
from metadata.utils.logger import ometa_logger from metadata.utils.logger import ometa_logger
@ -521,10 +522,13 @@ class ESMixin(Generic[T]):
fetch table from es when with/without `db_service_name` fetch table from es when with/without `db_service_name`
""" """
try: try:
prepended_fqn = fqn.prefix_entity_for_wildcard_search(
entity_type=entity_type, fqn=fqn_search_string
)
entity_result = get_entity_from_es_result( entity_result = get_entity_from_es_result(
entity_list=self.es_search_from_fqn( entity_list=self.es_search_from_fqn(
entity_type=entity_type, entity_type=entity_type,
fqn_search_string=fqn_search_string, fqn_search_string=prepended_fqn,
), ),
fetch_multiple_entities=fetch_multiple_entities, fetch_multiple_entities=fetch_multiple_entities,
) )

View File

@ -110,16 +110,16 @@ class ConnectorConfigKeys:
"db.name", "db.name",
"snowflake.database.name", "snowflake.database.name",
"database.include.list", "database.include.list",
"database.hostname", # "database.hostname",
"connection.url", # "connection.url",
"database.dbname", "database.dbname",
"topic.prefix", "topic.prefix",
"database.server.name", # Debezium V1 # "database.server.name", # This maps the server name, not the actual database
"databases.include", "databases.include",
"database.names", "database.names",
"snowflake.database", "snowflake.database",
"connection.host", # "connection.host",
"database.exclude.list", # "database.exclude.list",
] ]
CONTAINER_KEYS = [ CONTAINER_KEYS = [

View File

@ -182,7 +182,7 @@ class KafkaconnectSource(PipelineServiceSource):
) )
# Build search string: schema.table format # Build search string: schema.table format
search_string = ( search_string = (
f"{dataset_details.database}.{dataset_details.table}" f"{fqn.quote_name(dataset_details.database)}.{fqn.quote_name(dataset_details.table)}"
if dataset_details.database if dataset_details.database
else dataset_details.table else dataset_details.table
) )

View File

@ -13,10 +13,12 @@ Handle FQN building and splitting logic.
Filter information has been taken from the Filter information has been taken from the
ES indexes definitions ES indexes definitions
""" """
from __future__ import annotations
import hashlib import hashlib
import re import re
import traceback import traceback
from typing import Dict, List, Optional, Type, TypeVar, Union from typing import TYPE_CHECKING, Dict, List, Optional, Type, TypeVar, Union
from antlr4.CommonTokenStream import CommonTokenStream from antlr4.CommonTokenStream import CommonTokenStream
from antlr4.error.ErrorStrategy import BailErrorStrategy from antlr4.error.ErrorStrategy import BailErrorStrategy
@ -51,11 +53,13 @@ from metadata.generated.schema.entity.teams.team import Team
from metadata.generated.schema.entity.teams.user import User from metadata.generated.schema.entity.teams.user import User
from metadata.generated.schema.tests.testCase import TestCase from metadata.generated.schema.tests.testCase import TestCase
from metadata.generated.schema.tests.testSuite import TestSuite from metadata.generated.schema.tests.testSuite import TestSuite
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.utils.dispatch import class_register from metadata.utils.dispatch import class_register
from metadata.utils.elasticsearch import get_entity_from_es_result from metadata.utils.elasticsearch import get_entity_from_es_result
from metadata.utils.logger import utils_logger from metadata.utils.logger import utils_logger
if TYPE_CHECKING:
from metadata.ingestion.ometa.ometa_api import OpenMetadata
logger = utils_logger() logger = utils_logger()
T = TypeVar("T", bound=BaseModel) T = TypeVar("T", bound=BaseModel)
@ -866,3 +870,56 @@ def get_query_checksum(query: str) -> str:
The checksum is used as the query's name. The checksum is used as the query's name.
""" """
return hashlib.md5(query.encode()).hexdigest() return hashlib.md5(query.encode()).hexdigest()
# Not adding container since children can have recursive slots: service.container1.container2...
FQN_ENTITY_SLOTS = {
Table.__name__: 4,
DatabaseSchema.__name__: 3,
Database.__name__: 2,
Dashboard.__name__: 2,
APICollection.__name__: 2,
Chart.__name__: 2,
MlModel.__name__: 2,
Topic.__name__: 2,
SearchIndex.__name__: 2,
Tag.__name__: 2,
DataModel.__name__: 2,
StoredProcedure.__name__: 4,
Pipeline.__name__: 2,
}
def prefix_entity_for_wildcard_search(entity_type: Type[T], fqn: str) -> str:
"""
Given an entity type and an FQN, return the FQN prefixed with wildcards
to match any parent hierarchy leading to that entity.
For example, for a Topic with FQN "potato", return "*.potato" to match
the topic in any service. For a Table with FQN "schema.table", return
"*.*.schema.table" to match the table in any service and database.
Args:
entity_type: The entity type to match.
fqn: The FQN to prefix.
Returns:
The prefixed FQN with wildcards for missing parent levels.
"""
slots = FQN_ENTITY_SLOTS.get(entity_type.__name__)
if not slots:
raise FQNBuildingException(
f"Entity type {entity_type.__name__} not supported for wildcard search"
)
parts = split(fqn)
if len(parts) > slots:
raise FQNBuildingException(
f"FQN {fqn} has too many parts ({len(parts)})"
f"for entity type {entity_type.__name__} (expected {slots} or fewer)"
)
# Add wildcards for missing parent levels
wildcards_needed = slots - len(parts)
prefixed_parts = ["*"] * wildcards_needed + parts
return _build(*prefixed_parts, quote=True)

View File

@ -16,7 +16,18 @@ from unittest.mock import MagicMock
import pytest import pytest
from metadata.generated.schema.entity.data.table import Column, Table from metadata.generated.schema.entity.classification.tag import Tag
from metadata.generated.schema.entity.data.apiCollection import APICollection
from metadata.generated.schema.entity.data.chart import Chart
from metadata.generated.schema.entity.data.dashboard import Dashboard
from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
from metadata.generated.schema.entity.data.mlmodel import MlModel
from metadata.generated.schema.entity.data.pipeline import Pipeline
from metadata.generated.schema.entity.data.searchIndex import SearchIndex
from metadata.generated.schema.entity.data.storedProcedure import StoredProcedure
from metadata.generated.schema.entity.data.table import Column, DataModel, Table
from metadata.generated.schema.entity.data.topic import Topic
from metadata.generated.schema.type.basic import FullyQualifiedEntityName from metadata.generated.schema.type.basic import FullyQualifiedEntityName
from metadata.ingestion.models.custom_basemodel_validation import ( from metadata.ingestion.models.custom_basemodel_validation import (
RESERVED_ARROW_KEYWORD, RESERVED_ARROW_KEYWORD,
@ -288,3 +299,130 @@ class TestFqn(TestCase):
) )
expected3 = f"bigquery.my-project.dataset.events_2024${RESERVED_QUOTE_KEYWORD}daily{RESERVED_QUOTE_KEYWORD}" expected3 = f"bigquery.my-project.dataset.events_2024${RESERVED_QUOTE_KEYWORD}daily{RESERVED_QUOTE_KEYWORD}"
self.assertEqual(result3, expected3) self.assertEqual(result3, expected3)
def test_prefix_entity_for_wildcard_search(self):
"""Test wildcard search prefix generation for all supported entity types"""
# Table (4 slots: service.database.schema.table)
# Full FQN - no wildcards needed
table_fqn = "my_service.my_db.my_schema.my_table"
result = fqn.prefix_entity_for_wildcard_search(Table, table_fqn)
self.assertEqual(result, "my_service.my_db.my_schema.my_table")
# Table with partial FQN - needs wildcards
table_fqn_partial = "my_schema.my_table"
result = fqn.prefix_entity_for_wildcard_search(Table, table_fqn_partial)
self.assertEqual(result, "*.*.my_schema.my_table")
# Table with just table name - needs all wildcards
table_fqn_minimal = "my_table"
result = fqn.prefix_entity_for_wildcard_search(Table, table_fqn_minimal)
self.assertEqual(result, "*.*.*.my_table")
# Table with quoted parts
table_fqn_quoted = '"my.schema".my_table'
result_quoted = fqn.prefix_entity_for_wildcard_search(Table, table_fqn_quoted)
self.assertEqual(result_quoted, '*.*."my.schema".my_table')
# DatabaseSchema (3 slots: service.database.schema)
schema_fqn = "public"
result = fqn.prefix_entity_for_wildcard_search(DatabaseSchema, schema_fqn)
self.assertEqual(result, "*.*.public")
schema_fqn_full = "postgres_service.analytics_db.public"
result = fqn.prefix_entity_for_wildcard_search(DatabaseSchema, schema_fqn_full)
self.assertEqual(result, "postgres_service.analytics_db.public")
# Database (2 slots: service.database)
database_fqn = "production_db"
result = fqn.prefix_entity_for_wildcard_search(Database, database_fqn)
self.assertEqual(result, "*.production_db")
database_fqn_full = "mysql_service.production_db"
result = fqn.prefix_entity_for_wildcard_search(Database, database_fqn_full)
self.assertEqual(result, "mysql_service.production_db")
# Dashboard (2 slots: service.dashboard)
dashboard_fqn = "sales_dashboard"
result = fqn.prefix_entity_for_wildcard_search(Dashboard, dashboard_fqn)
self.assertEqual(result, "*.sales_dashboard")
# APICollection (2 slots: service.collection)
api_collection_fqn = "users_api"
result = fqn.prefix_entity_for_wildcard_search(
APICollection, api_collection_fqn
)
self.assertEqual(result, "*.users_api")
# Chart (2 slots: service.chart)
chart_fqn = "revenue_chart"
result = fqn.prefix_entity_for_wildcard_search(Chart, chart_fqn)
self.assertEqual(result, "*.revenue_chart")
# MlModel (2 slots: service.model)
mlmodel_fqn = "fraud_detection_model"
result = fqn.prefix_entity_for_wildcard_search(MlModel, mlmodel_fqn)
self.assertEqual(result, "*.fraud_detection_model")
# Topic (2 slots: service.topic)
topic_fqn = "potato"
result = fqn.prefix_entity_for_wildcard_search(Topic, topic_fqn)
self.assertEqual(result, "*.potato")
topic_fqn_full = "kafka.user_events"
result = fqn.prefix_entity_for_wildcard_search(Topic, topic_fqn_full)
self.assertEqual(result, "kafka.user_events")
# SearchIndex (2 slots: service.index)
search_index_fqn = "product_index"
result = fqn.prefix_entity_for_wildcard_search(SearchIndex, search_index_fqn)
self.assertEqual(result, "*.product_index")
# Tag (2 slots: classification.tag)
tag_fqn = "Sensitive"
result = fqn.prefix_entity_for_wildcard_search(Tag, tag_fqn)
self.assertEqual(result, "*.Sensitive")
tag_fqn_full = "PII.Sensitive"
result = fqn.prefix_entity_for_wildcard_search(Tag, tag_fqn_full)
self.assertEqual(result, "PII.Sensitive")
# DataModel (2 slots: service.model)
data_model_fqn = "customer_model"
result = fqn.prefix_entity_for_wildcard_search(DataModel, data_model_fqn)
self.assertEqual(result, "*.customer_model")
# StoredProcedure (4 slots: service.database.schema.procedure)
stored_proc_fqn = "calculate_revenue"
result = fqn.prefix_entity_for_wildcard_search(StoredProcedure, stored_proc_fqn)
self.assertEqual(result, "*.*.*.calculate_revenue")
stored_proc_fqn_partial = "public.calculate_revenue"
result = fqn.prefix_entity_for_wildcard_search(
StoredProcedure, stored_proc_fqn_partial
)
self.assertEqual(result, "*.*.public.calculate_revenue")
stored_proc_fqn_full = "oracle.sales_db.public.calculate_revenue"
result = fqn.prefix_entity_for_wildcard_search(
StoredProcedure, stored_proc_fqn_full
)
self.assertEqual(result, "oracle.sales_db.public.calculate_revenue")
# Pipeline (2 slots: service.pipeline)
pipeline_fqn = "daily_ingestion"
result = fqn.prefix_entity_for_wildcard_search(Pipeline, pipeline_fqn)
self.assertEqual(result, "*.daily_ingestion")
# Test error cases
# FQN with too many parts
with pytest.raises(fqn.FQNBuildingException) as exc:
fqn.prefix_entity_for_wildcard_search(
Table, "service.db.schema.table.extra"
)
assert "has too many parts" in str(exc.value)
# Test unsupported entity type (Column doesn't have slots defined)
with pytest.raises(fqn.FQNBuildingException) as exc:
fqn.prefix_entity_for_wildcard_search(Column, "column")
assert "not supported for wildcard search" in str(exc.value)