mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-02 05:43:17 +00:00
feat(ingest): add support for jdbc connector to kafka-connect source (#3076)
This commit is contained in:
parent
1ad1243371
commit
7abcb45f8c
@ -74,7 +74,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"airflow": {"apache-airflow >= 1.10.2"},
|
"airflow": {"apache-airflow >= 1.10.2"},
|
||||||
# Source plugins
|
# Source plugins
|
||||||
"kafka": kafka_common,
|
"kafka": kafka_common,
|
||||||
"kafka-connect": {"requests"},
|
"kafka-connect": sql_common | {"requests"},
|
||||||
"sqlalchemy": sql_common,
|
"sqlalchemy": sql_common,
|
||||||
"athena": sql_common | {"PyAthena[SQLAlchemy]"},
|
"athena": sql_common | {"PyAthena[SQLAlchemy]"},
|
||||||
"bigquery": sql_common | {"pybigquery >= 0.6.0"},
|
"bigquery": sql_common | {"pybigquery >= 0.6.0"},
|
||||||
|
@ -16,7 +16,7 @@ This plugin extracts the following:
|
|||||||
|
|
||||||
Current limitations:
|
Current limitations:
|
||||||
|
|
||||||
- Currently works only for Debezium source connectors.
|
- Currently works for JDBC and Debezium source connectors only.
|
||||||
|
|
||||||
## Quickstart recipe
|
## Quickstart recipe
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ from typing import Dict, Iterable, List, Optional
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.engine.url import make_url
|
||||||
|
|
||||||
import datahub.emitter.mce_builder as builder
|
import datahub.emitter.mce_builder as builder
|
||||||
import datahub.metadata.schema_classes as models
|
import datahub.metadata.schema_classes as models
|
||||||
@ -58,6 +59,18 @@ class ConnectorManifest(BaseModel):
|
|||||||
url: Optional[str]
|
url: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
def get_jdbc_source_connector_parser(connector_manifest):
|
||||||
|
connector_class = connector_manifest.config.get("connector.class", "")
|
||||||
|
parser = {
|
||||||
|
# https://docs.confluent.io/kafka-connect-jdbc/current/source-connector/source_config_options.html
|
||||||
|
"io.confluent.connect.jdbc.JdbcSourceConnector": {
|
||||||
|
"db_connection_url": connector_manifest.config.get("connection.url"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return parser.get(connector_class)
|
||||||
|
|
||||||
|
|
||||||
def get_debezium_source_connector_parser(connector_manifest):
|
def get_debezium_source_connector_parser(connector_manifest):
|
||||||
connector_class = connector_manifest.config.get("connector.class", "")
|
connector_class = connector_manifest.config.get("connector.class", "")
|
||||||
parser = {
|
parser = {
|
||||||
@ -113,6 +126,67 @@ def get_debezium_source_connector_parser(connector_manifest):
|
|||||||
return parser.get(connector_class)
|
return parser.get(connector_class)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_prefix(text: str, prefix: str) -> str:
|
||||||
|
if text.startswith(prefix):
|
||||||
|
index = len(prefix)
|
||||||
|
return text[index:]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JDBCSourceConnectorLineages:
|
||||||
|
connector_manifest: ConnectorManifest
|
||||||
|
|
||||||
|
def get_lineages(self):
|
||||||
|
lineages: List[KafkaConnectLineage] = list()
|
||||||
|
parser = get_jdbc_source_connector_parser(self.connector_manifest)
|
||||||
|
db_connection_url = parser.get("db_connection_url")
|
||||||
|
url = remove_prefix(str(db_connection_url), "jdbc:")
|
||||||
|
url_instance = make_url(url)
|
||||||
|
source_platform = url_instance.drivername
|
||||||
|
database_name = url_instance.database
|
||||||
|
|
||||||
|
logging.debug(
|
||||||
|
f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
|
||||||
|
)
|
||||||
|
|
||||||
|
topic_prefix = (
|
||||||
|
str(self.connector_manifest.config.get("topic.prefix"))
|
||||||
|
if self.connector_manifest.config.get("topic.prefix")
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
query = (
|
||||||
|
self.connector_manifest.config.get("query")
|
||||||
|
if self.connector_manifest.config.get("query")
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.connector_manifest.topic_names:
|
||||||
|
return lineages
|
||||||
|
|
||||||
|
for topic in self.connector_manifest.topic_names:
|
||||||
|
# if the connector uses a custom query
|
||||||
|
if topic_prefix and not query:
|
||||||
|
source_table = remove_prefix(topic, topic_prefix)
|
||||||
|
else:
|
||||||
|
source_table = topic
|
||||||
|
|
||||||
|
dataset_name = (
|
||||||
|
database_name + "." + source_table if database_name else source_table
|
||||||
|
)
|
||||||
|
|
||||||
|
lineage = KafkaConnectLineage(
|
||||||
|
source_dataset=dataset_name,
|
||||||
|
source_platform=source_platform,
|
||||||
|
target_dataset=topic,
|
||||||
|
target_platform="kafka",
|
||||||
|
)
|
||||||
|
lineages.append(lineage)
|
||||||
|
|
||||||
|
return lineages
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DebeziumSourceConnectorLineages:
|
class DebeziumSourceConnectorLineages:
|
||||||
connector_manifest: ConnectorManifest
|
connector_manifest: ConnectorManifest
|
||||||
@ -203,14 +277,12 @@ class KafkaConnectSource(Source):
|
|||||||
|
|
||||||
manifest = connector_response.json()
|
manifest = connector_response.json()
|
||||||
connector_manifest = ConnectorManifest(**manifest)
|
connector_manifest = ConnectorManifest(**manifest)
|
||||||
|
|
||||||
# Initialize connector lineages
|
# Initialize connector lineages
|
||||||
connector_manifest.lineages = list()
|
connector_manifest.lineages = list()
|
||||||
connector_manifest.url = connector_url
|
connector_manifest.url = connector_url
|
||||||
|
|
||||||
# Populate Source Connector metadata
|
# Populate Source Connector metadata
|
||||||
if connector_manifest.type == "source":
|
if connector_manifest.type == "source":
|
||||||
# connector_config = manifest.get("config", {})
|
|
||||||
topics_response = self.session.get(
|
topics_response = self.session.get(
|
||||||
f"{self.config.connect_uri}/connectors/{c}/topics",
|
f"{self.config.connect_uri}/connectors/{c}/topics",
|
||||||
)
|
)
|
||||||
@ -218,7 +290,18 @@ class KafkaConnectSource(Source):
|
|||||||
topics = topics_response.json()
|
topics = topics_response.json()
|
||||||
connector_manifest.topic_names = topics[c]["topics"]
|
connector_manifest.topic_names = topics[c]["topics"]
|
||||||
|
|
||||||
# Currently we only support Debezium Source Connector lineages
|
# JDBC source connector lineages
|
||||||
|
if connector_manifest.config.get("connector.class").__eq__(
|
||||||
|
"io.confluent.connect.jdbc.JdbcSourceConnector"
|
||||||
|
):
|
||||||
|
jdbc_source_lineages = JDBCSourceConnectorLineages(
|
||||||
|
connector_manifest=connector_manifest
|
||||||
|
)
|
||||||
|
connector_manifest.lineages.extend(
|
||||||
|
jdbc_source_lineages.get_lineages()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Debezium Source Connector lineages
|
||||||
debezium_source_lineages = DebeziumSourceConnectorLineages(
|
debezium_source_lineages = DebeziumSourceConnectorLineages(
|
||||||
connector_manifest=connector_manifest
|
connector_manifest=connector_manifest
|
||||||
)
|
)
|
||||||
@ -273,6 +356,7 @@ class KafkaConnectSource(Source):
|
|||||||
def construct_job_workunits(
|
def construct_job_workunits(
|
||||||
self, connector: ConnectorManifest
|
self, connector: ConnectorManifest
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
|
||||||
connector_name = connector.name
|
connector_name = connector.name
|
||||||
flow_urn = builder.make_data_flow_urn(
|
flow_urn = builder.make_data_flow_urn(
|
||||||
"kafka-connect", connector_name, self.config.env
|
"kafka-connect", connector_name, self.config.env
|
||||||
@ -357,9 +441,7 @@ class KafkaConnectSource(Source):
|
|||||||
yield wu
|
yield wu
|
||||||
|
|
||||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||||
|
|
||||||
connectors_manifest = self.get_connectors_manifest()
|
connectors_manifest = self.get_connectors_manifest()
|
||||||
|
|
||||||
for connector in connectors_manifest:
|
for connector in connectors_manifest:
|
||||||
name = connector.name
|
name = connector.name
|
||||||
if self.config.connector_patterns.allowed(name):
|
if self.config.connector_patterns.allowed(name):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user