mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-26 19:10:15 +00:00
Allow/deny patterns for kafka source
This commit is contained in:
parent
df3e3da45b
commit
d483d23fd7
@ -3,6 +3,11 @@ source:
|
|||||||
type: "kafka"
|
type: "kafka"
|
||||||
kafka:
|
kafka:
|
||||||
connection.bootstrap: "localhost:9092"
|
connection.bootstrap: "localhost:9092"
|
||||||
|
topic_patterns:
|
||||||
|
allow:
|
||||||
|
- ".*"
|
||||||
|
deny:
|
||||||
|
- "^_.+" # deny all tables that start with an underscore
|
||||||
|
|
||||||
sink:
|
sink:
|
||||||
type: "datahub-kafka"
|
type: "datahub-kafka"
|
||||||
|
@ -2,11 +2,10 @@ import logging
|
|||||||
from gometa.configuration import ConfigModel
|
from gometa.configuration import ConfigModel
|
||||||
from gometa.configuration.kafka import KafkaConsumerConnectionConfig
|
from gometa.configuration.kafka import KafkaConsumerConnectionConfig
|
||||||
from gometa.ingestion.api.source import Source, SourceReport
|
from gometa.ingestion.api.source import Source, SourceReport
|
||||||
from typing import Iterable, List, Dict, Any
|
from typing import Iterable, List, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
import confluent_kafka
|
import confluent_kafka
|
||||||
from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
|
from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
|
||||||
import re
|
|
||||||
from gometa.ingestion.source.metadata_common import MetadataWorkUnit
|
from gometa.ingestion.source.metadata_common import MetadataWorkUnit
|
||||||
|
|
||||||
import time
|
import time
|
||||||
@ -20,6 +19,7 @@ from gometa.metadata.com.linkedin.pegasus2avro.schema import (
|
|||||||
KafkaSchema,
|
KafkaSchema,
|
||||||
SchemaField,
|
SchemaField,
|
||||||
)
|
)
|
||||||
|
from gometa.configuration.common import AllowDenyPattern
|
||||||
from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status
|
from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -27,7 +27,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class KafkaSourceConfig(ConfigModel):
|
class KafkaSourceConfig(ConfigModel):
|
||||||
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
|
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
|
||||||
topic: str = ".*" # default is wildcard subscription
|
topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -57,18 +57,16 @@ class KafkaSourceReport(SourceReport):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class KafkaSource(Source):
|
class KafkaSource(Source):
|
||||||
source_config: KafkaSourceConfig
|
source_config: KafkaSourceConfig
|
||||||
topic_pattern: Any # actually re.Pattern
|
|
||||||
consumer: confluent_kafka.Consumer
|
consumer: confluent_kafka.Consumer
|
||||||
report: KafkaSourceReport
|
report: KafkaSourceReport
|
||||||
|
|
||||||
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
|
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
|
||||||
super().__init__(ctx)
|
super().__init__(ctx)
|
||||||
self.source_config = config
|
self.source_config = config
|
||||||
self.topic_pattern = re.compile(self.source_config.topic)
|
|
||||||
self.consumer = confluent_kafka.Consumer(
|
self.consumer = confluent_kafka.Consumer(
|
||||||
{
|
{
|
||||||
'group.id': 'test',
|
"group.id": "test",
|
||||||
'bootstrap.servers': self.source_config.connection.bootstrap,
|
"bootstrap.servers": self.source_config.connection.bootstrap,
|
||||||
**self.source_config.connection.consumer_config,
|
**self.source_config.connection.consumer_config,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -87,10 +85,9 @@ class KafkaSource(Source):
|
|||||||
for t in topics:
|
for t in topics:
|
||||||
self.report.report_topic_scanned(t)
|
self.report.report_topic_scanned(t)
|
||||||
|
|
||||||
# TODO: topics config should support allow and deny patterns
|
if self.source_config.topic_patterns.allowed(t):
|
||||||
if re.fullmatch(self.topic_pattern, t) and not t.startswith("_"):
|
|
||||||
mce = self._extract_record(t)
|
mce = self._extract_record(t)
|
||||||
wu = MetadataWorkUnit(id=f'kafka-{t}', mce=mce)
|
wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce)
|
||||||
self.report.report_workunit(wu)
|
self.report.report_workunit(wu)
|
||||||
yield wu
|
yield wu
|
||||||
else:
|
else:
|
||||||
@ -123,7 +120,7 @@ class KafkaSource(Source):
|
|||||||
|
|
||||||
# Parse the schema
|
# Parse the schema
|
||||||
fields: List[SchemaField] = []
|
fields: List[SchemaField] = []
|
||||||
if has_schema and schema.schema_type == 'AVRO':
|
if has_schema and schema.schema_type == "AVRO":
|
||||||
fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
|
fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
|
||||||
elif has_schema:
|
elif has_schema:
|
||||||
self.report.report_warning(
|
self.report.report_warning(
|
||||||
|
@ -10,8 +10,11 @@ from unittest.mock import patch, MagicMock
|
|||||||
class KafkaSourceTest(unittest.TestCase):
|
class KafkaSourceTest(unittest.TestCase):
|
||||||
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
|
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
|
||||||
def test_kafka_source_configuration(self, mock_kafka):
|
def test_kafka_source_configuration(self, mock_kafka):
|
||||||
ctx = PipelineContext(run_id='test')
|
ctx = PipelineContext(run_id="test")
|
||||||
kafka_source = KafkaSource.create({'connection': {'bootstrap': 'foobar:9092'}}, ctx)
|
kafka_source = KafkaSource.create(
|
||||||
|
{"connection": {"bootstrap": "foobar:9092"}}, ctx
|
||||||
|
)
|
||||||
|
kafka_source.close()
|
||||||
assert mock_kafka.call_count == 1
|
assert mock_kafka.call_count == 1
|
||||||
|
|
||||||
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
|
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
|
||||||
@ -21,13 +24,15 @@ class KafkaSourceTest(unittest.TestCase):
|
|||||||
mock_cluster_metadata.topics = ["foobar", "bazbaz"]
|
mock_cluster_metadata.topics = ["foobar", "bazbaz"]
|
||||||
mock_kafka_instance.list_topics.return_value = mock_cluster_metadata
|
mock_kafka_instance.list_topics.return_value = mock_cluster_metadata
|
||||||
|
|
||||||
ctx = PipelineContext(run_id='test')
|
ctx = PipelineContext(run_id="test")
|
||||||
kafka_source = KafkaSource.create({'connection': {'bootstrap': 'localhost:9092'}}, ctx)
|
kafka_source = KafkaSource.create(
|
||||||
|
{"connection": {"bootstrap": "localhost:9092"}}, ctx
|
||||||
|
)
|
||||||
workunits = []
|
workunits = []
|
||||||
for w in kafka_source.get_workunits():
|
for w in kafka_source.get_workunits():
|
||||||
workunits.append(w)
|
workunits.append(w)
|
||||||
|
|
||||||
first_mce = workunits[0].get_metadata()['mce']
|
first_mce = workunits[0].get_metadata()["mce"]
|
||||||
assert isinstance(first_mce, MetadataChangeEvent)
|
assert isinstance(first_mce, MetadataChangeEvent)
|
||||||
mock_kafka.assert_called_once()
|
mock_kafka.assert_called_once()
|
||||||
mock_kafka_instance.list_topics.assert_called_once()
|
mock_kafka_instance.list_topics.assert_called_once()
|
||||||
@ -40,9 +45,14 @@ class KafkaSourceTest(unittest.TestCase):
|
|||||||
mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"]
|
mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"]
|
||||||
mock_kafka_instance.list_topics.return_value = mock_cluster_metadata
|
mock_kafka_instance.list_topics.return_value = mock_cluster_metadata
|
||||||
|
|
||||||
ctx = PipelineContext(run_id='test1')
|
ctx = PipelineContext(run_id="test1")
|
||||||
kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx)
|
kafka_source = KafkaSource.create(
|
||||||
assert kafka_source.source_config.topic == "test"
|
{
|
||||||
|
"topic_patterns": {"allow": ["test"]},
|
||||||
|
"connection": {"bootstrap": "localhost:9092"},
|
||||||
|
},
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
workunits = [w for w in kafka_source.get_workunits()]
|
workunits = [w for w in kafka_source.get_workunits()]
|
||||||
|
|
||||||
mock_kafka.assert_called_once()
|
mock_kafka.assert_called_once()
|
||||||
@ -50,15 +60,23 @@ class KafkaSourceTest(unittest.TestCase):
|
|||||||
assert len(workunits) == 1
|
assert len(workunits) == 1
|
||||||
|
|
||||||
mock_cluster_metadata.topics = ["test", "test2", "bazbaz"]
|
mock_cluster_metadata.topics = ["test", "test2", "bazbaz"]
|
||||||
ctx = PipelineContext(run_id='test2')
|
ctx = PipelineContext(run_id="test2")
|
||||||
kafka_source = KafkaSource.create({'topic': 'test.*', 'connection': {'bootstrap': 'localhost:9092'}}, ctx)
|
kafka_source = KafkaSource.create(
|
||||||
|
{
|
||||||
|
"topic_patterns": {"allow": ["test.*"]},
|
||||||
|
"connection": {"bootstrap": "localhost:9092"},
|
||||||
|
},
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
workunits = [w for w in kafka_source.get_workunits()]
|
workunits = [w for w in kafka_source.get_workunits()]
|
||||||
assert len(workunits) == 2
|
assert len(workunits) == 2
|
||||||
|
|
||||||
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
|
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
|
||||||
def test_close(self, mock_kafka):
|
def test_close(self, mock_kafka):
|
||||||
mock_kafka_instance = mock_kafka.return_value
|
mock_kafka_instance = mock_kafka.return_value
|
||||||
ctx = PipelineContext(run_id='test')
|
ctx = PipelineContext(run_id="test")
|
||||||
kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx)
|
kafka_source = KafkaSource.create(
|
||||||
|
{"topic": "test", "connection": {"bootstrap": "localhost:9092"}}, ctx
|
||||||
|
)
|
||||||
kafka_source.close()
|
kafka_source.close()
|
||||||
assert mock_kafka_instance.close.call_count == 1
|
assert mock_kafka_instance.close.call_count == 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user