Allow/deny patterns for kafka source

This commit is contained in:
Harshal Sheth 2021-02-11 22:48:20 -08:00 committed by Shirshanka Das
parent df3e3da45b
commit d483d23fd7
3 changed files with 43 additions and 23 deletions

View File

@ -3,6 +3,11 @@ source:
type: "kafka" type: "kafka"
kafka: kafka:
connection.bootstrap: "localhost:9092" connection.bootstrap: "localhost:9092"
topic_patterns:
allow:
- ".*"
deny:
- "^_.+" # deny all tables that start with an underscore
sink: sink:
type: "datahub-kafka" type: "datahub-kafka"

View File

@ -2,11 +2,10 @@ import logging
from gometa.configuration import ConfigModel from gometa.configuration import ConfigModel
from gometa.configuration.kafka import KafkaConsumerConnectionConfig from gometa.configuration.kafka import KafkaConsumerConnectionConfig
from gometa.ingestion.api.source import Source, SourceReport from gometa.ingestion.api.source import Source, SourceReport
from typing import Iterable, List, Dict, Any from typing import Iterable, List, Dict
from dataclasses import dataclass, field from dataclasses import dataclass, field
import confluent_kafka import confluent_kafka
from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
import re
from gometa.ingestion.source.metadata_common import MetadataWorkUnit from gometa.ingestion.source.metadata_common import MetadataWorkUnit
import time import time
@ -20,6 +19,7 @@ from gometa.metadata.com.linkedin.pegasus2avro.schema import (
KafkaSchema, KafkaSchema,
SchemaField, SchemaField,
) )
from gometa.configuration.common import AllowDenyPattern
from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -27,7 +27,7 @@ logger = logging.getLogger(__name__)
class KafkaSourceConfig(ConfigModel): class KafkaSourceConfig(ConfigModel):
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
topic: str = ".*" # default is wildcard subscription topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
@dataclass @dataclass
@ -57,18 +57,16 @@ class KafkaSourceReport(SourceReport):
@dataclass @dataclass
class KafkaSource(Source): class KafkaSource(Source):
source_config: KafkaSourceConfig source_config: KafkaSourceConfig
topic_pattern: Any # actually re.Pattern
consumer: confluent_kafka.Consumer consumer: confluent_kafka.Consumer
report: KafkaSourceReport report: KafkaSourceReport
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
super().__init__(ctx) super().__init__(ctx)
self.source_config = config self.source_config = config
self.topic_pattern = re.compile(self.source_config.topic)
self.consumer = confluent_kafka.Consumer( self.consumer = confluent_kafka.Consumer(
{ {
'group.id': 'test', "group.id": "test",
'bootstrap.servers': self.source_config.connection.bootstrap, "bootstrap.servers": self.source_config.connection.bootstrap,
**self.source_config.connection.consumer_config, **self.source_config.connection.consumer_config,
} }
) )
@ -87,10 +85,9 @@ class KafkaSource(Source):
for t in topics: for t in topics:
self.report.report_topic_scanned(t) self.report.report_topic_scanned(t)
# TODO: topics config should support allow and deny patterns if self.source_config.topic_patterns.allowed(t):
if re.fullmatch(self.topic_pattern, t) and not t.startswith("_"):
mce = self._extract_record(t) mce = self._extract_record(t)
wu = MetadataWorkUnit(id=f'kafka-{t}', mce=mce) wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce)
self.report.report_workunit(wu) self.report.report_workunit(wu)
yield wu yield wu
else: else:
@ -123,7 +120,7 @@ class KafkaSource(Source):
# Parse the schema # Parse the schema
fields: List[SchemaField] = [] fields: List[SchemaField] = []
if has_schema and schema.schema_type == 'AVRO': if has_schema and schema.schema_type == "AVRO":
fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
elif has_schema: elif has_schema:
self.report.report_warning( self.report.report_warning(

View File

@ -10,8 +10,11 @@ from unittest.mock import patch, MagicMock
class KafkaSourceTest(unittest.TestCase): class KafkaSourceTest(unittest.TestCase):
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
def test_kafka_source_configuration(self, mock_kafka): def test_kafka_source_configuration(self, mock_kafka):
ctx = PipelineContext(run_id='test') ctx = PipelineContext(run_id="test")
kafka_source = KafkaSource.create({'connection': {'bootstrap': 'foobar:9092'}}, ctx) kafka_source = KafkaSource.create(
{"connection": {"bootstrap": "foobar:9092"}}, ctx
)
kafka_source.close()
assert mock_kafka.call_count == 1 assert mock_kafka.call_count == 1
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
@ -21,13 +24,15 @@ class KafkaSourceTest(unittest.TestCase):
mock_cluster_metadata.topics = ["foobar", "bazbaz"] mock_cluster_metadata.topics = ["foobar", "bazbaz"]
mock_kafka_instance.list_topics.return_value = mock_cluster_metadata mock_kafka_instance.list_topics.return_value = mock_cluster_metadata
ctx = PipelineContext(run_id='test') ctx = PipelineContext(run_id="test")
kafka_source = KafkaSource.create({'connection': {'bootstrap': 'localhost:9092'}}, ctx) kafka_source = KafkaSource.create(
{"connection": {"bootstrap": "localhost:9092"}}, ctx
)
workunits = [] workunits = []
for w in kafka_source.get_workunits(): for w in kafka_source.get_workunits():
workunits.append(w) workunits.append(w)
first_mce = workunits[0].get_metadata()['mce'] first_mce = workunits[0].get_metadata()["mce"]
assert isinstance(first_mce, MetadataChangeEvent) assert isinstance(first_mce, MetadataChangeEvent)
mock_kafka.assert_called_once() mock_kafka.assert_called_once()
mock_kafka_instance.list_topics.assert_called_once() mock_kafka_instance.list_topics.assert_called_once()
@ -40,9 +45,14 @@ class KafkaSourceTest(unittest.TestCase):
mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"] mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"]
mock_kafka_instance.list_topics.return_value = mock_cluster_metadata mock_kafka_instance.list_topics.return_value = mock_cluster_metadata
ctx = PipelineContext(run_id='test1') ctx = PipelineContext(run_id="test1")
kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) kafka_source = KafkaSource.create(
assert kafka_source.source_config.topic == "test" {
"topic_patterns": {"allow": ["test"]},
"connection": {"bootstrap": "localhost:9092"},
},
ctx,
)
workunits = [w for w in kafka_source.get_workunits()] workunits = [w for w in kafka_source.get_workunits()]
mock_kafka.assert_called_once() mock_kafka.assert_called_once()
@ -50,15 +60,23 @@ class KafkaSourceTest(unittest.TestCase):
assert len(workunits) == 1 assert len(workunits) == 1
mock_cluster_metadata.topics = ["test", "test2", "bazbaz"] mock_cluster_metadata.topics = ["test", "test2", "bazbaz"]
ctx = PipelineContext(run_id='test2') ctx = PipelineContext(run_id="test2")
kafka_source = KafkaSource.create({'topic': 'test.*', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) kafka_source = KafkaSource.create(
{
"topic_patterns": {"allow": ["test.*"]},
"connection": {"bootstrap": "localhost:9092"},
},
ctx,
)
workunits = [w for w in kafka_source.get_workunits()] workunits = [w for w in kafka_source.get_workunits()]
assert len(workunits) == 2 assert len(workunits) == 2
@patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer")
def test_close(self, mock_kafka): def test_close(self, mock_kafka):
mock_kafka_instance = mock_kafka.return_value mock_kafka_instance = mock_kafka.return_value
ctx = PipelineContext(run_id='test') ctx = PipelineContext(run_id="test")
kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) kafka_source = KafkaSource.create(
{"topic": "test", "connection": {"bootstrap": "localhost:9092"}}, ctx
)
kafka_source.close() kafka_source.close()
assert mock_kafka_instance.close.call_count == 1 assert mock_kafka_instance.close.call_count == 1