mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 02:37:05 +00:00 
			
		
		
		
	Allow/deny patterns for kafka source
This commit is contained in:
		
							parent
							
								
									df3e3da45b
								
							
						
					
					
						commit
						d483d23fd7
					
				| @ -3,6 +3,11 @@ source: | ||||
|   type: "kafka" | ||||
|   kafka: | ||||
|     connection.bootstrap: "localhost:9092" | ||||
|   topic_patterns: | ||||
|     allow: | ||||
|       - ".*" | ||||
|     deny: | ||||
|       - "^_.+" # deny all tables that start with an underscore | ||||
| 
 | ||||
| sink: | ||||
|   type: "datahub-kafka" | ||||
|  | ||||
| @ -2,11 +2,10 @@ import logging | ||||
| from gometa.configuration import ConfigModel | ||||
| from gometa.configuration.kafka import KafkaConsumerConnectionConfig | ||||
| from gometa.ingestion.api.source import Source, SourceReport | ||||
| from typing import Iterable, List, Dict, Any | ||||
| from typing import Iterable, List, Dict | ||||
| from dataclasses import dataclass, field | ||||
| import confluent_kafka | ||||
| from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient | ||||
| import re | ||||
| from gometa.ingestion.source.metadata_common import MetadataWorkUnit | ||||
| 
 | ||||
| import time | ||||
| @ -20,6 +19,7 @@ from gometa.metadata.com.linkedin.pegasus2avro.schema import ( | ||||
|     KafkaSchema, | ||||
|     SchemaField, | ||||
| ) | ||||
| from gometa.configuration.common import AllowDenyPattern | ||||
| from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| @ -27,7 +27,7 @@ logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| class KafkaSourceConfig(ConfigModel): | ||||
|     connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() | ||||
|     topic: str = ".*"  # default is wildcard subscription | ||||
|     topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"]) | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
| @ -57,18 +57,16 @@ class KafkaSourceReport(SourceReport): | ||||
| @dataclass | ||||
| class KafkaSource(Source): | ||||
|     source_config: KafkaSourceConfig | ||||
|     topic_pattern: Any  # actually re.Pattern | ||||
|     consumer: confluent_kafka.Consumer | ||||
|     report: KafkaSourceReport | ||||
| 
 | ||||
|     def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): | ||||
|         super().__init__(ctx) | ||||
|         self.source_config = config | ||||
|         self.topic_pattern = re.compile(self.source_config.topic) | ||||
|         self.consumer = confluent_kafka.Consumer( | ||||
|             { | ||||
|                 'group.id': 'test', | ||||
|                 'bootstrap.servers': self.source_config.connection.bootstrap, | ||||
|                 "group.id": "test", | ||||
|                 "bootstrap.servers": self.source_config.connection.bootstrap, | ||||
|                 **self.source_config.connection.consumer_config, | ||||
|             } | ||||
|         ) | ||||
| @ -87,10 +85,9 @@ class KafkaSource(Source): | ||||
|         for t in topics: | ||||
|             self.report.report_topic_scanned(t) | ||||
| 
 | ||||
|             # TODO: topics config should support allow and deny patterns | ||||
|             if re.fullmatch(self.topic_pattern, t) and not t.startswith("_"): | ||||
|             if self.source_config.topic_patterns.allowed(t): | ||||
|                 mce = self._extract_record(t) | ||||
|                 wu = MetadataWorkUnit(id=f'kafka-{t}', mce=mce) | ||||
|                 wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce) | ||||
|                 self.report.report_workunit(wu) | ||||
|                 yield wu | ||||
|             else: | ||||
| @ -123,7 +120,7 @@ class KafkaSource(Source): | ||||
| 
 | ||||
|         # Parse the schema | ||||
|         fields: List[SchemaField] = [] | ||||
|         if has_schema and schema.schema_type == 'AVRO': | ||||
|         if has_schema and schema.schema_type == "AVRO": | ||||
|             fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) | ||||
|         elif has_schema: | ||||
|             self.report.report_warning( | ||||
|  | ||||
| @ -10,8 +10,11 @@ from unittest.mock import patch, MagicMock | ||||
| class KafkaSourceTest(unittest.TestCase): | ||||
|     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") | ||||
|     def test_kafka_source_configuration(self, mock_kafka): | ||||
|         ctx = PipelineContext(run_id='test') | ||||
|         kafka_source = KafkaSource.create({'connection': {'bootstrap': 'foobar:9092'}}, ctx) | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         kafka_source = KafkaSource.create( | ||||
|             {"connection": {"bootstrap": "foobar:9092"}}, ctx | ||||
|         ) | ||||
|         kafka_source.close() | ||||
|         assert mock_kafka.call_count == 1 | ||||
| 
 | ||||
|     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") | ||||
| @ -21,13 +24,15 @@ class KafkaSourceTest(unittest.TestCase): | ||||
|         mock_cluster_metadata.topics = ["foobar", "bazbaz"] | ||||
|         mock_kafka_instance.list_topics.return_value = mock_cluster_metadata | ||||
| 
 | ||||
|         ctx = PipelineContext(run_id='test') | ||||
|         kafka_source = KafkaSource.create({'connection': {'bootstrap': 'localhost:9092'}}, ctx) | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         kafka_source = KafkaSource.create( | ||||
|             {"connection": {"bootstrap": "localhost:9092"}}, ctx | ||||
|         ) | ||||
|         workunits = [] | ||||
|         for w in kafka_source.get_workunits(): | ||||
|             workunits.append(w) | ||||
| 
 | ||||
|         first_mce = workunits[0].get_metadata()['mce'] | ||||
|         first_mce = workunits[0].get_metadata()["mce"] | ||||
|         assert isinstance(first_mce, MetadataChangeEvent) | ||||
|         mock_kafka.assert_called_once() | ||||
|         mock_kafka_instance.list_topics.assert_called_once() | ||||
| @ -40,9 +45,14 @@ class KafkaSourceTest(unittest.TestCase): | ||||
|         mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"] | ||||
|         mock_kafka_instance.list_topics.return_value = mock_cluster_metadata | ||||
| 
 | ||||
|         ctx = PipelineContext(run_id='test1') | ||||
|         kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) | ||||
|         assert kafka_source.source_config.topic == "test" | ||||
|         ctx = PipelineContext(run_id="test1") | ||||
|         kafka_source = KafkaSource.create( | ||||
|             { | ||||
|                 "topic_patterns": {"allow": ["test"]}, | ||||
|                 "connection": {"bootstrap": "localhost:9092"}, | ||||
|             }, | ||||
|             ctx, | ||||
|         ) | ||||
|         workunits = [w for w in kafka_source.get_workunits()] | ||||
| 
 | ||||
|         mock_kafka.assert_called_once() | ||||
| @ -50,15 +60,23 @@ class KafkaSourceTest(unittest.TestCase): | ||||
|         assert len(workunits) == 1 | ||||
| 
 | ||||
|         mock_cluster_metadata.topics = ["test", "test2", "bazbaz"] | ||||
|         ctx = PipelineContext(run_id='test2') | ||||
|         kafka_source = KafkaSource.create({'topic': 'test.*', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) | ||||
|         ctx = PipelineContext(run_id="test2") | ||||
|         kafka_source = KafkaSource.create( | ||||
|             { | ||||
|                 "topic_patterns": {"allow": ["test.*"]}, | ||||
|                 "connection": {"bootstrap": "localhost:9092"}, | ||||
|             }, | ||||
|             ctx, | ||||
|         ) | ||||
|         workunits = [w for w in kafka_source.get_workunits()] | ||||
|         assert len(workunits) == 2 | ||||
| 
 | ||||
|     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") | ||||
|     def test_close(self, mock_kafka): | ||||
|         mock_kafka_instance = mock_kafka.return_value | ||||
|         ctx = PipelineContext(run_id='test') | ||||
|         kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) | ||||
|         ctx = PipelineContext(run_id="test") | ||||
|         kafka_source = KafkaSource.create( | ||||
|             {"topic": "test", "connection": {"bootstrap": "localhost:9092"}}, ctx | ||||
|         ) | ||||
|         kafka_source.close() | ||||
|         assert mock_kafka_instance.close.call_count == 1 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Harshal Sheth
						Harshal Sheth