mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 10:49:00 +00:00 
			
		
		
		
	Allow/deny patterns for kafka source
This commit is contained in:
		
							parent
							
								
									df3e3da45b
								
							
						
					
					
						commit
						d483d23fd7
					
				| @ -3,6 +3,11 @@ source: | |||||||
|   type: "kafka" |   type: "kafka" | ||||||
|   kafka: |   kafka: | ||||||
|     connection.bootstrap: "localhost:9092" |     connection.bootstrap: "localhost:9092" | ||||||
|  |   topic_patterns: | ||||||
|  |     allow: | ||||||
|  |       - ".*" | ||||||
|  |     deny: | ||||||
|  |       - "^_.+" # deny all tables that start with an underscore | ||||||
| 
 | 
 | ||||||
| sink: | sink: | ||||||
|   type: "datahub-kafka" |   type: "datahub-kafka" | ||||||
|  | |||||||
| @ -2,11 +2,10 @@ import logging | |||||||
| from gometa.configuration import ConfigModel | from gometa.configuration import ConfigModel | ||||||
| from gometa.configuration.kafka import KafkaConsumerConnectionConfig | from gometa.configuration.kafka import KafkaConsumerConnectionConfig | ||||||
| from gometa.ingestion.api.source import Source, SourceReport | from gometa.ingestion.api.source import Source, SourceReport | ||||||
| from typing import Iterable, List, Dict, Any | from typing import Iterable, List, Dict | ||||||
| from dataclasses import dataclass, field | from dataclasses import dataclass, field | ||||||
| import confluent_kafka | import confluent_kafka | ||||||
| from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient | from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient | ||||||
| import re |  | ||||||
| from gometa.ingestion.source.metadata_common import MetadataWorkUnit | from gometa.ingestion.source.metadata_common import MetadataWorkUnit | ||||||
| 
 | 
 | ||||||
| import time | import time | ||||||
| @ -20,6 +19,7 @@ from gometa.metadata.com.linkedin.pegasus2avro.schema import ( | |||||||
|     KafkaSchema, |     KafkaSchema, | ||||||
|     SchemaField, |     SchemaField, | ||||||
| ) | ) | ||||||
|  | from gometa.configuration.common import AllowDenyPattern | ||||||
| from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status | from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status | ||||||
| 
 | 
 | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| @ -27,7 +27,7 @@ logger = logging.getLogger(__name__) | |||||||
| 
 | 
 | ||||||
| class KafkaSourceConfig(ConfigModel): | class KafkaSourceConfig(ConfigModel): | ||||||
|     connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() |     connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() | ||||||
|     topic: str = ".*"  # default is wildcard subscription |     topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @dataclass | @dataclass | ||||||
| @ -57,18 +57,16 @@ class KafkaSourceReport(SourceReport): | |||||||
| @dataclass | @dataclass | ||||||
| class KafkaSource(Source): | class KafkaSource(Source): | ||||||
|     source_config: KafkaSourceConfig |     source_config: KafkaSourceConfig | ||||||
|     topic_pattern: Any  # actually re.Pattern |  | ||||||
|     consumer: confluent_kafka.Consumer |     consumer: confluent_kafka.Consumer | ||||||
|     report: KafkaSourceReport |     report: KafkaSourceReport | ||||||
| 
 | 
 | ||||||
|     def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): |     def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): | ||||||
|         super().__init__(ctx) |         super().__init__(ctx) | ||||||
|         self.source_config = config |         self.source_config = config | ||||||
|         self.topic_pattern = re.compile(self.source_config.topic) |  | ||||||
|         self.consumer = confluent_kafka.Consumer( |         self.consumer = confluent_kafka.Consumer( | ||||||
|             { |             { | ||||||
|                 'group.id': 'test', |                 "group.id": "test", | ||||||
|                 'bootstrap.servers': self.source_config.connection.bootstrap, |                 "bootstrap.servers": self.source_config.connection.bootstrap, | ||||||
|                 **self.source_config.connection.consumer_config, |                 **self.source_config.connection.consumer_config, | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
| @ -87,10 +85,9 @@ class KafkaSource(Source): | |||||||
|         for t in topics: |         for t in topics: | ||||||
|             self.report.report_topic_scanned(t) |             self.report.report_topic_scanned(t) | ||||||
| 
 | 
 | ||||||
|             # TODO: topics config should support allow and deny patterns |             if self.source_config.topic_patterns.allowed(t): | ||||||
|             if re.fullmatch(self.topic_pattern, t) and not t.startswith("_"): |  | ||||||
|                 mce = self._extract_record(t) |                 mce = self._extract_record(t) | ||||||
|                 wu = MetadataWorkUnit(id=f'kafka-{t}', mce=mce) |                 wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce) | ||||||
|                 self.report.report_workunit(wu) |                 self.report.report_workunit(wu) | ||||||
|                 yield wu |                 yield wu | ||||||
|             else: |             else: | ||||||
| @ -123,7 +120,7 @@ class KafkaSource(Source): | |||||||
| 
 | 
 | ||||||
|         # Parse the schema |         # Parse the schema | ||||||
|         fields: List[SchemaField] = [] |         fields: List[SchemaField] = [] | ||||||
|         if has_schema and schema.schema_type == 'AVRO': |         if has_schema and schema.schema_type == "AVRO": | ||||||
|             fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) |             fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) | ||||||
|         elif has_schema: |         elif has_schema: | ||||||
|             self.report.report_warning( |             self.report.report_warning( | ||||||
|  | |||||||
| @ -10,8 +10,11 @@ from unittest.mock import patch, MagicMock | |||||||
| class KafkaSourceTest(unittest.TestCase): | class KafkaSourceTest(unittest.TestCase): | ||||||
|     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") |     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") | ||||||
|     def test_kafka_source_configuration(self, mock_kafka): |     def test_kafka_source_configuration(self, mock_kafka): | ||||||
|         ctx = PipelineContext(run_id='test') |         ctx = PipelineContext(run_id="test") | ||||||
|         kafka_source = KafkaSource.create({'connection': {'bootstrap': 'foobar:9092'}}, ctx) |         kafka_source = KafkaSource.create( | ||||||
|  |             {"connection": {"bootstrap": "foobar:9092"}}, ctx | ||||||
|  |         ) | ||||||
|  |         kafka_source.close() | ||||||
|         assert mock_kafka.call_count == 1 |         assert mock_kafka.call_count == 1 | ||||||
| 
 | 
 | ||||||
|     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") |     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") | ||||||
| @ -21,13 +24,15 @@ class KafkaSourceTest(unittest.TestCase): | |||||||
|         mock_cluster_metadata.topics = ["foobar", "bazbaz"] |         mock_cluster_metadata.topics = ["foobar", "bazbaz"] | ||||||
|         mock_kafka_instance.list_topics.return_value = mock_cluster_metadata |         mock_kafka_instance.list_topics.return_value = mock_cluster_metadata | ||||||
| 
 | 
 | ||||||
|         ctx = PipelineContext(run_id='test') |         ctx = PipelineContext(run_id="test") | ||||||
|         kafka_source = KafkaSource.create({'connection': {'bootstrap': 'localhost:9092'}}, ctx) |         kafka_source = KafkaSource.create( | ||||||
|  |             {"connection": {"bootstrap": "localhost:9092"}}, ctx | ||||||
|  |         ) | ||||||
|         workunits = [] |         workunits = [] | ||||||
|         for w in kafka_source.get_workunits(): |         for w in kafka_source.get_workunits(): | ||||||
|             workunits.append(w) |             workunits.append(w) | ||||||
| 
 | 
 | ||||||
|         first_mce = workunits[0].get_metadata()['mce'] |         first_mce = workunits[0].get_metadata()["mce"] | ||||||
|         assert isinstance(first_mce, MetadataChangeEvent) |         assert isinstance(first_mce, MetadataChangeEvent) | ||||||
|         mock_kafka.assert_called_once() |         mock_kafka.assert_called_once() | ||||||
|         mock_kafka_instance.list_topics.assert_called_once() |         mock_kafka_instance.list_topics.assert_called_once() | ||||||
| @ -40,9 +45,14 @@ class KafkaSourceTest(unittest.TestCase): | |||||||
|         mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"] |         mock_cluster_metadata.topics = ["test", "foobar", "bazbaz"] | ||||||
|         mock_kafka_instance.list_topics.return_value = mock_cluster_metadata |         mock_kafka_instance.list_topics.return_value = mock_cluster_metadata | ||||||
| 
 | 
 | ||||||
|         ctx = PipelineContext(run_id='test1') |         ctx = PipelineContext(run_id="test1") | ||||||
|         kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) |         kafka_source = KafkaSource.create( | ||||||
|         assert kafka_source.source_config.topic == "test" |             { | ||||||
|  |                 "topic_patterns": {"allow": ["test"]}, | ||||||
|  |                 "connection": {"bootstrap": "localhost:9092"}, | ||||||
|  |             }, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|         workunits = [w for w in kafka_source.get_workunits()] |         workunits = [w for w in kafka_source.get_workunits()] | ||||||
| 
 | 
 | ||||||
|         mock_kafka.assert_called_once() |         mock_kafka.assert_called_once() | ||||||
| @ -50,15 +60,23 @@ class KafkaSourceTest(unittest.TestCase): | |||||||
|         assert len(workunits) == 1 |         assert len(workunits) == 1 | ||||||
| 
 | 
 | ||||||
|         mock_cluster_metadata.topics = ["test", "test2", "bazbaz"] |         mock_cluster_metadata.topics = ["test", "test2", "bazbaz"] | ||||||
|         ctx = PipelineContext(run_id='test2') |         ctx = PipelineContext(run_id="test2") | ||||||
|         kafka_source = KafkaSource.create({'topic': 'test.*', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) |         kafka_source = KafkaSource.create( | ||||||
|  |             { | ||||||
|  |                 "topic_patterns": {"allow": ["test.*"]}, | ||||||
|  |                 "connection": {"bootstrap": "localhost:9092"}, | ||||||
|  |             }, | ||||||
|  |             ctx, | ||||||
|  |         ) | ||||||
|         workunits = [w for w in kafka_source.get_workunits()] |         workunits = [w for w in kafka_source.get_workunits()] | ||||||
|         assert len(workunits) == 2 |         assert len(workunits) == 2 | ||||||
| 
 | 
 | ||||||
|     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") |     @patch("gometa.ingestion.source.kafka.confluent_kafka.Consumer") | ||||||
|     def test_close(self, mock_kafka): |     def test_close(self, mock_kafka): | ||||||
|         mock_kafka_instance = mock_kafka.return_value |         mock_kafka_instance = mock_kafka.return_value | ||||||
|         ctx = PipelineContext(run_id='test') |         ctx = PipelineContext(run_id="test") | ||||||
|         kafka_source = KafkaSource.create({'topic': 'test', 'connection': {'bootstrap': 'localhost:9092'}}, ctx) |         kafka_source = KafkaSource.create( | ||||||
|  |             {"topic": "test", "connection": {"bootstrap": "localhost:9092"}}, ctx | ||||||
|  |         ) | ||||||
|         kafka_source.close() |         kafka_source.close() | ||||||
|         assert mock_kafka_instance.close.call_count == 1 |         assert mock_kafka_instance.close.call_count == 1 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Harshal Sheth
						Harshal Sheth