58 lines
1.9 KiB
Python
Raw Normal View History

2021-01-31 22:40:30 -08:00
from gometa.configuration import ConfigModel, KafkaConnectionConfig
2021-02-09 15:25:04 -08:00
from gometa.ingestion.api.source import Source, Extractor, SourceReport
2021-01-31 22:40:30 -08:00
from gometa.ingestion.api.source import WorkUnit
from typing import Optional, Iterable
2021-01-31 22:40:30 -08:00
from dataclasses import dataclass
import confluent_kafka
import re
from gometa.ingestion.api.closeable import Closeable
2021-02-01 11:24:52 -08:00
2021-01-31 22:40:30 -08:00
class KafkaSourceConfig(ConfigModel):
connection: KafkaConnectionConfig = KafkaConnectionConfig()
topic: str = ".*" # default is wildcard subscription
2021-02-01 11:24:52 -08:00
2021-01-31 22:40:30 -08:00
@dataclass
class KafkaWorkUnit(WorkUnit):
config: KafkaSourceConfig
def get_metadata(self):
return self.config.dict()
@dataclass
2021-01-31 22:40:30 -08:00
class KafkaSource(Source):
source_config: KafkaSourceConfig
topic_pattern: re.Pattern
consumer: confluent_kafka.Consumer
2021-02-09 15:25:04 -08:00
report: SourceReport = SourceReport()
2021-01-31 22:40:30 -08:00
def __init__(self, config, ctx):
super().__init__(ctx)
self.source_config = config
2021-01-31 22:40:30 -08:00
self.topic_pattern = re.compile(self.source_config.topic)
self.consumer = confluent_kafka.Consumer({'group.id':'test', 'bootstrap.servers':self.source_config.connection.bootstrap})
@classmethod
def create(cls, config_dict, ctx):
config = KafkaSourceConfig.parse_obj(config_dict)
return cls(config, ctx)
def get_workunits(self) -> Iterable[KafkaWorkUnit]:
2021-01-31 22:40:30 -08:00
topics = self.consumer.list_topics().topics
for t in topics:
if re.fullmatch(self.topic_pattern, t):
# TODO: topics config should support allow and deny patterns
2021-01-31 22:40:30 -08:00
if not t.startswith("_"):
2021-02-09 15:25:04 -08:00
wu = KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t))
self.report.report_workunit(wu)
yield wu
def get_report(self):
return self.report
2021-01-31 22:40:30 -08:00
def close(self):
if self.consumer:
self.consumer.close()