Start adding reporting

This commit is contained in:
Harshal Sheth 2021-02-09 15:25:04 -08:00 committed by Shirshanka Das
parent 0929c7cb77
commit fd9bc09e67
6 changed files with 79 additions and 11 deletions

View File

@ -0,0 +1,16 @@
from abc import abstractmethod, ABCMeta
from dataclasses import dataclass
import json
import pprint
@dataclass
class Report:
def as_obj(self) -> dict:
return self.__dict__
def as_string(self) -> str:
return pprint.pformat(self.as_obj())
def as_json(self) -> str:
return json.dumps(self.as_obj())

View File

@ -1,11 +1,25 @@
from abc import abstractmethod, ABCMeta from abc import abstractmethod, ABCMeta
from dataclasses import dataclass from dataclasses import dataclass, field
from typing import List, Any
from gometa.ingestion.api.closeable import Closeable from gometa.ingestion.api.closeable import Closeable
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit, PipelineContext from gometa.ingestion.api.common import RecordEnvelope, WorkUnit, PipelineContext
from gometa.ingestion.api.report import Report
@dataclass
class SinkReport(Report):
# workunits_processed = 0
records_written = 0
failures: List[Any] = field(default_factory=list)
def report_record_written(self, record: RecordEnvelope):
self.records_written += 1
def report_failure(self, info: Any) -> None:
self.failures.append(info)
class WriteCallback: class WriteCallback(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def on_success(self, record_envelope: RecordEnvelope, success_metadata: dict): def on_success(self, record_envelope: RecordEnvelope, success_metadata: dict):
@ -50,5 +64,9 @@ class Sink(Closeable, metaclass = ABCMeta):
pass pass
@abstractmethod @abstractmethod
def close(self): def get_report(self) -> SinkReport:
pass
@abstractmethod
def close(self) -> None:
pass pass

View File

@ -1,8 +1,19 @@
from typing import Iterable from typing import Iterable, List
from dataclasses import dataclass from dataclasses import dataclass, field
from abc import abstractmethod, ABCMeta from abc import abstractmethod, ABCMeta
from .closeable import Closeable from .closeable import Closeable
from .common import WorkUnit, PipelineContext, RecordEnvelope from .common import WorkUnit, PipelineContext, RecordEnvelope
from .report import Report
@dataclass
class SourceReport(Report):
workunits_produced = 0
workunit_ids: List[str] = field(default_factory=list)
def report_workunit(self, wu: WorkUnit):
self.workunits_produced += 1
self.workunit_ids.append(wu.id)
class Extractor(Closeable, metaclass=ABCMeta): class Extractor(Closeable, metaclass=ABCMeta):
@ -27,3 +38,7 @@ class Source(Closeable, metaclass = ABCMeta):
@abstractmethod @abstractmethod
def get_workunits(self) -> Iterable[WorkUnit]: def get_workunits(self) -> Iterable[WorkUnit]:
pass pass
@abstractmethod
def get_report(self) -> SourceReport:
pass

View File

@ -1,6 +1,7 @@
from typing import Dict from typing import Dict
from pydantic import BaseModel from pydantic import BaseModel
from dataclasses import dataclass, field from dataclasses import dataclass, field
import pprint
from gometa.configuration.common import DynamicTypedConfig, DynamicFactory from gometa.configuration.common import DynamicTypedConfig, DynamicFactory
from gometa.ingestion.api.source import Source, Extractor from gometa.ingestion.api.source import Source, Extractor
from gometa.ingestion.source import source_class_mapping from gometa.ingestion.source import source_class_mapping
@ -72,7 +73,7 @@ class Pipeline:
callback = LoggingCallback() callback = LoggingCallback()
extractor = self.extractor_class() extractor = self.extractor_class()
SinkClass: Type[Sink] = self.sink_class SinkClass: Type[Sink] = self.sink_class
sink = SinkClass.create(self.sink_config, self.ctx) sink: Sink = SinkClass.create(self.sink_config, self.ctx)
logger.info(f"Sink type:{self.config.sink.type},{self.sink_class} configured") logger.info(f"Sink type:{self.config.sink.type},{self.sink_class} configured")
for wu in self.source.get_workunits(): for wu in self.source.get_workunits():
# TODO: change extractor interface # TODO: change extractor interface
@ -84,3 +85,9 @@ class Pipeline:
extractor.close() extractor.close()
sink.handle_work_unit_end(wu) sink.handle_work_unit_end(wu)
sink.close() sink.close()
result = {
'source': self.source.get_report().as_obj(),
'sink': sink.get_report().as_obj(),
}
pprint.pprint(result, sort_dicts=False)

View File

@ -1,12 +1,12 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass, field
from typing import Optional, TypeVar, Type, Dict from typing import Optional, TypeVar, Type, Dict
from pydantic import BaseModel, Field, ValidationError, validator from pydantic import BaseModel, Field, ValidationError, validator
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
import requests import requests
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from gometa.ingestion.api.sink import Sink, WriteCallback from gometa.ingestion.api.sink import Sink, WriteCallback, SinkReport
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit from gometa.ingestion.api.common import RecordEnvelope, WorkUnit
import json import json
from gometa.metadata import json_converter from gometa.metadata import json_converter
@ -65,6 +65,7 @@ class DatahubRestSinkConfig(BaseModel):
@dataclass @dataclass
class DatahubRestSink(Sink): class DatahubRestSink(Sink):
config: DatahubRestSinkConfig config: DatahubRestSinkConfig
report: SinkReport = field(default_factory=SinkReport)
@classmethod @classmethod
def create(cls, config_dict, ctx): def create(cls, config_dict, ctx):
@ -103,13 +104,19 @@ class DatahubRestSink(Sink):
# with open('data.json', 'w') as outfile: # with open('data.json', 'w') as outfile:
# json.dump(serialized_snapshot, outfile) # json.dump(serialized_snapshot, outfile)
response.raise_for_status() response.raise_for_status()
self.report.report_record_written(record_envelope)
write_callback.on_success(record_envelope, {}) write_callback.on_success(record_envelope, {})
except HTTPError as e: except HTTPError as e:
info = response.json() info = response.json()
breakpoint() breakpoint()
self.report.report_failure({'e': e, 'info': info})
write_callback.on_failure(record_envelope, e, info) write_callback.on_failure(record_envelope, e, info)
except Exception as e: except Exception as e:
self.report.report_failure({'e': e})
write_callback.on_failure(record_envelope, e, {}) write_callback.on_failure(record_envelope, e, {})
def get_report(self) -> SinkReport:
return self.report
def close(self): def close(self):
pass pass

View File

@ -1,5 +1,5 @@
from gometa.configuration import ConfigModel, KafkaConnectionConfig from gometa.configuration import ConfigModel, KafkaConnectionConfig
from gometa.ingestion.api.source import Source, Extractor from gometa.ingestion.api.source import Source, Extractor, SourceReport
from gometa.ingestion.api.source import WorkUnit from gometa.ingestion.api.source import WorkUnit
from typing import Optional, Iterable from typing import Optional, Iterable
from dataclasses import dataclass from dataclasses import dataclass
@ -25,6 +25,7 @@ class KafkaSource(Source):
source_config: KafkaSourceConfig source_config: KafkaSourceConfig
topic_pattern: re.Pattern topic_pattern: re.Pattern
consumer: confluent_kafka.Consumer consumer: confluent_kafka.Consumer
report: SourceReport = SourceReport()
def __init__(self, config, ctx): def __init__(self, config, ctx):
super().__init__(ctx) super().__init__(ctx)
@ -43,8 +44,12 @@ class KafkaSource(Source):
if re.fullmatch(self.topic_pattern, t): if re.fullmatch(self.topic_pattern, t):
# TODO: topics config should support allow and deny patterns # TODO: topics config should support allow and deny patterns
if not t.startswith("_"): if not t.startswith("_"):
yield KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t)) wu = KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t))
self.report.report_workunit(wu)
yield wu
def get_report(self):
return self.report
def close(self): def close(self):
if self.consumer: if self.consumer: