mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-23 09:32:04 +00:00
Start adding reporting
This commit is contained in:
parent
0929c7cb77
commit
fd9bc09e67
16
metadata-ingestion/src/gometa/ingestion/api/report.py
Normal file
16
metadata-ingestion/src/gometa/ingestion/api/report.py
Normal file
@ -0,0 +1,16 @@
|
||||
from abc import abstractmethod, ABCMeta
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
import pprint
|
||||
|
||||
@dataclass
|
||||
class Report:
|
||||
|
||||
def as_obj(self) -> dict:
|
||||
return self.__dict__
|
||||
|
||||
def as_string(self) -> str:
|
||||
return pprint.pformat(self.as_obj())
|
||||
|
||||
def as_json(self) -> str:
|
||||
return json.dumps(self.as_obj())
|
@ -1,11 +1,25 @@
|
||||
from abc import abstractmethod, ABCMeta
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Any
|
||||
|
||||
from gometa.ingestion.api.closeable import Closeable
|
||||
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit, PipelineContext
|
||||
from gometa.ingestion.api.report import Report
|
||||
|
||||
@dataclass
|
||||
class SinkReport(Report):
|
||||
# workunits_processed = 0
|
||||
records_written = 0
|
||||
failures: List[Any] = field(default_factory=list)
|
||||
|
||||
def report_record_written(self, record: RecordEnvelope):
|
||||
self.records_written += 1
|
||||
|
||||
def report_failure(self, info: Any) -> None:
|
||||
self.failures.append(info)
|
||||
|
||||
|
||||
class WriteCallback:
|
||||
class WriteCallback(metaclass=ABCMeta):
|
||||
|
||||
@abstractmethod
|
||||
def on_success(self, record_envelope: RecordEnvelope, success_metadata: dict):
|
||||
@ -50,5 +64,9 @@ class Sink(Closeable, metaclass = ABCMeta):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def close(self):
|
||||
def get_report(self) -> SinkReport:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
@ -1,8 +1,19 @@
|
||||
from typing import Iterable
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, List
|
||||
from dataclasses import dataclass, field
|
||||
from abc import abstractmethod, ABCMeta
|
||||
from .closeable import Closeable
|
||||
from .common import WorkUnit, PipelineContext, RecordEnvelope
|
||||
from .report import Report
|
||||
|
||||
|
||||
@dataclass
|
||||
class SourceReport(Report):
|
||||
workunits_produced = 0
|
||||
workunit_ids: List[str] = field(default_factory=list)
|
||||
|
||||
def report_workunit(self, wu: WorkUnit):
|
||||
self.workunits_produced += 1
|
||||
self.workunit_ids.append(wu.id)
|
||||
|
||||
|
||||
class Extractor(Closeable, metaclass=ABCMeta):
|
||||
@ -27,3 +38,7 @@ class Source(Closeable, metaclass = ABCMeta):
|
||||
@abstractmethod
|
||||
def get_workunits(self) -> Iterable[WorkUnit]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_report(self) -> SourceReport:
|
||||
pass
|
||||
|
@ -1,6 +1,7 @@
|
||||
from typing import Dict
|
||||
from pydantic import BaseModel
|
||||
from dataclasses import dataclass, field
|
||||
import pprint
|
||||
from gometa.configuration.common import DynamicTypedConfig, DynamicFactory
|
||||
from gometa.ingestion.api.source import Source, Extractor
|
||||
from gometa.ingestion.source import source_class_mapping
|
||||
@ -72,7 +73,7 @@ class Pipeline:
|
||||
callback = LoggingCallback()
|
||||
extractor = self.extractor_class()
|
||||
SinkClass: Type[Sink] = self.sink_class
|
||||
sink = SinkClass.create(self.sink_config, self.ctx)
|
||||
sink: Sink = SinkClass.create(self.sink_config, self.ctx)
|
||||
logger.info(f"Sink type:{self.config.sink.type},{self.sink_class} configured")
|
||||
for wu in self.source.get_workunits():
|
||||
# TODO: change extractor interface
|
||||
@ -84,3 +85,9 @@ class Pipeline:
|
||||
extractor.close()
|
||||
sink.handle_work_unit_end(wu)
|
||||
sink.close()
|
||||
|
||||
result = {
|
||||
'source': self.source.get_report().as_obj(),
|
||||
'sink': sink.get_report().as_obj(),
|
||||
}
|
||||
pprint.pprint(result, sort_dicts=False)
|
||||
|
@ -1,12 +1,12 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, TypeVar, Type, Dict
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from gometa.ingestion.api.sink import Sink, WriteCallback
|
||||
from gometa.ingestion.api.sink import Sink, WriteCallback, SinkReport
|
||||
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit
|
||||
import json
|
||||
from gometa.metadata import json_converter
|
||||
@ -65,6 +65,7 @@ class DatahubRestSinkConfig(BaseModel):
|
||||
@dataclass
|
||||
class DatahubRestSink(Sink):
|
||||
config: DatahubRestSinkConfig
|
||||
report: SinkReport = field(default_factory=SinkReport)
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict, ctx):
|
||||
@ -103,13 +104,19 @@ class DatahubRestSink(Sink):
|
||||
# with open('data.json', 'w') as outfile:
|
||||
# json.dump(serialized_snapshot, outfile)
|
||||
response.raise_for_status()
|
||||
self.report.report_record_written(record_envelope)
|
||||
write_callback.on_success(record_envelope, {})
|
||||
except HTTPError as e:
|
||||
info = response.json()
|
||||
breakpoint()
|
||||
self.report.report_failure({'e': e, 'info': info})
|
||||
write_callback.on_failure(record_envelope, e, info)
|
||||
except Exception as e:
|
||||
self.report.report_failure({'e': e})
|
||||
write_callback.on_failure(record_envelope, e, {})
|
||||
|
||||
def get_report(self) -> SinkReport:
|
||||
return self.report
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
@ -1,5 +1,5 @@
|
||||
from gometa.configuration import ConfigModel, KafkaConnectionConfig
|
||||
from gometa.ingestion.api.source import Source, Extractor
|
||||
from gometa.ingestion.api.source import Source, Extractor, SourceReport
|
||||
from gometa.ingestion.api.source import WorkUnit
|
||||
from typing import Optional, Iterable
|
||||
from dataclasses import dataclass
|
||||
@ -25,6 +25,7 @@ class KafkaSource(Source):
|
||||
source_config: KafkaSourceConfig
|
||||
topic_pattern: re.Pattern
|
||||
consumer: confluent_kafka.Consumer
|
||||
report: SourceReport = SourceReport()
|
||||
|
||||
def __init__(self, config, ctx):
|
||||
super().__init__(ctx)
|
||||
@ -43,8 +44,12 @@ class KafkaSource(Source):
|
||||
if re.fullmatch(self.topic_pattern, t):
|
||||
# TODO: topics config should support allow and deny patterns
|
||||
if not t.startswith("_"):
|
||||
yield KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t))
|
||||
|
||||
wu = KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t))
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
def get_report(self):
|
||||
return self.report
|
||||
|
||||
def close(self):
|
||||
if self.consumer:
|
||||
|
Loading…
x
Reference in New Issue
Block a user