mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-23 17:39:59 +00:00
Start adding reporting
This commit is contained in:
parent
0929c7cb77
commit
fd9bc09e67
16
metadata-ingestion/src/gometa/ingestion/api/report.py
Normal file
16
metadata-ingestion/src/gometa/ingestion/api/report.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from abc import abstractmethod, ABCMeta
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import json
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Report:
|
||||||
|
|
||||||
|
def as_obj(self) -> dict:
|
||||||
|
return self.__dict__
|
||||||
|
|
||||||
|
def as_string(self) -> str:
|
||||||
|
return pprint.pformat(self.as_obj())
|
||||||
|
|
||||||
|
def as_json(self) -> str:
|
||||||
|
return json.dumps(self.as_obj())
|
@ -1,11 +1,25 @@
|
|||||||
from abc import abstractmethod, ABCMeta
|
from abc import abstractmethod, ABCMeta
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List, Any
|
||||||
|
|
||||||
from gometa.ingestion.api.closeable import Closeable
|
from gometa.ingestion.api.closeable import Closeable
|
||||||
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit, PipelineContext
|
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit, PipelineContext
|
||||||
|
from gometa.ingestion.api.report import Report
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SinkReport(Report):
|
||||||
|
# workunits_processed = 0
|
||||||
|
records_written = 0
|
||||||
|
failures: List[Any] = field(default_factory=list)
|
||||||
|
|
||||||
|
def report_record_written(self, record: RecordEnvelope):
|
||||||
|
self.records_written += 1
|
||||||
|
|
||||||
|
def report_failure(self, info: Any) -> None:
|
||||||
|
self.failures.append(info)
|
||||||
|
|
||||||
|
|
||||||
class WriteCallback:
|
class WriteCallback(metaclass=ABCMeta):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def on_success(self, record_envelope: RecordEnvelope, success_metadata: dict):
|
def on_success(self, record_envelope: RecordEnvelope, success_metadata: dict):
|
||||||
@ -50,5 +64,9 @@ class Sink(Closeable, metaclass = ABCMeta):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def close(self):
|
def get_report(self) -> SinkReport:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def close(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
@ -1,8 +1,19 @@
|
|||||||
from typing import Iterable
|
from typing import Iterable, List
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from abc import abstractmethod, ABCMeta
|
from abc import abstractmethod, ABCMeta
|
||||||
from .closeable import Closeable
|
from .closeable import Closeable
|
||||||
from .common import WorkUnit, PipelineContext, RecordEnvelope
|
from .common import WorkUnit, PipelineContext, RecordEnvelope
|
||||||
|
from .report import Report
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SourceReport(Report):
|
||||||
|
workunits_produced = 0
|
||||||
|
workunit_ids: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def report_workunit(self, wu: WorkUnit):
|
||||||
|
self.workunits_produced += 1
|
||||||
|
self.workunit_ids.append(wu.id)
|
||||||
|
|
||||||
|
|
||||||
class Extractor(Closeable, metaclass=ABCMeta):
|
class Extractor(Closeable, metaclass=ABCMeta):
|
||||||
@ -27,3 +38,7 @@ class Source(Closeable, metaclass = ABCMeta):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_workunits(self) -> Iterable[WorkUnit]:
|
def get_workunits(self) -> Iterable[WorkUnit]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_report(self) -> SourceReport:
|
||||||
|
pass
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import Dict
|
from typing import Dict
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
import pprint
|
||||||
from gometa.configuration.common import DynamicTypedConfig, DynamicFactory
|
from gometa.configuration.common import DynamicTypedConfig, DynamicFactory
|
||||||
from gometa.ingestion.api.source import Source, Extractor
|
from gometa.ingestion.api.source import Source, Extractor
|
||||||
from gometa.ingestion.source import source_class_mapping
|
from gometa.ingestion.source import source_class_mapping
|
||||||
@ -72,7 +73,7 @@ class Pipeline:
|
|||||||
callback = LoggingCallback()
|
callback = LoggingCallback()
|
||||||
extractor = self.extractor_class()
|
extractor = self.extractor_class()
|
||||||
SinkClass: Type[Sink] = self.sink_class
|
SinkClass: Type[Sink] = self.sink_class
|
||||||
sink = SinkClass.create(self.sink_config, self.ctx)
|
sink: Sink = SinkClass.create(self.sink_config, self.ctx)
|
||||||
logger.info(f"Sink type:{self.config.sink.type},{self.sink_class} configured")
|
logger.info(f"Sink type:{self.config.sink.type},{self.sink_class} configured")
|
||||||
for wu in self.source.get_workunits():
|
for wu in self.source.get_workunits():
|
||||||
# TODO: change extractor interface
|
# TODO: change extractor interface
|
||||||
@ -84,3 +85,9 @@ class Pipeline:
|
|||||||
extractor.close()
|
extractor.close()
|
||||||
sink.handle_work_unit_end(wu)
|
sink.handle_work_unit_end(wu)
|
||||||
sink.close()
|
sink.close()
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'source': self.source.get_report().as_obj(),
|
||||||
|
'sink': sink.get_report().as_obj(),
|
||||||
|
}
|
||||||
|
pprint.pprint(result, sort_dicts=False)
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, TypeVar, Type, Dict
|
from typing import Optional, TypeVar, Type, Dict
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import requests
|
import requests
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
from gometa.ingestion.api.sink import Sink, WriteCallback
|
from gometa.ingestion.api.sink import Sink, WriteCallback, SinkReport
|
||||||
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit
|
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit
|
||||||
import json
|
import json
|
||||||
from gometa.metadata import json_converter
|
from gometa.metadata import json_converter
|
||||||
@ -65,6 +65,7 @@ class DatahubRestSinkConfig(BaseModel):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class DatahubRestSink(Sink):
|
class DatahubRestSink(Sink):
|
||||||
config: DatahubRestSinkConfig
|
config: DatahubRestSinkConfig
|
||||||
|
report: SinkReport = field(default_factory=SinkReport)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict, ctx):
|
def create(cls, config_dict, ctx):
|
||||||
@ -103,13 +104,19 @@ class DatahubRestSink(Sink):
|
|||||||
# with open('data.json', 'w') as outfile:
|
# with open('data.json', 'w') as outfile:
|
||||||
# json.dump(serialized_snapshot, outfile)
|
# json.dump(serialized_snapshot, outfile)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
self.report.report_record_written(record_envelope)
|
||||||
write_callback.on_success(record_envelope, {})
|
write_callback.on_success(record_envelope, {})
|
||||||
except HTTPError as e:
|
except HTTPError as e:
|
||||||
info = response.json()
|
info = response.json()
|
||||||
breakpoint()
|
breakpoint()
|
||||||
|
self.report.report_failure({'e': e, 'info': info})
|
||||||
write_callback.on_failure(record_envelope, e, info)
|
write_callback.on_failure(record_envelope, e, info)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
self.report.report_failure({'e': e})
|
||||||
write_callback.on_failure(record_envelope, e, {})
|
write_callback.on_failure(record_envelope, e, {})
|
||||||
|
|
||||||
|
def get_report(self) -> SinkReport:
|
||||||
|
return self.report
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
pass
|
pass
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from gometa.configuration import ConfigModel, KafkaConnectionConfig
|
from gometa.configuration import ConfigModel, KafkaConnectionConfig
|
||||||
from gometa.ingestion.api.source import Source, Extractor
|
from gometa.ingestion.api.source import Source, Extractor, SourceReport
|
||||||
from gometa.ingestion.api.source import WorkUnit
|
from gometa.ingestion.api.source import WorkUnit
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@ -25,6 +25,7 @@ class KafkaSource(Source):
|
|||||||
source_config: KafkaSourceConfig
|
source_config: KafkaSourceConfig
|
||||||
topic_pattern: re.Pattern
|
topic_pattern: re.Pattern
|
||||||
consumer: confluent_kafka.Consumer
|
consumer: confluent_kafka.Consumer
|
||||||
|
report: SourceReport = SourceReport()
|
||||||
|
|
||||||
def __init__(self, config, ctx):
|
def __init__(self, config, ctx):
|
||||||
super().__init__(ctx)
|
super().__init__(ctx)
|
||||||
@ -43,8 +44,12 @@ class KafkaSource(Source):
|
|||||||
if re.fullmatch(self.topic_pattern, t):
|
if re.fullmatch(self.topic_pattern, t):
|
||||||
# TODO: topics config should support allow and deny patterns
|
# TODO: topics config should support allow and deny patterns
|
||||||
if not t.startswith("_"):
|
if not t.startswith("_"):
|
||||||
yield KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t))
|
wu = KafkaWorkUnit(id=f'kafka-{t}', config=KafkaSourceConfig(connection=self.source_config.connection, topic=t))
|
||||||
|
self.report.report_workunit(wu)
|
||||||
|
yield wu
|
||||||
|
|
||||||
|
def get_report(self):
|
||||||
|
return self.report
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.consumer:
|
if self.consumer:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user