Streamline serde

This commit is contained in:
Harshal Sheth 2021-02-11 17:28:54 -08:00 committed by Shirshanka Das
parent 761b27893b
commit ae88fbf727
5 changed files with 554 additions and 19 deletions

View File

@ -10,7 +10,6 @@ from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroSerializer
from gometa.metadata import json_converter
from gometa.metadata.schema_classes import SCHEMA_JSON_STR
from gometa.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
@ -44,32 +43,25 @@ class DatahubKafkaSink(Sink):
self.config = config
self.report = SinkReport()
mce_schema = MetadataChangeEvent.RECORD_SCHEMA
producer_config = {
"bootstrap.servers": self.config.connection.bootstrap,
"schema.registry.url": self.config.connection.schema_registry_url,
**self.config.connection.producer_config,
}
schema_registry_conf = {
'url': self.config.connection.schema_registry_url,
**self.config.connection.schema_registry_config,
}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)
def convert_mce_to_dict(mce, ctx):
tuple_encoding = json_converter.with_tuple_union().to_json_object(mce)
def convert_mce_to_dict(mce: MetadataChangeEvent, ctx):
tuple_encoding = mce.to_obj(tuples=True)
return tuple_encoding
avro_serializer = AvroSerializer(SCHEMA_JSON_STR, schema_registry_client, to_dict=convert_mce_to_dict)
producer_conf = {
producer_config = {
"bootstrap.servers": self.config.connection.bootstrap,
'key.serializer': StringSerializer('utf_8'),
'value.serializer': avro_serializer,
**self.config.connection.producer_config,
}
self.producer = SerializingProducer(producer_conf)
self.producer = SerializingProducer(producer_config)
@classmethod
def create(cls, config_dict, ctx: PipelineContext):

View File

@ -9,7 +9,6 @@ from requests.exceptions import HTTPError
from gometa.ingestion.api.sink import Sink, WriteCallback, SinkReport
from gometa.ingestion.api.common import RecordEnvelope, WorkUnit
import json
from gometa.metadata import json_converter
from gometa.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from gometa.metadata import (
ChartSnapshotClass,
@ -96,7 +95,7 @@ class DatahubRestSink(Sink):
mce = record_envelope.record
url = self.get_ingest_endpoint(mce)
raw_mce_obj = json_converter.to_json_object(mce.proposedSnapshot)
raw_mce_obj = mce.proposedSnapshot.to_obj()
mce_obj = _rest_li_ify(raw_mce_obj)
snapshot = {'snapshot': mce_obj}

View File

@ -5,7 +5,6 @@ import os
import pathlib
import logging
import json
from gometa.metadata import json_converter
from gometa.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
logger = logging.getLogger(__name__)
@ -44,7 +43,7 @@ class FileSink(Sink):
def write_record_async(self, record_envelope: RecordEnvelope[MetadataChangeEvent], write_callback: WriteCallback):
mce = record_envelope.record
obj = json_converter.to_json_object(mce, MetadataChangeEvent.RECORD_SCHEMA)
obj = mce.to_obj()
if self.wrote_something:
self.file.write(',\n')

View File

@ -4,7 +4,6 @@ from pydantic import BaseModel
from typing import Optional, Iterable
from gometa.ingestion.api.source import Source, SourceReport
from gometa.ingestion.source.metadata_common import MetadataWorkUnit
from gometa.metadata import json_converter
from gometa.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
class MetadataFileSourceConfig(BaseModel):
@ -27,7 +26,7 @@ class MetadataFileSource(Source):
mce_obj_list = [mce_obj_list]
for i, obj in enumerate(mce_obj_list):
mce = json_converter.from_json_object(obj, MetadataChangeEvent.RECORD_SCHEMA)
mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj)
wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce)
self.report.report_workunit(wu)
yield wu

File diff suppressed because it is too large Load Diff