175 lines
7.0 KiB
Haskell
Raw Normal View History

2020-03-18 22:29:47 +08:00
#! /usr/bin/env nix-shell
#! nix-shell datahub-producer.hs.nix -i runghc
{-# LANGUAGE OverloadedStrings, FlexibleInstances, FlexibleContexts, ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
import System.Environment (getArgs)
import System.Directory (canonicalizePath)
import Data.Typeable (Typeable)
import Data.Functor ((<&>))
import Control.Arrow (left, right)
import Control.Monad ((>=>), when)
import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.Catch (Exception, MonadThrow(..))
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BC
import qualified Data.Text as T
import qualified Data.Aeson as J
import Data.String.Conversions (cs)
import qualified Data.Binary as BIN
import Data.HashMap.Strict ((!))
import qualified Data.HashMap.Strict as MS
import qualified Data.Vector as V
import Control.Lens ((^?), (^..), folded, _Just)
import Data.Aeson.Lens (key, _Array, _String)
import qualified Data.Avro.Types as A (Value(..))
import qualified Data.Avro as A (Schema, Result(..))
import qualified Data.Avro.Schema as AS (
Schema(..), resultToEither, buildTypeEnvironment
, renderFullname, parseFullname, typeName, parseAvroJSON
)
-- import Data.Avro.JSON (decodeAvroJSON)
import Data.Avro.Encode (encodeAvro)
import Data.Avro.Decode (decodeAvro)
-- import Data.Avro.Deriving (makeSchema)
import Kafka.Avro (
SchemaRegistry(..), Subject(..), SchemaId(..)
, schemaRegistry, sendSchema
, extractSchemaId, loadSchema
)
import Data.Conduit (ConduitT, ZipSink(..), getZipSink, runConduitRes, runConduit, bracketP, (.|), yield)
import qualified Data.Conduit.Combinators as C
import Kafka.Conduit.Sink (ProducerRecord(..), TopicName(..), ProducePartition(..), BrokerAddress(..), kafkaSink, brokersList)
import Network.URI (parseURI)
import Network.URI.Lens (uriAuthorityLens, uriRegNameLens, uriPortLens)
import System.Process (readProcess)
data StringException = StringException String deriving (Typeable, Show)
instance Exception StringException
decodeAvroJSON :: A.Schema -> J.Value -> A.Result (A.Value A.Schema)
decodeAvroJSON schema json =
AS.parseAvroJSON union env schema json
where
env =
AS.buildTypeEnvironment missing schema
missing name =
fail ("Type " <> show name <> " not in schema")
union (AS.Union schemas) J.Null
| AS.Null `elem` schemas =
pure $ A.Union schemas AS.Null A.Null
| otherwise =
fail "Null not in union."
union (AS.Union schemas) (J.Object obj)
| null obj =
fail "Invalid encoding of union: empty object ({})."
| length obj > 1 =
fail ("Invalid encoding of union: object with too many fields:" ++ show obj)
| otherwise =
let
canonicalize name
| isBuiltIn name = name
| otherwise = AS.renderFullname $ AS.parseFullname name
branch =
head $ MS.keys obj
names =
MS.fromList [(AS.typeName t, t) | t <- V.toList schemas]
in case MS.lookup (canonicalize branch) names of
Just t -> do
nested <- AS.parseAvroJSON union env t (obj ! branch)
return (A.Union schemas t nested)
Nothing -> fail ("Type '" <> T.unpack branch <> "' not in union: " <> show schemas)
union AS.Union{} _ =
A.Error "Invalid JSON representation for union: has to be a JSON object with exactly one field."
union _ _ =
error "Impossible: function given non-union schema."
isBuiltIn name = name `elem` [ "null", "boolean", "int", "long", "float"
, "double", "bytes", "string", "array", "map" ]
fromRight :: (MonadThrow m, Show a) => String -> Either a b -> m b
fromRight label = either (throwM . StringException . (label ++) . show) return
fromJust :: (MonadThrow m, Show a) => String -> Maybe a -> m a
fromJust label = maybe (throwM . StringException $ (label ++ "value is missing") ) return
encodeJsonWithSchema :: (MonadIO m, MonadThrow m)
=> SchemaRegistry
-> Subject
-> A.Schema
-> J.Value
-> m B.ByteString
encodeJsonWithSchema sr subj schema json = do
v <- fromRight "[decodeAvroJSON]" $ AS.resultToEither $ decodeAvroJSON schema json
mbSid <- fromRight "[SchemaRegistry.sendSchema]"=<< sendSchema sr subj schema
return $ appendSchemaId v mbSid
where appendSchemaId v (SchemaId sid)= B.cons (toEnum 0) (BIN.encode sid) <> (encodeAvro v)
decodeJsonWithSchema :: (MonadIO m, MonadThrow m)
=> SchemaRegistry
-> B.ByteString
-> m J.Value
decodeJsonWithSchema sr bs = do
(sid, payload) <- maybe (throwM . StringException $ "BadPayloadNoSchemaId") return $ extractSchemaId bs
schema <- fromRight "[SchemaRegistry.loadSchema]" =<< loadSchema sr sid
J.toJSON <$> (fromRight "[Avro.decodeAvro]" $ decodeAvro schema payload)
parseNixJson :: FilePath -> IO J.Value
parseNixJson f = do
stdout :: String <- read <$> readProcess "nix-instantiate" ["--eval", "--expr", "builtins.toJSON (import " ++ f ++ ")"] ""
fromRight "[Aeson.eitherDecode] parse nix json" (J.eitherDecode (cs stdout))
main :: IO ()
main = do
args <- getArgs
when (length args /= 1) $
error " datahub-producer.hs [config-dir]"
confDir <- canonicalizePath (head args)
putStrLn ("confDir:" <> confDir)
confJson <- parseNixJson (confDir <> "/" <> "datahub-config.nix")
-- putStrLn ("confJson: " ++ show confJson)
schema <- fromRight "[Aeson.eitherDecode] parse asvc file:" =<<
J.eitherDecode <$> B.readFile (confDir <> "/" <> "MetadataChangeEvent.avsc")
-- putStrLn ("schema: " ++ show schema)
let
topic = "MetadataChangeEvent"
-- schema = $(makeSchema "../MetadataChangeEvent.avsc")
sandboxL = key "services".key "linkedin-datahub-pipeline".key "sandbox"
urisL = key "uris". _Array.folded._String
brokers = confJson ^.. sandboxL.key "kafka".urisL
srs = confJson ^.. sandboxL.key "schema-registry".urisL
brokers' = map (\uriText -> BrokerAddress . cs . concat $ parseURI (cs uriText) ^.. _Just.uriAuthorityLens._Just.(uriRegNameLens <> uriPortLens)) brokers
contents <- B.getContents <&> BC.lines
sr <- schemaRegistry (cs (head srs))
putStrLn " ==> beginning to send data..."
runConduitRes $ C.yieldMany contents
.| C.mapM (fromRight "[JSON.eitherDecode] read json record:". J.eitherDecode)
-- .| C.iterM (liftIO . putStrLn. cs . J.encode)
.| C.mapM (encodeJsonWithSchema sr (Subject (topic <> "-value")) schema)
-- .| C.iterM (decodeJsonWithSchema sr >=> liftIO . print . J.encode)
.| C.map (mkRecord (TopicName topic))
.| getZipSink ( ZipSink (kafkaSink (brokersList brokers')) *>
ZipSink ((C.length >>= yield) .| C.iterM (\n -> liftIO $ putStrLn ("total table num:" <> show n)) .| C.sinkNull))
return ()
where
mkRecord :: TopicName -> B.ByteString -> ProducerRecord
mkRecord topic bs = ProducerRecord topic UnassignedPartition Nothing (Just (cs bs))