mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-28 20:09:59 +00:00

Start adding java ETL examples, starting with kafka etl. We've had a few requests to start providing Java examples rather than Python due to type safety. I've also started to add these to metadata-ingestion-examples to make it clearer these are *examples*. They can be used directly or as a basis for other things. As we port to Java we'll move examples to contrib.
175 lines
7.0 KiB
Haskell
Executable File
175 lines
7.0 KiB
Haskell
Executable File
#! /usr/bin/env nix-shell
|
|
#! nix-shell datahub-producer.hs.nix -i runghc
|
|
|
|
{-# LANGUAGE OverloadedStrings, FlexibleInstances, FlexibleContexts, ScopedTypeVariables #-}
|
|
{-# LANGUAGE TemplateHaskell #-}
|
|
|
|
import System.Environment (getArgs)
|
|
import System.Directory (canonicalizePath)
|
|
import Data.Typeable (Typeable)
|
|
import Data.Functor ((<&>))
|
|
import Control.Arrow (left, right)
|
|
import Control.Monad ((>=>), when)
|
|
import Control.Monad.IO.Class (MonadIO(..))
|
|
|
|
import Control.Monad.Catch (Exception, MonadThrow(..))
|
|
|
|
import qualified Data.ByteString.Lazy as B
|
|
import qualified Data.ByteString.Lazy.Char8 as BC
|
|
import qualified Data.Text as T
|
|
import qualified Data.Aeson as J
|
|
import Data.String.Conversions (cs)
|
|
import qualified Data.Binary as BIN
|
|
|
|
import Data.HashMap.Strict ((!))
|
|
import qualified Data.HashMap.Strict as MS
|
|
import qualified Data.Vector as V
|
|
|
|
|
|
import Control.Lens ((^?), (^..), folded, _Just)
|
|
import Data.Aeson.Lens (key, _Array, _String)
|
|
|
|
import qualified Data.Avro.Types as A (Value(..))
|
|
import qualified Data.Avro as A (Schema, Result(..))
|
|
import qualified Data.Avro.Schema as AS (
|
|
Schema(..), resultToEither, buildTypeEnvironment
|
|
, renderFullname, parseFullname, typeName, parseAvroJSON
|
|
)
|
|
-- import Data.Avro.JSON (decodeAvroJSON)
|
|
import Data.Avro.Encode (encodeAvro)
|
|
import Data.Avro.Decode (decodeAvro)
|
|
-- import Data.Avro.Deriving (makeSchema)
|
|
|
|
import Kafka.Avro (
|
|
SchemaRegistry(..), Subject(..), SchemaId(..)
|
|
, schemaRegistry, sendSchema
|
|
, extractSchemaId, loadSchema
|
|
)
|
|
|
|
import Data.Conduit (ConduitT, ZipSink(..), getZipSink, runConduitRes, runConduit, bracketP, (.|), yield)
|
|
import qualified Data.Conduit.Combinators as C
|
|
import Kafka.Conduit.Sink (ProducerRecord(..), TopicName(..), ProducePartition(..), BrokerAddress(..), kafkaSink, brokersList)
|
|
|
|
import Network.URI (parseURI)
|
|
import Network.URI.Lens (uriAuthorityLens, uriRegNameLens, uriPortLens)
|
|
|
|
import System.Process (readProcess)
|
|
|
|
data StringException = StringException String deriving (Typeable, Show)
|
|
instance Exception StringException
|
|
|
|
decodeAvroJSON :: A.Schema -> J.Value -> A.Result (A.Value A.Schema)
|
|
decodeAvroJSON schema json =
|
|
AS.parseAvroJSON union env schema json
|
|
where
|
|
env =
|
|
AS.buildTypeEnvironment missing schema
|
|
missing name =
|
|
fail ("Type " <> show name <> " not in schema")
|
|
|
|
union (AS.Union schemas) J.Null
|
|
| AS.Null `elem` schemas =
|
|
pure $ A.Union schemas AS.Null A.Null
|
|
| otherwise =
|
|
fail "Null not in union."
|
|
union (AS.Union schemas) (J.Object obj)
|
|
| null obj =
|
|
fail "Invalid encoding of union: empty object ({})."
|
|
| length obj > 1 =
|
|
fail ("Invalid encoding of union: object with too many fields:" ++ show obj)
|
|
| otherwise =
|
|
let
|
|
canonicalize name
|
|
| isBuiltIn name = name
|
|
| otherwise = AS.renderFullname $ AS.parseFullname name
|
|
branch =
|
|
head $ MS.keys obj
|
|
names =
|
|
MS.fromList [(AS.typeName t, t) | t <- V.toList schemas]
|
|
in case MS.lookup (canonicalize branch) names of
|
|
Just t -> do
|
|
nested <- AS.parseAvroJSON union env t (obj ! branch)
|
|
return (A.Union schemas t nested)
|
|
Nothing -> fail ("Type '" <> T.unpack branch <> "' not in union: " <> show schemas)
|
|
union AS.Union{} _ =
|
|
A.Error "Invalid JSON representation for union: has to be a JSON object with exactly one field."
|
|
union _ _ =
|
|
error "Impossible: function given non-union schema."
|
|
|
|
isBuiltIn name = name `elem` [ "null", "boolean", "int", "long", "float"
|
|
, "double", "bytes", "string", "array", "map" ]
|
|
|
|
|
|
fromRight :: (MonadThrow m, Show a) => String -> Either a b -> m b
|
|
fromRight label = either (throwM . StringException . (label ++) . show) return
|
|
|
|
fromJust :: (MonadThrow m, Show a) => String -> Maybe a -> m a
|
|
fromJust label = maybe (throwM . StringException $ (label ++ "value is missing") ) return
|
|
|
|
encodeJsonWithSchema :: (MonadIO m, MonadThrow m)
|
|
=> SchemaRegistry
|
|
-> Subject
|
|
-> A.Schema
|
|
-> J.Value
|
|
-> m B.ByteString
|
|
encodeJsonWithSchema sr subj schema json = do
|
|
v <- fromRight "[decodeAvroJSON]" $ AS.resultToEither $ decodeAvroJSON schema json
|
|
mbSid <- fromRight "[SchemaRegistry.sendSchema]"=<< sendSchema sr subj schema
|
|
return $ appendSchemaId v mbSid
|
|
where appendSchemaId v (SchemaId sid)= B.cons (toEnum 0) (BIN.encode sid) <> (encodeAvro v)
|
|
|
|
decodeJsonWithSchema :: (MonadIO m, MonadThrow m)
|
|
=> SchemaRegistry
|
|
-> B.ByteString
|
|
-> m J.Value
|
|
decodeJsonWithSchema sr bs = do
|
|
(sid, payload) <- maybe (throwM . StringException $ "BadPayloadNoSchemaId") return $ extractSchemaId bs
|
|
schema <- fromRight "[SchemaRegistry.loadSchema]" =<< loadSchema sr sid
|
|
J.toJSON <$> (fromRight "[Avro.decodeAvro]" $ decodeAvro schema payload)
|
|
|
|
|
|
parseNixJson :: FilePath -> IO J.Value
|
|
parseNixJson f = do
|
|
stdout :: String <- read <$> readProcess "nix-instantiate" ["--eval", "--expr", "builtins.toJSON (import " ++ f ++ ")"] ""
|
|
fromRight "[Aeson.eitherDecode] parse nix json" (J.eitherDecode (cs stdout))
|
|
|
|
main :: IO ()
|
|
main = do
|
|
args <- getArgs
|
|
when (length args /= 1) $
|
|
error " datahub-producer.hs [config-dir]"
|
|
|
|
confDir <- canonicalizePath (head args)
|
|
putStrLn ("confDir:" <> confDir)
|
|
confJson <- parseNixJson (confDir <> "/" <> "datahub-config.nix")
|
|
-- putStrLn ("confJson: " ++ show confJson)
|
|
schema <- fromRight "[Aeson.eitherDecode] parse asvc file:" =<<
|
|
J.eitherDecode <$> B.readFile (confDir <> "/" <> "MetadataChangeEvent.avsc")
|
|
-- putStrLn ("schema: " ++ show schema)
|
|
|
|
let
|
|
topic = "MetadataChangeEvent"
|
|
-- schema = $(makeSchema "../MetadataChangeEvent.avsc")
|
|
sandboxL = key "services".key "linkedin-datahub-pipeline".key "sandbox"
|
|
urisL = key "uris". _Array.folded._String
|
|
brokers = confJson ^.. sandboxL.key "kafka".urisL
|
|
srs = confJson ^.. sandboxL.key "schema-registry".urisL
|
|
brokers' = map (\uriText -> BrokerAddress . cs . concat $ parseURI (cs uriText) ^.. _Just.uriAuthorityLens._Just.(uriRegNameLens <> uriPortLens)) brokers
|
|
|
|
contents <- B.getContents <&> BC.lines
|
|
sr <- schemaRegistry (cs (head srs))
|
|
|
|
putStrLn " ==> beginning to send data..."
|
|
runConduitRes $ C.yieldMany contents
|
|
.| C.mapM (fromRight "[JSON.eitherDecode] read json record:". J.eitherDecode)
|
|
-- .| C.iterM (liftIO . putStrLn. cs . J.encode)
|
|
.| C.mapM (encodeJsonWithSchema sr (Subject (topic <> "-value")) schema)
|
|
-- .| C.iterM (decodeJsonWithSchema sr >=> liftIO . print . J.encode)
|
|
.| C.map (mkRecord (TopicName topic))
|
|
.| getZipSink ( ZipSink (kafkaSink (brokersList brokers')) *>
|
|
ZipSink ((C.length >>= yield) .| C.iterM (\n -> liftIO $ putStrLn ("total table num:" <> show n)) .| C.sinkNull))
|
|
return ()
|
|
where
|
|
mkRecord :: TopicName -> B.ByteString -> ProducerRecord
|
|
mkRecord topic bs = ProducerRecord topic UnassignedPartition Nothing (Just (cs bs))
|