datahub/contrib/metadata-ingestion/haskell/bin/lineage_hive_generator.hs
John Plaisted 6ece2d6469
Start adding java ETL examples, starting with kafka etl. (#1805)
Start adding java ETL examples, starting with kafka etl.

We've had a few requests to start providing Java examples rather than Python due to type safety.

I've also started to add these to metadata-ingestion-examples to make it clearer these are *examples*. They can be used directly or as a basis for other things.

As we port to Java we'll move examples to contrib.
2020-09-11 13:04:21 -07:00

114 lines
3.4 KiB
Haskell
Executable File

#! /usr/bin/env nix-shell
#! nix-shell ./lineage_hive_generator.hs.nix -i runghc
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE QuasiQuotes #-}
import Data.Functor ((<&>))
import Control.Monad (when)
import Control.Arrow ((>>>))
import Data.Proxy (Proxy(..))
import Data.Either (isLeft, fromLeft, fromRight)
import Text.Printf (formatString)
import System.IO (hPrint, stderr)
import Data.String.Conversions (cs)
import qualified Data.Text.Lazy as T
import qualified Data.Text.Lazy.IO as T
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.HashMap.Strict as HM
import qualified Data.Aeson as J
import Data.Conduit (ConduitT, runConduitRes, runConduit, bracketP, (.|))
import qualified Data.Conduit.Combinators as C
import qualified Database.Sql.Hive.Parser as HIVE
import qualified Database.Sql.Hive.Type as HIVE
import Database.Sql.Type (
Catalog(..), DatabaseName(..), FullyQualifiedTableName(..), FQTN(..)
, makeDefaultingCatalog, mkNormalSchema
)
import Database.Sql.Util.Scope (runResolverWarn)
import Database.Sql.Util.Lineage.Table (getTableLineage)
import Data.Aeson.QQ (aesonQQ)
import Data.Time.Clock.POSIX (getPOSIXTime)
instance J.ToJSON FullyQualifiedTableName
instance J.ToJSONKey FullyQualifiedTableName
nowts :: IO Int
nowts = floor . (* 1000) <$> getPOSIXTime
catalog :: Catalog
catalog = makeDefaultingCatalog HM.empty
[mkNormalSchema "public" ()]
(DatabaseName () "defaultDatabase")
tableName :: FullyQualifiedTableName -> T.Text
tableName (FullyQualifiedTableName database schema name) = T.intercalate "." [database, schema, name]
mkMCE :: Int -> (FQTN, S.Set FQTN) -> J.Value
mkMCE ts (output, inputs) = [aesonQQ|
{ "proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": #{uriName output}
, "aspects": [
{ "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
"upstreams": #{upstreams ts inputs}
}
}
]
}
}
}
|]
where
upstream :: Int -> T.Text -> J.Value
upstream ts dataset = [aesonQQ|
{ "auditStamp": {"time":#{ts}, "actor":"urn:li:corpuser:jdoe"}
, "dataset": #{dataset}
, "type":"TRANSFORMED"
}
|]
upstreams ts = map (upstream ts . uriName) . S.toList
uriName :: FQTN -> T.Text
uriName fqtn = "urn:li:dataset:(urn:li:dataPlatform:hive,"
<> tableName fqtn
<> ",PROD)"
main = do
contents <- T.getContents <&> T.lines
ts <- nowts
runConduit $ C.yieldMany contents
.| C.iterM (hPrint stderr)
.| C.mapM (cs >>> T.readFile)
.| C.concatMap parseSQL
.| C.mapM resolveStatement
.| C.concatMap (getTableLineage >>> M.toList)
.| C.map (mkMCE ts)
.| C.mapM_ (J.encode >>> cs >>> putStrLn)
where
parseSQL sql = do
let stOrErr = HIVE.parseManyAll sql
when (isLeft stOrErr) $
error $ show (fromLeft undefined stOrErr)
fromRight undefined stOrErr
resolveStatement st = do
let resolvedStOrErr = runResolverWarn (HIVE.resolveHiveStatement st) HIVE.dialectProxy catalog
when (isLeft . fst $ resolvedStOrErr) $
error $ show (fromLeft undefined (fst resolvedStOrErr))
let (Right queryResolved, resolutions) = resolvedStOrErr
return queryResolved