datahub/contrib/metadata-ingestion/haskell/bin/dataset-jdbc-generator.hs
John Plaisted 6ece2d6469
Start adding java ETL examples, starting with kafka etl. (#1805)
Start adding java ETL examples, starting with kafka etl.

We've had a few requests to start providing Java examples rather than Python due to type safety.

I've also started to add these to metadata-ingestion-examples to make it clearer these are *examples*. They can be used directly or as a basis for other things.

As we port to Java we'll move examples to contrib.
2020-09-11 13:04:21 -07:00

214 lines
7.4 KiB
Haskell
Executable File

#! /usr/bin/env nix-shell
#! nix-shell dataset-jdbc-generator.hs.nix -i "runghc --ghc-arg=-fobject-code"
{-# LANGUAGE OverloadedStrings, FlexibleInstances, FlexibleContexts, ScopedTypeVariables #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TemplateHaskell, QuasiQuotes #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -fplugin=Language.Java.Inline.Plugin #-}
import System.Environment (lookupEnv)
import System.IO (hPrint, stderr, hSetEncoding, stdout, utf8)
import qualified Language.Haskell.TH.Syntax as TH
import Control.Concurrent (runInBoundThread)
import Language.Java (J, withJVM, reify, reflect, JType(..))
import Language.Java.Inline (imports, java)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.String.Conversions (cs)
import Text.InterpolatedString.Perl6 (q)
import Prelude hiding ((>>=), (>>))
import Data.Conduit (ConduitT, ZipSink(..), getZipSink, runConduitRes, runConduit, bracketP, (.|), yield)
import qualified Data.Conduit.Combinators as C
import qualified Data.Conduit.List as C (groupBy)
import qualified Data.Aeson as J
import Control.Arrow ((>>>))
import Data.Aeson.QQ (aesonQQ)
import Text.Printf (printf)
imports "java.util.*"
imports "java.sql.*"
datasetOracleSql :: T.Text
datasetOracleSql = [q|
select
c.OWNER || '.' || c.TABLE_NAME as schema_name
, t.COMMENTS as schema_description
, c.COLUMN_NAME as field_path
, c.DATA_TYPE as native_data_type
, m.COMMENTS as description
from ALL_TAB_COLUMNS c
left join ALL_TAB_COMMENTS t
on c.OWNER = t.OWNER
and c.TABLE_NAME = t.TABLE_NAME
left join ALL_COL_COMMENTS m
on c.OWNER = m.OWNER
and c.TABLE_NAME = m.TABLE_NAME
and c.COLUMN_NAME = m.COLUMN_NAME
where NOT REGEXP_LIKE(c.OWNER, 'ANONYMOUS|PUBLIC|SYS|SYSTEM|DBSNMP|MDSYS|CTXSYS|XDB|TSMSYS|ORACLE.*|APEX.*|TEST?*|GG_.*|\\$')
order by schema_name, c.COLUMN_ID
|]
datasetMysqlSql :: T.Text
datasetMysqlSql = [q|
select
concat(c.TABLE_SCHEMA, '.', c.TABLE_NAME) as schema_name
, NULL as schema_description
, c.COLUMN_NAME as field_path
, c.DATA_TYPE as native_data_type
, c.COLUMN_COMMENT as description
from information_schema.columns c
where table_schema not in ('information_schema')
order by schema_name, c.ORDINAL_POSITION
|]
datasetPostgresqlSql :: T.Text
datasetPostgresqlSql = [q|
SELECT
c.table_schema || '.' || c.table_name as schema_name
, pgtd.description as schema_description
, c.column_name as field_path
, c.data_type as native_data_type
, pgcd.description as description
FROM INFORMATION_SCHEMA.COLUMNS c
INNER JOIN
pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname
LEFT JOIN
pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position
LEFT JOIN
pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0
WHERE c.table_schema NOT IN ('information_schema', 'pg_catalog')
ORDER by schema_name, ordinal_position ;
|]
mkMCE :: Int -> T.Text -> [[T.Text]] -> J.Value
mkMCE ts platform fields@((schemaName:schemaDescription:_):_) = [aesonQQ|
{ "proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": #{urn}
, "aspects": [
{ "com.linkedin.pegasus2avro.common.Ownership": {
"owners": [{"owner": "urn:li:corpuser:datahub", "type":"DATAOWNER"}]
, "lastModified": {"time": #{ts}, "actor": "urn:li:corpuser:datahub"}
}
}
, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"description": {"string": #{schemaDescription}}
}
}
, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": #{schemaName}
, "platform": "urn:li:dataPlatform:"
, "version": 0
, "created": {"time": #{ts}, "actor": "urn:li:corpuser:datahub"}
, "lastModified": {"time": #{ts}, "actor": "urn:li:corpuser:datahub"}
, "hash": ""
, "platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"documentSchema": "{}"
, "tableSchema": "{}"
}
}
, "fields": #{mkFields fields}
}
}
]
}
}
}
|]
where
urn :: String = printf "urn:li:dataset:(urn:li:dataPlatform:%s,%s,%s)"
platform schemaName ("PROD"::String)
mkField (_:_:fieldPath:nativeDataType:description:[]) = [aesonQQ|
{
"fieldPath": #{fieldPath}
, "description": {"string": #{description}}
, "type": {"type": {"com.linkedin.pegasus2avro.schema.StringType": {}}}
, "nativeDataType": #{nativeDataType}
}
|]
mkFields = map mkField
main :: IO ()
main = do
hSetEncoding stdout utf8
let
jvmArgs = case $(TH.lift =<< TH.runIO (lookupEnv "CLASSPATH")) of
Nothing -> []
Just cp -> [ cs ("-Djava.class.path=" ++ cp) ]
platform :: T.Text = "localhost_postgresql"
-- dbUrl :: T.Text = "jdbc:mysql://localhost:3306/datahub?useSSL=false"
-- dbUrl :: T.Text = "jdbc:oracle:thin@localhost:1521:EDWDB"
dbUrl :: T.Text = "jdbc:postgresql://localhost:5432/datahub"
dbUser :: T.Text = "datahub"
dbPassword :: T.Text = "datahub"
-- dbDriver:: T.Text = "oracle.jdbc.OracleDriver" ;
-- dbDriver:: T.Text = "com.mysql.jdbc.Driver" ;
dbDriver:: T.Text = "org.postgresql.Driver" ;
-- dbDriver:: T.Text = "com.microsoft.sqlserver.jdbc.SQLServerDriver" ;
-- dbSQL :: T.Text = datasetMysqlSql
-- dbSQL :: T.Text = datasetOracleSql
dbSQL :: T.Text = datasetPostgresqlSql
runInBoundThread $ withJVM jvmArgs $ do
[jDbUrl, jDbUser, jDbPassword, jDbDriver, jDbSQL ] <-
mapM reflect [dbUrl, dbUser, dbPassword, dbDriver, dbSQL]
result <- [java| {
try {
Class.forName($jDbDriver) ;
} catch (ClassNotFoundException e) {
e.printStackTrace() ;
System.exit(1) ;
}
List<String[]> result = new ArrayList<String[]>() ;
try (Connection con = DriverManager.getConnection($jDbUrl, $jDbUser, $jDbPassword) ;
Statement st = con.createStatement(); ) {
try (ResultSet rs = st.executeQuery($jDbSQL)) {
while(rs.next()) {
String[] row = new String[] {
Optional.ofNullable(rs.getString("schema_name")).orElse("")
, Optional.ofNullable(rs.getString("schema_description")).orElse("")
, Optional.ofNullable(rs.getString("field_path")).orElse("")
, Optional.ofNullable(rs.getString("native_data_type")).orElse("")
, Optional.ofNullable(rs.getString("description")).orElse("")
} ;
result.add(row) ;
}
}
return result.toArray(new String[0][0]) ;
} catch (SQLException e) {
e.printStackTrace() ;
return null ;
}
} |]
rows :: [[T.Text]] <- reify result
runConduit $ C.yieldMany rows
-- .| C.iterM (hPrint stderr)
.| C.groupBy sameSchemaName
-- .| C.iterM (hPrint stderr)
.| C.map (mkMCE 0 platform)
.| C.mapM_ (J.encode >>> cs >>> putStrLn)
.| C.sinkNull
return ()
where
sameSchemaName (schemaNameL:_) (schemaNameR:_) = schemaNameL == schemaNameR