mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-28 20:09:59 +00:00

Start adding java ETL examples, starting with kafka etl. We've had a few requests to start providing Java examples rather than Python due to type safety. I've also started to add these to metadata-ingestion-examples to make it clearer these are *examples*. They can be used directly or as a basis for other things. As we port to Java we'll move examples to contrib.
214 lines
7.4 KiB
Haskell
Executable File
214 lines
7.4 KiB
Haskell
Executable File
#! /usr/bin/env nix-shell
|
|
#! nix-shell dataset-jdbc-generator.hs.nix -i "runghc --ghc-arg=-fobject-code"
|
|
|
|
{-# LANGUAGE OverloadedStrings, FlexibleInstances, FlexibleContexts, ScopedTypeVariables #-}
|
|
{-# LANGUAGE DataKinds #-}
|
|
{-# LANGUAGE TemplateHaskell, QuasiQuotes #-}
|
|
{-# LANGUAGE TypeOperators #-}
|
|
|
|
|
|
{-# OPTIONS_GHC -fplugin=Language.Java.Inline.Plugin #-}
|
|
|
|
|
|
import System.Environment (lookupEnv)
|
|
import System.IO (hPrint, stderr, hSetEncoding, stdout, utf8)
|
|
|
|
import qualified Language.Haskell.TH.Syntax as TH
|
|
|
|
import Control.Concurrent (runInBoundThread)
|
|
import Language.Java (J, withJVM, reify, reflect, JType(..))
|
|
import Language.Java.Inline (imports, java)
|
|
|
|
import qualified Data.Text as T
|
|
import qualified Data.Text.IO as T
|
|
import Data.String.Conversions (cs)
|
|
import Text.InterpolatedString.Perl6 (q)
|
|
|
|
import Prelude hiding ((>>=), (>>))
|
|
|
|
import Data.Conduit (ConduitT, ZipSink(..), getZipSink, runConduitRes, runConduit, bracketP, (.|), yield)
|
|
import qualified Data.Conduit.Combinators as C
|
|
import qualified Data.Conduit.List as C (groupBy)
|
|
|
|
import qualified Data.Aeson as J
|
|
import Control.Arrow ((>>>))
|
|
import Data.Aeson.QQ (aesonQQ)
|
|
|
|
import Text.Printf (printf)
|
|
|
|
|
|
imports "java.util.*"
|
|
imports "java.sql.*"
|
|
|
|
|
|
datasetOracleSql :: T.Text
|
|
datasetOracleSql = [q|
|
|
select
|
|
c.OWNER || '.' || c.TABLE_NAME as schema_name
|
|
, t.COMMENTS as schema_description
|
|
, c.COLUMN_NAME as field_path
|
|
, c.DATA_TYPE as native_data_type
|
|
, m.COMMENTS as description
|
|
from ALL_TAB_COLUMNS c
|
|
left join ALL_TAB_COMMENTS t
|
|
on c.OWNER = t.OWNER
|
|
and c.TABLE_NAME = t.TABLE_NAME
|
|
left join ALL_COL_COMMENTS m
|
|
on c.OWNER = m.OWNER
|
|
and c.TABLE_NAME = m.TABLE_NAME
|
|
and c.COLUMN_NAME = m.COLUMN_NAME
|
|
where NOT REGEXP_LIKE(c.OWNER, 'ANONYMOUS|PUBLIC|SYS|SYSTEM|DBSNMP|MDSYS|CTXSYS|XDB|TSMSYS|ORACLE.*|APEX.*|TEST?*|GG_.*|\\$')
|
|
order by schema_name, c.COLUMN_ID
|
|
|]
|
|
|
|
datasetMysqlSql :: T.Text
|
|
datasetMysqlSql = [q|
|
|
select
|
|
concat(c.TABLE_SCHEMA, '.', c.TABLE_NAME) as schema_name
|
|
, NULL as schema_description
|
|
, c.COLUMN_NAME as field_path
|
|
, c.DATA_TYPE as native_data_type
|
|
, c.COLUMN_COMMENT as description
|
|
from information_schema.columns c
|
|
where table_schema not in ('information_schema')
|
|
order by schema_name, c.ORDINAL_POSITION
|
|
|]
|
|
|
|
datasetPostgresqlSql :: T.Text
|
|
datasetPostgresqlSql = [q|
|
|
SELECT
|
|
c.table_schema || '.' || c.table_name as schema_name
|
|
, pgtd.description as schema_description
|
|
, c.column_name as field_path
|
|
, c.data_type as native_data_type
|
|
, pgcd.description as description
|
|
FROM INFORMATION_SCHEMA.COLUMNS c
|
|
INNER JOIN
|
|
pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname
|
|
LEFT JOIN
|
|
pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position
|
|
LEFT JOIN
|
|
pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0
|
|
WHERE c.table_schema NOT IN ('information_schema', 'pg_catalog')
|
|
ORDER by schema_name, ordinal_position ;
|
|
|]
|
|
|
|
|
|
mkMCE :: Int -> T.Text -> [[T.Text]] -> J.Value
|
|
mkMCE ts platform fields@((schemaName:schemaDescription:_):_) = [aesonQQ|
|
|
{ "proposedSnapshot": {
|
|
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
|
"urn": #{urn}
|
|
, "aspects": [
|
|
{ "com.linkedin.pegasus2avro.common.Ownership": {
|
|
"owners": [{"owner": "urn:li:corpuser:datahub", "type":"DATAOWNER"}]
|
|
, "lastModified": {"time": #{ts}, "actor": "urn:li:corpuser:datahub"}
|
|
}
|
|
}
|
|
, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
|
"description": {"string": #{schemaDescription}}
|
|
}
|
|
}
|
|
, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
|
"schemaName": #{schemaName}
|
|
, "platform": "urn:li:dataPlatform:"
|
|
, "version": 0
|
|
, "created": {"time": #{ts}, "actor": "urn:li:corpuser:datahub"}
|
|
, "lastModified": {"time": #{ts}, "actor": "urn:li:corpuser:datahub"}
|
|
, "hash": ""
|
|
, "platformSchema": {
|
|
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
|
"documentSchema": "{}"
|
|
, "tableSchema": "{}"
|
|
}
|
|
}
|
|
, "fields": #{mkFields fields}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
|]
|
|
where
|
|
urn :: String = printf "urn:li:dataset:(urn:li:dataPlatform:%s,%s,%s)"
|
|
platform schemaName ("PROD"::String)
|
|
mkField (_:_:fieldPath:nativeDataType:description:[]) = [aesonQQ|
|
|
{
|
|
"fieldPath": #{fieldPath}
|
|
, "description": {"string": #{description}}
|
|
, "type": {"type": {"com.linkedin.pegasus2avro.schema.StringType": {}}}
|
|
, "nativeDataType": #{nativeDataType}
|
|
}
|
|
|]
|
|
mkFields = map mkField
|
|
|
|
main :: IO ()
|
|
main = do
|
|
hSetEncoding stdout utf8
|
|
let
|
|
jvmArgs = case $(TH.lift =<< TH.runIO (lookupEnv "CLASSPATH")) of
|
|
Nothing -> []
|
|
Just cp -> [ cs ("-Djava.class.path=" ++ cp) ]
|
|
platform :: T.Text = "localhost_postgresql"
|
|
-- dbUrl :: T.Text = "jdbc:mysql://localhost:3306/datahub?useSSL=false"
|
|
-- dbUrl :: T.Text = "jdbc:oracle:thin@localhost:1521:EDWDB"
|
|
dbUrl :: T.Text = "jdbc:postgresql://localhost:5432/datahub"
|
|
|
|
dbUser :: T.Text = "datahub"
|
|
dbPassword :: T.Text = "datahub"
|
|
|
|
-- dbDriver:: T.Text = "oracle.jdbc.OracleDriver" ;
|
|
-- dbDriver:: T.Text = "com.mysql.jdbc.Driver" ;
|
|
dbDriver:: T.Text = "org.postgresql.Driver" ;
|
|
-- dbDriver:: T.Text = "com.microsoft.sqlserver.jdbc.SQLServerDriver" ;
|
|
|
|
-- dbSQL :: T.Text = datasetMysqlSql
|
|
-- dbSQL :: T.Text = datasetOracleSql
|
|
dbSQL :: T.Text = datasetPostgresqlSql
|
|
runInBoundThread $ withJVM jvmArgs $ do
|
|
[jDbUrl, jDbUser, jDbPassword, jDbDriver, jDbSQL ] <-
|
|
mapM reflect [dbUrl, dbUser, dbPassword, dbDriver, dbSQL]
|
|
|
|
result <- [java| {
|
|
try {
|
|
Class.forName($jDbDriver) ;
|
|
} catch (ClassNotFoundException e) {
|
|
e.printStackTrace() ;
|
|
System.exit(1) ;
|
|
}
|
|
|
|
List<String[]> result = new ArrayList<String[]>() ;
|
|
try (Connection con = DriverManager.getConnection($jDbUrl, $jDbUser, $jDbPassword) ;
|
|
Statement st = con.createStatement(); ) {
|
|
try (ResultSet rs = st.executeQuery($jDbSQL)) {
|
|
while(rs.next()) {
|
|
String[] row = new String[] {
|
|
Optional.ofNullable(rs.getString("schema_name")).orElse("")
|
|
, Optional.ofNullable(rs.getString("schema_description")).orElse("")
|
|
, Optional.ofNullable(rs.getString("field_path")).orElse("")
|
|
, Optional.ofNullable(rs.getString("native_data_type")).orElse("")
|
|
, Optional.ofNullable(rs.getString("description")).orElse("")
|
|
} ;
|
|
result.add(row) ;
|
|
}
|
|
}
|
|
return result.toArray(new String[0][0]) ;
|
|
} catch (SQLException e) {
|
|
e.printStackTrace() ;
|
|
return null ;
|
|
}
|
|
} |]
|
|
|
|
rows :: [[T.Text]] <- reify result
|
|
runConduit $ C.yieldMany rows
|
|
-- .| C.iterM (hPrint stderr)
|
|
.| C.groupBy sameSchemaName
|
|
-- .| C.iterM (hPrint stderr)
|
|
.| C.map (mkMCE 0 platform)
|
|
.| C.mapM_ (J.encode >>> cs >>> putStrLn)
|
|
.| C.sinkNull
|
|
return ()
|
|
where
|
|
sameSchemaName (schemaNameL:_) (schemaNameR:_) = schemaNameL == schemaNameR
|