mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-11 02:01:43 +00:00
Use ProcessBuilder and redirected log file for HDFS Extract (#198)
* Use ProcessBuilder and redirected log file for HDFS Extract * relax urn validation rule
This commit is contained in:
parent
39cec22e25
commit
cd4853d0a5
@ -24,25 +24,41 @@ import play.Logger;
|
|||||||
*/
|
*/
|
||||||
public class Urn {
|
public class Urn {
|
||||||
public String urnString;
|
public String urnString;
|
||||||
public String storageType;
|
public String datasetType;
|
||||||
public String schemaName;
|
public String schemaName;
|
||||||
public String abstractObjectName;
|
public String abstractObjectName;
|
||||||
|
|
||||||
static final String[] stoList = new String[] {"teradata", "hdfs"};
|
static final String[] stoList = new String[] {"teradata", "hdfs", "hive", "dalids", "oracle", "mysql", "pinot"};
|
||||||
static final Set<String> storageTypes = new HashSet<String>(Arrays.asList(stoList));
|
static final Set<String> datasetTypes = new HashSet<String>(Arrays.asList(stoList));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Urn can contain 3 parts
|
||||||
|
* (1) (2) (3)
|
||||||
|
* dataset_type://cluster:port/parent/name
|
||||||
|
* the 2nd part is only used to identify deployed dataset instance
|
||||||
|
* for dataset definition, we only use part (1) + (3)
|
||||||
|
*/
|
||||||
public Urn(String urnString) {
|
public Urn(String urnString) {
|
||||||
this.urnString = urnString;
|
this.urnString = urnString;
|
||||||
String[] splitResult = urnString.split(":///");
|
String[] splitResult = urnString.split(":///");
|
||||||
storageType = splitResult[0].toLowerCase();
|
datasetType = splitResult[0].toLowerCase();
|
||||||
Logger.debug(urnString);
|
Logger.debug(urnString);
|
||||||
switch (storageType) {
|
switch (datasetType) {
|
||||||
/* example: hdfs://data/tracking/PageViewEvent -> 'hdfs', '', 'data/tracking/PageViewEvent' */
|
/* example: hdfs:///data/tracking/PageViewEvent -> 'hdfs', '', 'data/tracking/PageViewEvent' */
|
||||||
case "hdfs": abstractObjectName = "/" + splitResult[1];
|
case "hdfs": abstractObjectName = "/" + splitResult[1];
|
||||||
schemaName = "";
|
schemaName = "";
|
||||||
break;
|
break;
|
||||||
/* example: teradata://dwh/dwh_dim/domain_name -> 'teradata', 'dwh/dwh_dim', 'domain_name' */
|
/* example: teradata:///dwh_dim/dim_table_name -> 'teradata', 'dwh_dim', 'dim_table_name'
|
||||||
case "teradata": String[] split2 = splitResult[1].split("/");
|
* hive:///db_name/table_name -> 'hive', 'db_name', 'table_name'
|
||||||
|
* */
|
||||||
|
case "teradata":
|
||||||
|
case "oracle":
|
||||||
|
case "mysql":
|
||||||
|
case "espresso":
|
||||||
|
case "pinot":
|
||||||
|
case "hive":
|
||||||
|
case "dalids":
|
||||||
|
String[] split2 = splitResult[1].split("/");
|
||||||
abstractObjectName = split2[split2.length-1];
|
abstractObjectName = split2[split2.length-1];
|
||||||
StringBuffer sb = new StringBuffer();
|
StringBuffer sb = new StringBuffer();
|
||||||
if (split2.length > 1) {
|
if (split2.length > 1) {
|
||||||
@ -58,23 +74,31 @@ public class Urn {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Urn(String storageType, String schemaName, String abstractObjectName) {
|
public Urn(String datasetType, String schemaName, String abstractObjectName) {
|
||||||
this.storageType = storageType.toLowerCase();
|
this.datasetType = datasetType.toLowerCase();
|
||||||
if (schemaName != null)
|
if (schemaName != null)
|
||||||
this.schemaName = schemaName.toLowerCase();
|
this.schemaName = schemaName.toLowerCase();
|
||||||
this.abstractObjectName = abstractObjectName;
|
this.abstractObjectName = abstractObjectName;
|
||||||
switch (this.storageType) {
|
switch (this.datasetType) {
|
||||||
case "teradata" : urnString = "teradata:///" + schemaName + "/" + abstractObjectName;
|
case "teradata":
|
||||||
|
case "oracle":
|
||||||
|
case "mysql":
|
||||||
|
case "espresso":
|
||||||
|
case "pinot":
|
||||||
|
case "hive":
|
||||||
|
case "dalids":
|
||||||
|
urnString = this.datasetType + ":///" + schemaName + "/" + abstractObjectName;
|
||||||
break;
|
break;
|
||||||
default: String trimName = abstractObjectName.startsWith("/") ? abstractObjectName.substring(1) : abstractObjectName;
|
default: String trimName = abstractObjectName.startsWith("/") ? abstractObjectName.substring(1) : abstractObjectName;
|
||||||
urnString = this.storageType + ":///" + trimName;
|
urnString = this.datasetType + ":///" + trimName;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean validateUrn(String urnString) {
|
public static boolean validateUrn(String urnString) {
|
||||||
|
|
||||||
String[] splitResult = urnString.split(":///");
|
String[] splitResult = urnString.split(":///");
|
||||||
if (storageTypes.contains(splitResult[0]) && splitResult.length > 1)
|
if ((datasetTypes.contains(splitResult[0]) || splitResult[0].matches("\\w+")) &&
|
||||||
|
splitResult.length > 1)
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -17,10 +17,11 @@ libraryDependencies ++= Seq(
|
|||||||
"org.mockito" % "mockito-core" % "1.9.5",
|
"org.mockito" % "mockito-core" % "1.9.5",
|
||||||
"org.quartz-scheduler" % "quartz" % "2.2.1",
|
"org.quartz-scheduler" % "quartz" % "2.2.1",
|
||||||
"org.quartz-scheduler" % "quartz-jobs" % "2.2.1",
|
"org.quartz-scheduler" % "quartz-jobs" % "2.2.1",
|
||||||
"org.slf4j" % "slf4j-api" % "1.6.6",
|
"org.slf4j" % "slf4j-api" % "1.7.21",
|
||||||
"org.jasypt" % "jasypt" % "1.9.2",
|
"org.jasypt" % "jasypt" % "1.9.2",
|
||||||
"org.apache.kafka" % "kafka_2.10" % "0.10.0.0",
|
"org.apache.kafka" % "kafka_2.10" % "0.10.0.0",
|
||||||
"org.apache.kafka" % "kafka-clients" % "0.10.0.0"
|
"org.apache.kafka" % "kafka-clients" % "0.10.0.0"
|
||||||
)
|
).map(_.exclude("log4j", "log4j"))
|
||||||
|
.map(_.exclude("org.slf4j", "slf4j-log4j12"))
|
||||||
|
|
||||||
play.Project.playJavaSettings
|
play.Project.playJavaSettings
|
||||||
|
@ -15,6 +15,7 @@ configurations {
|
|||||||
dependencySubstitution {
|
dependencySubstitution {
|
||||||
substitute module('org.slf4j:slf4j-log4j12') with module('ch.qos.logback:logback-classic:1.1.7')
|
substitute module('org.slf4j:slf4j-log4j12') with module('ch.qos.logback:logback-classic:1.1.7')
|
||||||
//prefer 'log4j-over-slf4j' over 'log4j'
|
//prefer 'log4j-over-slf4j' over 'log4j'
|
||||||
|
substitute module('log4j:log4j') with module('org.slf4j:log4j-over-slf4j:1.7.21')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,8 +19,8 @@ import com.jcraft.jsch.ChannelSftp;
|
|||||||
import com.jcraft.jsch.JSch;
|
import com.jcraft.jsch.JSch;
|
||||||
import com.jcraft.jsch.JSchException;
|
import com.jcraft.jsch.JSchException;
|
||||||
import com.jcraft.jsch.Session;
|
import com.jcraft.jsch.Session;
|
||||||
|
|
||||||
import com.jcraft.jsch.SftpException;
|
import com.jcraft.jsch.SftpException;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -32,8 +32,11 @@ import java.io.StringWriter;
|
|||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.lang.reflect.*;
|
import java.lang.reflect.*;
|
||||||
import metadata.etl.EtlJob;
|
import java.lang.ProcessBuilder;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import metadata.etl.EtlJob;
|
||||||
import wherehows.common.Constant;
|
import wherehows.common.Constant;
|
||||||
|
|
||||||
|
|
||||||
@ -95,20 +98,23 @@ public class HdfsMetadataEtl extends EtlJob {
|
|||||||
String numOfThread = prop.getProperty(Constant.HDFS_NUM_OF_THREAD_KEY, String.valueOf(1));
|
String numOfThread = prop.getProperty(Constant.HDFS_NUM_OF_THREAD_KEY, String.valueOf(1));
|
||||||
String hdfsUser = prop.getProperty(Constant.HDFS_REMOTE_USER_KEY);
|
String hdfsUser = prop.getProperty(Constant.HDFS_REMOTE_USER_KEY);
|
||||||
String hdfsKeyTab = prop.getProperty(Constant.HDFS_REMOTE_KEYTAB_LOCATION_KEY);
|
String hdfsKeyTab = prop.getProperty(Constant.HDFS_REMOTE_KEYTAB_LOCATION_KEY);
|
||||||
|
String hdfsExtractLogFile = outputSchemaFile + ".log";
|
||||||
|
|
||||||
String execCmd =
|
String[] hadoopCmd = {"hadoop", "jar", remoteJarFile,
|
||||||
"hadoop jar " + remoteJarFile
|
"-D" + Constant.HDFS_SCHEMA_REMOTE_PATH_KEY + "=" + outputSchemaFile,
|
||||||
+ " -D " + Constant.HDFS_SCHEMA_REMOTE_PATH_KEY + "=" + outputSchemaFile
|
"-D" + Constant.HDFS_SAMPLE_REMOTE_PATH_KEY + "=" + outputSampleDataFile,
|
||||||
+ " -D " + Constant.HDFS_SAMPLE_REMOTE_PATH_KEY + "=" + outputSampleDataFile
|
"-D" + Constant.HDFS_CLUSTER_KEY + "=" + cluster,
|
||||||
+ " -D " + Constant.HDFS_CLUSTER_KEY + "=" + cluster
|
"-D" + Constant.HDFS_WHITE_LIST_KEY + "=" + whiteList,
|
||||||
+ " -D " + Constant.HDFS_WHITE_LIST_KEY + "=" + whiteList
|
"-D" + Constant.HDFS_NUM_OF_THREAD_KEY + "=" + numOfThread,
|
||||||
+ " -D " + Constant.HDFS_NUM_OF_THREAD_KEY + "=" + numOfThread
|
"-D" + Constant.HDFS_REMOTE_USER_KEY + "=" + hdfsUser,
|
||||||
+ " -D " + Constant.HDFS_REMOTE_USER_KEY + "=" + hdfsUser
|
"-D" + Constant.HDFS_REMOTE_KEYTAB_LOCATION_KEY + "=" + hdfsKeyTab,
|
||||||
+ " -D log_file_name=hdfs_schema_fetch"
|
"-Dlog.file.name=hdfs_schema_fetch" };
|
||||||
+ " -D " + Constant.HDFS_REMOTE_KEYTAB_LOCATION_KEY + "=" + hdfsKeyTab;
|
|
||||||
//logger.info("executue remote command : " + execCmd);
|
|
||||||
|
|
||||||
Process process = Runtime.getRuntime().exec(execCmd);
|
ProcessBuilder pb = new ProcessBuilder(hadoopCmd);
|
||||||
|
File logFile = new File(hdfsExtractLogFile);
|
||||||
|
pb.redirectErrorStream(true);
|
||||||
|
pb.redirectOutput(ProcessBuilder.Redirect.appendTo(logFile));
|
||||||
|
Process process = pb.start();
|
||||||
int pid = -1;
|
int pid = -1;
|
||||||
if(process.getClass().getName().equals("java.lang.UNIXProcess")) {
|
if(process.getClass().getName().equals("java.lang.UNIXProcess")) {
|
||||||
/* get the PID on unix/linux systems */
|
/* get the PID on unix/linux systems */
|
||||||
@ -119,22 +125,7 @@ public class HdfsMetadataEtl extends EtlJob {
|
|||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
logger.info("executue remote command [PID=" + pid + "]: " + execCmd);
|
logger.info("executue command [PID=" + pid + "]: " + hadoopCmd);
|
||||||
|
|
||||||
BufferedInputStream stdout = new BufferedInputStream(process.getInputStream());
|
|
||||||
byte[] bytes = new byte[4096];
|
|
||||||
while (stdout.read(bytes) != -1) {}
|
|
||||||
|
|
||||||
String line = null;
|
|
||||||
/* @need to redo this part using ProcessBuilder + redirection
|
|
||||||
InputStream stdout = process.getInputStream();
|
|
||||||
InputStreamReader isr = new InputStreamReader(stdout);
|
|
||||||
BufferedReader br = new BufferedReader(isr);
|
|
||||||
|
|
||||||
|
|
||||||
while ( (line = br.readLine()) != null) {
|
|
||||||
logger.info(line);
|
|
||||||
}*/
|
|
||||||
|
|
||||||
// wait until this process finished.
|
// wait until this process finished.
|
||||||
int execResult = process.waitFor();
|
int execResult = process.waitFor();
|
||||||
@ -142,7 +133,8 @@ public class HdfsMetadataEtl extends EtlJob {
|
|||||||
// if the process failed, log the error and throw exception
|
// if the process failed, log the error and throw exception
|
||||||
if (execResult > 0) {
|
if (execResult > 0) {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(process.getErrorStream()));
|
BufferedReader br = new BufferedReader(new InputStreamReader(process.getErrorStream()));
|
||||||
String errString = "Error Details:\n";
|
String errString = "HDFS Metadata Extract Error:\n";
|
||||||
|
String line = "";
|
||||||
while((line = br.readLine()) != null)
|
while((line = br.readLine()) != null)
|
||||||
errString = errString.concat(line).concat("\n");
|
errString = errString.concat(line).concat("\n");
|
||||||
logger.error("*** Process failed, status: " + execResult);
|
logger.error("*** Process failed, status: " + execResult);
|
||||||
|
@ -61,7 +61,7 @@ class HdfsLoad:
|
|||||||
update stg_dict_dataset
|
update stg_dict_dataset
|
||||||
set name = substring_index(urn, '/', -2)
|
set name = substring_index(urn, '/', -2)
|
||||||
where db_id = {db_id}
|
where db_id = {db_id}
|
||||||
and name in ('1.0', '2.0', '3.0', '4.0', '0.1', '0.2', '0.3', '0.4', 'dedup', '1-day', '7-day');
|
and name regexp '[0-9]+\\.[0-9]+|dedup|dedupe|[0-9]+-day';
|
||||||
|
|
||||||
-- update parent name, this depends on the data from source system
|
-- update parent name, this depends on the data from source system
|
||||||
update stg_dict_dataset
|
update stg_dict_dataset
|
||||||
@ -109,7 +109,7 @@ class HdfsLoad:
|
|||||||
wh_etl_exec_id={wh_etl_exec_id}, abstract_dataset_urn=s.urn, schema_text=s.schema;
|
wh_etl_exec_id={wh_etl_exec_id}, abstract_dataset_urn=s.urn, schema_text=s.schema;
|
||||||
|
|
||||||
-- insert into final table
|
-- insert into final table
|
||||||
INSERT INTO dict_dataset
|
INSERT IGNORE INTO dict_dataset
|
||||||
( `name`,
|
( `name`,
|
||||||
`schema`,
|
`schema`,
|
||||||
schema_type,
|
schema_type,
|
||||||
@ -159,7 +159,7 @@ class HdfsLoad:
|
|||||||
and sdi.db_id = {db_id};
|
and sdi.db_id = {db_id};
|
||||||
|
|
||||||
-- insert into final instance table
|
-- insert into final instance table
|
||||||
INSERT INTO dict_dataset_instance
|
INSERT IGNORE INTO dict_dataset_instance
|
||||||
( dataset_id,
|
( dataset_id,
|
||||||
db_id,
|
db_id,
|
||||||
deployment_tier,
|
deployment_tier,
|
||||||
@ -195,11 +195,13 @@ class HdfsLoad:
|
|||||||
instance_created_time=s.instance_created_time, created_time=s.created_time, wh_etl_exec_id=s.wh_etl_exec_id
|
instance_created_time=s.instance_created_time, created_time=s.created_time, wh_etl_exec_id=s.wh_etl_exec_id
|
||||||
;
|
;
|
||||||
'''.format(source_file=self.input_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)
|
'''.format(source_file=self.input_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)
|
||||||
|
|
||||||
for state in load_cmd.split(";"):
|
for state in load_cmd.split(";"):
|
||||||
self.logger.debug(state)
|
self.logger.debug(state)
|
||||||
cursor.execute(state)
|
cursor.execute(state)
|
||||||
self.conn_mysql.commit()
|
self.conn_mysql.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
self.logger.info("finish loading hdfs metadata db_id={db_id} to dict_dataset".format(db_id=self.db_id))
|
||||||
|
|
||||||
def load_field(self):
|
def load_field(self):
|
||||||
cursor = self.conn_mysql.cursor()
|
cursor = self.conn_mysql.cursor()
|
||||||
@ -359,6 +361,7 @@ class HdfsLoad:
|
|||||||
cursor.execute(state)
|
cursor.execute(state)
|
||||||
self.conn_mysql.commit()
|
self.conn_mysql.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
self.logger.info("finish loading hdfs metadata db_id={db_id} to dict_field_detail".format(db_id=self.db_id))
|
||||||
|
|
||||||
def load_sample(self):
|
def load_sample(self):
|
||||||
cursor = self.conn_mysql.cursor()
|
cursor = self.conn_mysql.cursor()
|
||||||
@ -402,6 +405,7 @@ class HdfsLoad:
|
|||||||
cursor.execute(state)
|
cursor.execute(state)
|
||||||
self.conn_mysql.commit()
|
self.conn_mysql.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
self.logger.info("finish loading hdfs sample data db_id={db_id} to dict_dataset_sample".format(db_id=self.db_id))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -204,7 +204,7 @@ class HiveLoad:
|
|||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
insert into dict_field_detail (
|
insert ignore into dict_field_detail (
|
||||||
dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path,
|
dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path,
|
||||||
field_name, namespace, data_type, data_size, is_nullable, default_value,
|
field_name, namespace, data_type, data_size, is_nullable, default_value,
|
||||||
modified
|
modified
|
||||||
|
Loading…
x
Reference in New Issue
Block a user