Fix bug of duplicate field loading. Fix bug of subflow process in azkaban lineage ETL.

This commit is contained in:
SunZhaonan 2015-12-11 19:46:35 -08:00
parent 5946f355ff
commit 07c46304b5
15 changed files with 292 additions and 92 deletions

View File

@ -34,10 +34,8 @@ import wherehows.common.writers.DatabaseWriter;
*/ */
public class LineageDao { public class LineageDao {
public static final String FIND_JOBS_BY_DATASET = public static final String FIND_JOBS_BY_DATASET =
" select distinct ca.short_connection_string, f.flow_group, f.flow_name, jedl.job_name " " select distinct ca.short_connection_string, jedl.job_name, jedl.flow_path "
+ " from job_execution_data_lineage jedl " + " from job_execution_data_lineage jedl "
+ " join flow_execution fe on jedl.app_id = fe.app_id and jedl.flow_exec_id = fe.flow_exec_id "
+ " join flow f on fe.app_id = f.app_id and fe.flow_id = f.flow_id "
+ " join cfg_application ca on ca.app_id = jedl.app_id " + " join cfg_application ca on ca.app_id = jedl.app_id "
+ " join cfg_database cd on cd.db_id = jedl.db_id " + " join cfg_database cd on cd.db_id = jedl.db_id "
+ " where source_target_type = :source_target_type " + " where source_target_type = :source_target_type "

View File

@ -4,16 +4,14 @@ gradle dist;
rm -r target/universal/backend-service-1.0-SNAPSHOT; rm -r target/universal/backend-service-1.0-SNAPSHOT;
unzip target/universal/backend-service-1.0-SNAPSHOT.zip -d target/universal/; unzip target/universal/backend-service-1.0-SNAPSHOT.zip -d target/universal/;
scp target/universal/backend-service-1.0-SNAPSHOT/lib/metadata-etl-1.0.jar dev_svc@lva1-rpt14:~/backendServer/lib/; scp target/universal/backend-service-1.0-SNAPSHOT/lib/metadata-etl-1.0.jar ${TARGET_SERVER}:~/backendServer/lib/;
scp target/universal/backend-service-1.0-SNAPSHOT/lib/wherehows-common-1.0.jar dev_svc@lva1-rpt14:~/backendServer/lib/;
scp target/universal/backend-service-1.0-SNAPSHOT/lib/default.backend-service-1.0-SNAPSHOT.jar dev_svc@lva1-rpt14:~/backendServer/lib/; scp target/universal/backend-service-1.0-SNAPSHOT/lib/default.backend-service-1.0-SNAPSHOT.jar ${TARGET_SERVER}:~/backendServer/lib/;
scp target/universal/backend-service-1.0-SNAPSHOT/lib/schemaFetch.jar dev_svc@lva1-rpt14:~/backendServer/lib/; scp target/universal/backend-service-1.0-SNAPSHOT/lib/schemaFetch.jar ${TARGET_SERVER}:~/backendServer/lib/;
scp target/universal/backend-service-1.0-SNAPSHOT/lib/wherehows-common-1.0.jar ${TARGET_SERVER}:~/backendServer/lib/;
scp target/universal/backend-service-1.0-SNAPSHOT/bin/* ${TARGET_SERVER}:~/backendServer/bin/;
scp target/universal/backend-service-1.0-SNAPSHOT/bin/* dev_svc@lva1-rpt14:~/backendServer/bin/;
scp target/universal/backend-service-1.0-SNAPSHOT/lib/metadata-etl-1.0.jar cloudera@172.21.98.211:~/wherehows/backend-service-1.0-SNAPSHOT/lib/

View File

@ -165,21 +165,30 @@ CREATE TABLE `dict_field_detail` (
`field_label` VARCHAR(100) DEFAULT NULL, `field_label` VARCHAR(100) DEFAULT NULL,
`data_type` VARCHAR(50) NOT NULL, `data_type` VARCHAR(50) NOT NULL,
`data_size` INT(10) UNSIGNED DEFAULT NULL, `data_size` INT(10) UNSIGNED DEFAULT NULL,
`data_precision` TINYINT(4) DEFAULT NULL, `data_precision` TINYINT(4) DEFAULT NULL
`data_fraction` TINYINT(4) DEFAULT NULL, COMMENT 'only in decimal type',
`data_fraction` TINYINT(4) DEFAULT NULL
COMMENT 'only in decimal type',
`default_comment_id` INT(11) UNSIGNED DEFAULT NULL `default_comment_id` INT(11) UNSIGNED DEFAULT NULL
COMMENT 'a list of comment_id', COMMENT 'a list of comment_id',
`comment_ids` VARCHAR(500) DEFAULT NULL, `comment_ids` VARCHAR(500) DEFAULT NULL,
`is_nullable` CHAR(1) DEFAULT NULL, `is_nullable` CHAR(1) DEFAULT NULL,
`is_indexed` CHAR(1) DEFAULT NULL, `is_indexed` CHAR(1) DEFAULT NULL
`is_partitioned` CHAR(1) DEFAULT NULL, COMMENT 'only in RDBMS',
`is_distributed` TINYINT(4) DEFAULT NULL, `is_partitioned` CHAR(1) DEFAULT NULL
COMMENT 'only in RDBMS',
`is_distributed` TINYINT(4) DEFAULT NULL
COMMENT 'only in RDBMS',
`default_value` VARCHAR(200) DEFAULT NULL, `default_value` VARCHAR(200) DEFAULT NULL,
`namespace` VARCHAR(200) DEFAULT NULL, `namespace` VARCHAR(200) DEFAULT NULL,
`java_data_type` VARCHAR(50) DEFAULT NULL, `java_data_type` VARCHAR(50) DEFAULT NULL
`jdbc_data_type` VARCHAR(50) DEFAULT NULL, COMMENT 'correspond type in java',
`pig_data_type` VARCHAR(50) DEFAULT NULL, `jdbc_data_type` VARCHAR(50) DEFAULT NULL
`hcatalog_data_type` VARCHAR(50) DEFAULT NULL, COMMENT 'correspond type in jdbc',
`pig_data_type` VARCHAR(50) DEFAULT NULL
COMMENT 'correspond type in pig',
`hcatalog_data_type` VARCHAR(50) DEFAULT NULL
COMMENT 'correspond type in hcatalog',
`modified` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, `modified` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`field_id`), PRIMARY KEY (`field_id`),
KEY `idx_dict_field__datasetid_fieldname` (`dataset_id`, `field_name`) USING BTREE, KEY `idx_dict_field__datasetid_fieldname` (`dataset_id`, `field_name`) USING BTREE,

View File

@ -35,7 +35,7 @@ import java.util.Properties;
* Created by zsun on 7/29/15. * Created by zsun on 7/29/15.
*/ */
public abstract class EtlJob { public abstract class EtlJob {
public final static String CONFIG_FILE = "application.properties"; private final static String CONFIG_FILE = "application.properties";
public PythonInterpreter interpreter; public PythonInterpreter interpreter;
public PySystemState sys; public PySystemState sys;
public Properties prop; public Properties prop;
@ -61,8 +61,8 @@ public abstract class EtlJob {
*/ */
@Deprecated @Deprecated
public EtlJob(Integer appId, Integer dbId, long whExecId, String configFile) { public EtlJob(Integer appId, Integer dbId, long whExecId, String configFile) {
configFromFile(appId, dbId, whExecId, configFile); PySystemState sys = configFromFile(appId, dbId, whExecId, configFile);
addJythonToPath(); addJythonToPath(sys);
interpreter = new PythonInterpreter(null, sys); interpreter = new PythonInterpreter(null, sys);
} }
@ -75,11 +75,11 @@ public abstract class EtlJob {
*/ */
public EtlJob(Integer appId, Integer dbId, Long whExecId, Properties properties) { public EtlJob(Integer appId, Integer dbId, Long whExecId, Properties properties) {
configFromProperties(appId, dbId, whExecId, properties); configFromProperties(appId, dbId, whExecId, properties);
addJythonToPath(); addJythonToPath(sys);
interpreter = new PythonInterpreter(null, sys); interpreter = new PythonInterpreter(null, sys);
} }
private void addJythonToPath() { private void addJythonToPath(PySystemState pySystemState) {
URL url = classLoader.getResource("jython"); URL url = classLoader.getResource("jython");
if (url != null) { if (url != null) {
File file = new File(url.getFile()); File file = new File(url.getFile());
@ -87,12 +87,12 @@ public abstract class EtlJob {
if (path.startsWith("file:")) { if (path.startsWith("file:")) {
path = path.substring(5); path = path.substring(5);
} }
sys.path.append(new PyString(path.replace("!", ""))); pySystemState.path.append(new PyString(path.replace("!", "")));
} }
} }
@Deprecated @Deprecated
private void configFromFile(Integer appId, Integer dbId, long whExecId, String configFile) { private PySystemState configFromFile(Integer appId, Integer dbId, long whExecId, String configFile) {
prop = new Properties(); prop = new Properties();
if (appId != null) { if (appId != null) {
@ -117,8 +117,9 @@ public abstract class EtlJob {
config.put(new PyString(key), new PyString(value)); config.put(new PyString(key), new PyString(value));
} }
sys = new PySystemState(); PySystemState sys = new PySystemState();
sys.argv.append(config); sys.argv.append(config);
return sys;
} }
/** /**

View File

@ -52,7 +52,7 @@ public class AzJobChecker {
/** /**
* Default 10 minutes * Default 10 minutes
* @return * @return A list of recent finished AzkabanJobExecRecord
* @throws IOException * @throws IOException
* @throws SQLException * @throws SQLException
*/ */
@ -72,23 +72,32 @@ public class AzJobChecker {
public List<AzkabanJobExecRecord> getRecentFinishedJobFromFlow(int timeFrameMinutes) public List<AzkabanJobExecRecord> getRecentFinishedJobFromFlow(int timeFrameMinutes)
throws IOException, SQLException { throws IOException, SQLException {
long currentTimeStamp = System.currentTimeMillis(); long currentTimeStamp = System.currentTimeMillis();
long oneHourAgo = currentTimeStamp - 1000 * 60 * timeFrameMinutes; long beginTimeStamp = currentTimeStamp - 1000 * 60 * timeFrameMinutes;
return getRecentFinishedJobFromFlow(oneHourAgo); return getRecentFinishedJobFromFlow(beginTimeStamp, currentTimeStamp);
}
public List<AzkabanJobExecRecord> getRecentFinishedJobFromFlow(int timeFrameMinutes, long endTimeStamp)
throws IOException, SQLException {
long beginTimeStamp = endTimeStamp - 60 * timeFrameMinutes;
return getRecentFinishedJobFromFlow(beginTimeStamp * 1000, endTimeStamp * 1000); // convert to milli seconds
} }
/** /**
* Read the blob from "flow_data", do a topological sort on the nodes. Give them the sort id. * Read the blob from "flow_data", do a topological sort on the nodes. Give them the sort id.
* @param timestamp the beginning timestamp * @param startTimeStamp the begin timestamp
* @param endTimeStamp the end timestamp
* @return * @return
*/ */
public List<AzkabanJobExecRecord> getRecentFinishedJobFromFlow(long timestamp) public List<AzkabanJobExecRecord> getRecentFinishedJobFromFlow(long startTimeStamp, long endTimeStamp)
throws SQLException, IOException { throws SQLException, IOException {
logger.info("Get the jobs from time : {}", timestamp); logger.info("Get the jobs from time : {} to time : {}", startTimeStamp, endTimeStamp);
List<AzkabanJobExecRecord> results = new ArrayList<>(); List<AzkabanJobExecRecord> results = new ArrayList<>();
Statement stmt = conn.createStatement(); Statement stmt = conn.createStatement();
final String cmd = final String cmd =
"select exec_id, flow_id, status, submit_user, flow_data from execution_flows where end_time > " + timestamp; "select exec_id, flow_id, status, submit_user, flow_data from execution_flows where end_time > " + startTimeStamp
+ " and end_time < " + endTimeStamp ;
logger.info("Get recent flow sql : " + cmd);
final ResultSet rs = stmt.executeQuery(cmd); // this sql take 3 second to execute final ResultSet rs = stmt.executeQuery(cmd); // this sql take 3 second to execute
while (rs.next()) { while (rs.next()) {
@ -101,27 +110,47 @@ public class AzJobChecker {
return results; return results;
} }
/**
* Parse the json of flow_data field from execution_flows. Use recursion to handle the nested case.
* @param flowJson
* @param flowExecId
* @return
* @throws IOException
*/
public List<AzkabanJobExecRecord> parseJson(String flowJson, long flowExecId) public List<AzkabanJobExecRecord> parseJson(String flowJson, long flowExecId)
throws IOException { throws IOException {
List<AzkabanJobExecRecord> results = new ArrayList<>();
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
JsonNode wholeFlow = mapper.readTree(flowJson); JsonNode wholeFlow = mapper.readTree(flowJson);
JsonNode allJobs = wholeFlow.get("nodes"); JsonNode allJobs = wholeFlow.get("nodes");
String flowPath = wholeFlow.get("projectName").asText() + ":" + wholeFlow.get("flowId").asText(); String flowPath = wholeFlow.get("projectName").asText() + ":" + wholeFlow.get("flowId").asText();
for (JsonNode oneJob : allJobs) { List<AzkabanJobExecRecord> results = parseJsonHelper(allJobs, flowExecId, "", flowPath);
String jobName = oneJob.get("id").asText();
long startTime = oneJob.get("startTime").asLong();
long endTime = oneJob.get("endTime").asLong();
String status = oneJob.get("status").asText();
AzkabanJobExecRecord azkabanJobExecRecord =
new AzkabanJobExecRecord(appId, jobName, flowExecId, (int) (startTime / 1000), (int) (endTime / 1000), status,
flowPath);
results.add(azkabanJobExecRecord);
}
AzkabanJobExecUtil.sortAndSet(results); AzkabanJobExecUtil.sortAndSet(results);
return results; return results;
} }
private List<AzkabanJobExecRecord> parseJsonHelper(JsonNode allJobs, long flowExecId, String jobPrefix, String flowPath) {
List<AzkabanJobExecRecord> results = new ArrayList<>();
for (JsonNode oneJob : allJobs) {
if (oneJob.has("nodes")) { // is a subflow
String subFlowName = oneJob.get("id").asText();
String newJobPrefix = jobPrefix.length() > 0 ? jobPrefix + subFlowName + ":" : subFlowName + ":";
results.addAll(parseJsonHelper(oneJob.get("nodes"), flowExecId, newJobPrefix, flowPath));
} else {
String jobName = oneJob.get("id").asText();
long startTime = oneJob.get("startTime").asLong();
long endTime = oneJob.get("endTime").asLong();
String status = oneJob.get("status").asText();
jobName = jobPrefix.length() > 0 ? jobPrefix + jobName : jobName;
AzkabanJobExecRecord azkabanJobExecRecord =
new AzkabanJobExecRecord(appId, jobName, flowExecId, (int) (startTime / 1000), (int) (endTime / 1000),
status, flowPath);
results.add(azkabanJobExecRecord);
}
}
return results;
}
public void close() public void close()
throws SQLException { throws SQLException {
conn.close(); conn.close();

View File

@ -60,17 +60,22 @@ public class AzLineageExtractorMaster {
run(10); run(10);
} }
public void run(int timeFrame)
throws Exception {
run(timeFrame, System.currentTimeMillis());
}
/** /**
* Entry point. * Entry point.
* All recent finished azkaban jobs' lineage. Will write to database stagging table * All recent finished azkaban jobs' lineage. Will write to database stagging table
* @param timeFrame in minutes * @param timeFrame in minutes
* @throws Exception * @throws Exception
*/ */
public void run(int timeFrame) public void run(int timeFrame, long endTimeStamp)
throws Exception { throws Exception {
// get recent finished job // get recent finished job
AzJobChecker azJobChecker = new AzJobChecker(prop); AzJobChecker azJobChecker = new AzJobChecker(prop);
List<AzkabanJobExecRecord> jobExecList = azJobChecker.getRecentFinishedJobFromFlow(timeFrame); List<AzkabanJobExecRecord> jobExecList = azJobChecker.getRecentFinishedJobFromFlow(timeFrame, endTimeStamp);
azJobChecker.close(); azJobChecker.close();
logger.info("Total number of azkaban jobs : {}", jobExecList.size()); logger.info("Total number of azkaban jobs : {}", jobExecList.size());

View File

@ -27,7 +27,8 @@ import java.util.Properties;
*/ */
public class AzLineageMetadataEtl extends EtlJob { public class AzLineageMetadataEtl extends EtlJob {
public int timeFrame = -1; public Integer timeFrame = null;
public Long endTimeStamp = null;
Connection conn; Connection conn;
/** /**
@ -53,6 +54,8 @@ public class AzLineageMetadataEtl extends EtlJob {
public AzLineageMetadataEtl(int appId, long whExecId, Properties properties) { public AzLineageMetadataEtl(int appId, long whExecId, Properties properties) {
super(appId, null, whExecId, properties); super(appId, null, whExecId, properties);
this.timeFrame = Integer.valueOf(this.prop.getProperty(Constant.AZ_LINEAGE_ETL_LOOKBACK_MINS_KEY)); this.timeFrame = Integer.valueOf(this.prop.getProperty(Constant.AZ_LINEAGE_ETL_LOOKBACK_MINS_KEY));
if (this.prop.contains(Constant.AZ_LINEAGE_ETL_END_TIMESTAMP_KEY))
this.endTimeStamp = Long.valueOf(this.prop.getProperty(Constant.AZ_LINEAGE_ETL_END_TIMESTAMP_KEY));
try { try {
setUp(); setUp();
} catch (SQLException e) { } catch (SQLException e) {
@ -78,7 +81,11 @@ public class AzLineageMetadataEtl extends EtlJob {
conn.createStatement().execute(emptyStaggingTable); conn.createStatement().execute(emptyStaggingTable);
AzLineageExtractorMaster azLineageExtractorMaster = new AzLineageExtractorMaster(prop); AzLineageExtractorMaster azLineageExtractorMaster = new AzLineageExtractorMaster(prop);
// get lineage // get lineage
if (timeFrame > 0) { if (timeFrame != null && endTimeStamp != null && endTimeStamp != 0) {
azLineageExtractorMaster.run(timeFrame, endTimeStamp);
}
else if (timeFrame != null) {
azLineageExtractorMaster.run(timeFrame); azLineageExtractorMaster.run(timeFrame);
} else { } else {
azLineageExtractorMaster.run(10); azLineageExtractorMaster.run(10);
@ -110,8 +117,8 @@ public class AzLineageMetadataEtl extends EtlJob {
conn.createStatement().execute(insertIntoFinalTable); conn.createStatement().execute(insertIntoFinalTable);
logger.info("Azkaban lineage metadata ETL completed"); logger.info("Azkaban lineage metadata ETL completed");
if (prop.getProperty(Constant.APP_ID_KEY).equals("32")) { if (prop.getProperty(Constant.APP_ID_KEY).equals("32") || prop.getProperty(Constant.APP_ID_KEY).equals("31") ) {
logger.info("TEMPORARY load war's data into cmdb database"); logger.info("TEMPORARY load war & nertz's data into cmdb database");
loadIntoOthers(); loadIntoOthers();
} }
} }

View File

@ -61,15 +61,15 @@ public class HadoopNameNodeExtractor {
String WH_HOME = System.getenv("WH_HOME"); String WH_HOME = System.getenv("WH_HOME");
String USER_HOME = System.getenv("HOME") + "/.kerberos"; String USER_HOME = System.getenv("HOME") + "/.kerberos";
String ETC = "/etc"; String ETC = "/etc";
String TMP = "/tmp" + "/.kerberos"; String TMP = "/var/tmp" + "/.kerberos";
String[] allPositions = new String[]{CURRENT_DIR, WH_HOME, USER_HOME, TMP, ETC}; String[] allPositions = new String[]{CURRENT_DIR, WH_HOME, USER_HOME, TMP};
for (String possition : allPositions) { for (String possition : allPositions) {
String gssFileName = possition + "/gss-jaas.conf"; String gssFileName = possition + "/gss-jaas.conf";
File gssFile = new File(gssFileName); File gssFile = new File(gssFileName);
if (gssFile.exists()) { if (gssFile.exists()) {
logger.debug("find gss-jaas.conf file in : {}", gssFile.getAbsolutePath()); logger.info("find gss-jaas.conf file in : {}", gssFile.getAbsolutePath());
System.setProperty("java.security.auth.login.config", gssFile.getAbsolutePath()); System.setProperty("java.security.auth.login.config", gssFile.getAbsolutePath());
break; break;
} else { } else {
@ -80,7 +80,7 @@ public class HadoopNameNodeExtractor {
String krb5FileName = possition + "/krb5.conf"; String krb5FileName = possition + "/krb5.conf";
File krb5File = new File(krb5FileName); File krb5File = new File(krb5FileName);
if (krb5File.exists()) { if (krb5File.exists()) {
logger.debug("find krb5.conf file in : {}", krb5File.getAbsolutePath()); logger.info("find krb5.conf file in : {}", krb5File.getAbsolutePath());
System.setProperty("java.security.krb5.conf", krb5File.getAbsolutePath()); System.setProperty("java.security.krb5.conf", krb5File.getAbsolutePath());
break; break;
} else { } else {

View File

@ -168,15 +168,73 @@ class HdfsLoad:
analyze table field_comments; analyze table field_comments;
-- delete old record if it does not exist in this load batch anymore (but have the dataset id)
create temporary table if not exists t_deleted_fields (primary key (field_id))
select x.field_id
from stg_dict_field_detail s
join dict_dataset i
on s.urn = i.urn
and s.db_id = {db_id}
right join dict_field_detail x
on i.id = x.dataset_id
and s.field_name = x.field_name
and s.parent_path = x.parent_path
where s.field_name is null
and x.dataset_id in (
select d.id dataset_id
from stg_dict_field_detail k join dict_dataset d
on k.urn = d.urn
and k.db_id = {db_id}
)
; -- run time : ~2min
delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
-- update the old record if some thing changed
update dict_dict_field_detail t join
(
select x.field_id, s.*
from stg_dict_field_detail s join dict_dataset d
on s.urn = d.urn
join dict_field_detail x
on s.field_name = x.field_name
and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
and d.id = x.dataset_id
where s.db_id = {db_id}
and (x.sort_id <> s.sort_id
or x.parent_sort_id <> s.parent_sort_id
or x.data_type <> s.data_type
or x.data_size <> s.data_size
or x.data_precision <> s.data_precision
or x.is_nullable <> s.is_nullable
or x.is_partitioned <> s.is_partitioned
or x.is_distributed <> s.is_distributed
or x.default_value <> s.default_value
or x.namespace <> s.namespace
)
) p
on t.field_id = p.field_id
set t.sort_id = p.sort_id,
t.data_type = p.data_type,
t.data_size = p.data_size,
t.data_precision = p.data_precision,
t.is_nullable = p.is_nullable,
t.is_partitioned = p.is_partitioned,
t.is_distributed = p.is_distributed
t.default_value = p.default_value
t.namespace = p.namespace
t.last_modified = now()
;
insert into dict_field_detail ( insert into dict_field_detail (
dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path, dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path,
field_name, namespace, data_type, data_size, is_nullable, default_value, field_name, namespace, data_type, data_size, is_nullable, default_value,
default_comment_id default_comment_id, modified
) )
select select
d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path, d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value,
coalesce(fc.id, t.default_comment_id) fc_id coalesce(fc.id, t.default_comment_id) fc_id, now()
from stg_dict_field_detail sf join dict_dataset d from stg_dict_field_detail sf join dict_dataset d
on sf.urn = d.urn on sf.urn = d.urn
left join field_comments fc left join field_comments fc

View File

@ -129,29 +129,62 @@ class TeradataLoad:
and (char_length(trim(description)) = 0 and (char_length(trim(description)) = 0
or description in ('null', 'N/A', 'nothing', 'empty', 'none')); or description in ('null', 'N/A', 'nothing', 'empty', 'none'));
UPDATE dict_field_detail t, ( -- delete old record if it does not exist in this load batch anymore (but have the dataset id)
select create temporary table if not exists t_deleted_fields (primary key (field_id))
f.field_id, select x.field_id
s.sort_id, from stg_dict_field_detail s
s.data_type, join dict_dataset i
s.data_size, on s.urn = i.urn
s.data_precision, and s.db_id = {db_id}
s.data_scale, right join dict_field_detail x
s.is_nullable on i.id = x.dataset_id
from stg_dict_field_detail s join dict_dataset d and s.field_name = x.field_name
on s.urn = d.urn and s.parent_path = x.parent_path
join dict_field_detail f where s.field_name is null
on d.id = f.dataset_id and x.dataset_id in (
and s.field_name = f.field_name select d.id dataset_id
where s.db_id = {db_id} from stg_dict_field_detail k join dict_dataset d
) x on k.urn = d.urn
set t.sort_id = x.sort_id and k.db_id = {db_id}
, t.data_type = x.data_type )
, t.data_size = x.data_size ; -- run time : ~2min
, t.data_precision = x.data_precision
, t.data_fraction = x.data_scale delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
, t.is_nullable = x.is_nullable
where t.field_id = x.field_id -- update the old record if some thing changed
update dict_dict_field_detail t join
(
select x.field_id, s.*
from stg_dict_field_detail s join dict_dataset d
on s.urn = d.urn
join dict_field_detail x
on s.field_name = x.field_name
and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
and d.id = x.dataset_id
where s.db_id = {db_id}
and (x.sort_id <> s.sort_id
or x.parent_sort_id <> s.parent_sort_id
or x.data_type <> s.data_type
or x.data_size <> s.data_size
or x.data_precision <> s.data_precision
or x.is_nullable <> s.is_nullable
or x.is_partitioned <> s.is_partitioned
or x.is_distributed <> s.is_distributed
or x.default_value <> s.default_value
or x.namespace <> s.namespace
)
) p
on t.field_id = p.field_id
set t.sort_id = p.sort_id,
t.data_type = p.data_type,
t.data_size = p.data_size,
t.data_precision = p.data_precision,
t.is_nullable = p.is_nullable,
t.is_partitioned = p.is_partitioned,
t.is_distributed = p.is_distributed
t.default_value = p.default_value
t.namespace = p.namespace
t.last_modified = now()
; ;
show warnings limit 10; show warnings limit 10;

View File

@ -13,7 +13,12 @@
*/ */
package metadata.etl.lineage; package metadata.etl.lineage;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Properties; import java.util.Properties;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeTest; import org.testng.annotations.BeforeTest;
@ -108,6 +113,8 @@ public class AzJobCheckerTest {
+ " \"zsun\",\n" + " \"data_svc\"],\n" + " \"startTime\": 1442224810815,\n" + " \"zsun\",\n" + " \"data_svc\"],\n" + " \"startTime\": 1442224810815,\n"
+ " \"status\": \"SUCCEEDED\",\n" + " \"submitTime\": 1442224810778,\n" + " \"submitUser\": \"zsun\",\n" + " \"status\": \"SUCCEEDED\",\n" + " \"submitTime\": 1442224810778,\n" + " \"submitUser\": \"zsun\",\n"
+ " \"type\": null,\n" + " \"updateTime\": 1442233069065,\n" + " \"version\": 301}"; + " \"type\": null,\n" + " \"updateTime\": 1442233069065,\n" + " \"version\": 301}";
AzJobChecker ajc; AzJobChecker ajc;
Properties prop; Properties prop;
@ -130,6 +137,18 @@ public class AzJobCheckerTest {
Assert.assertNotNull(results); Assert.assertNotNull(results);
} }
@Test(groups = {"needConfig"})
public void getRecentFinishedJobFromFlowTest2()
throws SQLException, IOException {
List<AzkabanJobExecRecord> results = ajc.getRecentFinishedJobFromFlow(2, 1448916456L);
for (AzkabanJobExecRecord a : results) {
System.out.print(a.getFlowExecId() + "\t");
System.out.print(a.getJobName() + "\t");
System.out.println(a.getJobExecId());
}
Assert.assertNotNull(results);
}
@Test(groups = {"needConfig"}) @Test(groups = {"needConfig"})
public void parseJsonTest() public void parseJsonTest()
throws IOException { throws IOException {
@ -145,4 +164,27 @@ public class AzJobCheckerTest {
Assert.assertEquals((long) aje.getJobExecId(), 11111 * 1000 + i); Assert.assertEquals((long) aje.getJobExecId(), 11111 * 1000 + i);
} }
} }
@Test(groups = {"needConfig"})
public void parseNestedJsonTest()
throws IOException, URISyntaxException {
URL url = Thread.currentThread().getContextClassLoader().getResource("nestedJson");
byte[] encoded = Files.readAllBytes(Paths.get(url.getPath()));
String nestedJson = new String(encoded, "UTF-8");
List<AzkabanJobExecRecord> result = ajc.parseJson(nestedJson, 11111);
for (int i = 0; i < result.size(); i++) {
AzkabanJobExecRecord aje = result.get(i);
System.out.println(aje.getJobExecId());
System.out.println(aje.getJobName());
System.out.println(aje.getStartTime());
System.out.println(aje.getEndTime());
System.out.println(aje.getFlowPath());
System.out.println();
Assert.assertEquals((long) aje.getJobExecId(), 11111 * 1000 + i);
}
}
} }

View File

@ -44,7 +44,7 @@ public class AzLineageExtractorTest {
String wherehowsPassWord = prop.getProperty(Constant.WH_DB_PASSWORD_KEY); String wherehowsPassWord = prop.getProperty(Constant.WH_DB_PASSWORD_KEY);
connUrl = wherehowsUrl + "?" + "user=" + wherehowsUserName + "&password=" + wherehowsPassWord; connUrl = wherehowsUrl + "?" + "user=" + wherehowsUserName + "&password=" + wherehowsPassWord;
this.conn = DriverManager.getConnection(connUrl); this.conn = DriverManager.getConnection(connUrl);
AzLogParser.initialize(conn, Integer.valueOf(prop.getProperty(Constant.AZ_DEFAULT_HADOOP_DATABASE_ID_KEY))); AzLogParser.initialize(conn);
PathAnalyzer.initialize(conn); PathAnalyzer.initialize(conn);
} }

View File

@ -45,7 +45,7 @@ public class AzLogParserTest {
String wherehowsPassWord = prop.getProperty(Constant.WH_DB_PASSWORD_KEY); String wherehowsPassWord = prop.getProperty(Constant.WH_DB_PASSWORD_KEY);
Connection conn = Connection conn =
DriverManager.getConnection(wherehowsHost + "?" + "user=" + wherehowsUserName + "&password=" + wherehowsPassWord); DriverManager.getConnection(wherehowsHost + "?" + "user=" + wherehowsUserName + "&password=" + wherehowsPassWord);
AzLogParser.initialize(conn, -1); AzLogParser.initialize(conn);
} }
@Test(groups = {"needConfig"}) @Test(groups = {"needConfig"})
@ -81,7 +81,7 @@ public class AzLogParserTest {
String logSample = "asfdasdfsadf Moving from staged path[asdf] to final resting place[/tm/b/c] sdaf dsfasdfasdf"; String logSample = "asfdasdfsadf Moving from staged path[asdf] to final resting place[/tm/b/c] sdaf dsfasdfasdf";
AzkabanJobExecRecord sampleExecution = new AzkabanJobExecRecord(-1, "someJobName", (long) 0, 0, 0, "S", "path"); AzkabanJobExecRecord sampleExecution = new AzkabanJobExecRecord(-1, "someJobName", (long) 0, 0, 0, "S", "path");
sampleExecution.setJobExecId((long) 11111); sampleExecution.setJobExecId((long) 11111);
List<LineageRecord> result = AzLogParser.getLineageFromLog(logSample, sampleExecution); List<LineageRecord> result = AzLogParser.getLineageFromLog(logSample, sampleExecution, -1);
System.out.println(result.get(0).toDatabaseValue()); System.out.println(result.get(0).toDatabaseValue());
Assert.assertEquals(result.get(0).toDatabaseValue(), Assert.assertEquals(result.get(0).toDatabaseValue(),
@ -105,7 +105,7 @@ public class AzLogParserTest {
+ "17-11-2015 01:32:27 PST endorsements_push-lva-endorsements-member-restrictions INFO - INFO tcp://lva1-voldemort-read-only-2-vip.prod.linkedin.com:10103 : Invoking fetch for Node lva1-app0610.prod.linkedin.com [id 0] for webhdfs://lva1-warnn01.grid.linkedin.com:50070/jobs/endorse/endorsements/master/tmp/endorsements-member-restrictions.store/lva1-voldemort-read-only-2-vip.prod.linkedin.com/node-0\n" + "17-11-2015 01:32:27 PST endorsements_push-lva-endorsements-member-restrictions INFO - INFO tcp://lva1-voldemort-read-only-2-vip.prod.linkedin.com:10103 : Invoking fetch for Node lva1-app0610.prod.linkedin.com [id 0] for webhdfs://lva1-warnn01.grid.linkedin.com:50070/jobs/endorse/endorsements/master/tmp/endorsements-member-restrictions.store/lva1-voldemort-read-only-2-vip.prod.linkedin.com/node-0\n"
+ "17-11-2015 01:32:27 PST endorsements_push-lva-endorsements-member-restrictions INFO - INFO tcp://lva1-voldemort-rea"; + "17-11-2015 01:32:27 PST endorsements_push-lva-endorsements-member-restrictions INFO - INFO tcp://lva1-voldemort-rea";
AzkabanJobExecRecord sampleExecution = new AzkabanJobExecRecord(-1, "someJobName", (long) 0, 0, 0, "S", "path"); AzkabanJobExecRecord sampleExecution = new AzkabanJobExecRecord(-1, "someJobName", (long) 0, 0, 0, "S", "path");
List<LineageRecord> result = AzLogParser.getLineageFromLog(logSample, sampleExecution); List<LineageRecord> result = AzLogParser.getLineageFromLog(logSample, sampleExecution, -1);
System.out.println(result.get(0).toDatabaseValue()); System.out.println(result.get(0).toDatabaseValue());
Assert.assertEquals(result.get(0).getFullObjectName(), Assert.assertEquals(result.get(0).getFullObjectName(),
"tcp://lva1-voldemort-read-only-2-vip.prod.linkedin.com:10103/endorsements-member-restrictions"); "tcp://lva1-voldemort-read-only-2-vip.prod.linkedin.com:10103/endorsements-member-restrictions");

View File

@ -38,6 +38,7 @@ public class Constant {
public static final String AZ_DEFAULT_HADOOP_DATABASE_ID_KEY = "az.default.hadoop.database.id"; public static final String AZ_DEFAULT_HADOOP_DATABASE_ID_KEY = "az.default.hadoop.database.id";
public static final String AZ_LINEAGE_ETL_LOOKBACK_MINS_KEY = "az.lineage_etl.lookback_period.in.minutes"; public static final String AZ_LINEAGE_ETL_LOOKBACK_MINS_KEY = "az.lineage_etl.lookback_period.in.minutes";
public static final String LINEAGE_ACTOR_TIMEOUT_KEY = "az.lineage.actor.timeout"; public static final String LINEAGE_ACTOR_TIMEOUT_KEY = "az.lineage.actor.timeout";
public static final String AZ_LINEAGE_ETL_END_TIMESTAMP_KEY = "az.lineage_etl.end_timestamp";
public static final String AZ_SERVICE_URL_KEY = "az.server.url"; public static final String AZ_SERVICE_URL_KEY = "az.server.url";
public static final String AZ_SERVICE_USERNAME_KEY = "az.server.username"; public static final String AZ_SERVICE_USERNAME_KEY = "az.server.username";
@ -80,4 +81,15 @@ public class Constant {
// ui // ui
public static final String DATASET_TREE_FILE_NAME_KEY = "wherehows.ui.tree.dataset.file"; public static final String DATASET_TREE_FILE_NAME_KEY = "wherehows.ui.tree.dataset.file";
public static final String FLOW_TREE_FILE_NAME_KEY = "wherehows.ui.tree.flow.file"; public static final String FLOW_TREE_FILE_NAME_KEY = "wherehows.ui.tree.flow.file";
// hive
public static final String HIVE_METASTORE_JDBC_DRIVER = "hive.metastore.jdbc.driver";
public static final String HIVE_METASTORE_JDBC_URL = "hive.metastore.jdbc.url";
public static final String HIVE_METASTORE_USERNAME = "hive.metstore.username";
public static final String HIVE_METASTORE_PASSWORD = "hive.metastore.password";
public static final String HIVE_SCHEMA_JSON_FILE_KEY = "hive.schema_json_file";
// public static final String HIVE_SAMPLE_CSV_FILE_KEY = "hive.sample_csv";
public static final String HIVE_SCHEMA_CSV_FILE_KEY = "hive.schema_csv_file";
public static final String HIVE_FIELD_METADATA_KEY = "hive.field_metadata";
} }

View File

@ -29,8 +29,11 @@ public class DatasetFieldRecord implements Record {
Integer parentSortId; Integer parentSortId;
String parentPath; String parentPath;
String fieldName; String fieldName;
String fieldLabel;
String dataType; String dataType;
String isNullable; String isNullable;
String isIndexed;
String isPartitioned;
String defaultValue; String defaultValue;
Integer dataSize; Integer dataSize;
String namespace; String namespace;
@ -39,18 +42,21 @@ public class DatasetFieldRecord implements Record {
List<Object> allFields; List<Object> allFields;
char SEPR = 0x001A; char SEPR = 0x001A;
public DatasetFieldRecord(String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName, public DatasetFieldRecord(String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName, String fieldLabel,
String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description) { String dataType, String isNullable, String isIndexed, String isPartitioned, String defaultValue, Integer dataSize, String namespace, String description) {
this.urn = urn; this.urn = urn;
this.sortId = sortId; this.sortId = sortId;
this.parentSortId = parentSortId; this.parentSortId = parentSortId;
this.parentPath = parentPath; this.parentPath = parentPath;
this.fieldName = fieldName; this.fieldName = fieldName;
this.fieldLabel = fieldLabel;
this.dataType = dataType; this.dataType = dataType;
this.isNullable = isNullable;
this.defaultValue = defaultValue;
this.dataSize = dataSize; this.dataSize = dataSize;
this.isNullable = isNullable;
this.isIndexed = isIndexed;
this.isPartitioned = isPartitioned;
this.defaultValue = defaultValue;
this.namespace = namespace; this.namespace = namespace;
this.description = description; this.description = description;
@ -61,9 +67,11 @@ public class DatasetFieldRecord implements Record {
this.allFields.add(parentPath); this.allFields.add(parentPath);
this.allFields.add(fieldName); this.allFields.add(fieldName);
this.allFields.add(dataType); this.allFields.add(dataType);
this.allFields.add(isNullable);
this.allFields.add(defaultValue);
this.allFields.add(dataSize); this.allFields.add(dataSize);
this.allFields.add(isNullable);
this.allFields.add(isIndexed);
this.allFields.add(isPartitioned);
this.allFields.add(defaultValue);
this.allFields.add(namespace); this.allFields.add(namespace);
this.allFields.add(description); this.allFields.add(description);
} }