Merge pull request #10 from czbernard/master

fix ldap org hierarchy bug and add sub flow reference id
This commit is contained in:
SunZhaonan 2015-12-18 12:00:16 -08:00
commit 059dfb1e1a
13 changed files with 150 additions and 6 deletions

View File

@ -52,6 +52,8 @@ subprojects {
"akka" : "com.typesafe.akka:akka-actor_2.10:2.2.0", "akka" : "com.typesafe.akka:akka-actor_2.10:2.2.0",
"jgit" : "org.eclipse.jgit:org.eclipse.jgit:4.1.1.201511131810-r", "jgit" : "org.eclipse.jgit:org.eclipse.jgit:4.1.1.201511131810-r",
"jsoup" : "org.jsoup:jsoup:1.8.3", "jsoup" : "org.jsoup:jsoup:1.8.3",
"commons_io" : "commons-io:commons-io:2.4",
"jackson_databind" : "com.fasterxml.jackson.core:jackson-databind:2.6.1", "jackson_databind" : "com.fasterxml.jackson.core:jackson-databind:2.6.1",
"jackson_core" : "com.fasterxml.jackson.core:jackson-core:2.6.1", "jackson_core" : "com.fasterxml.jackson.core:jackson-core:2.6.1",

View File

@ -94,6 +94,7 @@ CREATE TABLE flow_job (
job_path VARCHAR(1024) COMMENT 'job path from top level', job_path VARCHAR(1024) COMMENT 'job path from top level',
job_type_id SMALLINT COMMENT 'type id of the job', job_type_id SMALLINT COMMENT 'type id of the job',
job_type VARCHAR(63) COMMENT 'type of the job', job_type VARCHAR(63) COMMENT 'type of the job',
ref_flow_id INT UNSIGNED DEFAULT NULL COMMENT 'the reference flow id of the job if the job is a subflow',
pre_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run before this job', pre_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run before this job',
post_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run after this job', post_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run after this job',
is_current CHAR(1) COMMENT 'determine if it is a current job', is_current CHAR(1) COMMENT 'determine if it is a current job',
@ -104,6 +105,7 @@ CREATE TABLE flow_job (
wh_etl_exec_id BIGINT COMMENT 'wherehows etl execution id that create this record', wh_etl_exec_id BIGINT COMMENT 'wherehows etl execution id that create this record',
PRIMARY KEY (app_id, job_id, dag_version), PRIMARY KEY (app_id, job_id, dag_version),
INDEX flow_id_idx (app_id, flow_id), INDEX flow_id_idx (app_id, flow_id),
INDEX ref_flow_id_idx (app_id, ref_flow_id),
INDEX job_path_idx (app_id, job_path(255)) INDEX job_path_idx (app_id, job_path(255))
) )
ENGINE = InnoDB ENGINE = InnoDB
@ -122,6 +124,8 @@ CREATE TABLE stg_flow_job (
job_path VARCHAR(1024) COMMENT 'job path from top level', job_path VARCHAR(1024) COMMENT 'job path from top level',
job_type_id SMALLINT COMMENT 'type id of the job', job_type_id SMALLINT COMMENT 'type id of the job',
job_type VARCHAR(63) COMMENT 'type of the job', job_type VARCHAR(63) COMMENT 'type of the job',
ref_flow_id INT UNSIGNED DEFAULT NULL COMMENT 'the reference flow id of the job if the job is a subflow',
ref_flow_path VARCHAR(1024) COMMENT 'the reference flow path of the job if the job is a subflow',
pre_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run before this job', pre_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run before this job',
post_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run after this job', post_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run after this job',
is_current CHAR(1) COMMENT 'determine if it is a current job', is_current CHAR(1) COMMENT 'determine if it is a current job',
@ -131,6 +135,7 @@ CREATE TABLE stg_flow_job (
INDEX (app_id, job_id, dag_version), INDEX (app_id, job_id, dag_version),
INDEX flow_id_idx (app_id, flow_id), INDEX flow_id_idx (app_id, flow_id),
INDEX flow_path_idx (app_id, flow_path(255)), INDEX flow_path_idx (app_id, flow_path(255)),
INDEX ref_flow_path_idx (app_id, ref_flow_path(255)),
INDEX job_path_idx (app_id, job_path(255)), INDEX job_path_idx (app_id, job_path(255)),
INDEX job_type_idx (job_type) INDEX job_type_idx (job_type)
) )

View File

@ -1,3 +1,17 @@
--
-- Copyright 2015 LinkedIn Corp. All rights reserved.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--
CREATE TABLE `source_code_commit_info` ( CREATE TABLE `source_code_commit_info` (
`app_id` SMALLINT(5) UNSIGNED DEFAULT NULL, `app_id` SMALLINT(5) UNSIGNED DEFAULT NULL,
`repository_urn` VARCHAR(300) CHAR SET latin1 NOT NULL COMMENT 'the git repo urn', `repository_urn` VARCHAR(300) CHAR SET latin1 NOT NULL COMMENT 'the git repo urn',

View File

@ -118,6 +118,8 @@ class AzkabanExtract:
node['jobType'], node['jobType'],
'Y', 'Y',
self.wh_exec_id) self.wh_exec_id)
if node['jobType'] == 'flow':
job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
job_writer.append(job_record) job_writer.append(job_record)
# job dag # job dag

View File

@ -22,7 +22,7 @@ import sys
class AzkabanTransform(SchedulerTransform): class AzkabanTransform(SchedulerTransform):
SchedulerTransform._tables["flows"]["columns"] = "app_id, flow_name, flow_group, flow_path, flow_level, source_modified_time, source_version, is_active, wh_etl_exec_id" SchedulerTransform._tables["flows"]["columns"] = "app_id, flow_name, flow_group, flow_path, flow_level, source_modified_time, source_version, is_active, wh_etl_exec_id"
SchedulerTransform._tables["jobs"]["columns"] = "app_id, flow_path, source_version, job_name, job_path, job_type, is_current, wh_etl_exec_id" SchedulerTransform._tables["jobs"]["columns"] = "app_id, flow_path, source_version, job_name, job_path, job_type, ref_flow_path, is_current, wh_etl_exec_id"
SchedulerTransform._tables["owners"]["columns"] = "app_id, flow_path, owner_id, permissions, owner_type, wh_etl_exec_id" SchedulerTransform._tables["owners"]["columns"] = "app_id, flow_path, owner_id, permissions, owner_type, wh_etl_exec_id"
SchedulerTransform._tables["flow_execs"]["columns"] = "app_id, flow_name, flow_path, source_version, flow_exec_id, flow_exec_status, attempt_id, executed_by, start_time, end_time, wh_etl_exec_id" SchedulerTransform._tables["flow_execs"]["columns"] = "app_id, flow_name, flow_path, source_version, flow_exec_id, flow_exec_status, attempt_id, executed_by, start_time, end_time, wh_etl_exec_id"
SchedulerTransform._tables["job_execs"]["columns"] = "app_id, flow_path, source_version, flow_exec_id, job_name, job_path, job_exec_id, job_exec_status, attempt_id, start_time, end_time, wh_etl_exec_id" SchedulerTransform._tables["job_execs"]["columns"] = "app_id, flow_path, source_version, flow_exec_id, job_name, job_path, job_exec_id, job_exec_status, attempt_id, start_time, end_time, wh_etl_exec_id"

View File

@ -182,6 +182,11 @@ class LdapTransform:
user_ids = [] user_ids = []
org_hierarchy_long_string = "" org_hierarchy_long_string = ""
org_hierarchy_depth_long_string = "" org_hierarchy_depth_long_string = ""
query = self._update_hierarchy_info.format(table=t.get("table"), app_id=self.app_id, user_ids=",".join(user_ids), org_hierarchy_long_string=org_hierarchy_long_string,
org_hierarchy_depth_long_string=org_hierarchy_depth_long_string)
# print query
self.wh_cursor.executemany(query)
self.wh_con.commit() self.wh_con.commit()
def find_path_for_user(self, start, pair, hierarchy): def find_path_for_user(self, start, pair, hierarchy):

View File

@ -81,9 +81,9 @@ class SchedulerLoad:
self.wh_con.commit() self.wh_con.commit()
cmd = """ cmd = """
INSERT INTO flow_job (app_id, flow_id, first_source_version, dag_version, job_id, job_name, job_path, job_type_id, job_type, pre_jobs, post_jobs, INSERT INTO flow_job (app_id, flow_id, first_source_version, dag_version, job_id, job_name, job_path, job_type_id, job_type, ref_flow_id, pre_jobs, post_jobs,
is_current, is_first, is_last, created_time, modified_time, wh_etl_exec_id) is_current, is_first, is_last, created_time, modified_time, wh_etl_exec_id)
SELECT app_id, flow_id, source_version first_source_version, dag_version, job_id, job_name, job_path, job_type_id, job_type, pre_jobs, post_jobs, SELECT app_id, flow_id, source_version first_source_version, dag_version, job_id, job_name, job_path, job_type_id, job_type, ref_flow_id, pre_jobs, post_jobs,
'Y', is_first, is_last, unix_timestamp(NOW()) created_time, NULL, wh_etl_exec_id 'Y', is_first, is_last, unix_timestamp(NOW()) created_time, NULL, wh_etl_exec_id
FROM stg_flow_job s FROM stg_flow_job s
WHERE s.app_id = {app_id} WHERE s.app_id = {app_id}
@ -94,6 +94,7 @@ class SchedulerLoad:
job_path = s.job_path, job_path = s.job_path,
job_type_id = s.job_type_id, job_type_id = s.job_type_id,
job_type = s.job_type, job_type = s.job_type,
ref_flow_id = s.ref_flow_id,
pre_jobs = s.pre_jobs, pre_jobs = s.pre_jobs,
post_jobs = s.post_jobs, post_jobs = s.post_jobs,
is_current = 'Y', is_current = 'Y',

View File

@ -149,6 +149,27 @@ class SchedulerTransform:
self.wh_cursor.execute(query) self.wh_cursor.execute(query)
self.wh_con.commit() self.wh_con.commit()
# ad hoc fix for null values, need better solution by changing the load script
query = """
UPDATE {table} stg
SET stg.ref_flow_path = null
WHERE stg.ref_flow_path = 'null' and stg.app_id = {app_id}
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update sub flow id from mapping table
query = """
UPDATE {table} stg
JOIN flow_source_id_map fm
ON stg.app_id = fm.app_id AND stg.ref_flow_path = fm.source_id_string
SET stg.ref_flow_id = fm.flow_id WHERE stg.app_id = {app_id}
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Insert new job into job map to generate job id # Insert new job into job map to generate job id
query = """ query = """
INSERT INTO job_source_id_map (app_id, source_id_string) INSERT INTO job_source_id_map (app_id, source_id_string)

View File

@ -7,6 +7,7 @@ dependencies {
compile externalDependency.spring_jdbc compile externalDependency.spring_jdbc
compile externalDependency.jgit compile externalDependency.jgit
compile externalDependency.jsoup compile externalDependency.jsoup
compile externalDependency.commons_io
testCompile externalDependency.testng testCompile externalDependency.testng
testCompile project(":metadata-etl") testCompile project(":metadata-etl")
} }

View File

@ -27,6 +27,7 @@ public class AzkabanJobRecord extends AbstractRecord {
String jobName; String jobName;
String jobPath; String jobPath;
String jobType; String jobType;
String refFlowPath;
Character isCurrent; Character isCurrent;
Long whExecId; Long whExecId;
@ -51,9 +52,81 @@ public class AzkabanJobRecord extends AbstractRecord {
allFields.add(jobName); allFields.add(jobName);
allFields.add(jobPath); allFields.add(jobPath);
allFields.add(jobType); allFields.add(jobType);
allFields.add(refFlowPath);
allFields.add(isCurrent); allFields.add(isCurrent);
allFields.add(whExecId); allFields.add(whExecId);
return allFields; return allFields;
} }
public Integer getAppId() {
return appId;
}
public void setAppId(Integer appId) {
this.appId = appId;
}
public String getFlowPath() {
return flowPath;
}
public void setFlowPath(String flowPath) {
this.flowPath = flowPath;
}
public Integer getSourceVersion() {
return sourceVersion;
}
public void setSourceVersion(Integer sourceVersion) {
this.sourceVersion = sourceVersion;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getJobPath() {
return jobPath;
}
public void setJobPath(String jobPath) {
this.jobPath = jobPath;
}
public String getJobType() {
return jobType;
}
public void setJobType(String jobType) {
this.jobType = jobType;
}
public String getRefFlowPath() {
return refFlowPath;
}
public void setRefFlowPath(String refFlowPath) {
this.refFlowPath = refFlowPath;
}
public Character getIsCurrent() {
return isCurrent;
}
public void setIsCurrent(Character isCurrent) {
this.isCurrent = isCurrent;
}
public Long getWhExecId() {
return whExecId;
}
public void setWhExecId(Long whExecId) {
this.whExecId = whExecId;
}
} }

View File

@ -41,7 +41,7 @@ public class GitCommitRecord extends AbstractRecord {
this.gitRepoUrn = gitRepoUrn; this.gitRepoUrn = gitRepoUrn;
this.commitId = commitMetadata.getCommitId(); this.commitId = commitMetadata.getCommitId();
this.filePath = commitMetadata.getFilePath(); this.filePath = commitMetadata.getFilePath();
this.fileName = FilenameUtils.getName(this.filePath); this.fileName = commitMetadata.getFileName();
this.commitTime = commitMetadata.getCommitTime().getTime() / 1000; this.commitTime = commitMetadata.getCommitTime().getTime() / 1000;
this.committerName = commitMetadata.getCommitter(); this.committerName = commitMetadata.getCommitter();
this.committerEmail = commitMetadata.getCommitterEmail(); this.committerEmail = commitMetadata.getCommitterEmail();

View File

@ -22,6 +22,7 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.eclipse.jgit.api.Git; import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.api.errors.GitAPIException; import org.eclipse.jgit.api.errors.GitAPIException;
import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.Constants;
@ -134,6 +135,7 @@ public class GitUtil {
for (RevCommit r : commitLog) { for (RevCommit r : commitLog) {
CommitMetadata metadata = new CommitMetadata(r.getName()); CommitMetadata metadata = new CommitMetadata(r.getName());
metadata.setFilePath(filePath); metadata.setFilePath(filePath);
metadata.setFileName(FilenameUtils.getName(filePath));
metadata.setMessage(r.getShortMessage().trim()); metadata.setMessage(r.getShortMessage().trim());
// Difference between committer and author // Difference between committer and author
// refer to: http://git-scm.com/book/ch2-3.html // refer to: http://git-scm.com/book/ch2-3.html
@ -172,6 +174,7 @@ public class GitUtil {
String committerEmail; String committerEmail;
String authorEmail; String authorEmail;
String filePath; String filePath;
String fileName;
public CommitMetadata() { public CommitMetadata() {
} }
@ -181,7 +184,7 @@ public class GitUtil {
} }
public CommitMetadata(String commitId, String author, String committer, Date commitTime, String message, public CommitMetadata(String commitId, String author, String committer, Date commitTime, String message,
String committerEmail, String authorEmail, String filePath) { String committerEmail, String authorEmail, String filePath, String fileName) {
this.commitId = commitId; this.commitId = commitId;
this.author = author; this.author = author;
this.committer = committer; this.committer = committer;
@ -190,6 +193,7 @@ public class GitUtil {
this.committerEmail = committerEmail; this.committerEmail = committerEmail;
this.authorEmail = authorEmail; this.authorEmail = authorEmail;
this.filePath = filePath; this.filePath = filePath;
this.fileName = fileName;
} }
public String getCommitId() { public String getCommitId() {
@ -255,6 +259,14 @@ public class GitUtil {
public void setFilePath(String filePath) { public void setFilePath(String filePath) {
this.filePath = filePath; this.filePath = filePath;
} }
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
} }
} }

View File

@ -20,12 +20,20 @@ public class StringUtil {
public static String toDbString(Object object) { public static String toDbString(Object object) {
if (object != null) { if (object != null) {
return "'" + object.toString().replace("\'", "\\\'").replace("\"", "\\\"") + "'"; return "'" + object.toString().replace("\\", "\\\\").replace("\'", "\\\'").replace("\"", "\\\"") + "'";
} else { } else {
return "null"; return "null";
} }
} }
public static String toCsvString(Object object) {
if (object != null) {
return "\"" + object.toString().replace("\\", "\\\\").replace("\'", "\\\'").replace("\"", "\\\"") + "\"";
} else {
return "\\N";
}
}
public static String replace(String s, String target, Object replacement) { public static String replace(String s, String target, Object replacement) {
if (replacement != null) { if (replacement != null) {
return s.replace(target, "'" + replacement.toString().replace("\'", "\\\'").replace("\"", "\\\"") + "'"); return s.replace(target, "'" + replacement.toString().replace("\'", "\\\'").replace("\"", "\\\"") + "'");