mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-05 14:06:40 +00:00
Merge pull request #7 from czbernard/linkedin
changes to gitorious crawler and put close connection into finally block in jythons
This commit is contained in:
commit
9eb7f14aa6
@ -16,6 +16,7 @@ package metadata.etl.git;
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import metadata.etl.EtlJob;
|
import metadata.etl.EtlJob;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -56,13 +57,13 @@ public class GitMetadataEtl extends EtlJob {
|
|||||||
}
|
}
|
||||||
FileWriter fw = new FileWriter(localDir + "/" + COMMIT_OUTPUT_FILE);
|
FileWriter fw = new FileWriter(localDir + "/" + COMMIT_OUTPUT_FILE);
|
||||||
for (String project : projects) {
|
for (String project : projects) {
|
||||||
List<String> repos = GitUtil.getRepoListFromProject(GitUtil.getHttpsUrl(gitHost, project));
|
Map<String, String> repos = GitUtil.getRepoListFromProject(GitUtil.getHttpsUrl(gitHost, project));
|
||||||
for (String repo : repos) {
|
for (String repo : repos.keySet()) {
|
||||||
String repoUri = GitUtil.getGitUrl(gitHost, repo);
|
String repoUri = repos.get(repo);
|
||||||
String repoDir = localDir + "/" + repo;
|
String repoDir = localDir + "/" + repo;
|
||||||
GitUtil.clone(repoUri, repoDir);
|
GitUtil.clone(repoUri, repoDir);
|
||||||
List<GitUtil.CommitMetadata> commitMetadatas = GitUtil.getRepoMetadata(repoDir);
|
List<GitUtil.CommitMetadata> commitMetadataList = GitUtil.getRepoMetadata(repoDir);
|
||||||
for (GitUtil.CommitMetadata m : commitMetadatas) {
|
for (GitUtil.CommitMetadata m : commitMetadataList) {
|
||||||
fw.append(new GitCommitRecord(m, repoUri));
|
fw.append(new GitCommitRecord(m, repoUri));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -65,16 +65,18 @@ class AzkabanExtract:
|
|||||||
print e
|
print e
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.collect_flow_jobs(self.metadata_folder + "/flow.csv", self.metadata_folder + "/job.csv", self.metadata_folder + "/dag.csv")
|
self.collect_flow_jobs(self.metadata_folder + "/flow.csv", self.metadata_folder + "/job.csv", self.metadata_folder + "/dag.csv")
|
||||||
self.collect_flow_owners(self.metadata_folder + "/owner.csv")
|
self.collect_flow_owners(self.metadata_folder + "/owner.csv")
|
||||||
self.collect_flow_schedules(self.metadata_folder + "/schedule.csv")
|
self.collect_flow_schedules(self.metadata_folder + "/schedule.csv")
|
||||||
self.collect_flow_execs(self.metadata_folder + "/flow_exec.csv", self.metadata_folder + "/job_exec.csv", self.lookback_period)
|
self.collect_flow_execs(self.metadata_folder + "/flow_exec.csv", self.metadata_folder + "/job_exec.csv", self.lookback_period)
|
||||||
|
finally:
|
||||||
self.az_cursor.close()
|
self.az_cursor.close()
|
||||||
self.az_con.close()
|
self.az_con.close()
|
||||||
|
|
||||||
def collect_flow_jobs(self, flow_file, job_file, dag_file):
|
def collect_flow_jobs(self, flow_file, job_file, dag_file):
|
||||||
print "collect flow&jobs"
|
print "collect flow&jobs"
|
||||||
query = "SELECT f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
|
query = "SELECT distinct f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
|
||||||
self.az_cursor.execute(query)
|
self.az_cursor.execute(query)
|
||||||
rows = DbUtil.dict_cursor(self.az_cursor)
|
rows = DbUtil.dict_cursor(self.az_cursor)
|
||||||
flow_writer = FileWriter(flow_file)
|
flow_writer = FileWriter(flow_file)
|
||||||
@ -89,7 +91,6 @@ class AzkabanExtract:
|
|||||||
unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
|
unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
|
||||||
try:
|
try:
|
||||||
row[json_column] = json.loads(unzipped_content)
|
row[json_column] = json.loads(unzipped_content)
|
||||||
#print json.dumps(row[json_column], indent=4)
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -26,8 +26,9 @@ class DatasetTreeBuilder:
|
|||||||
jdbc_driver = args[Constant.WH_DB_DRIVER_KEY]
|
jdbc_driver = args[Constant.WH_DB_DRIVER_KEY]
|
||||||
jdbc_url = args[Constant.WH_DB_URL_KEY]
|
jdbc_url = args[Constant.WH_DB_URL_KEY]
|
||||||
conn_mysql = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
|
conn_mysql = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
|
||||||
query = "select distinct id, concat(SUBSTRING_INDEX(urn, ':///', 1), '/', SUBSTRING_INDEX(urn, ':///', -1)) p from dict_dataset order by urn"
|
|
||||||
cur = conn_mysql.cursor()
|
cur = conn_mysql.cursor()
|
||||||
|
try:
|
||||||
|
query = "select distinct id, concat(SUBSTRING_INDEX(urn, ':///', 1), '/', SUBSTRING_INDEX(urn, ':///', -1)) p from dict_dataset order by urn"
|
||||||
cur.execute(query)
|
cur.execute(query)
|
||||||
datasets = cur.fetchall()
|
datasets = cur.fetchall()
|
||||||
self.dataset_dict = dict()
|
self.dataset_dict = dict()
|
||||||
@ -39,6 +40,7 @@ class DatasetTreeBuilder:
|
|||||||
current["__ID_OF_DATASET__"] = dataset[0]
|
current["__ID_OF_DATASET__"] = dataset[0]
|
||||||
self.file_name = args[Constant.DATASET_TREE_FILE_NAME_KEY]
|
self.file_name = args[Constant.DATASET_TREE_FILE_NAME_KEY]
|
||||||
self.value = []
|
self.value = []
|
||||||
|
finally:
|
||||||
cur.close()
|
cur.close()
|
||||||
conn_mysql.close()
|
conn_mysql.close()
|
||||||
|
|
||||||
|
@ -26,8 +26,9 @@ class FlowTreeBuilder:
|
|||||||
jdbc_driver = args[Constant.WH_DB_DRIVER_KEY]
|
jdbc_driver = args[Constant.WH_DB_DRIVER_KEY]
|
||||||
jdbc_url = args[Constant.WH_DB_URL_KEY]
|
jdbc_url = args[Constant.WH_DB_URL_KEY]
|
||||||
conn_mysql = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
|
conn_mysql = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
|
||||||
query = "select distinct f.flow_id, f.flow_name, f.flow_group, ca.app_code from flow f join cfg_application ca on f.app_id = ca.app_id order by app_code, flow_name"
|
|
||||||
cur = conn_mysql.cursor()
|
cur = conn_mysql.cursor()
|
||||||
|
try:
|
||||||
|
query = "select distinct f.flow_id, f.flow_name, f.flow_group, ca.app_code from flow f join cfg_application ca on f.app_id = ca.app_id order by app_code, flow_name"
|
||||||
cur.execute(query)
|
cur.execute(query)
|
||||||
flows = cur.fetchall()
|
flows = cur.fetchall()
|
||||||
self.flow_dict = dict()
|
self.flow_dict = dict()
|
||||||
@ -45,6 +46,7 @@ class FlowTreeBuilder:
|
|||||||
current["__ID_OF_FLOW__"] = flow[0]
|
current["__ID_OF_FLOW__"] = flow[0]
|
||||||
self.file_name = args[Constant.FLOW_TREE_FILE_NAME_KEY]
|
self.file_name = args[Constant.FLOW_TREE_FILE_NAME_KEY]
|
||||||
self.value = []
|
self.value = []
|
||||||
|
finally:
|
||||||
cur.close()
|
cur.close()
|
||||||
conn_mysql.close()
|
conn_mysql.close()
|
||||||
|
|
||||||
|
@ -20,7 +20,6 @@ import sys
|
|||||||
|
|
||||||
|
|
||||||
class GitLoad:
|
class GitLoad:
|
||||||
|
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
||||||
args[Constant.WH_DB_USERNAME_KEY],
|
args[Constant.WH_DB_USERNAME_KEY],
|
||||||
@ -30,7 +29,9 @@ class GitLoad:
|
|||||||
self.app_id = int(args[Constant.APP_ID_KEY])
|
self.app_id = int(args[Constant.APP_ID_KEY])
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.load_from_stg()
|
self.load_from_stg()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
@ -50,6 +51,7 @@ class GitLoad:
|
|||||||
self.wh_cursor.execute(query)
|
self.wh_cursor.execute(query)
|
||||||
self.wh_con.commit()
|
self.wh_con.commit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
git = GitLoad(props)
|
git = GitLoad(props)
|
||||||
|
@ -51,7 +51,9 @@ class OwnerTransform:
|
|||||||
self.metadata_folder = self.app_folder + "/" + str(self.app_id)
|
self.metadata_folder = self.app_folder + "/" + str(self.app_id)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.read_file_to_stg()
|
self.read_file_to_stg()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
@ -75,6 +77,7 @@ class OwnerTransform:
|
|||||||
self.wh_cursor.execute(query)
|
self.wh_cursor.execute(query)
|
||||||
self.wh_con.commit()
|
self.wh_con.commit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
ot = OwnerTransform(props)
|
ot = OwnerTransform(props)
|
||||||
|
@ -263,7 +263,9 @@ if __name__ == "__main__":
|
|||||||
l.db_id = args[Constant.DB_ID_KEY]
|
l.db_id = args[Constant.DB_ID_KEY]
|
||||||
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
||||||
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
||||||
|
try:
|
||||||
l.load_metadata()
|
l.load_metadata()
|
||||||
l.load_field()
|
l.load_field()
|
||||||
l.load_sample()
|
l.load_sample()
|
||||||
|
finally:
|
||||||
l.conn_mysql.close()
|
l.conn_mysql.close()
|
||||||
|
@ -24,7 +24,6 @@ from java.io import FileWriter
|
|||||||
|
|
||||||
|
|
||||||
class LdapExtract:
|
class LdapExtract:
|
||||||
|
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
self.args = args
|
self.args = args
|
||||||
self.app_id = int(args[Constant.APP_ID_KEY])
|
self.app_id = int(args[Constant.APP_ID_KEY])
|
||||||
@ -228,11 +227,8 @@ class LdapExtract:
|
|||||||
self.fetch_ldap_group(self.metadata_folder + "/ldap_group_record.csv")
|
self.fetch_ldap_group(self.metadata_folder + "/ldap_group_record.csv")
|
||||||
self.fetch_ldap_group_flatten(self.metadata_folder + "/ldap_group_flatten_record.csv")
|
self.fetch_ldap_group_flatten(self.metadata_folder + "/ldap_group_flatten_record.csv")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
ldap = LdapExtract(props)
|
ldap = LdapExtract(props)
|
||||||
ldap.run()
|
ldap.run()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,7 +20,6 @@ import sys
|
|||||||
|
|
||||||
|
|
||||||
class LdapLoad:
|
class LdapLoad:
|
||||||
|
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
||||||
args[Constant.WH_DB_USERNAME_KEY],
|
args[Constant.WH_DB_USERNAME_KEY],
|
||||||
@ -32,7 +31,9 @@ class LdapLoad:
|
|||||||
self.metadata_folder = self.app_folder + "/" + str(self.app_id)
|
self.metadata_folder = self.app_folder + "/" + str(self.app_id)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.load_from_stg()
|
self.load_from_stg()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
@ -99,6 +100,7 @@ class LdapLoad:
|
|||||||
self.wh_cursor.execute(query)
|
self.wh_cursor.execute(query)
|
||||||
self.wh_con.commit()
|
self.wh_con.commit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
lt = LdapLoad(props)
|
lt = LdapLoad(props)
|
||||||
|
@ -20,7 +20,8 @@ import sys
|
|||||||
|
|
||||||
|
|
||||||
class LdapTransform:
|
class LdapTransform:
|
||||||
_tables = {"ldap_user": {"columns": "app_id, is_active, user_id, urn, full_name, display_name, title, employee_number, manager_urn, email, department_id, department_name, start_date, mobile_phone, wh_etl_exec_id",
|
_tables = {"ldap_user": {
|
||||||
|
"columns": "app_id, is_active, user_id, urn, full_name, display_name, title, employee_number, manager_urn, email, department_id, department_name, start_date, mobile_phone, wh_etl_exec_id",
|
||||||
"file": "ldap_user_record.csv",
|
"file": "ldap_user_record.csv",
|
||||||
"table": "stg_dir_external_user_info",
|
"table": "stg_dir_external_user_info",
|
||||||
"nullif_columns":
|
"nullif_columns":
|
||||||
@ -105,10 +106,12 @@ class LdapTransform:
|
|||||||
self.ceo_user_id = args[Constant.LDAP_CEO_USER_ID_KEY]
|
self.ceo_user_id = args[Constant.LDAP_CEO_USER_ID_KEY]
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.read_file_to_stg()
|
self.read_file_to_stg()
|
||||||
self.update_null_value()
|
self.update_null_value()
|
||||||
self.update_manager_info()
|
self.update_manager_info()
|
||||||
self.update_hierarchy_info()
|
self.update_hierarchy_info()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
@ -172,7 +175,8 @@ class LdapTransform:
|
|||||||
org_hierarchy_depth_long_string += case_org_hierarchy_depth_template.format(user_id=user, org_hierarchy_depth=hierarchy[user][1])
|
org_hierarchy_depth_long_string += case_org_hierarchy_depth_template.format(user_id=user, org_hierarchy_depth=hierarchy[user][1])
|
||||||
count += 1
|
count += 1
|
||||||
if count % 1000 == 0:
|
if count % 1000 == 0:
|
||||||
query = self._update_hierarchy_info.format(table=t.get("table"), app_id=self.app_id, user_ids=",".join(user_ids), org_hierarchy_long_string=org_hierarchy_long_string, org_hierarchy_depth_long_string=org_hierarchy_depth_long_string)
|
query = self._update_hierarchy_info.format(table=t.get("table"), app_id=self.app_id, user_ids=",".join(user_ids), org_hierarchy_long_string=org_hierarchy_long_string,
|
||||||
|
org_hierarchy_depth_long_string=org_hierarchy_depth_long_string)
|
||||||
# print query
|
# print query
|
||||||
self.wh_cursor.executemany(query)
|
self.wh_cursor.executemany(query)
|
||||||
user_ids = []
|
user_ids = []
|
||||||
@ -200,6 +204,7 @@ class LdapTransform:
|
|||||||
hierarchy[start] = current
|
hierarchy[start] = current
|
||||||
return current
|
return current
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
lt = LdapTransform(props)
|
lt = LdapTransform(props)
|
||||||
|
@ -63,6 +63,7 @@ class OozieExtract:
|
|||||||
print "Oozie version: ", self.oz_version[0]
|
print "Oozie version: ", self.oz_version[0]
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.collect_flow_jobs(self.metadata_folder + "/flow.csv",
|
self.collect_flow_jobs(self.metadata_folder + "/flow.csv",
|
||||||
self.metadata_folder + "/job.csv",
|
self.metadata_folder + "/job.csv",
|
||||||
self.metadata_folder + "/dag.csv")
|
self.metadata_folder + "/dag.csv")
|
||||||
@ -70,6 +71,7 @@ class OozieExtract:
|
|||||||
self.collect_flow_schedules(self.metadata_folder + "/schedule.csv")
|
self.collect_flow_schedules(self.metadata_folder + "/schedule.csv")
|
||||||
self.collect_flow_execs(self.metadata_folder + "/flow_exec.csv", self.lookback_period)
|
self.collect_flow_execs(self.metadata_folder + "/flow_exec.csv", self.lookback_period)
|
||||||
self.collect_job_execs(self.metadata_folder + "/job_exec.csv", self.lookback_period)
|
self.collect_job_execs(self.metadata_folder + "/job_exec.csv", self.lookback_period)
|
||||||
|
finally:
|
||||||
self.oz_cursor.close()
|
self.oz_cursor.close()
|
||||||
self.oz_con.close()
|
self.oz_con.close()
|
||||||
|
|
||||||
|
@ -22,7 +22,6 @@ import sys
|
|||||||
|
|
||||||
|
|
||||||
class OwnerLoad:
|
class OwnerLoad:
|
||||||
|
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
||||||
args[Constant.WH_DB_USERNAME_KEY],
|
args[Constant.WH_DB_USERNAME_KEY],
|
||||||
@ -33,6 +32,7 @@ class OwnerLoad:
|
|||||||
self.app_folder = args[Constant.WH_APP_FOLDER_KEY]
|
self.app_folder = args[Constant.WH_APP_FOLDER_KEY]
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
cmd = """
|
cmd = """
|
||||||
INSERT INTO dataset_owner (dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id)
|
INSERT INTO dataset_owner (dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id)
|
||||||
SELECT * FROM (SELECT dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, group_concat(db_id ORDER BY db_id SEPARATOR ",") db_ids, is_group, is_active, source_time, unix_timestamp(NOW()) time_created, {wh_etl_exec_id}
|
SELECT * FROM (SELECT dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, group_concat(db_id ORDER BY db_id SEPARATOR ",") db_ids, is_group, is_active, source_time, unix_timestamp(NOW()) time_created, {wh_etl_exec_id}
|
||||||
@ -107,10 +107,11 @@ class OwnerLoad:
|
|||||||
"""
|
"""
|
||||||
self.wh_cursor.execute(cmd)
|
self.wh_cursor.execute(cmd)
|
||||||
self.wh_con.commit()
|
self.wh_con.commit()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
ot = OwnerLoad(props)
|
ot = OwnerLoad(props)
|
||||||
|
@ -94,11 +94,13 @@ class OwnerTransform:
|
|||||||
self.metadata_folder = self.app_folder + "/" + str(self.db_id)
|
self.metadata_folder = self.app_folder + "/" + str(self.db_id)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.read_file_to_stg()
|
self.read_file_to_stg()
|
||||||
self.update_dataset_id()
|
self.update_dataset_id()
|
||||||
self.update_database_id()
|
self.update_database_id()
|
||||||
self.update_app_id()
|
self.update_app_id()
|
||||||
self.update_owner_type()
|
self.update_owner_type()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
@ -158,6 +160,7 @@ class OwnerTransform:
|
|||||||
self.wh_cursor.execute(query)
|
self.wh_cursor.execute(query)
|
||||||
self.wh_con.commit()
|
self.wh_con.commit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
props = sys.argv[1]
|
props = sys.argv[1]
|
||||||
ot = OwnerTransform(props)
|
ot = OwnerTransform(props)
|
||||||
|
@ -32,6 +32,7 @@ class SchedulerLoad:
|
|||||||
self.wh_cursor = self.wh_con.cursor()
|
self.wh_cursor = self.wh_con.cursor()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.load_flows()
|
self.load_flows()
|
||||||
self.load_jobs()
|
self.load_jobs()
|
||||||
self.load_flow_dags()
|
self.load_flow_dags()
|
||||||
@ -39,6 +40,7 @@ class SchedulerLoad:
|
|||||||
self.load_flow_owner_permissions()
|
self.load_flow_owner_permissions()
|
||||||
self.load_flow_executions()
|
self.load_flow_executions()
|
||||||
self.load_job_executions()
|
self.load_job_executions()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
|
@ -84,6 +84,7 @@ class SchedulerTransform:
|
|||||||
self.metadata_folder = self.app_folder + "/" + str(scheduler_type) + "/" + str(self.app_id)
|
self.metadata_folder = self.app_folder + "/" + str(scheduler_type) + "/" + str(self.app_id)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
try:
|
||||||
self.read_flow_file_to_stg()
|
self.read_flow_file_to_stg()
|
||||||
self.read_job_file_to_stg()
|
self.read_job_file_to_stg()
|
||||||
self.read_dag_file_to_stg()
|
self.read_dag_file_to_stg()
|
||||||
@ -91,6 +92,7 @@ class SchedulerTransform:
|
|||||||
self.read_flow_schedule_file_to_stg()
|
self.read_flow_schedule_file_to_stg()
|
||||||
self.read_flow_exec_file_to_stg()
|
self.read_flow_exec_file_to_stg()
|
||||||
self.read_job_exec_file_to_stg()
|
self.read_job_exec_file_to_stg()
|
||||||
|
finally:
|
||||||
self.wh_cursor.close()
|
self.wh_cursor.close()
|
||||||
self.wh_con.close()
|
self.wh_con.close()
|
||||||
|
|
||||||
|
@ -538,6 +538,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
e = TeradataExtract()
|
e = TeradataExtract()
|
||||||
e.conn_td = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
e.conn_td = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
||||||
|
try:
|
||||||
e.conn_td.cursor().execute("SET QUERY_BAND = 'script=%s; pid=%d; ' FOR SESSION;" % ('TeradataExtract.py', os.getpid()))
|
e.conn_td.cursor().execute("SET QUERY_BAND = 'script=%s; pid=%d; ' FOR SESSION;" % ('TeradataExtract.py', os.getpid()))
|
||||||
e.conn_td.commit()
|
e.conn_td.commit()
|
||||||
e.log_file = args[Constant.TD_LOG_KEY]
|
e.log_file = args[Constant.TD_LOG_KEY]
|
||||||
@ -547,5 +548,6 @@ if __name__ == "__main__":
|
|||||||
'J': 'Join Index', 'U': 'Unique Index'}
|
'J': 'Join Index', 'U': 'Unique Index'}
|
||||||
|
|
||||||
e.run(None, None, args[Constant.TD_SCHEMA_OUTPUT_KEY], args[Constant.TD_SAMPLE_OUTPUT_KEY])
|
e.run(None, None, args[Constant.TD_SCHEMA_OUTPUT_KEY], args[Constant.TD_SAMPLE_OUTPUT_KEY])
|
||||||
|
finally:
|
||||||
e.conn_td.close()
|
e.conn_td.close()
|
||||||
|
|
||||||
|
@ -253,7 +253,9 @@ if __name__ == "__main__":
|
|||||||
l.db_id = args[Constant.DB_ID_KEY]
|
l.db_id = args[Constant.DB_ID_KEY]
|
||||||
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
||||||
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
||||||
|
try:
|
||||||
l.load_metadata()
|
l.load_metadata()
|
||||||
l.load_field()
|
l.load_field()
|
||||||
l.load_sample()
|
l.load_sample()
|
||||||
|
finally:
|
||||||
l.conn_mysql.close()
|
l.conn_mysql.close()
|
||||||
|
@ -17,8 +17,10 @@ import java.io.File;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.eclipse.jgit.api.Git;
|
import org.eclipse.jgit.api.Git;
|
||||||
import org.eclipse.jgit.api.errors.GitAPIException;
|
import org.eclipse.jgit.api.errors.GitAPIException;
|
||||||
@ -80,15 +82,18 @@ public class GitUtil {
|
|||||||
* @return List of path of repositories e.g. project/repo
|
* @return List of path of repositories e.g. project/repo
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public static List<String> getRepoListFromProject(String projectUrl) throws IOException {
|
public static Map<String, String> getRepoListFromProject(String projectUrl) throws IOException {
|
||||||
|
|
||||||
List<String> repoList = new LinkedList<>();
|
Map<String, String> repoList = new HashMap<>();
|
||||||
Document doc = Jsoup.connect(projectUrl).get();
|
Document doc = Jsoup.connect(projectUrl).data("format", "xml").get();
|
||||||
Elements repos = doc.getElementsByClass("repository");
|
Elements repos = doc.getElementsByTag("repositories");
|
||||||
|
Elements mainlines = repos.first().getElementsByTag("mainlines");
|
||||||
|
Elements repo = mainlines.first().getElementsByTag("repository");
|
||||||
|
|
||||||
for (Element e : repos) {
|
for (Element e : repo) {
|
||||||
String repo = e.children().first().text();
|
String repoName = e.getElementsByTag("name").first().text();
|
||||||
repoList.add(repo.trim());
|
String repoUrl = e.getElementsByTag("clone_url").first().text();
|
||||||
|
repoList.put(repoName.trim(), repoUrl.trim());
|
||||||
}
|
}
|
||||||
|
|
||||||
return repoList;
|
return repoList;
|
||||||
|
@ -29,7 +29,7 @@ public class GitUtilTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testGetRepoListFromProject()
|
public void testGetRepoListFromProject()
|
||||||
throws Exception {
|
throws Exception {
|
||||||
//List<String> repos = GitUtil.getRepoListFromProject("git://git.example.com/project");
|
//Map<String, String> repos = GitUtil.getRepoListFromProject("git://git.example.com/project");
|
||||||
//Assert.assertTrue(repos.size() > 0);
|
//Assert.assertTrue(repos.size() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user