mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-10 08:26:39 +00:00
Optimize dataset load scripts, improve speed (#350)
- When loading dataset fields in staging table, populate the dateset_id field first then use this in later JOIN. - When JOIN two big tables such as dict_field_detail, use pre-select to reduce table JOIN size and DB resource. - Refactor some SQL code. - Modify logback setting to better capture log time. - Remove unnecessary config in backend application.conf
This commit is contained in:
parent
d9438dc5bc
commit
b6e644fbb1
@ -41,7 +41,6 @@ db.wherehows.driver = com.mysql.jdbc.Driver
|
||||
db.wherehows.url = ${WHZ_DB_URL}
|
||||
db.wherehows.username = ${WHZ_DB_USERNAME}
|
||||
db.wherehows.password = ${WHZ_DB_PASSWORD}
|
||||
db.wherehows.host = ${WHZ_DB_HOST}
|
||||
|
||||
# You can expose this datasource via JNDI if needed (Useful for JPA)
|
||||
# db.default.jndiName=DefaultDS
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder>
|
||||
<pattern>%coloredLevel - %logger - %message%n%xException</pattern>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
|
@ -27,7 +27,6 @@ class HdfsLoad:
|
||||
Load dataset metadata into final table
|
||||
:return: nothing
|
||||
"""
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = '''
|
||||
DELETE FROM stg_dict_dataset WHERE db_id = {db_id};
|
||||
|
||||
@ -197,15 +196,11 @@ class HdfsLoad:
|
||||
;
|
||||
'''.format(source_file=self.input_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.debug(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("finish loading hdfs metadata db_id={db_id} to dict_dataset".format(db_id=self.db_id))
|
||||
|
||||
|
||||
def load_field(self):
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_field_cmd = '''
|
||||
DELETE FROM stg_dict_field_detail where db_id = {db_id};
|
||||
|
||||
@ -242,24 +237,25 @@ class HdfsLoad:
|
||||
|
||||
analyze table field_comments;
|
||||
|
||||
-- update stg_dict_field_detail dataset_id
|
||||
update stg_dict_field_detail sf, dict_dataset d
|
||||
set sf.dataset_id = d.id where sf.urn = d.urn
|
||||
and sf.db_id = {db_id};
|
||||
delete from stg_dict_field_detail
|
||||
where db_id = {db_id} and dataset_id is null; -- remove if not match to dataset
|
||||
|
||||
-- delete old record if it does not exist in this load batch anymore (but have the dataset id)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id))
|
||||
select x.field_id
|
||||
from stg_dict_field_detail s
|
||||
join dict_dataset i
|
||||
on s.urn = i.urn
|
||||
and s.db_id = {db_id}
|
||||
right join dict_field_detail x
|
||||
on i.id = x.dataset_id
|
||||
and s.field_name = x.field_name
|
||||
and s.parent_path = x.parent_path
|
||||
where s.field_name is null
|
||||
and x.dataset_id in (
|
||||
select d.id dataset_id
|
||||
from stg_dict_field_detail k join dict_dataset d
|
||||
on k.urn = d.urn
|
||||
and k.db_id = {db_id}
|
||||
)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
|
||||
SELECT x.field_id
|
||||
FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
|
||||
RIGHT JOIN
|
||||
( select dataset_id, field_id, field_name, parent_path from dict_field_detail
|
||||
where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
|
||||
) x
|
||||
ON s.dataset_id = x.dataset_id
|
||||
AND s.field_name = x.field_name
|
||||
AND s.parent_path = x.parent_path
|
||||
WHERE s.field_name is null
|
||||
; -- run time : ~2min
|
||||
|
||||
delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
|
||||
@ -269,12 +265,10 @@ class HdfsLoad:
|
||||
(
|
||||
select x.field_id, s.*
|
||||
from (select * from stg_dict_field_detail where db_id = {db_id}) s
|
||||
join dict_dataset d
|
||||
on s.urn = d.urn
|
||||
join dict_field_detail x
|
||||
on s.field_name = x.field_name
|
||||
and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
|
||||
and d.id = x.dataset_id
|
||||
and s.dataset_id = x.dataset_id
|
||||
where (x.sort_id <> s.sort_id
|
||||
or x.parent_sort_id <> s.parent_sort_id
|
||||
or x.data_type <> s.data_type
|
||||
@ -307,12 +301,11 @@ class HdfsLoad:
|
||||
modified
|
||||
)
|
||||
select
|
||||
d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
|
||||
sf.dataset_id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
|
||||
sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now()
|
||||
from stg_dict_field_detail sf join dict_dataset d
|
||||
on sf.urn = d.urn
|
||||
from stg_dict_field_detail sf
|
||||
left join dict_field_detail t
|
||||
on d.id = t.dataset_id
|
||||
on sf.dataset_id = t.dataset_id
|
||||
and sf.field_name = t.field_name
|
||||
and sf.parent_path = t.parent_path
|
||||
where db_id = {db_id} and t.field_id is null
|
||||
@ -325,13 +318,12 @@ class HdfsLoad:
|
||||
|
||||
-- insert
|
||||
insert into stg_dict_dataset_field_comment
|
||||
select t.field_id field_id, fc.id comment_id, d.id dataset_id, {db_id}
|
||||
from stg_dict_field_detail sf join dict_dataset d
|
||||
on sf.urn = d.urn
|
||||
select t.field_id field_id, fc.id comment_id, sf.dataset_id dataset_id, {db_id}
|
||||
from stg_dict_field_detail sf
|
||||
join field_comments fc
|
||||
on sf.description = fc.comment
|
||||
join dict_field_detail t
|
||||
on d.id = t.dataset_id
|
||||
on sf.dataset_id = t.dataset_id
|
||||
and sf.field_name = t.field_name
|
||||
and sf.parent_path = t.parent_path
|
||||
where sf.db_id = {db_id};
|
||||
@ -357,15 +349,12 @@ class HdfsLoad:
|
||||
DELETE FROM stg_dict_field_detail where db_id = {db_id};
|
||||
|
||||
'''.format(source_file=self.input_field_file, db_id=self.db_id)
|
||||
for state in load_field_cmd.split(";"):
|
||||
self.logger.debug(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
|
||||
self.executeCommands(load_field_cmd)
|
||||
self.logger.info("finish loading hdfs metadata db_id={db_id} to dict_field_detail".format(db_id=self.db_id))
|
||||
|
||||
|
||||
def load_sample(self):
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_sample_cmd = '''
|
||||
DELETE FROM stg_dict_dataset_sample where db_id = {db_id};
|
||||
|
||||
@ -375,11 +364,11 @@ class HdfsLoad:
|
||||
(urn,ref_urn,data)
|
||||
SET db_id = {db_id};
|
||||
|
||||
-- update reference id in stagging table
|
||||
UPDATE stg_dict_dataset_sample s
|
||||
-- update reference id in staging table
|
||||
UPDATE stg_dict_dataset_sample s
|
||||
JOIN dict_dataset d ON s.ref_urn = d.urn
|
||||
SET s.ref_id = d.id
|
||||
WHERE s.db_id = {db_id} AND s.ref_urn > '' AND s.ref_urn <> 'null';
|
||||
WHERE s.db_id = {db_id} AND s.ref_urn > '';
|
||||
|
||||
-- first insert ref_id as 0
|
||||
INSERT INTO dict_dataset_sample
|
||||
@ -401,14 +390,18 @@ class HdfsLoad:
|
||||
SET d.ref_id = s.ref_id
|
||||
WHERE s.db_id = {db_id} AND d.ref_id = 0;
|
||||
'''.format(source_file=self.input_sample_file, db_id=self.db_id)
|
||||
for state in load_sample_cmd.split(";"):
|
||||
self.logger.debug(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
|
||||
self.executeCommands(load_sample_cmd)
|
||||
self.logger.info("finish loading hdfs sample data db_id={db_id} to dict_dataset_sample".format(db_id=self.db_id))
|
||||
|
||||
|
||||
def executeCommands(self, commands):
|
||||
for cmd in commands.split(";"):
|
||||
self.logger.debug(cmd)
|
||||
self.conn_cursor.execute(cmd)
|
||||
self.conn_mysql.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = sys.argv[1]
|
||||
|
||||
@ -426,10 +419,11 @@ if __name__ == "__main__":
|
||||
l.db_id = args[Constant.DB_ID_KEY]
|
||||
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
||||
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
||||
l.conn_cursor = l.conn_mysql.cursor()
|
||||
|
||||
if Constant.INNODB_LOCK_WAIT_TIMEOUT in args:
|
||||
lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
|
||||
l.conn_mysql.cursor().execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
|
||||
l.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
|
||||
|
||||
try:
|
||||
l.load_metadata()
|
||||
|
@ -23,7 +23,6 @@ class HiveLoad:
|
||||
self.logger = LoggerFactory.getLogger("%s[%s]" % (self.__class__.__name__, wh_etl_exec_id))
|
||||
|
||||
def load_metadata(self):
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = """
|
||||
DELETE FROM stg_dict_dataset WHERE db_id = {db_id};
|
||||
|
||||
@ -97,18 +96,15 @@ class HiveLoad:
|
||||
;
|
||||
""".format(source_file=self.input_schema_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.info(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Load dataset metadata.")
|
||||
|
||||
|
||||
def load_field(self):
|
||||
"""
|
||||
Load fields
|
||||
:return:
|
||||
"""
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = """
|
||||
DELETE FROM stg_dict_field_detail WHERE db_id = {db_id};
|
||||
|
||||
@ -125,30 +121,26 @@ class HiveLoad:
|
||||
, description=nullif(@description,'null')
|
||||
, last_modified=now();
|
||||
|
||||
-- update dataset_id
|
||||
-- update stg_dict_field_detail dataset_id
|
||||
update stg_dict_field_detail sf, dict_dataset d
|
||||
set sf.dataset_id = d.id where sf.urn = d.urn
|
||||
and sf.db_id = {db_id};
|
||||
delete from stg_dict_field_detail
|
||||
where db_id = {db_id} and dataset_id is null; -- remove if not match to dataset
|
||||
|
||||
-- delete old record if it does not exist in this load batch anymore (but have the dataset id)
|
||||
-- join with dict_dataset to avoid right join using index. (using index will slow down the query)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id))
|
||||
select x.field_id
|
||||
from stg_dict_field_detail s
|
||||
join dict_dataset i
|
||||
on s.urn = i.urn
|
||||
and s.db_id = {db_id}
|
||||
right join dict_field_detail x
|
||||
on i.id = x.dataset_id
|
||||
and s.field_name = x.field_name
|
||||
and s.parent_path = x.parent_path
|
||||
where s.field_name is null
|
||||
and x.dataset_id in (
|
||||
select d.id dataset_id
|
||||
from stg_dict_field_detail k join dict_dataset d
|
||||
on k.urn = d.urn
|
||||
and k.db_id = {db_id}
|
||||
)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
|
||||
SELECT x.field_id
|
||||
FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
|
||||
RIGHT JOIN
|
||||
( select dataset_id, field_id, field_name, parent_path from dict_field_detail
|
||||
where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
|
||||
) x
|
||||
ON s.dataset_id = x.dataset_id
|
||||
AND s.field_name = x.field_name
|
||||
AND s.parent_path = x.parent_path
|
||||
WHERE s.field_name is null
|
||||
; -- run time : ~2min
|
||||
|
||||
delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
|
||||
@ -190,7 +182,7 @@ class HiveLoad:
|
||||
|
||||
-- insert new ones
|
||||
CREATE TEMPORARY TABLE IF NOT EXISTS t_existed_field
|
||||
( primary key (urn, sort_id, db_id) )
|
||||
( primary key (urn, sort_id, db_id) ) ENGINE=MyISAM
|
||||
AS (
|
||||
SELECT sf.urn, sf.sort_id, sf.db_id, count(*) field_count
|
||||
FROM stg_dict_field_detail sf
|
||||
@ -203,8 +195,7 @@ class HiveLoad:
|
||||
group by 1,2,3
|
||||
);
|
||||
|
||||
|
||||
insert ignore into dict_field_detail (
|
||||
insert ignore into dict_field_detail (
|
||||
dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path,
|
||||
field_name, namespace, data_type, data_size, is_nullable, default_value,
|
||||
modified
|
||||
@ -239,7 +230,6 @@ class HiveLoad:
|
||||
where field_id in (select field_id from stg_dict_dataset_field_comment)
|
||||
and is_default = 1 ) and db_id = {db_id};
|
||||
|
||||
|
||||
-- doesn't have this comment before, insert into it and set as default
|
||||
insert ignore into dict_dataset_field_comment
|
||||
select sd.field_id, sd.comment_id, sd.dataset_id, 1 from stg_dict_dataset_field_comment sd
|
||||
@ -249,7 +239,6 @@ class HiveLoad:
|
||||
where d.comment_id is null
|
||||
and sd.db_id = {db_id};
|
||||
|
||||
|
||||
insert into field_comments (
|
||||
user_id, comment, created, modified, comment_crc32_checksum
|
||||
)
|
||||
@ -266,20 +255,15 @@ class HiveLoad:
|
||||
|
||||
""".format(source_file=self.input_field_file, db_id=self.db_id)
|
||||
|
||||
# didn't load into final table for now
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Load dataset fields.")
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.info(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
|
||||
def load_dataset_instance(self):
|
||||
"""
|
||||
Load dataset instance
|
||||
:return:
|
||||
"""
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = """
|
||||
DELETE FROM stg_dict_dataset_instance WHERE db_id = {db_id};
|
||||
|
||||
@ -337,21 +321,15 @@ class HiveLoad:
|
||||
;
|
||||
""".format(source_file=self.input_instance_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)
|
||||
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Load dataset instance.")
|
||||
|
||||
# didn't load into final table for now
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.info(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
|
||||
def load_dataset_dependencies(self):
|
||||
"""
|
||||
Load dataset instance
|
||||
Load dataset dependencies
|
||||
:return:
|
||||
"""
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = """
|
||||
DELETE FROM stg_cfg_object_name_map;
|
||||
LOAD DATA LOCAL INFILE '{source_file}'
|
||||
@ -372,7 +350,7 @@ class HiveLoad:
|
||||
-- create to be deleted table
|
||||
DROP TEMPORARY table IF EXISTS t_deleted_depend;
|
||||
|
||||
CREATE TEMPORARY TABLE t_deleted_depend
|
||||
CREATE TEMPORARY TABLE t_deleted_depend ENGINE=MyISAM
|
||||
AS (
|
||||
SELECT DISTINCT c.obj_name_map_id
|
||||
FROM cfg_object_name_map c LEFT JOIN stg_cfg_object_name_map s
|
||||
@ -420,12 +398,15 @@ class HiveLoad:
|
||||
""".format(source_file=self.input_dependency_file)
|
||||
|
||||
# didn't load into final table for now
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Load dataset dependencies.")
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.info(state)
|
||||
cursor.execute(state)
|
||||
|
||||
def executeCommands(self, commands):
|
||||
for cmd in commands.split(";"):
|
||||
self.logger.debug(cmd)
|
||||
self.conn_cursor.execute(cmd)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -446,10 +427,11 @@ if __name__ == "__main__":
|
||||
l.db_id = args[Constant.DB_ID_KEY]
|
||||
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
||||
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
||||
l.conn_cursor = l.conn_mysql.cursor()
|
||||
|
||||
if Constant.INNODB_LOCK_WAIT_TIMEOUT in args:
|
||||
lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
|
||||
l.conn_mysql.cursor().execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
|
||||
l.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
|
||||
|
||||
try:
|
||||
l.load_metadata()
|
||||
|
@ -15,7 +15,7 @@
|
||||
from com.ziclix.python.sql import zxJDBC
|
||||
from wherehows.common import Constant
|
||||
from org.slf4j import LoggerFactory
|
||||
import sys, os, datetime
|
||||
import sys, datetime
|
||||
|
||||
|
||||
class OracleLoad:
|
||||
@ -159,24 +159,25 @@ class OracleLoad:
|
||||
|
||||
analyze table {field_comments};
|
||||
|
||||
-- update stg_dict_field_detail dataset_id
|
||||
update stg_dict_field_detail sf, dict_dataset d
|
||||
set sf.dataset_id = d.id where sf.urn = d.urn
|
||||
and sf.db_id = {db_id};
|
||||
delete from stg_dict_field_detail
|
||||
where db_id = {db_id} and dataset_id is null; -- remove if not match to dataset
|
||||
|
||||
-- delete old record if it does not exist in this load batch anymore (but have the dataset id)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id))
|
||||
select x.field_id
|
||||
from stg_dict_field_detail s
|
||||
join {dict_dataset} i
|
||||
on s.urn = i.urn
|
||||
and s.db_id = {db_id}
|
||||
right join {dict_field_detail} x
|
||||
on i.id = x.dataset_id
|
||||
and s.field_name = x.field_name
|
||||
and s.parent_path = x.parent_path
|
||||
where s.field_name is null
|
||||
and x.dataset_id in (
|
||||
select d.id dataset_id
|
||||
from stg_dict_field_detail k join {dict_dataset} d
|
||||
on k.urn = d.urn
|
||||
and k.db_id = {db_id}
|
||||
)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
|
||||
SELECT x.field_id
|
||||
FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
|
||||
RIGHT JOIN
|
||||
( select dataset_id, field_id, field_name, parent_path from dict_field_detail
|
||||
where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
|
||||
) x
|
||||
ON s.dataset_id = x.dataset_id
|
||||
AND s.field_name = x.field_name
|
||||
AND s.parent_path = x.parent_path
|
||||
WHERE s.field_name is null
|
||||
; -- run time : ~2min
|
||||
|
||||
delete from {dict_field_detail} where field_id in (select field_id from t_deleted_fields);
|
||||
@ -186,12 +187,10 @@ class OracleLoad:
|
||||
(
|
||||
select x.field_id, s.*
|
||||
from stg_dict_field_detail s
|
||||
join {dict_dataset} d
|
||||
on s.urn = d.urn
|
||||
join {dict_field_detail} x
|
||||
on s.field_name = x.field_name
|
||||
and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
|
||||
and d.id = x.dataset_id
|
||||
and s.dataset_id = x.dataset_id
|
||||
where s.db_id = {db_id}
|
||||
and (x.sort_id <> s.sort_id
|
||||
or x.parent_sort_id <> s.parent_sort_id
|
||||
@ -224,12 +223,11 @@ class OracleLoad:
|
||||
field_name, namespace, data_type, data_size, is_nullable, default_value, modified
|
||||
)
|
||||
select
|
||||
d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
|
||||
sf.dataset_id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
|
||||
sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now()
|
||||
from stg_dict_field_detail sf join {dict_dataset} d
|
||||
on sf.urn = d.urn
|
||||
from stg_dict_field_detail sf
|
||||
left join {dict_field_detail} t
|
||||
on d.id = t.dataset_id
|
||||
on sf.dataset_id = t.dataset_id
|
||||
and sf.field_name = t.field_name
|
||||
and sf.parent_path = t.parent_path
|
||||
where db_id = {db_id} and t.field_id is null
|
||||
@ -242,13 +240,12 @@ class OracleLoad:
|
||||
|
||||
-- insert
|
||||
insert into stg_dict_dataset_field_comment
|
||||
select t.field_id field_id, fc.id comment_id, d.id dataset_id, {db_id}
|
||||
from stg_dict_field_detail sf join {dict_dataset} d
|
||||
on sf.urn = d.urn
|
||||
select t.field_id field_id, fc.id comment_id, sf.dataset_id dataset_id, {db_id}
|
||||
from stg_dict_field_detail sf
|
||||
join {field_comments} fc
|
||||
on sf.description = fc.comment
|
||||
join {dict_field_detail} t
|
||||
on d.id = t.dataset_id
|
||||
on sf.dataset_id = t.dataset_id
|
||||
and sf.field_name = t.field_name
|
||||
and sf.parent_path = t.parent_path
|
||||
where sf.db_id = {db_id};
|
||||
@ -290,7 +287,7 @@ class OracleLoad:
|
||||
(urn,ref_urn,data)
|
||||
SET db_id = {db_id};
|
||||
|
||||
-- update reference id in stagging table
|
||||
-- update reference id in staging table
|
||||
UPDATE stg_dict_dataset_sample s
|
||||
LEFT JOIN {dict_dataset} d ON s.ref_urn = d.urn
|
||||
SET s.ref_id = d.id
|
||||
|
@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#
|
||||
|
||||
import sys
|
||||
import sys, datetime
|
||||
from com.ziclix.python.sql import zxJDBC
|
||||
from distutils.util import strtobool
|
||||
from wherehows.common import Constant
|
||||
@ -24,7 +24,6 @@ class TeradataLoad:
|
||||
self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
|
||||
|
||||
def load_metadata(self):
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = '''
|
||||
DELETE FROM stg_dict_dataset WHERE db_id = {db_id};
|
||||
|
||||
@ -173,14 +172,11 @@ class TeradataLoad:
|
||||
;
|
||||
'''.format(source_file=self.input_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.debug(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Finish loading metadata")
|
||||
|
||||
|
||||
def load_field(self):
|
||||
cursor = self.conn_mysql.cursor()
|
||||
load_cmd = '''
|
||||
DELETE FROM stg_dict_field_detail where db_id = {db_id};
|
||||
|
||||
@ -206,24 +202,25 @@ class TeradataLoad:
|
||||
and (char_length(trim(description)) = 0
|
||||
or description in ('null', 'N/A', 'nothing', 'empty', 'none'));
|
||||
|
||||
-- update stg_dict_field_detail dataset_id
|
||||
update stg_dict_field_detail sf, dict_dataset d
|
||||
set sf.dataset_id = d.id where sf.urn = d.urn
|
||||
and sf.db_id = {db_id};
|
||||
delete from stg_dict_field_detail
|
||||
where db_id = {db_id} and dataset_id is null; -- remove if not match to dataset
|
||||
|
||||
-- delete old record if it does not exist in this load batch anymore (but have the dataset id)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id))
|
||||
select x.field_id
|
||||
from stg_dict_field_detail s
|
||||
join dict_dataset i
|
||||
on s.urn = i.urn
|
||||
and s.db_id = {db_id}
|
||||
right join dict_field_detail x
|
||||
on i.id = x.dataset_id
|
||||
and s.field_name = x.field_name
|
||||
and s.parent_path = x.parent_path
|
||||
where s.field_name is null
|
||||
and x.dataset_id in (
|
||||
select d.id dataset_id
|
||||
from stg_dict_field_detail k join dict_dataset d
|
||||
on k.urn = d.urn
|
||||
and k.db_id = {db_id}
|
||||
)
|
||||
create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
|
||||
SELECT x.field_id
|
||||
FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
|
||||
RIGHT JOIN
|
||||
( select dataset_id, field_id, field_name, parent_path from dict_field_detail
|
||||
where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
|
||||
) x
|
||||
ON s.dataset_id = x.dataset_id
|
||||
AND s.field_name = x.field_name
|
||||
AND s.parent_path = x.parent_path
|
||||
WHERE s.field_name is null
|
||||
; -- run time : ~2min
|
||||
|
||||
delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
|
||||
@ -232,12 +229,11 @@ class TeradataLoad:
|
||||
update dict_field_detail t join
|
||||
(
|
||||
select x.field_id, s.*
|
||||
from stg_dict_field_detail s join dict_dataset d
|
||||
on s.urn = d.urn
|
||||
from stg_dict_field_detail s
|
||||
join dict_field_detail x
|
||||
on s.field_name = x.field_name
|
||||
and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
|
||||
and d.id = x.dataset_id
|
||||
and s.dataset_id = x.dataset_id
|
||||
where s.db_id = {db_id}
|
||||
and (x.sort_id <> s.sort_id
|
||||
or x.parent_sort_id <> s.parent_sort_id
|
||||
@ -274,7 +270,7 @@ class TeradataLoad:
|
||||
modified
|
||||
)
|
||||
select
|
||||
d.id,
|
||||
s.dataset_id,
|
||||
0 as fields_layout_id,
|
||||
s.sort_id,
|
||||
0 parent_sort_id,
|
||||
@ -285,10 +281,9 @@ class TeradataLoad:
|
||||
s.data_scale,
|
||||
s.is_nullable,
|
||||
now()
|
||||
from stg_dict_field_detail s join dict_dataset d
|
||||
on s.urn = d.urn
|
||||
from stg_dict_field_detail s
|
||||
left join dict_field_detail f
|
||||
on d.id = f.dataset_id
|
||||
on s.dataset_id = f.dataset_id
|
||||
and s.field_name = f.field_name
|
||||
where db_id = {db_id} and f.field_id is null;
|
||||
|
||||
@ -299,13 +294,12 @@ class TeradataLoad:
|
||||
|
||||
-- insert
|
||||
insert into stg_dict_dataset_field_comment
|
||||
select t.field_id field_id, fc.id comment_id, d.id dataset_id, {db_id}
|
||||
from stg_dict_field_detail sf join dict_dataset d
|
||||
on sf.urn = d.urn
|
||||
select t.field_id field_id, fc.id comment_id, sf.dataset_id dataset_id, {db_id}
|
||||
from stg_dict_field_detail sf
|
||||
join field_comments fc
|
||||
on sf.description = fc.comment
|
||||
join dict_field_detail t
|
||||
on d.id = t.dataset_id
|
||||
on sf.dataset_id = t.dataset_id
|
||||
and sf.field_name = t.field_name
|
||||
and sf.parent_path = t.parent_path
|
||||
where sf.db_id = {db_id};
|
||||
@ -328,11 +322,9 @@ class TeradataLoad:
|
||||
and sd.db_id = {db_id};
|
||||
'''.format(source_file=self.input_field_file, db_id=self.db_id)
|
||||
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.debug(state)
|
||||
cursor.execute(state)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Finish loading fields ")
|
||||
|
||||
|
||||
def load_sample(self):
|
||||
load_cmd = '''
|
||||
@ -343,11 +335,20 @@ class TeradataLoad:
|
||||
(urn,ref_urn,data)
|
||||
SET db_id = {db_id};
|
||||
|
||||
-- update reference id in stagging table
|
||||
UPDATE stg_dict_dataset_sample s
|
||||
LEFT JOIN dict_dataset d ON s.ref_urn = d.urn
|
||||
SET s.ref_id = d.id
|
||||
WHERE s.db_id = {db_id};
|
||||
-- update dataset id in staging table
|
||||
UPDATE stg_dict_dataset_sample s
|
||||
JOIN dict_dataset d ON s.db_id = {db_id} and s.urn = d.urn
|
||||
SET s.dataset_id = d.id;
|
||||
|
||||
-- update reference id in staging table
|
||||
UPDATE stg_dict_dataset_sample s
|
||||
JOIN (
|
||||
select dataset_id, id from
|
||||
(select dataset_id, ref_urn from stg_dict_dataset_sample
|
||||
where db_id = 3 and ref_urn > '') st
|
||||
join dict_dataset on urn = ref_urn) d
|
||||
ON s.dataset_id = d.dataset_id
|
||||
SET s.ref_id = d.id;
|
||||
|
||||
-- first insert ref_id as 0
|
||||
INSERT INTO dict_dataset_sample
|
||||
@ -357,27 +358,24 @@ class TeradataLoad:
|
||||
`data`,
|
||||
created
|
||||
)
|
||||
select d.id as dataset_id, s.urn, s.ref_id, s.data, now()
|
||||
from stg_dict_dataset_sample s left join dict_dataset d on d.urn = s.urn
|
||||
where s.db_id = {db_id}
|
||||
on duplicate key update
|
||||
`data`=s.data, modified=now();
|
||||
|
||||
|
||||
-- update reference id in final table
|
||||
UPDATE dict_dataset_sample d
|
||||
RIGHT JOIN stg_dict_dataset_sample s ON d.urn = s.urn
|
||||
SET d.ref_id = s.ref_id
|
||||
WHERE s.db_id = {db_id} AND d.ref_id = 0;
|
||||
|
||||
SELECT * from
|
||||
(select dataset_id, urn, ref_id s_ref_id, `data` s_data, now()
|
||||
from stg_dict_dataset_sample WHERE db_id = {db_id}) s
|
||||
ON DUPLICATE KEY UPDATE
|
||||
ref_id = COALESCE(s_ref_id, ref_id),
|
||||
`data`=s_data,
|
||||
modified=now();
|
||||
'''.format(source_file=self.input_sampledata_file, db_id=self.db_id)
|
||||
|
||||
cursor = self.conn_mysql.cursor()
|
||||
for state in load_cmd.split(";"):
|
||||
self.logger.debug(state)
|
||||
cursor.execute(state)
|
||||
self.executeCommands(load_cmd)
|
||||
self.logger.info("Finish loading samples ")
|
||||
|
||||
|
||||
def executeCommands(self, commands):
|
||||
for cmd in commands.split(";"):
|
||||
self.logger.debug(cmd)
|
||||
self.conn_cursor.execute(cmd)
|
||||
self.conn_mysql.commit()
|
||||
cursor.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -405,10 +403,11 @@ if __name__ == "__main__":
|
||||
l.db_id = args[Constant.DB_ID_KEY]
|
||||
l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
|
||||
l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
||||
l.conn_cursor = l.conn_mysql.cursor()
|
||||
|
||||
if Constant.INNODB_LOCK_WAIT_TIMEOUT in args:
|
||||
lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
|
||||
l.conn_mysql.cursor().execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
|
||||
l.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
|
||||
|
||||
try:
|
||||
l.load_metadata()
|
||||
|
@ -34,4 +34,18 @@ public class TeradataMetadataEtlTest {
|
||||
TeradataMetadataEtl t = new TeradataMetadataEtl(3, 0L);
|
||||
t.extract();
|
||||
}
|
||||
|
||||
@Test(groups = {"needConfig"})
|
||||
public void testTransform()
|
||||
throws Exception {
|
||||
TeradataMetadataEtl t = new TeradataMetadataEtl(3, 0L);
|
||||
t.transform();
|
||||
}
|
||||
|
||||
@Test(groups = {"needConfig"})
|
||||
public void testLoad()
|
||||
throws Exception {
|
||||
TeradataMetadataEtl t = new TeradataMetadataEtl(3, 0L);
|
||||
t.load();
|
||||
}
|
||||
}
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder>
|
||||
<pattern>%coloredLevel - %logger - %message%n%xException</pattern>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user