# # Copyright 2015 LinkedIn Corp. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # from com.ziclix.python.sql import zxJDBC from wherehows.common import Constant from org.slf4j import LoggerFactory import sys, os, datetime class OracleLoad: def __init__(self, args): self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__) username = args[Constant.WH_DB_USERNAME_KEY] password = args[Constant.WH_DB_PASSWORD_KEY] JDBC_DRIVER = args[Constant.WH_DB_DRIVER_KEY] JDBC_URL = args[Constant.WH_DB_URL_KEY] self.input_table_file = args[Constant.ORA_SCHEMA_OUTPUT_KEY] self.input_field_file = args[Constant.ORA_FIELD_OUTPUT_KEY] self.input_sample_file = args[Constant.ORA_SAMPLE_OUTPUT_KEY] self.db_id = args[Constant.DB_ID_KEY] self.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY] self.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER) self.conn_cursor = self.conn_mysql.cursor() if Constant.INNODB_LOCK_WAIT_TIMEOUT in args: lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT] self.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time) self.logger.info("Load Oracle Metadata into {}, db_id {}, wh_exec_id {}" .format(JDBC_URL, self.db_id, self.wh_etl_exec_id)) self.dict_dataset_table = 'dict_dataset' self.field_comments_table = 'field_comments' self.dict_field_table = 'dict_field_detail' self.dict_field_comment_table = 'dict_dataset_field_comment' self.dict_dataset_sample_table = 'dict_dataset_sample' def load_tables(self): load_tables_cmd = ''' DELETE FROM stg_dict_dataset WHERE db_id = {db_id}; -- load into stg table LOAD DATA LOCAL INFILE '{source_file}' INTO TABLE stg_dict_dataset FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' IGNORE 1 LINES (`name`, `schema`, `schema_type`, `properties`, `urn`, `source`, `location_prefix`, `parent_name`, `storage_type`, `dataset_type`, `is_partitioned`) SET db_id = {db_id}, wh_etl_exec_id = {wh_etl_exec_id}; -- insert into final table INSERT INTO {dict_dataset} ( `name`, `schema`, schema_type, `fields`, properties, urn, `source`, location_prefix, parent_name, storage_type, ref_dataset_id, status_id, dataset_type, hive_serdes_class, is_partitioned, partition_layout_pattern_id, sample_partition_full_path, source_created_time, source_modified_time, created_time, wh_etl_exec_id ) select s.name, s.schema, s.schema_type, s.fields, s.properties, s.urn, s.source, s.location_prefix, s.parent_name, s.storage_type, s.ref_dataset_id, s.status_id, s.dataset_type, s.hive_serdes_class, s.is_partitioned, s.partition_layout_pattern_id, s.sample_partition_full_path, s.source_created_time, s.source_modified_time, UNIX_TIMESTAMP(now()), s.wh_etl_exec_id from stg_dict_dataset s where s.db_id = {db_id} on duplicate key update `name`=s.name, `schema`=s.schema, schema_type=s.schema_type, `fields`=s.fields, properties=s.properties, `source`=s.source, location_prefix=s.location_prefix, parent_name=s.parent_name, storage_type=s.storage_type, ref_dataset_id=s.ref_dataset_id, status_id=s.status_id, dataset_type=s.dataset_type, hive_serdes_class=s.hive_serdes_class, is_partitioned=s.is_partitioned, partition_layout_pattern_id=s.partition_layout_pattern_id, sample_partition_full_path=s.sample_partition_full_path, source_created_time=s.source_created_time, source_modified_time=s.source_modified_time, modified_time=UNIX_TIMESTAMP(now()), wh_etl_exec_id=s.wh_etl_exec_id ; analyze table {dict_dataset}; '''.format(source_file=self.input_table_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id, dict_dataset=self.dict_dataset_table) self.executeCommands(load_tables_cmd) self.logger.info("finish loading oracle table metadata from {} to {}" .format(self.input_table_file, self.dict_dataset_table)) def load_fields(self): load_fields_cmd = ''' DELETE FROM stg_dict_field_detail where db_id = {db_id}; LOAD DATA LOCAL INFILE '{source_file}' INTO TABLE stg_dict_field_detail FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' IGNORE 1 LINES (urn, sort_id, field_name, data_type, is_nullable, @vdata_size, @vdata_precision, @vdata_scale, @vdefault_value, description) SET db_id = {db_id}, data_size = nullif(@vdata_size,''), data_precision = nullif(@vdata_precision,''), data_scale = nullif(@vdata_scale,''), default_value = nullif(@vdefault_value,'') ; -- show warnings limit 20; analyze table stg_dict_field_detail; update stg_dict_field_detail set default_value = trim(default_value) where db_id = {db_id}; update stg_dict_field_detail set description = null where db_id = {db_id} and (char_length(trim(description)) = 0 or description in ('null', 'N/A', 'nothing', 'empty', 'none')); insert into {field_comments} ( user_id, comment, created, modified, comment_crc32_checksum ) select 0 user_id, description, now() created, now() modified, crc32(description) from ( select sf.description from stg_dict_field_detail sf left join {field_comments} fc on sf.description = fc.comment where sf.description is not null and fc.id is null and sf.db_id = {db_id} group by 1 order by 1 ) d; analyze table {field_comments}; -- delete old record if it does not exist in this load batch anymore (but have the dataset id) create temporary table if not exists t_deleted_fields (primary key (field_id)) select x.field_id from stg_dict_field_detail s join {dict_dataset} i on s.urn = i.urn and s.db_id = {db_id} right join {dict_field_detail} x on i.id = x.dataset_id and s.field_name = x.field_name and s.parent_path = x.parent_path where s.field_name is null and x.dataset_id in ( select d.id dataset_id from stg_dict_field_detail k join {dict_dataset} d on k.urn = d.urn and k.db_id = {db_id} ) ; -- run time : ~2min delete from {dict_field_detail} where field_id in (select field_id from t_deleted_fields); -- update the old record if some thing changed update {dict_field_detail} t join ( select x.field_id, s.* from stg_dict_field_detail s join {dict_dataset} d on s.urn = d.urn join {dict_field_detail} x on s.field_name = x.field_name and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*') and d.id = x.dataset_id where s.db_id = {db_id} and (x.sort_id <> s.sort_id or x.parent_sort_id <> s.parent_sort_id or x.data_type <> s.data_type or x.data_size <> s.data_size or (x.data_size is null XOR s.data_size is null) or x.data_precision <> s.data_precision or (x.data_precision is null XOR s.data_precision is null) or x.is_nullable <> s.is_nullable or (x.is_nullable is null XOR s.is_nullable is null) or x.is_partitioned <> s.is_partitioned or (x.is_partitioned is null XOR s.is_partitioned is null) or x.is_distributed <> s.is_distributed or (x.is_distributed is null XOR s.is_distributed is null) or x.default_value <> s.default_value or (x.default_value is null XOR s.default_value is null) or x.namespace <> s.namespace or (x.namespace is null XOR s.namespace is null) ) ) p on t.field_id = p.field_id set t.sort_id = p.sort_id, t.parent_sort_id = p.parent_sort_id, t.data_type = p.data_type, t.data_size = p.data_size, t.data_precision = p.data_precision, t.is_nullable = p.is_nullable, t.is_partitioned = p.is_partitioned, t.is_distributed = p.is_distributed, t.default_value = p.default_value, t.namespace = p.namespace, t.modified = now() ; insert into {dict_field_detail} ( dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path, field_name, namespace, data_type, data_size, is_nullable, default_value, modified ) select d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path, sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now() from stg_dict_field_detail sf join {dict_dataset} d on sf.urn = d.urn left join {dict_field_detail} t on d.id = t.dataset_id and sf.field_name = t.field_name and sf.parent_path = t.parent_path where db_id = {db_id} and t.field_id is null ; analyze table {dict_field_detail}; -- delete old record in stagging delete from stg_dict_dataset_field_comment where db_id = {db_id}; -- insert insert into stg_dict_dataset_field_comment select t.field_id field_id, fc.id comment_id, d.id dataset_id, {db_id} from stg_dict_field_detail sf join {dict_dataset} d on sf.urn = d.urn join {field_comments} fc on sf.description = fc.comment join {dict_field_detail} t on d.id = t.dataset_id and sf.field_name = t.field_name and sf.parent_path = t.parent_path where sf.db_id = {db_id}; -- have default comment, insert it set default to 0 insert ignore into {dict_dataset_field_comment} select field_id, comment_id, dataset_id, 0 is_default from stg_dict_dataset_field_comment where field_id in ( select field_id from {dict_dataset_field_comment} where field_id in (select field_id from stg_dict_dataset_field_comment) and is_default = 1 ) and db_id = {db_id}; -- doesn't have this comment before, insert into it and set as default insert ignore into {dict_dataset_field_comment} select sd.field_id, sd.comment_id, sd.dataset_id, 1 from stg_dict_dataset_field_comment sd left join {dict_dataset_field_comment} d on d.field_id = sd.field_id and d.comment_id = sd.comment_id where d.comment_id is null and sd.db_id = {db_id}; '''.format(source_file=self.input_field_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id, dict_dataset=self.dict_dataset_table, dict_field_detail=self.dict_field_table, field_comments=self.field_comments_table, dict_dataset_field_comment=self.dict_field_comment_table) self.executeCommands(load_fields_cmd) self.logger.info("finish loading oracle table fields from {} to {}" .format(self.input_field_file, self.dict_field_table)) def load_sample(self): load_sample_cmd = ''' DELETE FROM stg_dict_dataset_sample where db_id = {db_id}; LOAD DATA LOCAL INFILE '{source_file}' INTO TABLE stg_dict_dataset_sample FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' IGNORE 1 LINES (urn,ref_urn,data) SET db_id = {db_id}; -- update reference id in stagging table UPDATE stg_dict_dataset_sample s LEFT JOIN {dict_dataset} d ON s.ref_urn = d.urn SET s.ref_id = d.id WHERE s.db_id = {db_id}; -- first insert ref_id as 0 INSERT INTO {dict_dataset_sample} ( `dataset_id`, `urn`, `ref_id`, `data`, created ) select d.id as dataset_id, s.urn, s.ref_id, s.data, now() from stg_dict_dataset_sample s left join {dict_dataset} d on d.urn = s.urn where s.db_id = {db_id} on duplicate key update `data`=s.data, modified=now(); -- update reference id in final table UPDATE {dict_dataset_sample} d RIGHT JOIN stg_dict_dataset_sample s ON d.urn = s.urn SET d.ref_id = s.ref_id WHERE s.db_id = {db_id} AND d.ref_id = 0; '''.format(source_file=self.input_sample_file, db_id=self.db_id, dict_dataset=self.dict_dataset_table, dict_dataset_sample=self.dict_dataset_sample_table) self.executeCommands(load_sample_cmd) self.logger.info("finish loading oracle sample data from {} to {}" .format(self.input_sample_file, self.dict_dataset_sample_table)) def executeCommands(self, commands): for cmd in commands.split(";"): self.logger.debug(cmd) self.conn_cursor.execute(cmd) self.conn_mysql.commit() def run(self): try: begin = datetime.datetime.now().strftime("%H:%M:%S") self.load_tables() self.load_fields() # self.load_sample() end = datetime.datetime.now().strftime("%H:%M:%S") self.logger.info("Load Oracle metadata [%s -> %s]" % (str(begin), str(end))) finally: self.conn_cursor.close() self.conn_mysql.close() if __name__ == "__main__": args = sys.argv[1] l = OracleLoad(args) l.run()