Modify Oracle metadata ETL job, use Json dumps and remove unnecessary quotes

This commit is contained in:
Yi Wang 2016-08-03 18:49:00 -07:00
parent b4a718efd0
commit dbbdb6e2fb
2 changed files with 89 additions and 68 deletions

View File

@ -199,7 +199,6 @@ class OracleExtract:
''' '''
schema_dict = {"fields": []} schema_dict = {"fields": []}
table_record = {} table_record = {}
field_record = {}
table_idx = 0 table_idx = 0
field_idx = 0 field_idx = 0
@ -211,14 +210,18 @@ class OracleExtract:
# This is a new table. Let's push the previous table record into output_list # This is a new table. Let's push the previous table record into output_list
if 'urn' in table_record: if 'urn' in table_record:
schema_dict["num_fields"] = field_idx schema_dict["num_fields"] = field_idx
table_record['columns'] = schema_dict table_record["columns"] = json.dumps(schema_dict)
self.table_output_list.append(table_record) self.table_output_list.append(table_record)
properties = {
"indexes": self.table_dict[table_name_key].get("indexes"),
"partition_column": self.table_dict[table_name_key].get("partition_column")
}
table_record = { table_record = {
"name": row[1], "name": row[1],
"columns": {}, "columns": None,
"schema_type": "JSON", "schema_type": "JSON",
"properties": self.table_dict[table_name_key], "properties": json.dumps(properties),
"urn": table_urn, "urn": table_urn,
"source": "Oracle", "source": "Oracle",
"location_prefix": row[0], "location_prefix": row[0],
@ -249,7 +252,7 @@ class OracleExtract:
# finish all remaining rows # finish all remaining rows
schema_dict["num_fields"] = field_idx schema_dict["num_fields"] = field_idx
table_record['columns'] = schema_dict table_record["columns"] = json.dumps(schema_dict)
self.table_output_list.append(table_record) self.table_output_list.append(table_record)
self.logger.info("%d Table records generated" % table_idx) self.logger.info("%d Table records generated" % table_idx)
@ -304,7 +307,17 @@ class OracleExtract:
return None return None
def trim_newline(self, line): def trim_newline(self, line):
return None if line is None else line.replace('\n', ' ').replace('\r', ' ') return None if line is None else line.replace('\n', ' ').replace('\r', ' ').encode('ascii', 'ignore')
def write_csv(self, csv_filename, csv_columns, data_list):
csvfile = open(csv_filename, 'wb')
os.chmod(csv_filename, 0644)
writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n',
quoting=csv.QUOTE_NONE, quotechar='\1', escapechar='\0')
writer.writeheader()
for data in data_list:
writer.writerow(data)
csvfile.close()
def run(self, database_name, table_name, table_output_file, field_output_file, sample_output_file, sample=False): def run(self, database_name, table_name, table_output_file, field_output_file, sample_output_file, sample=False):
@ -323,34 +336,25 @@ class OracleExtract:
begin = datetime.datetime.now().strftime("%H:%M:%S") begin = datetime.datetime.now().strftime("%H:%M:%S")
# table info # table info
rows = self.get_table_info(None, None) rows = self.get_table_info(None, None)
self.get_extra_table_info()
self.format_table_metadata(rows) self.format_table_metadata(rows)
end = datetime.datetime.now().strftime("%H:%M:%S") end = datetime.datetime.now().strftime("%H:%M:%S")
self.logger.info("Collecting table info [%s -> %s]" % (str(begin), str(end))) self.logger.info("Collecting table info [%s -> %s]" % (str(begin), str(end)))
csv_columns = ['name', 'columns', 'schema_type', 'properties', 'urn', 'source', 'location_prefix', csv_columns = ['name', 'columns', 'schema_type', 'properties', 'urn', 'source', 'location_prefix',
'parent_name', 'storage_type', 'dataset_type', 'is_partitioned'] 'parent_name', 'storage_type', 'dataset_type', 'is_partitioned']
csvfile = open(table_output_file, 'wb') self.write_csv(table_output_file, csv_columns, self.table_output_list)
os.chmod(table_output_file, 0666)
writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n')
writer.writeheader()
for data in self.table_output_list:
writer.writerow(data)
csvfile.close
csv_columns = ['dataset_urn', 'sort_id', 'name', 'data_type', 'nullable', csv_columns = ['dataset_urn', 'sort_id', 'name', 'data_type', 'nullable',
'size', 'precision', 'scale', 'default_value', 'doc'] 'size', 'precision', 'scale', 'default_value', 'doc']
csvfile = open(field_output_file, 'wb') self.write_csv(field_output_file, csv_columns, self.field_output_list)
os.chmod(field_output_file, 0666)
writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n')
writer.writeheader()
for data in self.field_output_list:
writer.writerow(data)
csvfile.close
if sample: if sample:
csvfile = open(sample_output_file, 'wb') csvfile = open(sample_output_file, 'wb')
os.chmod(sample_output_file, 0666) os.chmod(sample_output_file, 0666)
writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n') writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n',
quoting=csv.QUOTE_NONE, quotechar='\1', escapechar='\0')
self.logger.info("Writing to CSV file {}".format(sample_output_file))
# collect sample data # collect sample data
for onedatabase in schema: for onedatabase in schema:

View File

@ -15,7 +15,7 @@
from com.ziclix.python.sql import zxJDBC from com.ziclix.python.sql import zxJDBC
from wherehows.common import Constant from wherehows.common import Constant
from org.slf4j import LoggerFactory from org.slf4j import LoggerFactory
import datetime import sys, os, datetime
class OracleLoad: class OracleLoad:
@ -39,10 +39,19 @@ class OracleLoad:
lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT] lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
self.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time) self.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
self.logger.info("Load Oracle Metadata into {}, db_id {}, wh_exec_id {}"
.format(JDBC_URL, self.db_id, self.wh_etl_exec_id))
self.dict_dataset_table = 'dict_dataset'
self.field_comments_table = 'field_comments'
self.dict_field_table = 'dict_field_detail'
self.dict_field_comment_table = 'dict_dataset_field_comment'
self.dict_dataset_sample_table = 'dict_dataset_sample'
def load_tables(self): def load_tables(self):
load_tables_cmd = ''' load_tables_cmd = '''
DELETE FROM stg_dict_dataset WHERE db_id = '{db_id}'; DELETE FROM stg_dict_dataset WHERE db_id = {db_id};
-- load into stg table -- load into stg table
LOAD DATA LOCAL INFILE '{source_file}' LOAD DATA LOCAL INFILE '{source_file}'
@ -55,7 +64,7 @@ class OracleLoad:
wh_etl_exec_id = {wh_etl_exec_id}; wh_etl_exec_id = {wh_etl_exec_id};
-- insert into final table -- insert into final table
INSERT INTO dict_dataset INSERT INTO {dict_dataset}
( `name`, ( `name`,
`schema`, `schema`,
schema_type, schema_type,
@ -97,16 +106,18 @@ class OracleLoad:
modified_time=UNIX_TIMESTAMP(now()), wh_etl_exec_id=s.wh_etl_exec_id modified_time=UNIX_TIMESTAMP(now()), wh_etl_exec_id=s.wh_etl_exec_id
; ;
analyze table dict_dataset; analyze table {dict_dataset};
'''.format(source_file=self.input_table_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id) '''.format(source_file=self.input_table_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id,
dict_dataset=self.dict_dataset_table)
self.executeCommands(load_tables_cmd) self.executeCommands(load_tables_cmd)
self.logger.info("finish loading oracle table metadata") self.logger.info("finish loading oracle table metadata from {} to {}"
.format(self.input_table_file, self.dict_dataset_table))
def load_fields(self): def load_fields(self):
load_fields_cmd = ''' load_fields_cmd = '''
DELETE FROM stg_dict_field_detail where db_id = '{db_id}'; DELETE FROM stg_dict_field_detail where db_id = {db_id};
LOAD DATA LOCAL INFILE '{source_file}' LOAD DATA LOCAL INFILE '{source_file}'
INTO TABLE stg_dict_field_detail INTO TABLE stg_dict_field_detail
@ -132,13 +143,13 @@ class OracleLoad:
and (char_length(trim(description)) = 0 and (char_length(trim(description)) = 0
or description in ('null', 'N/A', 'nothing', 'empty', 'none')); or description in ('null', 'N/A', 'nothing', 'empty', 'none'));
insert into field_comments ( insert into {field_comments} (
user_id, comment, created, modified, comment_crc32_checksum user_id, comment, created, modified, comment_crc32_checksum
) )
select 0 user_id, description, now() created, now() modified, crc32(description) from select 0 user_id, description, now() created, now() modified, crc32(description) from
( (
select sf.description select sf.description
from stg_dict_field_detail sf left join field_comments fc from stg_dict_field_detail sf left join {field_comments} fc
on sf.description = fc.comment on sf.description = fc.comment
where sf.description is not null where sf.description is not null
and fc.id is null and fc.id is null
@ -146,40 +157,41 @@ class OracleLoad:
group by 1 order by 1 group by 1 order by 1
) d; ) d;
analyze table field_comments; analyze table {field_comments};
-- delete old record if it does not exist in this load batch anymore (but have the dataset id) -- delete old record if it does not exist in this load batch anymore (but have the dataset id)
create temporary table if not exists t_deleted_fields (primary key (field_id)) create temporary table if not exists t_deleted_fields (primary key (field_id))
select x.field_id select x.field_id
from stg_dict_field_detail s from stg_dict_field_detail s
join dict_dataset i join {dict_dataset} i
on s.urn = i.urn on s.urn = i.urn
and s.db_id = {db_id} and s.db_id = {db_id}
right join dict_field_detail x right join {dict_field_detail} x
on i.id = x.dataset_id on i.id = x.dataset_id
and s.field_name = x.field_name and s.field_name = x.field_name
and s.parent_path = x.parent_path and s.parent_path = x.parent_path
where s.field_name is null where s.field_name is null
and x.dataset_id in ( and x.dataset_id in (
select d.id dataset_id select d.id dataset_id
from stg_dict_field_detail k join dict_dataset d from stg_dict_field_detail k join {dict_dataset} d
on k.urn = d.urn on k.urn = d.urn
and k.db_id = {db_id} and k.db_id = {db_id}
) )
; -- run time : ~2min ; -- run time : ~2min
delete from dict_field_detail where field_id in (select field_id from t_deleted_fields); delete from {dict_field_detail} where field_id in (select field_id from t_deleted_fields);
-- update the old record if some thing changed -- update the old record if some thing changed
update dict_field_detail t join update {dict_field_detail} t join
( (
select x.field_id, s.* select x.field_id, s.*
from stg_dict_field_detail s join dict_dataset d from stg_dict_field_detail s
join {dict_dataset} d
on s.urn = d.urn on s.urn = d.urn
join dict_field_detail x join {dict_field_detail} x
on s.field_name = x.field_name on s.field_name = x.field_name
and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*') and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
and d.id = x.dataset_id and d.id = x.dataset_id
where s.db_id = {db_id} where s.db_id = {db_id}
and (x.sort_id <> s.sort_id and (x.sort_id <> s.sort_id
or x.parent_sort_id <> s.parent_sort_id or x.parent_sort_id <> s.parent_sort_id
@ -207,24 +219,23 @@ class OracleLoad:
t.modified = now() t.modified = now()
; ;
insert into dict_field_detail ( insert into {dict_field_detail} (
dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path, dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path,
field_name, namespace, data_type, data_size, is_nullable, default_value, field_name, namespace, data_type, data_size, is_nullable, default_value, modified
modified
) )
select select
d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path, d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now() sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now()
from stg_dict_field_detail sf join dict_dataset d from stg_dict_field_detail sf join {dict_dataset} d
on sf.urn = d.urn on sf.urn = d.urn
left join dict_field_detail t left join {dict_field_detail} t
on d.id = t.dataset_id on d.id = t.dataset_id
and sf.field_name = t.field_name and sf.field_name = t.field_name
and sf.parent_path = t.parent_path and sf.parent_path = t.parent_path
where db_id = {db_id} and t.field_id is null where db_id = {db_id} and t.field_id is null
; ;
analyze table dict_field_detail; analyze table {dict_field_detail};
-- delete old record in stagging -- delete old record in stagging
delete from stg_dict_dataset_field_comment where db_id = {db_id}; delete from stg_dict_dataset_field_comment where db_id = {db_id};
@ -232,36 +243,40 @@ class OracleLoad:
-- insert -- insert
insert into stg_dict_dataset_field_comment insert into stg_dict_dataset_field_comment
select t.field_id field_id, fc.id comment_id, d.id dataset_id, {db_id} select t.field_id field_id, fc.id comment_id, d.id dataset_id, {db_id}
from stg_dict_field_detail sf join dict_dataset d from stg_dict_field_detail sf join {dict_dataset} d
on sf.urn = d.urn on sf.urn = d.urn
join field_comments fc join {field_comments} fc
on sf.description = fc.comment on sf.description = fc.comment
join dict_field_detail t join {dict_field_detail} t
on d.id = t.dataset_id on d.id = t.dataset_id
and sf.field_name = t.field_name and sf.field_name = t.field_name
and sf.parent_path = t.parent_path and sf.parent_path = t.parent_path
where sf.db_id = {db_id}; where sf.db_id = {db_id};
-- have default comment, insert it set default to 0 -- have default comment, insert it set default to 0
insert ignore into dict_dataset_field_comment insert ignore into {dict_dataset_field_comment}
select field_id, comment_id, dataset_id, 0 is_default from stg_dict_dataset_field_comment where field_id in ( select field_id, comment_id, dataset_id, 0 is_default from stg_dict_dataset_field_comment where field_id in (
select field_id from dict_dataset_field_comment select field_id from {dict_dataset_field_comment}
where field_id in (select field_id from stg_dict_dataset_field_comment) where field_id in (select field_id from stg_dict_dataset_field_comment)
and is_default = 1 ) and db_id = {db_id}; and is_default = 1 ) and db_id = {db_id};
-- doesn't have this comment before, insert into it and set as default -- doesn't have this comment before, insert into it and set as default
insert ignore into dict_dataset_field_comment insert ignore into {dict_dataset_field_comment}
select sd.field_id, sd.comment_id, sd.dataset_id, 1 from stg_dict_dataset_field_comment sd select sd.field_id, sd.comment_id, sd.dataset_id, 1
left join dict_dataset_field_comment d from stg_dict_dataset_field_comment sd
on d.field_id = sd.field_id left join {dict_dataset_field_comment} d
and d.comment_id = sd.comment_id on d.field_id = sd.field_id
and d.comment_id = sd.comment_id
where d.comment_id is null where d.comment_id is null
and sd.db_id = {db_id}; and sd.db_id = {db_id};
'''.format(source_file=self.input_field_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id) '''.format(source_file=self.input_field_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id,
dict_dataset=self.dict_dataset_table, dict_field_detail=self.dict_field_table,
field_comments=self.field_comments_table, dict_dataset_field_comment=self.dict_field_comment_table)
self.executeCommands(load_fields_cmd) self.executeCommands(load_fields_cmd)
self.logger.info("finish loading oracle table fields") self.logger.info("finish loading oracle table fields from {} to {}"
.format(self.input_field_file, self.dict_field_table))
def load_sample(self): def load_sample(self):
@ -277,12 +292,12 @@ class OracleLoad:
-- update reference id in stagging table -- update reference id in stagging table
UPDATE stg_dict_dataset_sample s UPDATE stg_dict_dataset_sample s
LEFT JOIN dict_dataset d ON s.ref_urn = d.urn LEFT JOIN {dict_dataset} d ON s.ref_urn = d.urn
SET s.ref_id = d.id SET s.ref_id = d.id
WHERE s.db_id = {db_id}; WHERE s.db_id = {db_id};
-- first insert ref_id as 0 -- first insert ref_id as 0
INSERT INTO dict_dataset_sample INSERT INTO {dict_dataset_sample}
( `dataset_id`, ( `dataset_id`,
`urn`, `urn`,
`ref_id`, `ref_id`,
@ -290,20 +305,22 @@ class OracleLoad:
created created
) )
select d.id as dataset_id, s.urn, s.ref_id, s.data, now() select d.id as dataset_id, s.urn, s.ref_id, s.data, now()
from stg_dict_dataset_sample s left join dict_dataset d on d.urn = s.urn from stg_dict_dataset_sample s left join {dict_dataset} d on d.urn = s.urn
where s.db_id = {db_id} where s.db_id = {db_id}
on duplicate key update on duplicate key update
`data`=s.data, modified=now(); `data`=s.data, modified=now();
-- update reference id in final table -- update reference id in final table
UPDATE dict_dataset_sample d UPDATE {dict_dataset_sample} d
RIGHT JOIN stg_dict_dataset_sample s ON d.urn = s.urn RIGHT JOIN stg_dict_dataset_sample s ON d.urn = s.urn
SET d.ref_id = s.ref_id SET d.ref_id = s.ref_id
WHERE s.db_id = {db_id} AND d.ref_id = 0; WHERE s.db_id = {db_id} AND d.ref_id = 0;
'''.format(source_file=self.input_sample_file, db_id=self.db_id) '''.format(source_file=self.input_sample_file, db_id=self.db_id,
dict_dataset=self.dict_dataset_table, dict_dataset_sample=self.dict_dataset_sample_table)
self.executeCommands(load_sample_cmd) self.executeCommands(load_sample_cmd)
self.logger.info("finish loading oracle sample data") self.logger.info("finish loading oracle sample data from {} to {}"
.format(self.input_sample_file, self.dict_dataset_sample_table))
def executeCommands(self, commands): def executeCommands(self, commands):