fix the dataset field has duplicated records issue

This commit is contained in:
jbai 2016-06-16 16:33:37 -07:00
parent 38fdf1c132
commit 0c68d9c4fb

View File

@ -196,6 +196,12 @@ class HiveExtract:
field_list = [] field_list = []
for row_index, row_value in enumerate(rows): for row_index, row_value in enumerate(rows):
if row_value[20].lower() == 'dalids':
urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
else:
urn = 'hive:///' + row_value[0] + '/' + row_value[18]
if urn not in self.dataset_dict:
field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16], field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
'Comment': row_value[17]}) 'Comment': row_value[17]})
if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table
@ -204,7 +210,6 @@ class HiveExtract:
# process the record of table # process the record of table
if row_value[20].lower() == 'dalids': if row_value[20].lower() == 'dalids':
urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
instance_record = DatasetInstanceRecord(row_value[25], instance_record = DatasetInstanceRecord(row_value[25],
long(self.db_id), long(self.db_id),
'grid', 'grid',
@ -222,8 +227,6 @@ class HiveExtract:
self.instance_writer.append(instance_record) self.instance_writer.append(instance_record)
dataset_urn_idx += 1 dataset_urn_idx += 1
self.instance_dict[row_value[25]] = dataset_urn_idx self.instance_dict[row_value[25]] = dataset_urn_idx
else:
urn = 'hive:///' + row_value[0] + '/' + row_value[18]
if urn in self.dataset_dict: if urn in self.dataset_dict:
continue continue