mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-23 16:48:12 +00:00
fix the dataset field has duplicated records issue
This commit is contained in:
parent
38fdf1c132
commit
0c68d9c4fb
@ -196,7 +196,13 @@ class HiveExtract:
|
|||||||
|
|
||||||
field_list = []
|
field_list = []
|
||||||
for row_index, row_value in enumerate(rows):
|
for row_index, row_value in enumerate(rows):
|
||||||
field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
|
if row_value[20].lower() == 'dalids':
|
||||||
|
urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
|
||||||
|
else:
|
||||||
|
urn = 'hive:///' + row_value[0] + '/' + row_value[18]
|
||||||
|
|
||||||
|
if urn not in self.dataset_dict:
|
||||||
|
field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
|
||||||
'Comment': row_value[17]})
|
'Comment': row_value[17]})
|
||||||
if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table
|
if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table
|
||||||
# sort the field_list by IntegerIndex
|
# sort the field_list by IntegerIndex
|
||||||
@ -204,7 +210,6 @@ class HiveExtract:
|
|||||||
# process the record of table
|
# process the record of table
|
||||||
|
|
||||||
if row_value[20].lower() == 'dalids':
|
if row_value[20].lower() == 'dalids':
|
||||||
urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
|
|
||||||
instance_record = DatasetInstanceRecord(row_value[25],
|
instance_record = DatasetInstanceRecord(row_value[25],
|
||||||
long(self.db_id),
|
long(self.db_id),
|
||||||
'grid',
|
'grid',
|
||||||
@ -222,8 +227,6 @@ class HiveExtract:
|
|||||||
self.instance_writer.append(instance_record)
|
self.instance_writer.append(instance_record)
|
||||||
dataset_urn_idx += 1
|
dataset_urn_idx += 1
|
||||||
self.instance_dict[row_value[25]] = dataset_urn_idx
|
self.instance_dict[row_value[25]] = dataset_urn_idx
|
||||||
else:
|
|
||||||
urn = 'hive:///' + row_value[0] + '/' + row_value[18]
|
|
||||||
|
|
||||||
if urn in self.dataset_dict:
|
if urn in self.dataset_dict:
|
||||||
continue
|
continue
|
||||||
|
Loading…
x
Reference in New Issue
Block a user