From 0c68d9c4fb58324450150566b70d71fb19c8554d Mon Sep 17 00:00:00 2001 From: jbai Date: Thu, 16 Jun 2016 16:33:37 -0700 Subject: [PATCH] fix the dataset field has duplicated records issue --- metadata-etl/src/main/resources/jython/HiveExtract.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/metadata-etl/src/main/resources/jython/HiveExtract.py b/metadata-etl/src/main/resources/jython/HiveExtract.py index 21e29265ab..3d06296c9a 100644 --- a/metadata-etl/src/main/resources/jython/HiveExtract.py +++ b/metadata-etl/src/main/resources/jython/HiveExtract.py @@ -196,7 +196,13 @@ class HiveExtract: field_list = [] for row_index, row_value in enumerate(rows): - field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16], + if row_value[20].lower() == 'dalids': + urn = 'dalids:///' + row_value[0] + '/' + row_value[18] + else: + urn = 'hive:///' + row_value[0] + '/' + row_value[18] + + if urn not in self.dataset_dict: + field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16], 'Comment': row_value[17]}) if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table # sort the field_list by IntegerIndex @@ -204,7 +210,6 @@ class HiveExtract: # process the record of table if row_value[20].lower() == 'dalids': - urn = 'dalids:///' + row_value[0] + '/' + row_value[18] instance_record = DatasetInstanceRecord(row_value[25], long(self.db_id), 'grid', @@ -222,8 +227,6 @@ class HiveExtract: self.instance_writer.append(instance_record) dataset_urn_idx += 1 self.instance_dict[row_value[25]] = dataset_urn_idx - else: - urn = 'hive:///' + row_value[0] + '/' + row_value[18] if urn in self.dataset_dict: continue