From 0c68d9c4fb58324450150566b70d71fb19c8554d Mon Sep 17 00:00:00 2001
From: jbai <jbai@linkedin.com>
Date: Thu, 16 Jun 2016 16:33:37 -0700
Subject: [PATCH] fix the dataset field has duplicated records issue

---
 metadata-etl/src/main/resources/jython/HiveExtract.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/metadata-etl/src/main/resources/jython/HiveExtract.py b/metadata-etl/src/main/resources/jython/HiveExtract.py
index 21e29265ab..3d06296c9a 100644
--- a/metadata-etl/src/main/resources/jython/HiveExtract.py
+++ b/metadata-etl/src/main/resources/jython/HiveExtract.py
@@ -196,7 +196,13 @@ class HiveExtract:
 
     field_list = []
     for row_index, row_value in enumerate(rows):
-      field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
+      if row_value[20].lower() == 'dalids':
+        urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
+      else:
+        urn = 'hive:///' + row_value[0] + '/' + row_value[18]
+
+      if urn not in self.dataset_dict:
+        field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
                          'Comment': row_value[17]})
       if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table
         # sort the field_list by IntegerIndex
@@ -204,7 +210,6 @@ class HiveExtract:
         # process the record of table
 
         if row_value[20].lower() == 'dalids':
-          urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
           instance_record = DatasetInstanceRecord(row_value[25],
                                       long(self.db_id),
                                       'grid',
@@ -222,8 +227,6 @@ class HiveExtract:
           self.instance_writer.append(instance_record)
           dataset_urn_idx += 1
           self.instance_dict[row_value[25]] = dataset_urn_idx
-        else:
-          urn = 'hive:///' + row_value[0] + '/' + row_value[18]
 
         if urn in self.dataset_dict:
           continue