Optimize dataset load scripts, improve speed (#350)

- When loading dataset fields in staging table, populate the dateset_id field first then use this in later JOIN. - When JOIN two big tables such as dict_field_detail, use pre-select to reduce table JOIN size and DB resource. - Refactor some SQL code. - Modify logback setting to better capture log time. - Remove unnecessary config in backend application.conf
2025-10-10 08:26:39 +00:00 · 2017-03-22 10:23:30 -07:00 · 2017-03-22 10:23:30 -07:00 · b6e644fbb1
commit b6e644fbb1
parent d9438dc5bc
8 changed files with 185 additions and 200 deletions
--- a/backend-service/conf/application.conf
+++ b/backend-service/conf/application.conf
@ -41,7 +41,6 @@ db.wherehows.driver = com.mysql.jdbc.Driver
 db.wherehows.url = ${WHZ_DB_URL}
 db.wherehows.username = ${WHZ_DB_USERNAME}
 db.wherehows.password = ${WHZ_DB_PASSWORD}
-db.wherehows.host = ${WHZ_DB_HOST}

 # You can expose this datasource via JNDI if needed (Useful for JPA)
 # db.default.jndiName=DefaultDS
--- a/backend-service/conf/logback.xml
+++ b/backend-service/conf/logback.xml
@ -19,7 +19,7 @@

  <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
    <encoder>
-      <pattern>%coloredLevel - %logger - %message%n%xException</pattern>
+      <pattern>%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n</pattern>
    </encoder>
  </appender>

--- a/metadata-etl/src/main/resources/jython/HdfsLoad.py
+++ b/metadata-etl/src/main/resources/jython/HdfsLoad.py
@ -27,7 +27,6 @@ class HdfsLoad:
    Load dataset metadata into final table
    :return: nothing
    """
-    cursor = self.conn_mysql.cursor()
    load_cmd = '''
        DELETE FROM stg_dict_dataset WHERE db_id = {db_id};

@ -197,15 +196,11 @@ class HdfsLoad:
          ;
        '''.format(source_file=self.input_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)

-    for state in load_cmd.split(";"):
-      self.logger.debug(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()
+    self.executeCommands(load_cmd)
    self.logger.info("finish loading hdfs metadata db_id={db_id} to dict_dataset".format(db_id=self.db_id))

+
  def load_field(self):
-    cursor = self.conn_mysql.cursor()
    load_field_cmd = '''
        DELETE FROM stg_dict_field_detail where db_id = {db_id};

@ -242,24 +237,25 @@ class HdfsLoad:

        analyze table field_comments;

+        -- update stg_dict_field_detail dataset_id
+        update stg_dict_field_detail sf, dict_dataset d
+        set sf.dataset_id = d.id where sf.urn = d.urn
+        and sf.db_id = {db_id};
+        delete from stg_dict_field_detail
+        where db_id = {db_id} and dataset_id is null;  -- remove if not match to dataset
+
        -- delete old record if it does not exist in this load batch anymore (but have the dataset id)
-        create temporary table if not exists t_deleted_fields (primary key (field_id))
-          select x.field_id
-            from stg_dict_field_detail s
-              join dict_dataset i
-                on s.urn = i.urn
-                and s.db_id = {db_id}
-              right join dict_field_detail x
-                on i.id = x.dataset_id
-                and s.field_name = x.field_name
-                and s.parent_path = x.parent_path
-          where s.field_name is null
-            and x.dataset_id in (
-                       select d.id dataset_id
-                       from stg_dict_field_detail k join dict_dataset d
-                         on k.urn = d.urn
-                        and k.db_id = {db_id}
-            )
+        create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
+          SELECT x.field_id
+          FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
+          RIGHT JOIN
+            ( select dataset_id, field_id, field_name, parent_path from dict_field_detail
+              where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
+            ) x
+            ON s.dataset_id = x.dataset_id
+            AND s.field_name = x.field_name
+            AND s.parent_path = x.parent_path
+          WHERE s.field_name is null
        ; -- run time : ~2min

        delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
@ -269,12 +265,10 @@ class HdfsLoad:
        (
          select x.field_id, s.*
          from (select * from stg_dict_field_detail where db_id = {db_id}) s
-            join dict_dataset d
-              on s.urn = d.urn
            join dict_field_detail x
              on s.field_name = x.field_name
              and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
-              and d.id = x.dataset_id
+              and s.dataset_id = x.dataset_id
          where (x.sort_id <> s.sort_id
                or x.parent_sort_id <> s.parent_sort_id
                or x.data_type <> s.data_type
@ -307,12 +301,11 @@ class HdfsLoad:
           modified
        )
        select
-          d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
+          sf.dataset_id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
          sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now()
-        from stg_dict_field_detail sf join dict_dataset d
-          on sf.urn = d.urn
+        from stg_dict_field_detail sf
             left join dict_field_detail t
-          on d.id = t.dataset_id
+          on sf.dataset_id = t.dataset_id
         and sf.field_name = t.field_name
         and sf.parent_path = t.parent_path
        where db_id = {db_id} and t.field_id is null
@ -325,13 +318,12 @@ class HdfsLoad:

        -- insert
        insert into stg_dict_dataset_field_comment
-        select t.field_id field_id, fc.id comment_id,  d.id dataset_id, {db_id}
-                from stg_dict_field_detail sf join dict_dataset d
-                  on sf.urn = d.urn
+        select t.field_id field_id, fc.id comment_id, sf.dataset_id dataset_id, {db_id}
+                from stg_dict_field_detail sf
                      join field_comments fc
                  on sf.description = fc.comment
                      join dict_field_detail t
-                  on d.id = t.dataset_id
+                  on sf.dataset_id = t.dataset_id
                 and sf.field_name = t.field_name
                 and sf.parent_path = t.parent_path
        where sf.db_id = {db_id};
@ -357,15 +349,12 @@ class HdfsLoad:
        DELETE FROM stg_dict_field_detail where db_id = {db_id};

        '''.format(source_file=self.input_field_file, db_id=self.db_id)
-    for state in load_field_cmd.split(";"):
-      self.logger.debug(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()
+
+    self.executeCommands(load_field_cmd)
    self.logger.info("finish loading hdfs metadata db_id={db_id} to dict_field_detail".format(db_id=self.db_id))

+
  def load_sample(self):
-    cursor = self.conn_mysql.cursor()
    load_sample_cmd = '''
    DELETE FROM stg_dict_dataset_sample where db_id = {db_id};

@ -375,11 +364,11 @@ class HdfsLoad:
    (urn,ref_urn,data)
    SET db_id = {db_id};

-    -- update reference id in stagging table
-    UPDATE  stg_dict_dataset_sample s
+    -- update reference id in staging table
+    UPDATE stg_dict_dataset_sample s
    JOIN dict_dataset d ON s.ref_urn = d.urn
    SET s.ref_id = d.id
-    WHERE s.db_id = {db_id} AND s.ref_urn > '' AND s.ref_urn <> 'null';
+    WHERE s.db_id = {db_id} AND s.ref_urn > '';

    -- first insert ref_id as 0
    INSERT INTO dict_dataset_sample
@ -401,14 +390,18 @@ class HdfsLoad:
    SET d.ref_id = s.ref_id
    WHERE s.db_id = {db_id} AND d.ref_id = 0;
    '''.format(source_file=self.input_sample_file, db_id=self.db_id)
-    for state in load_sample_cmd.split(";"):
-      self.logger.debug(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()
+
+    self.executeCommands(load_sample_cmd)
    self.logger.info("finish loading hdfs sample data db_id={db_id} to dict_dataset_sample".format(db_id=self.db_id))


+  def executeCommands(self, commands):
+    for cmd in commands.split(";"):
+      self.logger.debug(cmd)
+      self.conn_cursor.execute(cmd)
+      self.conn_mysql.commit()
+
+
 if __name__ == "__main__":
  args = sys.argv[1]

@ -426,10 +419,11 @@ if __name__ == "__main__":
  l.db_id = args[Constant.DB_ID_KEY]
  l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
  l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
+  l.conn_cursor = l.conn_mysql.cursor()

  if Constant.INNODB_LOCK_WAIT_TIMEOUT in args:
    lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
-    l.conn_mysql.cursor().execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
+    l.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)

  try:
    l.load_metadata()
--- a/metadata-etl/src/main/resources/jython/HiveLoad.py
+++ b/metadata-etl/src/main/resources/jython/HiveLoad.py
@ -23,7 +23,6 @@ class HiveLoad:
    self.logger = LoggerFactory.getLogger("%s[%s]" % (self.__class__.__name__, wh_etl_exec_id))

  def load_metadata(self):
-    cursor = self.conn_mysql.cursor()
    load_cmd = """
        DELETE FROM stg_dict_dataset WHERE db_id = {db_id};

@ -97,18 +96,15 @@ class HiveLoad:
        ;
        """.format(source_file=self.input_schema_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)

-    for state in load_cmd.split(";"):
-      self.logger.info(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()
+    self.executeCommands(load_cmd)
+    self.logger.info("Load dataset metadata.")
+

  def load_field(self):
    """
    Load fields
    :return:
    """
-    cursor = self.conn_mysql.cursor()
    load_cmd = """
        DELETE FROM stg_dict_field_detail WHERE db_id = {db_id};

@ -125,30 +121,26 @@ class HiveLoad:
        , description=nullif(@description,'null')
        , last_modified=now();

-        -- update dataset_id
+        -- update stg_dict_field_detail dataset_id
        update stg_dict_field_detail sf, dict_dataset d
        set sf.dataset_id = d.id where sf.urn = d.urn
        and sf.db_id = {db_id};
+        delete from stg_dict_field_detail
+        where db_id = {db_id} and dataset_id is null;  -- remove if not match to dataset

        -- delete old record if it does not exist in this load batch anymore (but have the dataset id)
        -- join with dict_dataset to avoid right join using index. (using index will slow down the query)
-        create temporary table if not exists t_deleted_fields (primary key (field_id))
-          select x.field_id
-            from stg_dict_field_detail s
-              join dict_dataset i
-                on s.urn = i.urn
-                and s.db_id = {db_id}
-              right join dict_field_detail x
-                on i.id = x.dataset_id
-                and s.field_name = x.field_name
-                and s.parent_path = x.parent_path
-          where s.field_name is null
-            and x.dataset_id in (
-                       select d.id dataset_id
-                       from stg_dict_field_detail k join dict_dataset d
-                         on k.urn = d.urn
-                        and k.db_id = {db_id}
-            )
+        create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
+          SELECT x.field_id
+          FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
+          RIGHT JOIN
+            ( select dataset_id, field_id, field_name, parent_path from dict_field_detail
+              where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
+            ) x
+            ON s.dataset_id = x.dataset_id
+            AND s.field_name = x.field_name
+            AND s.parent_path = x.parent_path
+          WHERE s.field_name is null
        ; -- run time : ~2min

        delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
@ -190,7 +182,7 @@ class HiveLoad:

        -- insert new ones
        CREATE TEMPORARY TABLE IF NOT EXISTS t_existed_field
-        ( primary key (urn, sort_id, db_id) )
+        ( primary key (urn, sort_id, db_id) ) ENGINE=MyISAM
        AS (
        SELECT sf.urn, sf.sort_id, sf.db_id, count(*) field_count
        FROM stg_dict_field_detail sf
@ -203,8 +195,7 @@ class HiveLoad:
        group by 1,2,3
        );

-
-       insert ignore into dict_field_detail (
+        insert ignore into dict_field_detail (
          dataset_id, fields_layout_id, sort_id, parent_sort_id, parent_path,
          field_name, namespace, data_type, data_size, is_nullable, default_value,
           modified
@ -239,7 +230,6 @@ class HiveLoad:
          where field_id in (select field_id from stg_dict_dataset_field_comment)
        and is_default = 1 ) and db_id = {db_id};

-
        -- doesn't have this comment before, insert into it and set as default
        insert ignore into dict_dataset_field_comment
        select sd.field_id, sd.comment_id, sd.dataset_id, 1 from stg_dict_dataset_field_comment sd
@ -249,7 +239,6 @@ class HiveLoad:
        where d.comment_id is null
        and sd.db_id = {db_id};

-
        insert into field_comments (
          user_id, comment, created, modified, comment_crc32_checksum
        )
@ -266,20 +255,15 @@ class HiveLoad:

        """.format(source_file=self.input_field_file, db_id=self.db_id)

-    # didn't load into final table for now
+    self.executeCommands(load_cmd)
+    self.logger.info("Load dataset fields.")

-    for state in load_cmd.split(";"):
-      self.logger.info(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()

  def load_dataset_instance(self):
      """
      Load dataset instance
      :return:
      """
-      cursor = self.conn_mysql.cursor()
      load_cmd = """
        DELETE FROM stg_dict_dataset_instance WHERE db_id = {db_id};

@ -337,21 +321,15 @@ class HiveLoad:
            ;
        """.format(source_file=self.input_instance_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)

+      self.executeCommands(load_cmd)
+      self.logger.info("Load dataset instance.")

-      # didn't load into final table for now
-
-      for state in load_cmd.split(";"):
-          self.logger.info(state)
-          cursor.execute(state)
-          self.conn_mysql.commit()
-      cursor.close()

  def load_dataset_dependencies(self):
      """
-      Load dataset instance
+      Load dataset dependencies
      :return:
      """
-      cursor = self.conn_mysql.cursor()
      load_cmd = """
        DELETE FROM stg_cfg_object_name_map;
        LOAD DATA LOCAL INFILE '{source_file}'
@ -372,7 +350,7 @@ class HiveLoad:
        -- create to be deleted table
        DROP TEMPORARY table IF EXISTS t_deleted_depend;

-        CREATE TEMPORARY TABLE t_deleted_depend
+        CREATE TEMPORARY TABLE t_deleted_depend ENGINE=MyISAM
        AS (
        SELECT DISTINCT c.obj_name_map_id
          FROM cfg_object_name_map c LEFT JOIN stg_cfg_object_name_map s
@ -420,12 +398,15 @@ class HiveLoad:
        """.format(source_file=self.input_dependency_file)

      # didn't load into final table for now
+      self.executeCommands(load_cmd)
+      self.logger.info("Load dataset dependencies.")

-      for state in load_cmd.split(";"):
-          self.logger.info(state)
-          cursor.execute(state)
+
+  def executeCommands(self, commands):
+      for cmd in commands.split(";"):
+          self.logger.debug(cmd)
+          self.conn_cursor.execute(cmd)
          self.conn_mysql.commit()
-      cursor.close()


 if __name__ == "__main__":
@ -446,10 +427,11 @@ if __name__ == "__main__":
  l.db_id = args[Constant.DB_ID_KEY]
  l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
  l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
+  l.conn_cursor = l.conn_mysql.cursor()

  if Constant.INNODB_LOCK_WAIT_TIMEOUT in args:
    lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
-    l.conn_mysql.cursor().execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
+    l.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)

  try:
    l.load_metadata()
--- a/metadata-etl/src/main/resources/jython/OracleLoad.py
+++ b/metadata-etl/src/main/resources/jython/OracleLoad.py
@ -15,7 +15,7 @@
 from com.ziclix.python.sql import zxJDBC
 from wherehows.common import Constant
 from org.slf4j import LoggerFactory
-import sys, os, datetime
+import sys, datetime


 class OracleLoad:
@ -159,24 +159,25 @@ class OracleLoad:

        analyze table {field_comments};

+        -- update stg_dict_field_detail dataset_id
+        update stg_dict_field_detail sf, dict_dataset d
+        set sf.dataset_id = d.id where sf.urn = d.urn
+        and sf.db_id = {db_id};
+        delete from stg_dict_field_detail
+        where db_id = {db_id} and dataset_id is null;  -- remove if not match to dataset
+
        -- delete old record if it does not exist in this load batch anymore (but have the dataset id)
-        create temporary table if not exists t_deleted_fields (primary key (field_id))
-          select x.field_id
-            from stg_dict_field_detail s
-              join {dict_dataset} i
-                on s.urn = i.urn
-                and s.db_id = {db_id}
-              right join {dict_field_detail} x
-                on i.id = x.dataset_id
-                and s.field_name = x.field_name
-                and s.parent_path = x.parent_path
-          where s.field_name is null
-            and x.dataset_id in (
-                       select d.id dataset_id
-                       from stg_dict_field_detail k join {dict_dataset} d
-                         on k.urn = d.urn
-                        and k.db_id = {db_id}
-            )
+        create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
+          SELECT x.field_id
+          FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
+          RIGHT JOIN
+            ( select dataset_id, field_id, field_name, parent_path from dict_field_detail
+              where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
+            ) x
+            ON s.dataset_id = x.dataset_id
+            AND s.field_name = x.field_name
+            AND s.parent_path = x.parent_path
+          WHERE s.field_name is null
        ; -- run time : ~2min

        delete from {dict_field_detail} where field_id in (select field_id from t_deleted_fields);
@ -186,12 +187,10 @@ class OracleLoad:
        (
          select x.field_id, s.*
          from stg_dict_field_detail s
-          join {dict_dataset} d
-            on s.urn = d.urn
          join {dict_field_detail} x
            on s.field_name = x.field_name
            and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
-            and d.id = x.dataset_id
+            and s.dataset_id = x.dataset_id
          where s.db_id = {db_id}
            and (x.sort_id <> s.sort_id
                or x.parent_sort_id <> s.parent_sort_id
@ -224,12 +223,11 @@ class OracleLoad:
          field_name, namespace, data_type, data_size, is_nullable, default_value, modified
        )
        select
-          d.id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
+          sf.dataset_id, 0, sf.sort_id, sf.parent_sort_id, sf.parent_path,
          sf.field_name, sf.namespace, sf.data_type, sf.data_size, sf.is_nullable, sf.default_value, now()
-        from stg_dict_field_detail sf join {dict_dataset} d
-          on sf.urn = d.urn
+        from stg_dict_field_detail sf
             left join {dict_field_detail} t
-          on d.id = t.dataset_id
+          on sf.dataset_id = t.dataset_id
          and sf.field_name = t.field_name
          and sf.parent_path = t.parent_path
        where db_id = {db_id} and t.field_id is null
@ -242,13 +240,12 @@ class OracleLoad:

        -- insert
        insert into stg_dict_dataset_field_comment
-        select t.field_id field_id, fc.id comment_id,  d.id dataset_id, {db_id}
-                from stg_dict_field_detail sf join {dict_dataset} d
-                  on sf.urn = d.urn
+        select t.field_id field_id, fc.id comment_id,  sf.dataset_id dataset_id, {db_id}
+                from stg_dict_field_detail sf
                      join {field_comments} fc
                  on sf.description = fc.comment
                      join {dict_field_detail} t
-                  on d.id = t.dataset_id
+                  on sf.dataset_id = t.dataset_id
                 and sf.field_name = t.field_name
                 and sf.parent_path = t.parent_path
        where sf.db_id = {db_id};
@ -290,7 +287,7 @@ class OracleLoad:
    (urn,ref_urn,data)
    SET db_id = {db_id};

-    -- update reference id in stagging table
+    -- update reference id in staging table
    UPDATE  stg_dict_dataset_sample s
    LEFT JOIN {dict_dataset} d ON s.ref_urn = d.urn
    SET s.ref_id = d.id
--- a/metadata-etl/src/main/resources/jython/TeradataLoad.py
+++ b/metadata-etl/src/main/resources/jython/TeradataLoad.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #

-import sys
+import sys, datetime
 from com.ziclix.python.sql import zxJDBC
 from distutils.util import strtobool
 from wherehows.common import Constant
@ -24,7 +24,6 @@ class TeradataLoad:
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)

  def load_metadata(self):
-    cursor = self.conn_mysql.cursor()
    load_cmd = '''
        DELETE FROM stg_dict_dataset WHERE db_id = {db_id};

@ -173,14 +172,11 @@ class TeradataLoad:
          ;
        '''.format(source_file=self.input_file, db_id=self.db_id, wh_etl_exec_id=self.wh_etl_exec_id)

-    for state in load_cmd.split(";"):
-      self.logger.debug(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()
+    self.executeCommands(load_cmd)
+    self.logger.info("Finish loading metadata")
+

  def load_field(self):
-    cursor = self.conn_mysql.cursor()
    load_cmd = '''
        DELETE FROM stg_dict_field_detail where db_id = {db_id};

@ -206,24 +202,25 @@ class TeradataLoad:
           and (char_length(trim(description)) = 0
           or description in ('null', 'N/A', 'nothing', 'empty', 'none'));

+        -- update stg_dict_field_detail dataset_id
+        update stg_dict_field_detail sf, dict_dataset d
+        set sf.dataset_id = d.id where sf.urn = d.urn
+        and sf.db_id = {db_id};
+        delete from stg_dict_field_detail
+        where db_id = {db_id} and dataset_id is null;  -- remove if not match to dataset
+
        -- delete old record if it does not exist in this load batch anymore (but have the dataset id)
-        create temporary table if not exists t_deleted_fields (primary key (field_id))
-          select x.field_id
-            from stg_dict_field_detail s
-              join dict_dataset i
-                on s.urn = i.urn
-                and s.db_id = {db_id}
-              right join dict_field_detail x
-                on i.id = x.dataset_id
-                and s.field_name = x.field_name
-                and s.parent_path = x.parent_path
-          where s.field_name is null
-            and x.dataset_id in (
-                       select d.id dataset_id
-                       from stg_dict_field_detail k join dict_dataset d
-                         on k.urn = d.urn
-                        and k.db_id = {db_id}
-            )
+        create temporary table if not exists t_deleted_fields (primary key (field_id)) ENGINE=MyISAM
+          SELECT x.field_id
+          FROM (select dataset_id, field_name, parent_path from stg_dict_field_detail where db_id = {db_id}) s
+          RIGHT JOIN
+            ( select dataset_id, field_id, field_name, parent_path from dict_field_detail
+              where dataset_id in (select dataset_id from stg_dict_field_detail where db_id = {db_id})
+            ) x
+            ON s.dataset_id = x.dataset_id
+            AND s.field_name = x.field_name
+            AND s.parent_path = x.parent_path
+          WHERE s.field_name is null
        ; -- run time : ~2min

        delete from dict_field_detail where field_id in (select field_id from t_deleted_fields);
@ -232,12 +229,11 @@ class TeradataLoad:
        update dict_field_detail t join
        (
          select x.field_id, s.*
-          from stg_dict_field_detail s join dict_dataset d
-            on s.urn = d.urn
+          from stg_dict_field_detail s
               join dict_field_detail x
           on s.field_name = x.field_name
          and coalesce(s.parent_path, '*') = coalesce(x.parent_path, '*')
-          and d.id = x.dataset_id
+          and s.dataset_id = x.dataset_id
          where s.db_id = {db_id}
            and (x.sort_id <> s.sort_id
                or x.parent_sort_id <> s.parent_sort_id
@ -274,7 +270,7 @@ class TeradataLoad:
        modified
        )
        select
-          d.id,
+          s.dataset_id,
          0 as fields_layout_id,
          s.sort_id,
          0 parent_sort_id,
@ -285,10 +281,9 @@ class TeradataLoad:
          s.data_scale,
          s.is_nullable,
          now()
-        from stg_dict_field_detail s join dict_dataset d
-          on s.urn = d.urn
+        from stg_dict_field_detail s
             left join dict_field_detail f
-          on d.id = f.dataset_id
+          on s.dataset_id = f.dataset_id
         and s.field_name = f.field_name
        where db_id = {db_id} and f.field_id is null;

@ -299,13 +294,12 @@ class TeradataLoad:

        -- insert
        insert into stg_dict_dataset_field_comment
-        select t.field_id field_id, fc.id comment_id,  d.id dataset_id, {db_id}
-                from stg_dict_field_detail sf join dict_dataset d
-                  on sf.urn = d.urn
+        select t.field_id field_id, fc.id comment_id, sf.dataset_id dataset_id, {db_id}
+                from stg_dict_field_detail sf
                      join field_comments fc
                  on sf.description = fc.comment
                      join dict_field_detail t
-                  on d.id = t.dataset_id
+                  on sf.dataset_id = t.dataset_id
                 and sf.field_name = t.field_name
                 and sf.parent_path = t.parent_path
        where sf.db_id = {db_id};
@ -328,11 +322,9 @@ class TeradataLoad:
        and sd.db_id = {db_id};
        '''.format(source_file=self.input_field_file, db_id=self.db_id)

-    for state in load_cmd.split(";"):
-      self.logger.debug(state)
-      cursor.execute(state)
-      self.conn_mysql.commit()
-    cursor.close()
+    self.executeCommands(load_cmd)
+    self.logger.info("Finish loading fields ")
+

  def load_sample(self):
    load_cmd = '''
@ -343,11 +335,20 @@ class TeradataLoad:
    (urn,ref_urn,data)
    SET db_id = {db_id};

-    -- update reference id in stagging table
-    UPDATE  stg_dict_dataset_sample s
-    LEFT JOIN dict_dataset d ON s.ref_urn = d.urn
-    SET s.ref_id = d.id
-    WHERE s.db_id = {db_id};
+    -- update dataset id in staging table
+    UPDATE stg_dict_dataset_sample s
+    JOIN dict_dataset d ON s.db_id = {db_id} and s.urn = d.urn
+    SET s.dataset_id = d.id;
+
+    -- update reference id in staging table
+    UPDATE stg_dict_dataset_sample s
+    JOIN (
+      select dataset_id, id from
+      (select dataset_id, ref_urn from stg_dict_dataset_sample
+      where db_id = 3 and ref_urn > '') st
+      join dict_dataset on urn = ref_urn) d
+    ON s.dataset_id = d.dataset_id
+    SET s.ref_id = d.id;

    -- first insert ref_id as 0
    INSERT INTO dict_dataset_sample
@ -357,27 +358,24 @@ class TeradataLoad:
      `data`,
      created
    )
-    select d.id as dataset_id, s.urn, s.ref_id, s.data, now()
-    from stg_dict_dataset_sample s left join dict_dataset d on d.urn = s.urn
-          where s.db_id = {db_id}
-    on duplicate key update
-      `data`=s.data, modified=now();
-
-
-    -- update reference id in final table
-    UPDATE dict_dataset_sample d
-    RIGHT JOIN stg_dict_dataset_sample s ON d.urn = s.urn
-    SET d.ref_id = s.ref_id
-    WHERE s.db_id = {db_id} AND d.ref_id = 0;
-
+    SELECT * from
+      (select dataset_id, urn, ref_id s_ref_id, `data` s_data, now()
+      from stg_dict_dataset_sample WHERE db_id = {db_id}) s
+    ON DUPLICATE KEY UPDATE
+      ref_id = COALESCE(s_ref_id, ref_id),
+      `data`=s_data,
+      modified=now();
    '''.format(source_file=self.input_sampledata_file, db_id=self.db_id)

-    cursor = self.conn_mysql.cursor()
-    for state in load_cmd.split(";"):
-      self.logger.debug(state)
-      cursor.execute(state)
+    self.executeCommands(load_cmd)
+    self.logger.info("Finish loading samples ")
+
+
+  def executeCommands(self, commands):
+    for cmd in commands.split(";"):
+      self.logger.debug(cmd)
+      self.conn_cursor.execute(cmd)
      self.conn_mysql.commit()
-    cursor.close()


 if __name__ == "__main__":
@ -405,10 +403,11 @@ if __name__ == "__main__":
  l.db_id = args[Constant.DB_ID_KEY]
  l.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
  l.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
+  l.conn_cursor = l.conn_mysql.cursor()

  if Constant.INNODB_LOCK_WAIT_TIMEOUT in args:
    lock_wait_time = args[Constant.INNODB_LOCK_WAIT_TIMEOUT]
-    l.conn_mysql.cursor().execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)
+    l.conn_cursor.execute("SET innodb_lock_wait_timeout = %s;" % lock_wait_time)

  try:
    l.load_metadata()
--- a/metadata-etl/src/test/java/metadata/etl/dataset/teradata/TeradataMetadataEtlTest.java
+++ b/metadata-etl/src/test/java/metadata/etl/dataset/teradata/TeradataMetadataEtlTest.java
@ -34,4 +34,18 @@ public class TeradataMetadataEtlTest {
    TeradataMetadataEtl t = new TeradataMetadataEtl(3, 0L);
    t.extract();
  }
+
+  @Test(groups = {"needConfig"})
+  public void testTransform()
+      throws Exception {
+    TeradataMetadataEtl t = new TeradataMetadataEtl(3, 0L);
+    t.transform();
+  }
+
+  @Test(groups = {"needConfig"})
+  public void testLoad()
+      throws Exception {
+    TeradataMetadataEtl t = new TeradataMetadataEtl(3, 0L);
+    t.load();
+  }
 }
--- a/web/conf/logback.xml
+++ b/web/conf/logback.xml
@ -19,7 +19,7 @@

  <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
    <encoder>
-      <pattern>%coloredLevel - %logger - %message%n%xException</pattern>
+      <pattern>%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n</pattern>
    </encoder>
  </appender>