datahub/metadata-etl/src/main/resources/jython/EspressoTransform.py

#
# Copyright 2015 LinkedIn Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#

import datetime
import json
import os
import sys

from com.ziclix.python.sql import zxJDBC
from wherehows.common import Constant
from org.slf4j import LoggerFactory

import FileUtil


class EspressoTransform:

  def __init__(self, args):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)

    username = args[Constant.WH_DB_USERNAME_KEY]
    password = args[Constant.WH_DB_PASSWORD_KEY]
    JDBC_DRIVER = args[Constant.WH_DB_DRIVER_KEY]
    JDBC_URL = args[Constant.WH_DB_URL_KEY]

    temp_dir = FileUtil.etl_temp_dir(args, "ESPRESSO")
    self.input_file = open(os.path.join(temp_dir, args[Constant.ESPRESSO_OUTPUT_KEY]), 'r')

    self.db_id = args[Constant.DB_ID_KEY]
    self.wh_etl_exec_id = args[Constant.WH_EXEC_ID_KEY]
    self.conn_mysql = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
    self.conn_cursor = self.conn_mysql.cursor()

    self.logger.info("Transform ESPRESSO metadata into {}, db_id {}, wh_exec_id {}"
                     .format(JDBC_URL, self.db_id, self.wh_etl_exec_id))

    self.schema_history_cmd = "INSERT IGNORE INTO stg_dict_dataset_schema_history (urn, modified_date, dataset_schema) " + \
                              "VALUES (?, current_date - ?, ?)"

    self.dataset_cmd = "INSERT IGNORE INTO stg_dict_dataset (`db_id`, `dataset_type`, `urn`, `name`, `schema`, schema_type, " \
                       "properties, `fields`, `source`, location_prefix, parent_name, storage_type, created_time, wh_etl_exec_id) " + \
                       "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'Table', UNIX_TIMESTAMP(), ?)"

    self.owner_cmd = "INSERT IGNORE INTO stg_dataset_owner (dataset_urn, namespace, owner_id, owner_type, is_group, " \
                     "db_name, db_id, app_id, is_active, sort_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"


  def convert_espresso(self, content):
    '''
    convert from original content to a insert statement
    one json could contain several tables
    one table could contain several versions
    '''
    if 'tableSchemas' in content['databaseSpec']['com.linkedin.nuage.EspressoDatabase']:
      for doc in content['databaseSpec']['com.linkedin.nuage.EspressoDatabase']['tableSchemas']:
        if 'key' not in doc:
          self.logger.info("[key] not found in json [tableSchema] section in '%s': %s", (content['name'], json.dumps(doc)))
          continue
        # different documents inside the Espresso DB
        self.construct_espresso_dataset(doc, content)

  def construct_espresso_dataset(self, doc, content):
    EXCLUDED_ATTRS_IN_PROP = ['databaseSpec', 'owners', 'parentName', 'type', 'name', 'fabric']  # need transformation

    db_name = content['name']
    doc_name = doc['name']
    urn = 'espresso:///' + db_name + '/' + doc_name
    dataset_type ='ESPRESSO'
    source = dataset_type
    location_prefix = '/' + db_name
    parent_name = db_name
    schema_type = 'JSON'
    key_list = json.loads(doc['key'])
    fields = []
    for key_field in key_list:
      key_field_with_prefix = key_field.copy()
      key_field_with_prefix["name"] = 'key:' + key_field_with_prefix["name"]
      fields.append(key_field_with_prefix)

    properties = {}
    for p_key in content.keys():
      if p_key not in EXCLUDED_ATTRS_IN_PROP:
        properties[p_key] = content[p_key]
    if 'dbusBootstrap' in content['databaseSpec']['com.linkedin.nuage.EspressoDatabase']:
      properties['databusBootstrap'] = content['databaseSpec']['com.linkedin.nuage.EspressoDatabase']['dbusBootstrap']
    if 'etl' in content['databaseSpec']['com.linkedin.nuage.EspressoDatabase']:
      properties['etlToHadoop'] = content['databaseSpec']['com.linkedin.nuage.EspressoDatabase']['etl']

    combined_schema = {
      'name': doc['name'],
      'doc': doc['description'],
      'keySchema': key_list,
      'valueSchema': None
    }

    if 'documentSchema' in doc:
      # different versions of valueSchema
      pseudo_date_offset = len(doc['documentSchema'])
      for one_ver in doc['documentSchema']:
        combined_schema['valueSchema'] = json.loads(one_ver['valueSchema'])
        schema_string = json.dumps(combined_schema)
        self.conn_cursor.executemany(self.schema_history_cmd, [urn, pseudo_date_offset, schema_string])
        pseudo_date_offset -= 1

      # append fields defined in the last version of valueSchema
      fields.extend(combined_schema['valueSchema']['fields'])
    else:
      pseudo_date_offset = 1
      schema_string = json.dumps(combined_schema)
      self.conn_cursor.executemany(self.schema_history_cmd, [urn, pseudo_date_offset, schema_string])


    self.conn_cursor.executemany(self.dataset_cmd, [self.db_id, dataset_type, urn, doc_name, json.dumps(combined_schema),
                                                    schema_type, json.dumps(properties), json.dumps({'fields': fields}),
                                                    source, location_prefix, parent_name, self.wh_etl_exec_id])

    owner_count = 1
    if "owners" in content:
      for owner in content['owners']:
        id_idx = owner.rfind(':')
        self.conn_cursor.executemany(self.owner_cmd, [urn, owner[:id_idx], owner[id_idx+1:], 'Delegate', 'N',
                                                      'espresso', self.db_id, 0, 'Y', owner_count])
        owner_count += 1

    if "servicesList" in content:
      for service in content['servicesList']:
        self.conn_cursor.executemany(self.owner_cmd, [urn, 'urn:li:service', service, 'Delegate', 'Y', 'espresso',
                                                      self.db_id, 0, 'Y', owner_count])
        owner_count += 1

    self.conn_mysql.commit()
    self.logger.debug('Transformed ' + urn)


  def load_espresso(self):
    for line in self.input_file:
      #print line
      one_table_info = json.loads(line)
      if len(one_table_info) > 0:
        self.convert_espresso(one_table_info)


  def clean_staging(self):
    self.conn_cursor.execute('DELETE FROM stg_dict_dataset WHERE db_id = {db_id}'.format(db_id=self.db_id))
    self.conn_cursor.execute('DELETE FROM stg_dataset_owner WHERE db_id = {db_id}'.format(db_id=self.db_id))
    self.conn_mysql.commit()


  def run(self):
    try:
      begin = datetime.datetime.now().strftime("%H:%M:%S")
      self.clean_staging()
      self.load_espresso()
      end = datetime.datetime.now().strftime("%H:%M:%S")
      self.logger.info("Transform ESPRESSO metadata [%s -> %s]" % (str(begin), str(end)))
    finally:
      self.conn_cursor.close()
      self.conn_mysql.close()


if __name__ == "__main__":
  args = sys.argv[1]

  t = EspressoTransform(args)
  t.run()