mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-25 17:15:09 +00:00

* Rename web to wherehows-api and update README. * Rename backend-service to wherehows-backend * Rename metadata-etl to wherehows-etl * Rename hadoop-dataset-extractor-standalone to wherehows-hadoop
223 lines
9.0 KiB
Python
223 lines
9.0 KiB
Python
#
|
|
# Copyright 2015 LinkedIn Corp. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
#
|
|
|
|
from org.slf4j import LoggerFactory
|
|
from wherehows.common import Constant
|
|
from com.ziclix.python.sql import zxJDBC
|
|
import sys
|
|
|
|
|
|
class LdapTransform:
|
|
_tables = {"ldap_user": {
|
|
"columns": "app_id, is_active, user_id, urn, full_name, display_name, title, employee_number, manager_urn, email, department_id, department_name, start_date, mobile_phone, wh_etl_exec_id",
|
|
"file": "ldap_user_record.csv",
|
|
"table": "stg_dir_external_user_info",
|
|
"nullif_columns":
|
|
{"department_id": "''",
|
|
"employee_number": 0,
|
|
"start_date": "'0000-00-00'",
|
|
"manager_urn": "''",
|
|
"department_name": "''",
|
|
"mobile_phone": "''",
|
|
"email": "''",
|
|
"title": "''"}
|
|
},
|
|
"ldap_group": {"columns": "app_id, group_id, sort_id, user_app_id, user_id, wh_etl_exec_id",
|
|
"file": "ldap_group_record.csv",
|
|
"table": "stg_dir_external_group_user_map",
|
|
"nullif_columns": {"user_id": "''"}
|
|
},
|
|
"ldap_group_flatten": {"columns": "app_id, group_id, sort_id, user_app_id, user_id, wh_etl_exec_id",
|
|
"file": "ldap_group_flatten_record.csv",
|
|
"table": "stg_dir_external_group_user_map_flatten"
|
|
}
|
|
}
|
|
|
|
_read_file_template = """
|
|
LOAD DATA LOCAL INFILE '{folder}/{file}'
|
|
INTO TABLE {table}
|
|
FIELDS TERMINATED BY '\x1a' ESCAPED BY '\0'
|
|
LINES TERMINATED BY '\n'
|
|
({columns});
|
|
"""
|
|
|
|
_update_column_to_null_template = """
|
|
UPDATE {table} stg
|
|
SET {column} = NULL
|
|
WHERE {column} = {column_value} and app_id = {app_id}
|
|
"""
|
|
|
|
_update_manager_info = """
|
|
update {table} stg
|
|
join (select t1.app_id, t1.user_id, t1.employee_number, t2.user_id as manager_user_id, t2.employee_number as manager_employee_number from
|
|
{table} t1 join {table} t2 on t1.manager_urn = t2.urn and t1.app_id = t2.app_id
|
|
where t1.app_id = {app_id}
|
|
) s on stg.app_id = s.app_id and stg.user_id = s.user_id
|
|
set stg.manager_user_id = s.manager_user_id
|
|
, stg.manager_employee_number = s.manager_employee_number
|
|
WHERE stg.app_id = {app_id}
|
|
"""
|
|
|
|
_get_manager_edge = """
|
|
select user_id, manager_user_id from {table} stg
|
|
where app_id = {app_id} and manager_user_id is not null and user_id <> manager_user_id
|
|
"""
|
|
|
|
_update_hierarchy_info = """
|
|
update {table} stg
|
|
set org_hierarchy = CASE {org_hierarchy_long_string} END,
|
|
org_hierarchy_depth = CASE {org_hierarchy_depth_long_string} END
|
|
where app_id = {app_id} and user_id in ({user_ids})
|
|
"""
|
|
|
|
_update_hierarchy_info_per_row = """
|
|
update {table} stg
|
|
set org_hierarchy = '{org_hierarchy}',
|
|
org_hierarchy_depth = {org_hierarchy_depth}
|
|
where app_id = {app_id} and user_id = '{user_id}'
|
|
"""
|
|
|
|
_clear_staging_tempalte = """
|
|
DELETE FROM {table} where app_id = {app_id}
|
|
"""
|
|
|
|
def __init__(self, args):
|
|
self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
|
|
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
|
args[Constant.WH_DB_USERNAME_KEY],
|
|
args[Constant.WH_DB_PASSWORD_KEY],
|
|
args[Constant.WH_DB_DRIVER_KEY])
|
|
self.wh_cursor = self.wh_con.cursor()
|
|
self.app_id = int(args[Constant.APP_ID_KEY])
|
|
self.group_app_id = int(args[Constant.LDAP_GROUP_APP_ID_KEY])
|
|
self.app_folder = args[Constant.WH_APP_FOLDER_KEY]
|
|
self.metadata_folder = self.app_folder + "/" + str(self.app_id)
|
|
self.ceo_user_id = args[Constant.LDAP_CEO_USER_ID_KEY]
|
|
|
|
def run(self):
|
|
try:
|
|
self.read_file_to_stg()
|
|
self.update_null_value()
|
|
self.update_manager_info()
|
|
self.update_hierarchy_info()
|
|
finally:
|
|
self.wh_cursor.close()
|
|
self.wh_con.close()
|
|
|
|
def read_file_to_stg(self):
|
|
|
|
for table in self._tables:
|
|
t = self._tables[table]
|
|
# Clear stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
self.logger.debug(query)
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def update_null_value(self):
|
|
for table in self._tables:
|
|
t = self._tables[table]
|
|
if 'nullif_columns' in t:
|
|
for column in t['nullif_columns']:
|
|
query = self._update_column_to_null_template.format(table=t.get("table"), column=column, column_value=t['nullif_columns'][column], app_id=self.app_id)
|
|
self.logger.debug(query)
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def update_manager_info(self):
|
|
t = self._tables["ldap_user"]
|
|
query = self._update_manager_info.format(table=t.get("table"), app_id=self.app_id)
|
|
self.logger.debug(query)
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def update_hierarchy_info(self):
|
|
t = self._tables["ldap_user"]
|
|
query = self._get_manager_edge.format(table=t.get("table"), app_id=self.app_id)
|
|
self.logger.debug(query)
|
|
self.wh_cursor.execute(query)
|
|
user_mgr_map = dict()
|
|
hierarchy = dict()
|
|
|
|
for row in self.wh_cursor:
|
|
user_mgr_map[row[0]] = row[1]
|
|
|
|
for user in user_mgr_map:
|
|
self.find_path_for_user(user, user_mgr_map, hierarchy)
|
|
|
|
case_org_hierarchy_template = " WHEN user_id = '{user_id}' THEN '{org_hierarchy}' "
|
|
case_org_hierarchy_depth_template = " WHEN user_id = '{user_id}' THEN {org_hierarchy_depth} "
|
|
user_ids = []
|
|
org_hierarchy_long_string = ""
|
|
org_hierarchy_depth_long_string = ""
|
|
count = 0
|
|
for user in hierarchy:
|
|
if hierarchy[user] is not None:
|
|
user_ids.append("'" + user + "'")
|
|
org_hierarchy_long_string += case_org_hierarchy_template.format(user_id=user, org_hierarchy=hierarchy[user][0])
|
|
org_hierarchy_depth_long_string += case_org_hierarchy_depth_template.format(user_id=user, org_hierarchy_depth=hierarchy[user][1])
|
|
count += 1
|
|
if count % 1000 == 0:
|
|
query = self._update_hierarchy_info.format(table=t.get("table"), app_id=self.app_id, user_ids=",".join(user_ids), org_hierarchy_long_string=org_hierarchy_long_string,
|
|
org_hierarchy_depth_long_string=org_hierarchy_depth_long_string)
|
|
# self.logger.debug(query)
|
|
self.wh_cursor.executemany(query)
|
|
user_ids = []
|
|
org_hierarchy_long_string = ""
|
|
org_hierarchy_depth_long_string = ""
|
|
|
|
query = self._update_hierarchy_info.format(table=t.get("table"), app_id=self.app_id, user_ids=",".join(user_ids), org_hierarchy_long_string=org_hierarchy_long_string,
|
|
org_hierarchy_depth_long_string=org_hierarchy_depth_long_string)
|
|
# self.logger.debug(query)
|
|
self.wh_cursor.executemany(query)
|
|
self.wh_con.commit()
|
|
|
|
def find_path_for_user(self, start, user_mgr_map, hierarchy):
|
|
if start in hierarchy:
|
|
return hierarchy[start]
|
|
|
|
if start is None or start == '':
|
|
return None
|
|
|
|
path = "/" + start
|
|
depth = 0
|
|
user = start
|
|
while user in user_mgr_map:
|
|
if user == self.ceo_user_id or user == user_mgr_map[user]:
|
|
break
|
|
user = user_mgr_map[user]
|
|
path = "/" + user + path
|
|
depth += 1
|
|
if user == self.ceo_user_id:
|
|
break
|
|
|
|
if path:
|
|
hierarchy[start] = (path, depth)
|
|
if len(hierarchy) % 1000 == 0:
|
|
self.logger.info("%d hierarchy path created in cache so far. [%s]" % (len(hierarchy), start))
|
|
|
|
return (path, depth)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
props = sys.argv[1]
|
|
lt = LdapTransform(props)
|
|
lt.run()
|