datahub/metadata-etl/src/main/resources/jython/SchedulerTransform.py
2015-12-17 16:26:15 -08:00

440 lines
17 KiB
Python

#
# Copyright 2015 LinkedIn Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#
from wherehows.common.enums import SchedulerType
__author__ = 'zechen'
from wherehows.common import Constant
from com.ziclix.python.sql import zxJDBC
import sys
class SchedulerTransform:
_tables = {"flows": {"columns": "app_id, flow_name, flow_path, flow_level, source_version, source_created_time, source_modified_time, wh_etl_exec_id",
"file": "flow.csv",
"table": "stg_flow"},
"jobs": {"columns": "app_id, flow_path, source_version, job_name, job_path, job_type, wh_etl_exec_id",
"file": "job.csv",
"table": "stg_flow_job"},
"dags": {"columns": "app_id, flow_path, source_version, source_job_path, target_job_path, wh_etl_exec_id",
"file": "dag.csv",
"table": "stg_flow_dag_edge"},
"owners": {"columns": "app_id, flow_path, owner_id, wh_etl_exec_id",
"file": "owner.csv",
"table": "stg_flow_owner_permission"},
"schedules": {"columns": "app_id, flow_path, unit, frequency, effective_start_time, effective_end_time, ref_id, wh_etl_exec_id",
"file": "schedule.csv",
"table": "stg_flow_schedule"},
"flow_execs": {"columns": "app_id, flow_name, flow_path, flow_exec_uuid, source_version, flow_exec_status, attempt_id, executed_by, start_time, end_time, wh_etl_exec_id",
"file": "flow_exec.csv",
"table": "stg_flow_execution"},
"job_execs": {"columns": "app_id, flow_path, flow_exec_uuid, source_version, job_name, job_path, job_exec_uuid, job_exec_status, attempt_id, start_time, end_time, wh_etl_exec_id",
"file": "job_exec.csv",
"table": "stg_job_execution"}
}
_read_file_template = """
LOAD DATA LOCAL INFILE '{folder}/{file}'
INTO TABLE {table}
FIELDS TERMINATED BY '\x1a' ESCAPED BY '\0'
LINES TERMINATED BY '\n'
({columns});
"""
_get_flow_id_template = """
UPDATE {table} stg
JOIN flow_source_id_map fm
ON stg.app_id = fm.app_id AND stg.flow_path = fm.source_id_string
SET stg.flow_id = fm.flow_id WHERE stg.app_id = {app_id}
"""
_get_job_id_template = """
UPDATE {table} stg
JOIN job_source_id_map jm
ON stg.app_id = jm.app_id AND stg.job_path = jm.source_id_string
SET stg.job_id = jm.job_id WHERE stg.app_id = {app_id}
"""
_clear_staging_tempalte = """
DELETE FROM {table} WHERE app_id = {app_id}
"""
def __init__(self, args, scheduler_type):
self.app_id = int(args[Constant.APP_ID_KEY])
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
args[Constant.WH_DB_USERNAME_KEY],
args[Constant.WH_DB_PASSWORD_KEY],
args[Constant.WH_DB_DRIVER_KEY])
self.wh_cursor = self.wh_con.cursor()
self.app_folder = args[Constant.WH_APP_FOLDER_KEY]
self.metadata_folder = self.app_folder + "/" + str(scheduler_type) + "/" + str(self.app_id)
def run(self):
try:
self.read_flow_file_to_stg()
self.read_job_file_to_stg()
self.read_dag_file_to_stg()
self.read_flow_owner_file_to_stg()
self.read_flow_schedule_file_to_stg()
self.read_flow_exec_file_to_stg()
self.read_job_exec_file_to_stg()
finally:
self.wh_cursor.close()
self.wh_con.close()
def read_flow_file_to_stg(self):
t = self._tables["flows"]
# Clear stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Insert new flow into mapping table to generate flow id
query = """
INSERT INTO flow_source_id_map (app_id, source_id_string)
SELECT sf.app_id, sf.flow_path FROM {table} sf
WHERE sf.app_id = {app_id}
AND NOT EXISTS (SELECT * FROM flow_source_id_map where app_id = sf.app_id AND source_id_string = sf.flow_path)
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
def read_job_file_to_stg(self):
t = self._tables["jobs"]
# Clearing stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# ad hoc fix for null values, need better solution by changing the load script
query = """
UPDATE {table} stg
SET stg.ref_flow_path = null
WHERE stg.ref_flow_path = 'null' and stg.app_id = {app_id}
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update sub flow id from mapping table
query = """
UPDATE {table} stg
JOIN flow_source_id_map fm
ON stg.app_id = fm.app_id AND stg.ref_flow_path = fm.source_id_string
SET stg.ref_flow_id = fm.flow_id WHERE stg.app_id = {app_id}
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Insert new job into job map to generate job id
query = """
INSERT INTO job_source_id_map (app_id, source_id_string)
SELECT sj.app_id, sj.job_path FROM {table} sj
WHERE sj.app_id = {app_id}
AND NOT EXISTS (SELECT * FROM job_source_id_map where app_id = sj.app_id AND source_id_string = sj.job_path)
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update job id from mapping table
query = self._get_job_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update job type id from job type reverse map
query = """
UPDATE {table} sj
JOIN cfg_job_type_reverse_map jtm
ON sj.job_type = jtm.job_type_actual
SET sj.job_type_id = jtm.job_type_id
WHERE sj.app_id = {app_id}
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
def read_dag_file_to_stg(self):
t = self._tables["dags"]
# Clearing staging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update source_job_id
query = """
UPDATE {table} sj JOIN job_source_id_map jm ON sj.app_id = jm.app_id AND sj.source_job_path = jm.source_id_string
SET sj.source_job_id = jm.job_id
WHERE sj.app_id = {app_id}
""".format(app_id=self.app_id, table=t.get("table"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update target_job_id
query = """
UPDATE {table} sj JOIN job_source_id_map jm ON sj.app_id = jm.app_id AND sj.target_job_path = jm.source_id_string
SET sj.target_job_id = jm.job_id
WHERE sj.app_id = {app_id}
""".format(app_id=self.app_id, table=t.get("table"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update pre_jobs and post_jobs in stg_flow_jobs table
# need increase group concat max length to avoid overflow
query = "SET group_concat_max_len=40960"
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
UPDATE stg_flow_job sj JOIN
(SELECT source_job_id as job_id, source_version, GROUP_CONCAT(distinct target_job_id SEPARATOR ',') as post_jobs
FROM {table} WHERE app_id = {app_id} AND source_job_id != target_job_id
GROUP BY source_job_id, source_version) as d
ON sj.job_id = d.job_id AND sj.source_version = d.source_version
SET sj.post_jobs = d.post_jobs
WHERE sj.app_id = {app_id};
""".format(app_id=self.app_id, table=t.get("table"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
UPDATE stg_flow_job sj JOIN
(SELECT target_job_id as job_id, source_version, GROUP_CONCAT(distinct source_job_id SEPARATOR ',') as pre_jobs
FROM {table} WHERE app_id = {app_id} AND source_job_id != target_job_id
GROUP BY target_job_id, source_version) as d
ON sj.job_id = d.job_id AND sj.source_version = d.source_version
SET sj.pre_jobs = d.pre_jobs
WHERE sj.app_id = {app_id};
""".format(app_id=self.app_id, table=t.get("table"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
UPDATE stg_flow_job sj
SET sj.is_first = 'Y'
WHERE sj.pre_jobs IS NULL AND sj.app_id = {app_id}
""".format(app_id=self.app_id)
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
UPDATE stg_flow_job sj
SET sj.is_last = 'Y'
WHERE sj.post_jobs IS NULL AND sj.app_id = {app_id}
""".format(app_id=self.app_id)
self.wh_cursor.execute(query)
self.wh_con.commit()
query = self._clear_staging_tempalte.format(table="stg_flow_dag", app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
INSERT INTO stg_flow_dag (app_id, flow_id, source_version, wh_etl_exec_id, dag_md5)
SELECT DISTINCT app_id, flow_id, source_version, wh_etl_exec_id, md5(group_concat(source_job_id, "-", target_job_id ORDER BY source_job_id,target_job_id SEPARATOR ",")) dag_md5
FROM {table} t
WHERE app_id = {app_id} GROUP BY app_id, flow_id, source_version, wh_etl_exec_id
UNION
SELECT DISTINCT app_id, flow_id, source_version, wh_etl_exec_id, 0
FROM stg_flow sf
WHERE sf.app_id = {app_id} AND NOT EXISTS (SELECT * FROM {table} t WHERE t.app_id = sf.app_id AND t.flow_id = sf.flow_id AND t.source_version = sf.source_version)
""".format(app_id=self.app_id, table=t.get("table"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
UPDATE stg_flow_dag s
LEFT JOIN flow_dag f
ON s.app_id = f.app_id AND s.flow_id = f.flow_id AND (f.is_current IS NULL OR f.is_current = 'Y')
SET s.dag_version = CASE WHEN f.dag_md5 IS NULL THEN 0 WHEN s.dag_md5 != f.dag_md5 THEN f.dag_version + 1 ELSE f.dag_version END
WHERE s.app_id = {app_id}
""".format(app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
query = """
UPDATE stg_flow_job sj
JOIN
(SELECT DISTINCT app_id, flow_id, source_version, dag_version FROM stg_flow_dag) dag
ON sj.app_id = dag.app_id AND sj.flow_id = dag.flow_id AND sj.source_version = dag.source_version
SET sj.dag_version = dag.dag_version
WHERE sj.app_id = {app_id}
""".format(app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
def read_flow_owner_file_to_stg(self):
t = self._tables["owners"]
# Clear stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
def read_flow_schedule_file_to_stg(self):
t = self._tables["schedules"]
# Clear stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow is_scheduled flag
query = """
UPDATE stg_flow f
LEFT JOIN {table} fs
ON f.flow_id = fs.flow_id AND f.app_id = fs.app_id
SET f.is_scheduled = CASE WHEN fs.flow_id IS NULL THEN 'N' ELSE 'Y' END
WHERE f.app_id = {app_id}
""".format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
def read_flow_exec_file_to_stg(self):
t = self._tables["flow_execs"]
# Clear stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
def read_job_exec_file_to_stg(self):
t = self._tables["job_execs"]
# Clear stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update flow id from mapping table
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
# Update job id from mapping table
query = self._get_job_id_template.format(table=t.get("table"), app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()
if __name__ == "__main__":
props = sys.argv[1]
st = SchedulerTransform(props, SchedulerType.GENERIC)
st.run()