mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-16 03:18:45 +00:00
440 lines
17 KiB
Python
440 lines
17 KiB
Python
#
|
|
# Copyright 2015 LinkedIn Corp. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
#
|
|
|
|
from wherehows.common.enums import SchedulerType
|
|
|
|
|
|
__author__ = 'zechen'
|
|
|
|
from wherehows.common import Constant
|
|
from com.ziclix.python.sql import zxJDBC
|
|
import sys
|
|
|
|
|
|
class SchedulerTransform:
|
|
|
|
_tables = {"flows": {"columns": "app_id, flow_name, flow_path, flow_level, source_version, source_created_time, source_modified_time, wh_etl_exec_id",
|
|
"file": "flow.csv",
|
|
"table": "stg_flow"},
|
|
"jobs": {"columns": "app_id, flow_path, source_version, job_name, job_path, job_type, wh_etl_exec_id",
|
|
"file": "job.csv",
|
|
"table": "stg_flow_job"},
|
|
"dags": {"columns": "app_id, flow_path, source_version, source_job_path, target_job_path, wh_etl_exec_id",
|
|
"file": "dag.csv",
|
|
"table": "stg_flow_dag_edge"},
|
|
"owners": {"columns": "app_id, flow_path, owner_id, wh_etl_exec_id",
|
|
"file": "owner.csv",
|
|
"table": "stg_flow_owner_permission"},
|
|
"schedules": {"columns": "app_id, flow_path, unit, frequency, effective_start_time, effective_end_time, ref_id, wh_etl_exec_id",
|
|
"file": "schedule.csv",
|
|
"table": "stg_flow_schedule"},
|
|
"flow_execs": {"columns": "app_id, flow_name, flow_path, flow_exec_uuid, source_version, flow_exec_status, attempt_id, executed_by, start_time, end_time, wh_etl_exec_id",
|
|
"file": "flow_exec.csv",
|
|
"table": "stg_flow_execution"},
|
|
"job_execs": {"columns": "app_id, flow_path, flow_exec_uuid, source_version, job_name, job_path, job_exec_uuid, job_exec_status, attempt_id, start_time, end_time, wh_etl_exec_id",
|
|
"file": "job_exec.csv",
|
|
"table": "stg_job_execution"}
|
|
}
|
|
|
|
_read_file_template = """
|
|
LOAD DATA LOCAL INFILE '{folder}/{file}'
|
|
INTO TABLE {table}
|
|
FIELDS TERMINATED BY '\x1a' ESCAPED BY '\0'
|
|
LINES TERMINATED BY '\n'
|
|
({columns});
|
|
"""
|
|
|
|
_get_flow_id_template = """
|
|
UPDATE {table} stg
|
|
JOIN flow_source_id_map fm
|
|
ON stg.app_id = fm.app_id AND stg.flow_path = fm.source_id_string
|
|
SET stg.flow_id = fm.flow_id WHERE stg.app_id = {app_id}
|
|
"""
|
|
|
|
_get_job_id_template = """
|
|
UPDATE {table} stg
|
|
JOIN job_source_id_map jm
|
|
ON stg.app_id = jm.app_id AND stg.job_path = jm.source_id_string
|
|
SET stg.job_id = jm.job_id WHERE stg.app_id = {app_id}
|
|
"""
|
|
|
|
_clear_staging_tempalte = """
|
|
DELETE FROM {table} WHERE app_id = {app_id}
|
|
"""
|
|
|
|
def __init__(self, args, scheduler_type):
|
|
self.app_id = int(args[Constant.APP_ID_KEY])
|
|
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
|
|
args[Constant.WH_DB_USERNAME_KEY],
|
|
args[Constant.WH_DB_PASSWORD_KEY],
|
|
args[Constant.WH_DB_DRIVER_KEY])
|
|
self.wh_cursor = self.wh_con.cursor()
|
|
self.app_folder = args[Constant.WH_APP_FOLDER_KEY]
|
|
self.metadata_folder = self.app_folder + "/" + str(scheduler_type) + "/" + str(self.app_id)
|
|
|
|
def run(self):
|
|
try:
|
|
self.read_flow_file_to_stg()
|
|
self.read_job_file_to_stg()
|
|
self.read_dag_file_to_stg()
|
|
self.read_flow_owner_file_to_stg()
|
|
self.read_flow_schedule_file_to_stg()
|
|
self.read_flow_exec_file_to_stg()
|
|
self.read_job_exec_file_to_stg()
|
|
finally:
|
|
self.wh_cursor.close()
|
|
self.wh_con.close()
|
|
|
|
def read_flow_file_to_stg(self):
|
|
t = self._tables["flows"]
|
|
|
|
# Clear stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Insert new flow into mapping table to generate flow id
|
|
query = """
|
|
INSERT INTO flow_source_id_map (app_id, source_id_string)
|
|
SELECT sf.app_id, sf.flow_path FROM {table} sf
|
|
WHERE sf.app_id = {app_id}
|
|
AND NOT EXISTS (SELECT * FROM flow_source_id_map where app_id = sf.app_id AND source_id_string = sf.flow_path)
|
|
""".format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def read_job_file_to_stg(self):
|
|
t = self._tables["jobs"]
|
|
|
|
# Clearing stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# ad hoc fix for null values, need better solution by changing the load script
|
|
query = """
|
|
UPDATE {table} stg
|
|
SET stg.ref_flow_path = null
|
|
WHERE stg.ref_flow_path = 'null' and stg.app_id = {app_id}
|
|
""".format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update sub flow id from mapping table
|
|
query = """
|
|
UPDATE {table} stg
|
|
JOIN flow_source_id_map fm
|
|
ON stg.app_id = fm.app_id AND stg.ref_flow_path = fm.source_id_string
|
|
SET stg.ref_flow_id = fm.flow_id WHERE stg.app_id = {app_id}
|
|
""".format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Insert new job into job map to generate job id
|
|
query = """
|
|
INSERT INTO job_source_id_map (app_id, source_id_string)
|
|
SELECT sj.app_id, sj.job_path FROM {table} sj
|
|
WHERE sj.app_id = {app_id}
|
|
AND NOT EXISTS (SELECT * FROM job_source_id_map where app_id = sj.app_id AND source_id_string = sj.job_path)
|
|
""".format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update job id from mapping table
|
|
query = self._get_job_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update job type id from job type reverse map
|
|
query = """
|
|
UPDATE {table} sj
|
|
JOIN cfg_job_type_reverse_map jtm
|
|
ON sj.job_type = jtm.job_type_actual
|
|
SET sj.job_type_id = jtm.job_type_id
|
|
WHERE sj.app_id = {app_id}
|
|
""".format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def read_dag_file_to_stg(self):
|
|
t = self._tables["dags"]
|
|
|
|
# Clearing staging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update source_job_id
|
|
query = """
|
|
UPDATE {table} sj JOIN job_source_id_map jm ON sj.app_id = jm.app_id AND sj.source_job_path = jm.source_id_string
|
|
SET sj.source_job_id = jm.job_id
|
|
WHERE sj.app_id = {app_id}
|
|
""".format(app_id=self.app_id, table=t.get("table"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update target_job_id
|
|
query = """
|
|
UPDATE {table} sj JOIN job_source_id_map jm ON sj.app_id = jm.app_id AND sj.target_job_path = jm.source_id_string
|
|
SET sj.target_job_id = jm.job_id
|
|
WHERE sj.app_id = {app_id}
|
|
""".format(app_id=self.app_id, table=t.get("table"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update pre_jobs and post_jobs in stg_flow_jobs table
|
|
# need increase group concat max length to avoid overflow
|
|
query = "SET group_concat_max_len=40960"
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
UPDATE stg_flow_job sj JOIN
|
|
(SELECT source_job_id as job_id, source_version, GROUP_CONCAT(distinct target_job_id SEPARATOR ',') as post_jobs
|
|
FROM {table} WHERE app_id = {app_id} AND source_job_id != target_job_id
|
|
GROUP BY source_job_id, source_version) as d
|
|
ON sj.job_id = d.job_id AND sj.source_version = d.source_version
|
|
SET sj.post_jobs = d.post_jobs
|
|
WHERE sj.app_id = {app_id};
|
|
""".format(app_id=self.app_id, table=t.get("table"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
UPDATE stg_flow_job sj JOIN
|
|
(SELECT target_job_id as job_id, source_version, GROUP_CONCAT(distinct source_job_id SEPARATOR ',') as pre_jobs
|
|
FROM {table} WHERE app_id = {app_id} AND source_job_id != target_job_id
|
|
GROUP BY target_job_id, source_version) as d
|
|
ON sj.job_id = d.job_id AND sj.source_version = d.source_version
|
|
SET sj.pre_jobs = d.pre_jobs
|
|
WHERE sj.app_id = {app_id};
|
|
""".format(app_id=self.app_id, table=t.get("table"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
UPDATE stg_flow_job sj
|
|
SET sj.is_first = 'Y'
|
|
WHERE sj.pre_jobs IS NULL AND sj.app_id = {app_id}
|
|
""".format(app_id=self.app_id)
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
UPDATE stg_flow_job sj
|
|
SET sj.is_last = 'Y'
|
|
WHERE sj.post_jobs IS NULL AND sj.app_id = {app_id}
|
|
""".format(app_id=self.app_id)
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = self._clear_staging_tempalte.format(table="stg_flow_dag", app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
INSERT INTO stg_flow_dag (app_id, flow_id, source_version, wh_etl_exec_id, dag_md5)
|
|
SELECT DISTINCT app_id, flow_id, source_version, wh_etl_exec_id, md5(group_concat(source_job_id, "-", target_job_id ORDER BY source_job_id,target_job_id SEPARATOR ",")) dag_md5
|
|
FROM {table} t
|
|
WHERE app_id = {app_id} GROUP BY app_id, flow_id, source_version, wh_etl_exec_id
|
|
UNION
|
|
SELECT DISTINCT app_id, flow_id, source_version, wh_etl_exec_id, 0
|
|
FROM stg_flow sf
|
|
WHERE sf.app_id = {app_id} AND NOT EXISTS (SELECT * FROM {table} t WHERE t.app_id = sf.app_id AND t.flow_id = sf.flow_id AND t.source_version = sf.source_version)
|
|
""".format(app_id=self.app_id, table=t.get("table"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
UPDATE stg_flow_dag s
|
|
LEFT JOIN flow_dag f
|
|
ON s.app_id = f.app_id AND s.flow_id = f.flow_id AND (f.is_current IS NULL OR f.is_current = 'Y')
|
|
SET s.dag_version = CASE WHEN f.dag_md5 IS NULL THEN 0 WHEN s.dag_md5 != f.dag_md5 THEN f.dag_version + 1 ELSE f.dag_version END
|
|
WHERE s.app_id = {app_id}
|
|
""".format(app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
query = """
|
|
UPDATE stg_flow_job sj
|
|
JOIN
|
|
(SELECT DISTINCT app_id, flow_id, source_version, dag_version FROM stg_flow_dag) dag
|
|
ON sj.app_id = dag.app_id AND sj.flow_id = dag.flow_id AND sj.source_version = dag.source_version
|
|
SET sj.dag_version = dag.dag_version
|
|
WHERE sj.app_id = {app_id}
|
|
""".format(app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
|
|
self.wh_con.commit()
|
|
|
|
def read_flow_owner_file_to_stg(self):
|
|
t = self._tables["owners"]
|
|
|
|
# Clear stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def read_flow_schedule_file_to_stg(self):
|
|
t = self._tables["schedules"]
|
|
|
|
# Clear stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow is_scheduled flag
|
|
query = """
|
|
UPDATE stg_flow f
|
|
LEFT JOIN {table} fs
|
|
ON f.flow_id = fs.flow_id AND f.app_id = fs.app_id
|
|
SET f.is_scheduled = CASE WHEN fs.flow_id IS NULL THEN 'N' ELSE 'Y' END
|
|
WHERE f.app_id = {app_id}
|
|
""".format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def read_flow_exec_file_to_stg(self):
|
|
t = self._tables["flow_execs"]
|
|
|
|
# Clear stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
def read_job_exec_file_to_stg(self):
|
|
t = self._tables["job_execs"]
|
|
|
|
# Clear stagging table
|
|
query = self._clear_staging_tempalte.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Load file into stagging table
|
|
query = self._read_file_template.format(folder=self.metadata_folder, file=t.get("file"), table=t.get("table"), columns=t.get("columns"))
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update flow id from mapping table
|
|
query = self._get_flow_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
# Update job id from mapping table
|
|
query = self._get_job_id_template.format(table=t.get("table"), app_id=self.app_id)
|
|
print query
|
|
self.wh_cursor.execute(query)
|
|
self.wh_con.commit()
|
|
|
|
if __name__ == "__main__":
|
|
props = sys.argv[1]
|
|
st = SchedulerTransform(props, SchedulerType.GENERIC)
|
|
st.run()
|