mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-15 10:57:58 +00:00
Update Azkaban_Execution job to fetch cronExpression in flow scheduling
This commit is contained in:
commit
c9f4f18d9c
@ -95,8 +95,8 @@ CREATE TABLE flow_job (
|
||||
job_type_id SMALLINT COMMENT 'type id of the job',
|
||||
job_type VARCHAR(63) COMMENT 'type of the job',
|
||||
ref_flow_id INT UNSIGNED NULL COMMENT 'the reference flow id of the job if the job is a subflow',
|
||||
pre_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run before this job',
|
||||
post_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run after this job',
|
||||
pre_jobs VARCHAR(20000) CHAR SET latin1 COMMENT 'comma separated job ids that run before this job',
|
||||
post_jobs VARCHAR(20000) CHAR SET latin1 COMMENT 'comma separated job ids that run after this job',
|
||||
is_current CHAR(1) COMMENT 'determine if it is a current job',
|
||||
is_first CHAR(1) COMMENT 'determine if it is the first job',
|
||||
is_last CHAR(1) COMMENT 'determine if it is the last job',
|
||||
@ -126,8 +126,8 @@ CREATE TABLE stg_flow_job (
|
||||
job_type VARCHAR(63) COMMENT 'type of the job',
|
||||
ref_flow_id INT UNSIGNED NULL COMMENT 'the reference flow id of the job if the job is a subflow',
|
||||
ref_flow_path VARCHAR(1024) COMMENT 'the reference flow path of the job if the job is a subflow',
|
||||
pre_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run before this job',
|
||||
post_jobs VARCHAR(4096) COMMENT 'comma separated job ids that run after this job',
|
||||
pre_jobs VARCHAR(20000) CHAR SET latin1 COMMENT 'comma separated job ids that run before this job',
|
||||
post_jobs VARCHAR(20000) CHAR SET latin1 COMMENT 'comma separated job ids that run after this job',
|
||||
is_current CHAR(1) COMMENT 'determine if it is a current job',
|
||||
is_first CHAR(1) COMMENT 'determine if it is the first job',
|
||||
is_last CHAR(1) COMMENT 'determine if it is the last job',
|
||||
@ -366,6 +366,7 @@ CREATE TABLE flow_schedule (
|
||||
COMMENT 'flow id',
|
||||
unit VARCHAR(31) COMMENT 'unit of time',
|
||||
frequency INT COMMENT 'frequency of the unit',
|
||||
cron_expression VARCHAR(127) COMMENT 'cron expression',
|
||||
is_active CHAR(1) COMMENT 'determine if it is an active schedule',
|
||||
included_instances VARCHAR(127) COMMENT 'included instance',
|
||||
excluded_instances VARCHAR(127) COMMENT 'excluded instance',
|
||||
@ -389,6 +390,7 @@ CREATE TABLE stg_flow_schedule (
|
||||
flow_path VARCHAR(1024) COMMENT 'flow path from top level',
|
||||
unit VARCHAR(31) COMMENT 'unit of time',
|
||||
frequency INT COMMENT 'frequency of the unit',
|
||||
cron_expression VARCHAR(127) COMMENT 'cron expression',
|
||||
included_instances VARCHAR(127) COMMENT 'included instance',
|
||||
excluded_instances VARCHAR(127) COMMENT 'excluded instance',
|
||||
effective_start_time INT UNSIGNED COMMENT 'effective start time of the flow execution',
|
||||
@ -438,7 +440,7 @@ CREATE TABLE stg_flow_owner_permission (
|
||||
DEFAULT CHARSET = utf8
|
||||
COMMENT = 'Scheduler owner table' PARTITION BY HASH (app_id) PARTITIONS 8;
|
||||
|
||||
CREATE TABLE job_execution_ext_reference (
|
||||
CREATE TABLE job_execution_ext_reference (
|
||||
app_id smallint(5) UNSIGNED COMMENT 'application id of the flow' NOT NULL,
|
||||
job_exec_id bigint(20) UNSIGNED COMMENT 'job execution id either inherit or generated' NOT NULL,
|
||||
attempt_id smallint(6) COMMENT 'job execution attempt id' DEFAULT '0',
|
||||
@ -463,11 +465,11 @@ PARTITION BY HASH(app_id)
|
||||
PARTITION p7)
|
||||
;
|
||||
|
||||
CREATE INDEX idx_job_execution_ext_ref__ext_ref_id USING BTREE
|
||||
CREATE INDEX idx_job_execution_ext_ref__ext_ref_id USING BTREE
|
||||
ON job_execution_ext_reference(ext_ref_id);
|
||||
|
||||
|
||||
CREATE TABLE stg_job_execution_ext_reference (
|
||||
CREATE TABLE stg_job_execution_ext_reference (
|
||||
app_id smallint(5) UNSIGNED COMMENT 'application id of the flow' NOT NULL,
|
||||
job_exec_id bigint(20) UNSIGNED COMMENT 'job execution id either inherit or generated' NOT NULL,
|
||||
attempt_id smallint(6) COMMENT 'job execution attempt id' DEFAULT '0',
|
||||
|
@ -224,13 +224,18 @@ class AzkabanExtract:
|
||||
# print json.dumps(row[json_column], indent=4)
|
||||
|
||||
if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == 'true':
|
||||
unit = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][-1:]
|
||||
unit = self._period_unit_table[unit]
|
||||
frequency = int(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1])
|
||||
unit, frequency, cron_expr = None, None, None
|
||||
period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"]
|
||||
if period is not None and period != "null" and period[-1:] in self._period_unit_table:
|
||||
unit = self._period_unit_table[period[-1:]]
|
||||
frequency = int(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1])
|
||||
if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]:
|
||||
cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"]
|
||||
schedule_record = AzkabanFlowScheduleRecord(self.app_id,
|
||||
row[json_column]["actions"][0]["actionJson"]["projectName"] + ':' + row[json_column]["actions"][0]["actionJson"]["flowName"],
|
||||
unit,
|
||||
frequency,
|
||||
cron_expr,
|
||||
long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"]) / 1000,
|
||||
int(time.mktime(datetime.date(2099,12,31).timetuple())),
|
||||
'0',
|
||||
|
@ -163,6 +163,7 @@ class OozieExtract:
|
||||
row['app_path'],
|
||||
row['time_unit'],
|
||||
int(row['frequency']),
|
||||
None,
|
||||
row['start_time'],
|
||||
row['end_time'],
|
||||
row['ref_id'],
|
||||
|
@ -136,15 +136,16 @@ class SchedulerLoad:
|
||||
self.wh_con.commit()
|
||||
|
||||
cmd = """
|
||||
INSERT INTO flow_schedule (app_id, flow_id, unit, frequency, included_instances, excluded_instances, effective_start_time, effective_end_time, is_active, ref_id,
|
||||
INSERT INTO flow_schedule (app_id, flow_id, unit, frequency, cron_expression, included_instances, excluded_instances, effective_start_time, effective_end_time, is_active, ref_id,
|
||||
created_time, modified_time, wh_etl_exec_id)
|
||||
SELECT app_id, flow_id, unit, frequency, included_instances, excluded_instances, effective_start_time, effective_end_time, 'Y', ref_id,
|
||||
SELECT app_id, flow_id, unit, frequency, cron_expression, included_instances, excluded_instances, effective_start_time, effective_end_time, 'Y', ref_id,
|
||||
unix_timestamp(NOW()) created_time, NULL modified_time, wh_etl_exec_id
|
||||
FROM stg_flow_schedule s
|
||||
WHERE s.app_id = {app_id} AND s.flow_id IS NOT NULL
|
||||
ON DUPLICATE KEY UPDATE
|
||||
unit = s.unit,
|
||||
frequency = s.frequency,
|
||||
cron_expression = s.cron_expression,
|
||||
is_active = 'Y',
|
||||
ref_id = s.ref_id,
|
||||
included_instances = s.included_instances,
|
||||
|
@ -33,7 +33,7 @@ class SchedulerTransform:
|
||||
"owners": {"columns": "app_id, flow_path, owner_id, wh_etl_exec_id",
|
||||
"file": "owner.csv",
|
||||
"table": "stg_flow_owner_permission"},
|
||||
"schedules": {"columns": "app_id, flow_path, unit, frequency, effective_start_time, effective_end_time, ref_id, wh_etl_exec_id",
|
||||
"schedules": {"columns": "app_id, flow_path, unit, frequency, cron_expression, effective_start_time, effective_end_time, ref_id, wh_etl_exec_id",
|
||||
"file": "schedule.csv",
|
||||
"table": "stg_flow_schedule"},
|
||||
"flow_execs": {"columns": "app_id, flow_name, flow_path, flow_exec_uuid, source_version, flow_exec_status, attempt_id, executed_by, start_time, end_time, wh_etl_exec_id",
|
||||
|
@ -25,17 +25,19 @@ public class AzkabanFlowScheduleRecord extends AbstractRecord {
|
||||
String flowPath;
|
||||
String unit;
|
||||
Integer frequency;
|
||||
String cronExpression;
|
||||
Long effectiveStartTime;
|
||||
Long effectiveEndTime;
|
||||
String refId;
|
||||
Long whExecId;
|
||||
|
||||
public AzkabanFlowScheduleRecord(Integer appId, String flowPath, String unit, Integer frequency,
|
||||
Long effectiveStartTime, Long effectiveEndTime, String refId, Long whExecId) {
|
||||
String cronExpression, Long effectiveStartTime, Long effectiveEndTime, String refId, Long whExecId) {
|
||||
this.appId = appId;
|
||||
this.flowPath = flowPath;
|
||||
this.unit = unit;
|
||||
this.frequency = frequency;
|
||||
this.cronExpression = cronExpression;
|
||||
this.effectiveStartTime = effectiveStartTime;
|
||||
this.effectiveEndTime = effectiveEndTime;
|
||||
this.refId = refId;
|
||||
@ -49,6 +51,7 @@ public class AzkabanFlowScheduleRecord extends AbstractRecord {
|
||||
allFields.add(flowPath);
|
||||
allFields.add(unit);
|
||||
allFields.add(frequency);
|
||||
allFields.add(cronExpression);
|
||||
allFields.add(effectiveStartTime);
|
||||
allFields.add(effectiveEndTime);
|
||||
allFields.add(refId);
|
||||
|
@ -18,7 +18,7 @@ package wherehows.common.schemas;
|
||||
*/
|
||||
public class OozieFlowScheduleRecord extends AzkabanFlowScheduleRecord {
|
||||
public OozieFlowScheduleRecord(Integer appId, String flowPath, String frequency, Integer interval,
|
||||
Long effectiveStartTime, Long effectiveEndTime, String refId, Long whExecId) {
|
||||
super(appId, flowPath, frequency, interval, effectiveStartTime, effectiveEndTime, refId, whExecId);
|
||||
String cronExpression, Long effectiveStartTime, Long effectiveEndTime, String refId, Long whExecId) {
|
||||
super(appId, flowPath, frequency, interval, cronExpression, effectiveStartTime, effectiveEndTime, refId, whExecId);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user