xxl-job 告警逻辑梳理

130 阅读2分钟

循环扫描失败记录,触发告警

JobFailMonitorHelper

monitorThread = new Thread(new Runnable() {
   @Override
   public void run() {
      // monitor
      while (!toStop) {
         try {
            //查询 1000条 执行失败记录
            List<Long> failLogIds = XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().findFailJobLogIds(1000);
            if (failLogIds!=null && !failLogIds.isEmpty()) {
               for (long failLogId: failLogIds) {
                  // lock log 锁定记录
                  int lockRet = XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().updateAlarmStatus(failLogId, 0, -1);
                  if (lockRet < 1) {
                     continue;
                  } // 查询失败记录日志
                  XxlJobLog log = XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().load(failLogId);
                  //查询job信息
                  XxlJobInfo info = XxlJobAdminConfig.getAdminConfig().getXxlJobInfoDao().loadById(log.getJobId());
                  // 1、fail retry monitor 重试次数大于0
                  if (log.getExecutorFailRetryCount() > 0) {
                     // 重试执行
                     JobTriggerPoolHelper.trigger(log.getJobId(), TriggerTypeEnum.RETRY, (log.getExecutorFailRetryCount()-1), log.getExecutorShardingParam(), log.getExecutorParam(), null);
                     String retryMsg = "<br><br><span style="color:#F39C12;" > >>>>>>>>>>>"+ I18nUtil.getString("jobconf_trigger_type_retry") +"<<<<<<<<<<< </span><br>";
                     log.setTriggerMsg(log.getTriggerMsg() + retryMsg);
                     //更新执行日志 只更新日志内容
                     XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().updateTriggerInfo(log);
                  }
                  // 2、fail alarm monitor
                  int newAlarmStatus = 0;       // 告警状态:0-默认、-1=锁定状态、1-无需告警、2-告警成功、3-告警失败
                  if (info != null) { //执行告警记录
                     boolean alarmResult = XxlJobAdminConfig.getAdminConfig().getJobAlarmer().alarm(info, log);
                     newAlarmStatus = alarmResult?2:3;
                  } else {
                     newAlarmStatus = 1;
                  }
                  //更新告警状态
                  XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().updateAlarmStatus(failLogId, -1, newAlarmStatus);
               }
            }
         } catch (Throwable e) {
            if (!toStop) {
               logger.error(">>>>>>>>>>> xxl-job, job fail monitor thread error:{}", e);
            }
         }
                  try {
                      TimeUnit.SECONDS.sleep(10);
                  } catch (Throwable e) {
                      if (!toStop) {
                          logger.error(e.getMessage(), e);
                      }
                  }
              }
      logger.info(">>>>>>>>>>> xxl-job, job fail monitor thread stop");
   }
});

失败记录从哪里来

任务执行时会创建日志数据

XxlJobTrigge.processTrigger

private static void processTrigger(XxlJobGroup group, XxlJobInfo jobInfo, int finalFailRetryCount, TriggerTypeEnum triggerType, int index, int total){
    // 1、save log-id
    XxlJobLog jobLog = new XxlJobLog();
    jobLog.setJobGroup(jobInfo.getJobGroup());
    jobLog.setJobId(jobInfo.getId());
    jobLog.setTriggerTime(new Date());
    XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().save(jobLog);
    logger.debug(">>>>>>>>>>> xxl-job trigger start, jobId:{}", jobLog.getId());

}

服务端任务执行完会更新日志状态

com.xxl.job.core.thread.JobThread#run

public void run() {
       //任务执行。。。。
      } catch (Throwable e) {
         //异常处理 。。。。
      } finally {
               if(triggerParam != null) {
                   // callback handler info
                   if (!toStop) {
                       // commonm
                       TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
                              triggerParam.getLogId(),
                     triggerParam.getLogDateTime(),
                     XxlJobContext.getXxlJobContext().getHandleCode(),
                     XxlJobContext.getXxlJobContext().getHandleMsg() )
               );
                   } else {
                       // is killed
                       TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
                              triggerParam.getLogId(),
                     triggerParam.getLogDateTime(),
                     XxlJobContext.HANDLE_CODE_FAIL,
                     stopReason + " [job running, killed]" )
               );
                   }
               }
           }
       }

   // callback trigger request in queue
   while(triggerQueue !=null && triggerQueue.size()>0){
      TriggerParam triggerParam = triggerQueue.poll();
      if (triggerParam!=null) {
         // is killed
         TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
               triggerParam.getLogId(),
               triggerParam.getLogDateTime(),
               XxlJobContext.HANDLE_CODE_FAIL,
               stopReason + " [job not executed, in the job queue, killed.]")
         );
      }
   }

}

服务端执行回调

TriggerCallbackThread

triggerCallbackThread = new Thread(new Runnable() {
    @Override
    public void run() {
        // normal callback
        while(!toStop){
            try {
                HandleCallbackParam callback = getInstance().callBackQueue.take();
                if (callback != null) {
                    // callback list param
                    List<HandleCallbackParam> callbackParamList = new ArrayList<HandleCallbackParam>();
                    //这个方法的主要作用是将队列中的所有元素移除并添加到指定的集合中。
                    int drainToNum = getInstance().callBackQueue.drainTo(callbackParamList);
                    callbackParamList.add(callback);//一次拿出所有回调数据
                    // callback, will retry if error
                    if (callbackParamList!=null && callbackParamList.size()>0) {
                        doCallback(callbackParamList);
                    }
                }
            } catch (Throwable e) {
                if (!toStop) {
                    logger.error(e.getMessage(), e);
                }
            }
        }
        // last callback 停止以后再做一次回调
        try {
            List<HandleCallbackParam> callbackParamList = new ArrayList<HandleCallbackParam>();
            int drainToNum = getInstance().callBackQueue.drainTo(callbackParamList);
            if (callbackParamList!=null && callbackParamList.size()>0) {
                doCallback(callbackParamList);
            }
        } catch (Throwable e) {
            if (!toStop) {
                logger.error(e.getMessage(), e);
            }
        }
        logger.info(">>>>>>>>>>> xxl-job, executor callback thread destroy.");

    }
});

客户端接受回调 更新日志数据

com.xxl.job.admin.core.thread.JobCompleteHelper#callback(com.xxl.job.core.biz.model.HandleCallbackParam)

private ReturnT<String> callback(HandleCallbackParam handleCallbackParam) {
   // valid log item
   XxlJobLog log = XxlJobAdminConfig.getAdminConfig().getXxlJobLogDao().load(handleCallbackParam.getLogId());
   if (log == null) {
      return new ReturnT<String>(ReturnT.FAIL_CODE, "log item not found.");
   }
   if (log.getHandleCode() > 0) {
      return new ReturnT<String>(ReturnT.FAIL_CODE, "log repeate callback.");     // avoid repeat callback, trigger child job etc
   }

   // handle msg
   StringBuffer handleMsg = new StringBuffer();
   if (log.getHandleMsg()!=null) {
      handleMsg.append(log.getHandleMsg()).append("<br>");
   }
   if (handleCallbackParam.getHandleMsg() != null) {
      handleMsg.append(handleCallbackParam.getHandleMsg());
   }

   // success, save log
   log.setHandleTime(new Date());
   log.setHandleCode(handleCallbackParam.getHandleCode());
   log.setHandleMsg(handleMsg.toString());
   XxlJobCompleter.updateHandleInfoAndFinish(log);

   return ReturnT.SUCCESS;
}