Saturn 源码解析

789 阅读15分钟

saturn-console

启动

saturn console本身为一个springboot项目,所有的bean都在saturn-console 下的applicationContext.xml中定义。所以启动主要就是初始化负责各种操作的bean

    <!-- -----  权限管理主要针对job -------->
<bean id="authorizationService"
          class="com.vip.saturn.job.console.service.impl.AuthorizationServiceImpl"/> 
    <bean id="authorizationManageServiceImpl"
          class="com.vip.saturn.job.console.service.impl.AuthorizationManageServiceImpl"/>

  <!-- -----  系统配置管理,console中的一些系统配置相关 -------->
	<bean id="systemConfigService"
		class="com.vip.saturn.job.console.service.impl.SystemConfigServiceImpl"/>

  <!-- -----  告警  -------->
	<bean id="alarmStatisticsService"
		class="com.vip.saturn.job.console.service.impl.AlarmStatisticsServiceImpl"/>
   <!-- -----  主页和统计信息 -------->
	<bean id="dashboardService"
		class="com.vip.saturn.job.console.service.impl.DashboardServiceImpl"/>
  <!-- -----  执行器管理 -------->
	<bean id="executorService"
		class="com.vip.saturn.job.console.service.impl.ExecutorServiceImpl"/>
   <!-- -----  job -------->
	<bean id="jobService"
		class="com.vip.saturn.job.console.service.impl.JobServiceImpl"/>
  <!-- -----  域和zk集群管理 -------->
	<bean id="namespaceZkClusterMappingService"
		class="com.vip.saturn.job.console.service.impl.NamespaceZkClusterMappingServiceImpl"/>
 <!-- -----  报告告警 -------->
	<bean id="reportAlarmService"
		class="com.vip.saturn.job.console.service.impl.ReportAlarmServiceImpl"/>
 <!-- ----- 判断cron -------->
	<bean id="utilsService"
		class="com.vip.saturn.job.console.service.impl.UtilsServiceImpl"/>
 <!-- -----  判断zk和数据库是不同 -------->
	<bean id="zkDBDiffService"
		class="com.vip.saturn.job.console.service.impl.ZkDBDiffServiceImpl"/>
 <!-- -----  查看和管理zk中的path -------->
	<bean id="zkTreeService"
		class="com.vip.saturn.job.console.service.impl.ZkTreeServiceImpl"/>
 <!-- ----- 更新job的配置 -------->
	<bean id="updateJobConfigService"
		class="com.vip.saturn.job.console.service.impl.UpdateJobConfigServiceImpl"/>
 <!-- -----  RestApi 针对job的一些http请求 -------->
	<bean id="restApiService" class="com.vip.saturn.job.console.service.impl.RestApiServiceImpl"/>
<!-- -----  将执行的结果(每天执行多少,失败等等)持久化到数据库 -------->
	<bean id="statisticPersistence"
		class="com.vip.saturn.job.console.service.impl.statistics.StatisticsPersistence"/>
<!-- -----  dashboard统计数据(执行多少,失败等等) -------->
	<bean id="statisticRefreshService"
		class="com.vip.saturn.job.console.service.impl.statistics.StatisticsRefreshServiceImpl"/>
<!-- -----  注册中心管理 -------->
	<bean id="registryCenterService"
		class="com.vip.saturn.job.console.service.impl.RegistryCenterServiceImpl"/>
<!-- -----  权限,登陆 -------->
    <bean id="authenticationService"
          class="com.vip.saturn.job.console.service.impl.AuthenticationServiceImpl"/>

上诉的bean中,大部分都是一些定时任务。重点关注namespacejob管理,包括创建,分片等等。

namespace的创建

console创建域,需要传入namespacezkCluster,后台由RegistryCenterControllercreateNamespace进行创建。核心逻辑如下:

  @Transactional(rollbackFor = {Exception.class})
	@Override
	public void createNamespace(NamespaceDomainInfo namespaceDomainInfo) throws SaturnJobConsoleException {
		String namespace = namespaceDomainInfo.getNamespace();
		String zkClusterKey = namespaceDomainInfo.getZkCluster();
    // 从注册中心查询是否存在该zk集群
		ZkCluster currentCluster = getZkCluster(zkClusterKey);

		if (currentCluster == null) {
			throw new SaturnJobConsoleHttpException(HttpStatus.BAD_REQUEST.value(),
					String.format(ERR_MSG_TEMPLATE_FAIL_TO_CREATE, namespace, "not found zkcluster" + zkClusterKey));
		}
    // 判断当前所有的集群下是否有该namespace,也就是说namespace是所有集群唯一的
		if (checkNamespaceExists(namespace)) {
			throw new SaturnJobConsoleHttpException(HttpStatus.BAD_REQUEST.value(),
					String.format(ERR_MSG_NS_ALREADY_EXIST, namespace));
		}

		try {
			// 创建 namespaceInfo
			NamespaceInfo namespaceInfo = constructNamespaceInfo(namespaceDomainInfo);
			namespaceInfoService.create(namespaceInfo);
			// 创建 zkcluster 和 namespaceInfo 关系,并写入数据库,如果从现有数据库中发现有该namespace,则更新数据,否则插入新数据。
			namespaceZkClusterMapping4SqlService.insert(namespace, "", zkClusterKey, NAMESPACE_CREATOR_NAME);
			// refresh 数据到注册中心,该方法不是通过直接刷新数据到zk,而是通过刷新sys_config 中的uid来异步进行刷新
			notifyRefreshRegCenter();
		} catch (Exception e) {
			log.error(e.getMessage(), e);
			throw new SaturnJobConsoleHttpException(HttpStatus.INTERNAL_SERVER_ERROR.value(),
					String.format(ERR_MSG_TEMPLATE_FAIL_TO_CREATE, namespace, e.getMessage()));
		}
	}


上文中代码会和RegistryCenterServiceImpl进行交互。

	public void init() {
		getConsoleClusterId();
		localRefresh();
		initLocalRefreshThreadPool();
		startLocalRefreshTimer();
		startLocalRefreshIfNecessaryTimer();
	}

	private void initLocalRefreshThreadPool() {
		localRefreshThreadPool = Executors
				.newSingleThreadExecutor(new ConsoleThreadFactory("refresh-RegCenter-thread", false));
	}

	private void startLocalRefreshTimer() {

		//每隔5分钟执行一次localRefresh

	}

	private void startLocalRefreshIfNecessaryTimer() {
  /* 每一秒钟检查,当system的配置发生变化,当前的uuid 和最新的uuid 不同的时候,执行loaclRefresh */
	}

	private synchronized void localRefresh() {
		// 有删减
      // 刷新注册中心,主要包括,对比zk集群是否变化,包括域的关闭和迁移
			refreshRegistryCenter();
    	// console 的选主和数据更新,在console的系统配置中设置的zk集群才能够被当前的console管理。
    	// 该数据为mysql中的数据,dashboard的选主逻辑为在zk中注册$SaturnSlef/saturn-console/dashboard/leader
			refreshDashboardLeaderTreeCache();
    	// 是否需要创建或者迁移namespaceShardingManager,每一个域名都对应一个namespaceShardingManager
			refreshNamespaceShardingListenerManagerMap();

	}

该类的初始化主要为上文的代码。所以,域名的创建逻辑是直接写入数据库,然后通过refresh写入zk。一个namespace的创建就完成了,主要是在数据库中写入数据(namespace_zkcluster_mapping),更新sys_config中的uid异步刷新。刷新后,注册中心会有该namespace的节点,以及针对dashboard的选主和分片管理,其中每一个namespace会有一个分片管理NamespaceShardingManager,该类主要用于管理执行器的上下线,分片等信息。

  	private void start0() throws Exception {
     //
		shardingTreeCacheService.start();
		// create ephemeral node $SaturnExecutors/leader/host & $Jobs
    // 主要是针对saturn的执行器选举,为console节点
		namespaceShardingService.leaderElection();
    // 针对已经存在的job增加JobServersTriggerShardingListener,用于当executor上线下后进行分片,节点为$Jobs/${jobName}/servers
    // JobServersTriggerShardingListener 
		addJobListenersService.addExistJobPathListener();
		// 上下线Listener,节点为/$SaturnExecutors/executors
    //ExecutorOnlineOfflineTriggerShardingListener  ExecutorTrafficTriggerShardingListener
		addOnlineOfflineListener();
		// 分片 节点为/$SaturnExecutors/sharding
    //SaturnExecutorsShardingTriggerShardingListener
		addExecutorShardingListener();
		// 选主
    // /$SaturnExecutors/leader
		addLeaderElectionListener();
		// 新增与删除/$Jobs
    // AddOrRemoveJobListener
		addNewOrRemoveJobListener();
	}

针对上面不同的listerner最后的执行的方法都是AbstractAsyncShardingTask中的run方法。当前我们创建了一个namespace且需要分片的时候,对应的NamespaceShardingManager会在zk上注册多个节点,并且监听对应的executor上下线,job创建等事件。分片是由当前executor的主节点来进行的,也就是说当新建一个namespace的时候,该namespace所有的executor分片由获取到该namespaceleader进行操作,每一步操作都会判断当前操作的对象是否为namespaceleader(是否已经创建host节点,以及host是否就是当前节点)。

选举:

	public void leaderElection() throws Exception {
		lock.lockInterruptibly();
		try {
			if (hasLeadership()) {
				return;
			}
			log.info("{}-{} leadership election start", namespace, hostValue);
			try (LeaderLatch leaderLatch = new LeaderLatch(curatorFramework,
					SaturnExecutorsNode.LEADER_LATCHNODE_PATH)) {
				leaderLatch.start();
				int timeoutSeconds = 60;
				if (leaderLatch.await(timeoutSeconds, TimeUnit.SECONDS)) {
					if (!hasLeadership()) {
						becomeLeader();
					} else {
						log.info("{}-{} becomes a follower", namespace, hostValue);
					}
				} else {
					log.error("{}-{} leadership election is timeout({}s)", namespace, hostValue, timeoutSeconds);
				}
			} catch (InterruptedException e) {
				log.info("{}-{} leadership election is interrupted", namespace, hostValue);
				throw e;
			} catch (Exception e) {
				log.error(namespace + "-" + hostValue + " leadership election error", e);
				throw e;
			}
		} finally {
			lock.unlock();
		}
	}

选举其实就是在$SaturnExecutors/leader节点下写入latch节点,成为leader后,回持久化$JOB节点在zk,然后将host写入$SaturnExecutors/leader/host里。

如果当前操作实例成为leader,则进行jobexecutor管理工作,包括上下线,分片等。

console所有的操作最后其实都是异步执行AbstractAsyncShardingTask里面的方法:

	public void run() {
		logStartInfo();
		boolean isAllShardingTask = this instanceof ExecuteAllShardingTask;
		try {
			// 如果当前变为非leader,则直接返回
			if (!namespaceShardingService.isLeadershipOnly()) {
				return;
			}

			// 如果需要全量分片,且当前线程不是全量分片线程,则直接返回,没必要做分片,由于console在选举成功后就会设置全量分片为true,而且立马将全量分片的task
      //提交给线程池。所以一开始是执行全量分片的
			if (namespaceShardingService.isNeedAllSharding() && !isAllShardingTask) {
				log.info("the {} will be ignored, because there will be {}", this.getClass().getSimpleName(),
						ExecuteAllShardingTask.class.getSimpleName());
				return;
			}
			// 从zk中获取所有的job
			List<String> allJobs = getAllJobs();
      // 获取所有enable的job
			List<String> allEnableJobs = getAllEnableJobs(allJobs);
      //最后在线的executor。位于$SaturnExecutors/sharding/content,该节点包含了executor的ip,分片值,以及负载等信息
			List<Executor> oldOnlineExecutorList = getLastOnlineExecutorList();
      // 如果当前是全分片,则从/$SaturnExecutors/executors 下拉去所有在线的executor,否则为null
			List<Executor> customLastOnlineExecutorList = customLastOnlineExecutorList();
      // 如果不是全量分片,则copy最后在县的executor。
			List<Executor> lastOnlineExecutorList = customLastOnlineExecutorList == null
					? copyOnlineExecutorList(oldOnlineExecutorList) : customLastOnlineExecutorList;
      //最后没有被摘取流量的executor,$SaturnExecutors/executors/xx/noTraffic true 已经被摘流量;false,otherwise;
			List<Executor> lastOnlineTrafficExecutorList = getTrafficExecutorList(lastOnlineExecutorList);
			List<Shard> shardList = new ArrayList<>();
			// 摘取,该方法为抽象方法,
			if (pick(allJobs, allEnableJobs, shardList, lastOnlineExecutorList, lastOnlineTrafficExecutorList)) {
				// 放回
				putBackBalancing(allEnableJobs, shardList, lastOnlineExecutorList, lastOnlineTrafficExecutorList);
				// 如果当前变为非leader,则返回
				if (!namespaceShardingService.isLeadershipOnly()) {
					return;
				}
				// 持久化分片结果
				if (shardingContentIsChanged(oldOnlineExecutorList, lastOnlineExecutorList)) {
					namespaceShardingContentService.persistDirectly(lastOnlineExecutorList);
				}
				// notify the shards-changed jobs of all enable jobs.
				Map<String, Map<String, List<Integer>>> enabledAndShardsChangedJobShardContent = getEnabledAndShardsChangedJobShardContent(
						isAllShardingTask, allEnableJobs, oldOnlineExecutorList, lastOnlineExecutorList);
				namespaceShardingContentService
						.persistJobsNecessaryInTransaction(enabledAndShardsChangedJobShardContent);
				// sharding count ++
				increaseShardingCount();
			}
		} catch (InterruptedException e) {
			log.info("{}-{} {} is interrupted", namespaceShardingService.getNamespace(),
					namespaceShardingService.getHostValue(), this.getClass().getSimpleName());
			Thread.currentThread().interrupt();
		} catch (Throwable t) {
			log.error(t.getMessage(), t);
			if (!isAllShardingTask) { // 如果当前不是全量分片,则需要全量分片来拯救异常
				namespaceShardingService.setNeedAllSharding(true);
				namespaceShardingService.shardingCountIncrementAndGet();
				executorService.submit(new ExecuteAllShardingTask(namespaceShardingService));
			} else { // 如果当前是全量分片,则告警并关闭当前服务,重选leader来做事情
				raiseAlarm();
				shutdownNamespaceShardingService();
			}
		} finally {
			if (isAllShardingTask) { // 如果是全量分片,不再进行全量分片
				namespaceShardingService.setNeedAllSharding(false);
			}
			namespaceShardingService.shardingCountDecrementAndGet();
		}
	}

针对上面的pick方法:

ExecuteAllShardingTask : 域下重排,移除已经存在所有executor,重新获取executors,重新获取作业shards
  
ExecuteExtractTrafficShardingTask : 摘取executor流量,标记该executor的noTraffic为true,并移除其所有作业分片,只摘取所有非本地作业分片,设置totalLoadLevel为0
  
ExecuteJobDisableShardingTask : 作业禁用,摘取所有executor运行的该作业的shard,注意要相应地减loadLevel,不需要放回
  
ExecuteJobEnableShardingTask: 作业启用,获取该作业的shards,注意要过滤不能运行该作业的executors

ExecuteJobForceShardShardingTask: 作业重排,移除所有executor的该作业shard,重新获取该作业的shards,finally删除forceShard结点
 
ExecuteJobServerOfflineShardingTask: 作业的executor下线,将该executor运行的该作业分片都摘取,如果是本地作业,则移除

ExecuteJobServerOnlineShardingTask : 作业的executor上线,executor级别平衡摘取,但是只能摘取该作业的shard;添加的新的shard
  
ExecuteOfflineShardingTask : executor下线,摘取该executor运行的所有非本地模式作业,移除该executor

总的来说,其实就是完成$SaturnExecutors/sharding/content下的内容,下面的内容是一个数组,从0开始,节点中记录的对象为

[{"executorName":"aaa","ip":"0.0.0.0","noTraffic":false,"jobNameList":["job1","job2"],"shardList":[{"jobName":"job1","item":0,loadlevel:1},{"jobName":"job2","item":0,loadlevel:1}]}]

后面的shardList就是分片后的结果。

上述就是一个namespace的创建过程中的逻辑。

namespace 迁移

console中能够将namespace迁移到其他的zk集群中。迁移主要逻辑就是修改数据库和注册中心的数据。迁移过程过程会记录失败和成功的个数到temporary_shared_status表中。更新过程是先从数据库中拿到当前namespacezk集群名称,然后将当前的节点移动到目的zk集群,删除当前对应的节点,然后刷新数据库中的数据。

如果在迁移过程中,删除了zk节点,但是namespace还没有刷新到数据库中,那么需要通过diff方法将数据补齐。暂时没有发现其他的途径

job

job的创建

job的创建和复制本质上是一样的,只是一个是手写配置,一个是从数据库中拉数据。具体的代码在JobServiceImplcreateJob中。

private void addOrCopyJob(String namespace, JobConfig jobConfig, String jobNameCopied, String createdBy)
			throws SaturnJobConsoleException {
    // 一半情况下为空
		List<JobConfig> unSystemJobs = getUnSystemJobs(namespace);
		Set<JobConfig> streamChangedJobs = new HashSet<>();
		validateJobConfig(namespace, jobConfig, unSystemJobs, streamChangedJobs);
		// 如果数据存在相同作业名,则抛异常
		// 直接再查一次,不使用unSystemJobs,因为也不能与系统作业名相同
		String jobName = jobConfig.getJobName();
		if (currentJobConfigService.findConfigByNamespaceAndJobName(namespace, jobName) != null) {
			throw new SaturnJobConsoleException(ERROR_CODE_BAD_REQUEST, String.format("该作业(%s)已经存在", jobName));
		}
		// 如果zk存在该作业,则尝试删除
		CuratorRepository.CuratorFrameworkOp curatorFrameworkOp = registryCenterService
				.getCuratorFrameworkOp(namespace);
		if (curatorFrameworkOp.checkExists(JobNodePath.getJobNodePath(jobName))) {
			if (!removeJobFromZk(jobName, curatorFrameworkOp)) {
				throw new SaturnJobConsoleException(ERROR_CODE_BAD_REQUEST,
						String.format("该作业(%s)正在删除中,请稍后再试", jobName));
			}
		}
		// 该域作业总数不能超过一定数量,sys_config 表中MAX_JOB_NUM字段,默认为100
		int maxJobNum = getMaxJobNum();
		if (jobIncExceeds(namespace, maxJobNum, 1)) {
			throw new SaturnJobConsoleException(ERROR_CODE_BAD_REQUEST,
					String.format("总作业数超过最大限制(%d),作业名%s创建失败", maxJobNum, jobName));
		}
		// 如果是copy作业,则从数据库中复制被拷贝的作业的配置到新的作业配置
		JobConfig myJobConfig = jobConfig;
		if (jobNameCopied != null) {
			myJobConfig = currentJobConfigService.findConfigByNamespaceAndJobName(namespace, jobNameCopied);
			SaturnBeanUtils.copyPropertiesIgnoreNull(jobConfig, myJobConfig);
		}
		// 设置作业配置字段默认值,并且强制纠正某些字段
		correctConfigValueWhenAddJob(myJobConfig);
		// 添加该作业到数据库
		currentJobConfigService.create(constructJobConfig4DB(namespace, myJobConfig, createdBy, createdBy));
		// 更新关联作业的上下游
		for (JobConfig streamChangedJob : streamChangedJobs) {
			currentJobConfigService.updateStream(constructJobConfig4DB(namespace, streamChangedJob, null, createdBy));
		}
		// 添加该作业配置到zk,并联动更新关联作业的上下游
		createJobConfigToZk(myJobConfig, streamChangedJobs, curatorFrameworkOp);
	}

最后将作业各种配置写入zk

		// 添加作业
private void createJobConfigToZk(JobConfig jobConfig, Set<JobConfig> streamChangedJobs,
			CuratorRepository.CuratorFrameworkOp curatorFrameworkOp) throws SaturnJobConsoleException {
			curatorTransactionOp
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_ENABLED), jobConfig.getEnabled())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_DESCRIPTION), jobConfig.getDescription())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_CUSTOM_CONTEXT),
							jobConfig.getCustomContext())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_JOB_TYPE), jobConfig.getJobType())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_JOB_MODE), jobConfig.getJobMode())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_SHARDING_ITEM_PARAMETERS),
							jobConfig.getShardingItemParameters())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_JOB_PARAMETER),
							jobConfig.getJobParameter())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_QUEUE_NAME), jobConfig.getQueueName())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_CHANNEL_NAME),
							jobConfig.getChannelName())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_FAILOVER), jobConfig.getFailover())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_MONITOR_EXECUTION), "true")
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_TIMEOUT_4_ALARM_SECONDS),
							jobConfig.getTimeout4AlarmSeconds())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_TIMEOUT_SECONDS),
							jobConfig.getTimeoutSeconds())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_TIME_ZONE), jobConfig.getTimeZone())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_CRON), jobConfig.getCron())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_PAUSE_PERIOD_DATE),
							jobConfig.getPausePeriodDate())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_PAUSE_PERIOD_TIME),
							jobConfig.getPausePeriodTime())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_PROCESS_COUNT_INTERVAL_SECONDS),
							jobConfig.getProcessCountIntervalSeconds())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_SHARDING_TOTAL_COUNT),
							jobConfig.getShardingTotalCount())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_SHOW_NORMAL_LOG),
							jobConfig.getShowNormalLog())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_LOAD_LEVEL), jobConfig.getLoadLevel())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_JOB_DEGREE), jobConfig.getJobDegree())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_ENABLED_REPORT),
							jobConfig.getEnabledReport())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_PREFER_LIST), jobConfig.getPreferList())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_USE_DISPREFER_LIST),
							jobConfig.getUseDispreferList())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_LOCAL_MODE), jobConfig.getLocalMode())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_USE_SERIAL), jobConfig.getUseSerial())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_DEPENDENCIES),
							jobConfig.getDependencies())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_GROUPS), jobConfig.getGroups())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_RERUN), jobConfig.getRerun())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_UPSTREAM), jobConfig.getUpStream())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_DOWNSTREAM), jobConfig.getDownStream())
					.create(JobNodePath.getConfigNodePath(jobName, CONFIG_ITEM_JOB_CLASS), jobConfig.getJobClass());
}

可以看到,写入zk中的数据特别的多,基本上是配置的所有数据,如果没有则都使用默认值。一半情况下,最开始初始化的时候,只会涉及到job的名称,类全名和分片信息。其他的数据都是在详情里面配置的。

Executor

executor启动

本文只涉及到executor外部启动方式,不涉及到内嵌模式。

executor本身也是个java进程,所以仅仅需要找到对应的main方法就可以知道了解到启动的逻辑。

如果在本地使用saturn:run -Dnamespace=demo1 -DexecutorName=exec01 -DVIP_SATURN_CONSOLE_URI=http://127.0.0.1:9088 -f pom.xml的方式启动,

那么已经配置了namespace,console-url两个必须填入的,在插件的SaturnJobRunMojo中,我们发现他帮我们下载了executorzip然后解压缩,最后传入了saturnLibDirappLibDir,一个用于加载saturn依赖的包,一个是业务代码通过saturn:zip得到的zip包解压缩。为什么不能使用springboot,我们后续会了解到。

saturn-executor启动源码中,首先是初始化classLoader,包含两个,一个executorSaturnClassLoader,一个为jobJobClassLoader,两者的实现都是差不多的,主要都是继承URLClassLoader,该classloader的主要作用就是可以通过jar或者url等方式获取到class

public class SaturnClassLoader extends URLClassLoader {

	public SaturnClassLoader(URL[] urls, ClassLoader parent) {
		super(urls, parent);
	}
	@Override
	protected Class<?> loadClass(String name, boolean resolve) throws ClassNotFoundException {
		synchronized (getClassLoadingLock(name)) {
			Class<?> findClass = findLoadedClass(name);

			if (findClass == null) {
				findClass = super.loadClass(name, resolve);
			}
			return findClass;
		}
	}

classLoader继承了URLClassLoader,上文抛出一个问题,就是为什么不能用springboot的打包方式。当我们需要执行一个jar的时候,jar包中对应会有个META-INF/MANIFEST.MF文件,里面有个参数就是Main-Class当我们使用springboot插件打包的时候,该参数被修改为org.springframework.boot.loader.JarLauncher,而我们定义的main方法的class最终变成了start-class也就是说.而Jarlauncher最后使用的classloaderorg.springframework.boot.loader.LaunchedURLClassLoader,他会去扫描当前jar包里的BOOT-INF/classesBOOT-INF/lib,所以如果使用JobClassLoader的去加载原本写好的Main方法,而loadClass是不会去包里面查看BOOT-INF/classes或者BOOT-INF/lib下的内容的。这里涉及到springboot启动的时候的时候进行的类加载。

使用saturnmaven插件进行打包的实现莫过于将当前所有依赖的jar放入一个zip中。

启动的时候,传入-saturnLibDir -appLibDir -namespace,其中saturnLibDir也就是saturnexecutor/lib包将由SaturnClassLoader加载,appLibDir是上文提到的zip解压缩后文件夹,是由JobClassLoader加载。这种分开的方式可以有效去掉jar冲突。

运行启动人口为executor下的Main,核心启动为SaturnExecutor.buildExecutor。首先是初始化一些和日志相关配置。然后初始化Job,由于我们在写Job的时候,会在resources下配置saturn.properties,里面会写入app.class.

private static Object validateAndLoadSaturnApplication(ClassLoader jobClassLoader) {
		try {
			Properties properties = getSaturnProperty(jobClassLoader); //获取saturn.properties
			if (properties == null) {
				return null;
			}
			String appClassStr = properties.getProperty("app.class"); // 获取app.class
			if (StringUtils.isBlank(appClassStr)) {
				return null;
			}
			appClassStr = appClassStr.trim();
			ClassLoader oldCL = Thread.currentThread().getContextClassLoader();
			try {
				Thread.currentThread().setContextClassLoader(jobClassLoader);
				Class<?> appClass = jobClassLoader.loadClass(appClassStr); // 加载当前的appclass
				Class<?> saturnApplicationClass = jobClassLoader.loadClass(SATURN_APPLICATION_CLASS); // 加载实现SaturnApplication的类
				if (saturnApplicationClass.isAssignableFrom(appClass)) { // 判断初始化class 是否实现saturn的SaturnApplication借口
					Object saturnApplication = appClass.newInstance(); //初始化对象
					appClass.getMethod("init").invoke(saturnApplication);//调用初始化方法。
					return saturnApplication; // 这里返回的是Object,但是其实可以返回为SaturnApplication
				} else {
					throw new RuntimeException(
							"the app.class " + appClassStr + " must be instance of " + SATURN_APPLICATION_CLASS);
				}
			} finally {
				Thread.currentThread().setContextClassLoader(oldCL);
			}
		} catch (RuntimeException e) {
			LogUtils.error(log, LogEvents.ExecutorEvent.INIT, "Fail to load SaturnApplication", e);
			throw e;
		} catch (Exception e) {
			LogUtils.error(log, LogEvents.ExecutorEvent.INIT, "Fail to load SaturnApplication", e);
			throw new RuntimeException(e);
		}
	}

如果是springboot项目,则可以直接使用saturnspringboot

public class GenericSpringBootSaturnApplication extends AbstractSpringSaturnApplication {
	@Override
	public void init() {
		if (applicationContext != null) {
			destroy();
		}
		applicationContext = run();
	}
	@Override
	public void destroy() {
		if (applicationContext != null) {
			SpringApplication.exit(applicationContext);
			applicationContext = null;
		}
	}
	/**
	 * 启动SpringBoot,默认启动方式为SpringApplication.run(source()),其中source()可查看{@link #source()}方法
	 */
	protected ApplicationContext run() {
		return SpringApplication.run(source());
	}
	/**
	 * 使用默认方式启动SpringBoot时,加载的source
	 */
	protected Object source() {
		return this.getClass();
	}

}

其实可以看出,springboot项目主要就是为了获得这个ApplicationContext。后面saturnJob是从applicationContext中直接获取bean的,所以我们的job需要注册为springbean.

最后初始化SaturnExecutor的对象

private SaturnExecutor(String namespace, String executorName, ClassLoader executorClassLoader,
			ClassLoader jobClassLoader, Object saturnApplication) {
		this.executorName = executorName;
		this.namespace = namespace;
		this.executorClassLoader = executorClassLoader;
		this.jobClassLoader = jobClassLoader;
		this.saturnApplication = saturnApplication;
		this.raiseAlarmExecutorService = Executors
				.newSingleThreadExecutor(new SaturnThreadFactory(executorName + "-raise-alarm-thread", false));
		this.shutdownJobsExecutorService = Executors
				.newCachedThreadPool(new SaturnThreadFactory(executorName + "-shutdownJobSchedulers-thread", true));
		initRestartThread();
		registerShutdownHandler();
	}

其中initRestartThead会调用到

	public void execute() throws Exception {
		shutdownLock.lockInterruptibly();
		try {
			if (isShutdown) {
				return;
			}
			long startTime = System.currentTimeMillis();
			shutdown0();
			try {
				StartCheckUtil.add2CheckList(StartCheckItem.ZK, StartCheckItem.UNIQUE, StartCheckItem.JOBKILL);

				LogUtils.info(log, LogEvents.ExecutorEvent.INIT, "start to discover from saturn console");
				// 通过console的url,通过/rest/v1/discovery?namespace 获取到namespace的zk
				Map<String, String> discoveryInfo = discover();
				String zkConnectionString = discoveryInfo.get(DISCOVER_INFO_ZK_CONN_STR);
				if (StringUtils.isBlank(zkConnectionString)) {
					LogUtils.error(log, LogEvents.ExecutorEvent.INIT, "zk connection string is blank!");
					throw new RuntimeException("zk connection string is blank!");
				}
				saturnExecutorExtension.postDiscover(discoveryInfo);
				// 初始化注册中心
				initRegistryCenter(zkConnectionString.trim());
				// 检测是否存在仍然有正在运行的SHELL作业
				checkAndKillExistedShellJobs();
				// 初始化timeout scheduler
				TimeoutSchedulerExecutor.createScheduler(executorName);
				// 先注册Executor再启动作业,防止Executor因为一些配置限制而抛异常了,而作业线程已启动,导致作业还运行了一会
        // 在$SaturnExecutors/executors下注册当前的executor,包括{lastBeginTime,clean,version,ip}
				registerExecutor();
				// 启动定时清空nohup文件的线程
				periodicTruncateNohupOutService = new PeriodicTruncateNohupOutService(executorName);
				periodicTruncateNohupOutService.start();
				// 启动零点清0成功数错误数的线程
				resetCountService = new ResetCountService(executorName);
				resetCountService.startRestCountTimer();
				// 添加新增作业时的回调方法,启动已经存在的作业
				saturnExecutorService.registerJobsWatcher();
			} catch (Throwable t) {
				saturnExecutorExtension.handleExecutorStartError(t);
				shutdown0();
				throw t;
			}
		} finally {
			shutdownLock.unlock();
		}
	}

首先将当前的executor注册到zk上,然后对当前namespace下的$JOB节点进行监听。

	public void registerJobsWatcher() throws Exception {
		if (initNewJobService != null) {
			initNewJobService.shutdown();
		}
		initNewJobService = new InitNewJobService(this);
		initNewJobService.start();
	}
//InitNewJobService
public void start() throws Exception {
		treeCache = TreeCache.newBuilder((CuratorFramework) regCenter.getRawClient(), JobNodePath.ROOT).setExecutor(
				new CloseableExecutorService(Executors
						.newSingleThreadExecutor(new SaturnThreadFactory(executorName + "-$Jobs-watcher", false)),
						true)).setMaxDepth(1).build();
		executorService = Executors
				.newSingleThreadExecutor(new SaturnThreadFactory(executorName + "-initNewJob-thread", false));
		treeCache.getListenable().addListener(new InitNewJobListener(), executorService);
		treeCache.start();
	}

InitNewJobListener中是负责对当前节点node的监听,使用的是zktreecache,Curator包括了Node Cache 、Path Cache、Tree Cache三类。其中Node Cache节点缓存可以用于ZNode节点的监听,Path Cache子节点缓存用于ZNode的子节点的监听,而Tree Cache树缓存是Path Cache的增强,不光能监听子节点,也能监听ZNode节点自身。

监听的事件核心处理逻辑为:

	    String jobName = StringUtils.substringAfterLast(path, "/");
			// /${namespace}/$Jobs/${jobName}/${jobclass}
			String jobClassPath = JobNodePath.getNodeFullPath(jobName, ConfigurationNode.JOB_CLASS);
			// wait 5 seconds at most until jobClass created .WAIT_JOBCLASS_ADDED_COUNT=25
			for (int i = 0; i < WAIT_JOBCLASS_ADDED_COUNT; i++) {
        // 判断是否已经存在当前job的完整class
				if (!regCenter.isExisted(jobClassPath)) {
					Thread.sleep(200L);
					continue;
				}
        // 是否已经创建
				if (!jobNames.contains(jobName)) {
					if (canInitTheJob(jobName) && initJobScheduler(jobName)) {
            //加入已经创建的list,为什么不用set也很奇怪。
						jobNames.add(jobName);
					}
				} else {
				//log
				break;
			}

创建分为两种,如果当前的executor配置了VIP_SATURN_INIT_JOB_BY_GROUPS的值,也就是配置了group则只初始group中的数据。否则初始化所由的job。而初始化job位于initJobScheduler:

private boolean initJobScheduler(String jobName) {
			try {
				// job 创建失败的记录
				JOB_INIT_FAILED_RECORDS.get(executorName).putIfAbsent(jobName, new HashSet<Integer>());
        // 所有在console创建job或者update job的参数,从注册中心拉
				JobConfiguration jobConfig = new JobConfiguration(regCenter, jobName);
				if (jobConfig.getSaturnJobClass() == null) {
					throw new JobException(
							"unexpected error, the saturnJobClass cannot be null, jobName is %s, jobType is %s",
							jobName, jobConfig.getJobType());
				}
				if (jobConfig.isDeleting()) {
					String serverNodePath = JobNodePath.getServerNodePath(jobName, executorName);
					regCenter.remove(serverNodePath);
					LogUtils.warn(log, jobName, "the job is on deleting");
					return false;
				}
        //新建JobScheduler 对象
				JobScheduler scheduler = new JobScheduler(regCenter, jobConfig);
        // 设置saturnExecutorService,主要包含了executor的注册信息和关系
				scheduler.setSaturnExecutorService(saturnExecutorService);
        //初始化
				scheduler.init();
				// 如果创建成功,则从当前失败记录里移除
				JOB_INIT_FAILED_RECORDS.get(executorName).get(jobName).clear();
				return true;
			} catch (JobInitAlarmException e) {
				if (!SystemEnvProperties.VIP_SATURN_DISABLE_JOB_INIT_FAILED_ALARM) {
					// no need to log exception stack as it should be logged in the original happen place
					raiseAlarmForJobInitFailed(jobName, e);
				}
			} catch (Throwable t) {
				LogUtils.warn(log, jobName, "job initialize failed, but will not stop the init process", t);
			}

			return false;
		}

可以看到,重点最后都在JobScheduler这个类中,而这个类应该是和Job一对一的。

	public JobScheduler(final CoordinatorRegistryCenter coordinatorRegistryCenter,
			final JobConfiguration jobConfiguration) {
		this.jobName = jobConfiguration.getJobName();
		this.executorName = coordinatorRegistryCenter.getExecutorName();
		this.currentConf = jobConfiguration;
		this.coordinatorRegistryCenter = coordinatorRegistryCenter;
		this.jobNodeStorage = new JobNodeStorage(coordinatorRegistryCenter, jobConfiguration);
		initExecutorService();
		JobRegistry.addJobScheduler(executorName, jobName, this);

		zkCacheManager = new ZkCacheManager((CuratorFramework) coordinatorRegistryCenter.getRawClient(), jobName,
				executorName);
    // 包含一些job的config信息,包括cron等信息
		configService = new ConfigurationService(this);
    //选举,每一个job都有自己的leader,路径 /${namespace}/$Jobs/${job}/leader/election/
		leaderElectionService = new LeaderElectionService(this);
    //主要用于 持久化/${namespace}/$Jobs/${job}/servers下面的executor信息,包括ip,执行的成功失败等信息
    // 还会有标记是否立即执行,该节点为临时节点
		serverService = new ServerService(this);
    //用于分片信息
		shardingService = new ShardingService(this);
    //执行上下文
		executionContextService = new ExecutionContextService(this);
    //执行作业的服务,主要包含更新下次启动时间,注册作业完成情况等信息
		executionService = new ExecutionService(this);
    // 失败转移
		failoverService = new FailoverService(this);
    //统计作业执行情况
		statisticsService = new StatisticsService(this);
    //执行情况统计
		analyseService = new AnalyseService(this);
    // 每一个namespace 最多执行500个job VIP_SATURN_MAX_NUMBER_OF_JOBS=500
		limitMaxJobsService = new LimitMaxJobsService(this);
    //针对上面的所有service,都会有对应的zklisterner,这里主要是将所有的service和zk节点进行注册
		listenerManager = new ListenerManager(this);
    //上传结果的信息
		reportService = new ReportService(this);

	}

后面针对上诉的service在执行的过程中的作用在展开,现在进入JobScheduler.init().

	public void init() {
		try {
			startAll();
			createJob();
			serverService.persistServerOnline(job);
			// Notify job enabled or disabled after that all are ready, include job was initialized.
			configService.notifyJobEnabledOrNot();
		} catch (Throwable t) {
			shutdown(false);
			throw t;
		}
	}

初始化涉及到两步,第一步就是开始前面所有的service,然后就是创建job。由于service是跟随着job的生命周期来运行的,所以关注于job的生命周期,也就是它的初始化,运行,运行状态转换和停止等。其中createJob就是涉及到job的初始化:

	private void createJob() {
    //从typemanager 中获取到当前的job类型,本文主要涉及为java
		Class<?> jobClass = currentConf.getSaturnJobClass();
		try {
			job = (AbstractElasticJob) jobClass.newInstance();
		} catch (Exception e) {
			LogUtils.error(log, jobName, "unexptected error", e);
			throw new JobException(e);
		}
    //设置JobScheduler 中的service
		job.setJobScheduler(this);
		job.setConfigService(configService);
		job.setShardingService(shardingService);
		job.setExecutionContextService(executionContextService);
		job.setExecutionService(executionService);
		job.setFailoverService(failoverService);
		job.setServerService(serverService);
		job.setExecutorName(executorName);
		job.setReportService(reportService);
		job.setJobName(jobName);
		job.setNamespace(coordinatorRegistryCenter.getNamespace());
		job.setSaturnExecutorService(saturnExecutorService);
    // 初始化job
		job.init();
	}

所有的jobType:

public void registerJobType() {
   JobTypeManager.register( //
         JobTypeBuilder.newBuilder().name("JAVA_JOB").cron().java().allowedShutdownGracefully()
               .triggerClass(CronTrigger.class).handlerClass(SaturnJavaJob.class).build());
   JobTypeManager.register( //
         JobTypeBuilder.newBuilder().name("SHELL_JOB").cron().shell().allowedShutdownGracefully()
               .triggerClass(CronTrigger.class).handlerClass(SaturnScriptJob.class).build());
   JobTypeManager.register( //
         JobTypeBuilder.newBuilder().name("PASSIVE_JAVA_JOB").passive().java().allowedShutdownGracefully()
               .triggerClass(PassiveTrigger.class).handlerClass(SaturnJavaJob.class).build());
   JobTypeManager.register( //
         JobTypeBuilder.newBuilder().name("PASSIVE_SHELL_JOB").passive().shell().allowedShutdownGracefully()
               .triggerClass(PassiveTrigger.class).handlerClass(SaturnScriptJob.class).build());
}

java相关的job最后使用的对象为SaturnJavaJob。首先看下具体的一个业务任务代码。

// AbstractSaturnJavaJob extends AbstractElasticJob
@Component
public class DemoJob extends AbstractSaturnJavaJob {

   private static final Logger log = LoggerFactory.getLogger(DemoJob.class);

   @Resource
   private DemoService demoService;

   @Override
   public SaturnJobReturn handleJavaJob(String jobName, Integer shardItem, String shardParam,
         SaturnJobExecutionContext shardingContext) throws InterruptedException {
      log.info("{} is running, item is {}", jobName, shardItem);
      demoService.doing();
      return new SaturnJobReturn();
   }

}

回到SaturnJavaJob.init

public void init() {
   //AbstractSaturnJavaJob.init
   (super.init()):
   { //代码块为super.init()
   	Class<? extends Trigger> triggerClass = configService.getJobType().getTriggerClass();
		Trigger trigger = null;
		try {
			trigger = triggerClass.newInstance();
			trigger.init(this);
		} catch (Exception e) {
			LogUtils.error(log, jobName, "Trigger init failed", e);
			throw new JobException(e);
		}
		scheduler = new SaturnScheduler(this, trigger);
		scheduler.start();
		getExecutorService();
   }
   // 创建job的对象
   createJobBusinessInstanceIfNecessary();
   // 获取job的version
   getJobVersionIfNecessary();
}

更具对应的job类型获取到Trigger类型。然后初始化TriggerSaturnScheduler。并且启动SaturnScheduler。最后是将任务交给SaturnWorker对象执行:

//new SaturnWorker(job, trigger.createTriggered(false, null), trigger.createQuartzTrigger()); SaturnScheduler创建方法Triggered 方法其实主要用于判断是否已经执行
public SaturnWorker(AbstractElasticJob job, Triggered notTriggered, Trigger trigger) {
   this.job = job;
   this.notTriggered = notTriggered;
   this.triggered = notTriggered;
   initTrigger(trigger);//	this.triggerObj = (OperableTrigger) trigger; 初始化triggerObj
}
SaturnWorker.run() 
while (!halted.get()) {
			try {
				synchronized (sigLock) {
					while (paused && !halted.get()) {
						try {
							sigLock.wait(1000L);
						} catch (InterruptedException ignore) {
						}
					}
					if (halted.get()) {
						break;
					}
				}
				boolean noFireTime = false; // 没有下次执行时间,初始化为false
				long timeUntilTrigger = 1000;
        // 即上文传入的trigger,
				if (triggerObj != null) {
					triggerObj.updateAfterMisfire(null);
					long now = System.currentTimeMillis();
					Date nextFireTime = triggerObj.getNextFireTime();
					if (nextFireTime != null) {
						timeUntilTrigger = nextFireTime.getTime() - now;
					} else {
						noFireTime = true;
					}
				}

				while (!noFireTime && timeUntilTrigger > 2) {
					synchronized (sigLock) {
						// 是否停止
						if (halted.get()) {
							break;
						}
						//是否已经执行
						if (triggered.isYes()) {
							break;
						}
						try {
							sigLock.wait(timeUntilTrigger);
						} catch (InterruptedException ignore) {
						}
              // 计算下次执行时间
						if (triggerObj != null) {
							long now = System.currentTimeMillis();
							Date nextFireTime = triggerObj.getNextFireTime();
							if (nextFireTime != null) {
								timeUntilTrigger = nextFireTime.getTime() - now;
							} else {
								noFireTime = true;
							}
						}
					}
				}
				boolean goAhead;
				Triggered currentTriggered = notTriggered; //notTriggered = trigger.createTriggered(false, null)
				// 触发执行只有两个条件:1.时间到了 2.点立即执行
				synchronized (sigLock) {
          // 是否停止或者暂停
					goAhead = !halted.get() && !paused;
					// 重置立即执行标志,赋值当前立即执行数据
					if (triggered.isYes()) { // 初始化为false
						currentTriggered = triggered;
						triggered = notTriggered;
					} else if (goAhead) { // 非立即执行。即,执行时间到了,或者没有下次执行时间
						goAhead = goAhead && !noFireTime; // 有下次执行时间,即执行时间到了,才执行作业
						if (goAhead) { // 执行时间到了,更新执行时间
							if (triggerObj != null) {
								triggerObj.triggered(null);//这里使用的是quartz,传入null用于计算下次cron的时间
							}
						} else { // 没有下次执行时间,则尝试睡一秒,防止不停的循环导致CPU使用率过高(如果cron不再改为周期性执行)
							try {
								sigLock.wait(1000L);
							} catch (InterruptedException ignore) {
							}
						}
					}
				}
        // job开始执行
				if (goAhead) {
					job.execute(currentTriggered);
				}

			} catch (RuntimeException e) {
				LogUtils.error(log, job.getJobName(), e.getMessage(), e);
			}
		}

由于saturnworker是异步放入线程池进行运行,所以此时主线程已经进入了下一步init方法:

private void createJobBusinessInstanceIfNecessary() {
  // 从configuration也就是zk中获取到job的class
   String jobClassStr = configService.getJobConfiguration().getJobClass();
   if (StringUtils.isBlank(jobClassStr)) {
      LogUtils.error(log, jobName, "jobClass is not set");
      throw new JobInitAlarmException("jobClass is not set");
   }
   jobClassStr = jobClassStr.trim();
   LogUtils.info(log, jobName, "start to create job business instance, jobClass is {}", jobClassStr);
   if (jobBusinessInstance == null) {
      ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader();
      ClassLoader jobClassLoader = saturnExecutorService.getJobClassLoader();// 还是上文中的Jobclassloader
      Thread.currentThread().setContextClassLoader(jobClassLoader);
      try {
        // 如果说,我们使用springboot的fatjar,那么在这里,就不能够获取到springbootjar包中的/BOOT-INF/Classes,从而爆出CLassNotfound的一场
         Class<?> jobClass = jobClassLoader.loadClass(jobClassStr);
        //从上文中,如果是spring项目,就会有applicationcontext中获取到对应的jobbean,该方法调用的其实是AbstractSaturnApplication,下的getJobInstance。也就是说,如果不想通过spring来管理自己的job,那么可以在对应的application中实现getJobInstance 方法,该方法需要返回对应class的job实例。spring中的实现 applicationContext.getBean(jobClass) 。
         jobBusinessInstance = tryToGetSaturnBusinessInstanceFromSaturnApplication(jobClassLoader, jobClass);
         if (jobBusinessInstance == null) {
            try {
              // 如果没有从application中获取到,那么可以调用当前class的静态方法getObject,来初始化job。
               jobBusinessInstance = jobClass.getMethod("getObject").invoke(null);
               if (jobBusinessInstance != null) {
                  LogUtils.info(log, jobName, "get job instance from getObject");
               }
            } catch (NoSuchMethodException e) {
               LogUtils.info(log, jobName,
                     "the jobClass hasn't the static getObject method, will initialize job by default no arguments constructor method");
            }
         }
         // 业务没有重写getObject方法,BaseSaturnJob会默认返回null
         if (jobBusinessInstance == null) {
            jobBusinessInstance = jobClass.newInstance();
            LogUtils.info(log, jobName, "get job instance from newInstance");
         }
        // 该类暂时没发现有啥用处,setSaturnApi也是为空实现
         SaturnApi saturnApi = new SaturnApi(getNamespace(), executorName);
         jobClass.getMethod("setSaturnApi", Object.class).invoke(jobBusinessInstance, saturnApi);
      } catch (Throwable t) {
         throw new JobInitAlarmException(logBusinessExceptionIfNecessary(jobName, t));
      } finally {
         Thread.currentThread().setContextClassLoader(oldClassLoader);
      }
   }
   if (jobBusinessInstance == null) {
      LogUtils.error(log, jobName, "job instance is null");
      throw new JobInitAlarmException("job instance is null");
   }
}

现在,saturnworker线程已经开始运行,而job也已经初始化完成。

Job的执行

上文已经将job初始化,而且其实已经调用了 job.execute(currentTriggered);,也就是job其实已经开始进入执行阶段,接下来会看下job的具体执行过程:

//AbstractElasticJob 
public final void execute(final Triggered triggered) {
   LogUtils.debug(log, jobName, "Saturn start to execute job [{}]", jobName);
   // 对每一个jobScheduler,作业对象只有一份,多次使用,所以每次开始执行前先要reset
   reset();
   if (configService == null) {
      LogUtils.warn(log, jobName, "configService is null");
      return;
   }
   JobExecutionMultipleShardingContext shardingContext = null;
   try {
     // 如果当前的job运行上报,cron和passive作业默认上报,而且当前host下没有分配failover的,则进行分片
     // job在执行的过程中,会在execution下
      if (!configService.isEnabledReport() || failoverService.getLocalHostFailoverItems().isEmpty()) {
        // 如果当前允许上报,或者当前executor的失败转移分片为空,则进行分片
         shardingService.shardingIfNecessary();
      }
			// 判断当前job是否执行
      if (!configService.isJobEnabled()) {
         LogUtils.debug(log, jobName, "{} is disabled, cannot be continued, do nothing about business.",
               jobName);
         return;
      }
		  //分片上下文
      shardingContext = executionContextService.getJobExecutionShardingContext(triggered);
      if (shardingContext.getShardingItems() == null || shardingContext.getShardingItems().isEmpty()) {
         LogUtils.debug(log, jobName, "{} 's items of the executor is empty, do nothing about business.",
               jobName);
         callbackWhenShardingItemIsEmpty(shardingContext);
         return;
      }

      if (configService.isInPausePeriod()) {
         LogUtils.info(log, jobName,
               "the job {} current running time is in pausePeriod, do nothing about business.", jobName);
         return;
      }

      executeJobInternal(shardingContext);

      if (isFailoverSupported() && configService.isFailover() && !stopped && !forceStopped && !aborted) {
         failoverService.failoverIfNecessary();
      }

      LogUtils.debug(log, jobName, "Saturn finish to execute job [{}], sharding context:{}.", jobName,
            shardingContext);
   } catch (Exception e) {
      LogUtils.warn(log, jobName, e.getMessage(), e);
   } finally {
      running = false;
   }
}

判断是否需要分片中,会有判断是否有运行本作业服务器的失败转移序列号

	/**
	 * 获取运行在本作业服务器的失效转移序列号.
	 *
	 * @return 运行在本作业服务器的失效转移序列号
	 */
	public List<Integer> getLocalHostFailoverItems() {
    //zk中的路径为:/${namespace}/$Jobs/${jonName}/execution 里面是分片数组
		List<String> items = getJobNodeStorage().getJobNodeChildrenKeys(ExecutionNode.ROOT);
		List<Integer> result = new ArrayList<>(items.size());
		for (String each : items) {
			int item = Integer.parseInt(each);
      // zk路径为/${namespace}/$Jobs/${jonName}/execution/${item}/failover
			String node = FailoverNode.getExecutionFailoverNode(item);
      // 判断是否存在/${namespace}/$Jobs/${jonName}/execution/${item}/failover
      //上诉路径为一个零时节点,如果该节点存在,那么failover中的值为executorName
			if (getJobNodeStorage().isJobNodeExisted(node) && executorName
					.equals(getJobNodeStorage().getJobNodeDataDirectly(node))) {
				result.add(item);
			}
		}
		Collections.sort(result);
		return result;
	}

由于cron是默认需要分片的,所以其实从某种意义上来说每次执行的时候都会进行分片

/**
	 * 如果需要分片且当前节点为主节点, 则作业分片.
	 */
	public synchronized void shardingIfNecessary() throws JobShuttingDownException {
		if (isShutdown) {
			return;
		}
		GetDataStat getDataStat = null;
    // 从/${namespace}/$Jobs/${jonName}/leader/sharding/necessary中获取数据
		if (getJobNodeStorage().isJobNodeExisted(ShardingNode.NECESSARY)) {
			getDataStat = getNecessaryDataStat();
		}
		// sharding necessary内容为空,或者内容是"0"则返回,否则,需要进行sharding处理
		if (getDataStat == null || SHARDING_UN_NECESSARY.equals(getDataStat.getData())) {
			return;
		}
		// 如果不是leader,则等待leader处理完成(这也是一个死循环,知道满足跳出循环的条件:1. 被shutdown 2. 无须sharding而且不处于processing状态)
		if (blockUntilShardingComplatedIfNotLeader()) {
			return;
		}
		// 如果有作业分片处于running状态则等待(无限期)
		waitingOtherJobCompleted();
		// 建立一个临时节点,标记shardig处理中
		//processing
		getJobNodeStorage().fillEphemeralJobNode(ShardingNode.PROCESSING, "");
		try {
			// 删除作业下面的所有JobServer的sharding节点
			clearShardingInfo();

			int retryCount = 3;
			while (!isShutdown) {
				boolean needRetry = false;
				int version = getDataStat.getVersion();
				// 首先尝试从job/leader/sharding/neccessary节点获取,如果失败,会从$SaturnExecutors/sharding/content下面获取
				// key is executor, value is sharding items
				Map<String, List<Integer>> shardingItems = namespaceShardingContentService
						.getShardContent(jobName, getDataStat.getData());
				try {
					// 所有jobserver的(检查+创建),加上设置sharding necessary内容为0,都是一个事务
					CuratorTransactionFinal curatorTransactionFinal = getJobNodeStorage().getClient().inTransaction()
							.check().forPath("/").and();
					for (Entry<String, List<Integer>> entry : shardingItems.entrySet()) {
          //创建/${namespace}/$Jobs/${jonName}/servers/${executorName}/sharding,写入一些必要的数据
						curatorTransactionFinal.create().forPath(
								JobNodePath.getNodeFullPath(jobName, ShardingNode.getShardingNode(entry.getKey())),
								ItemUtils.toItemsString(entry.getValue()).getBytes(StandardCharsets.UTF_8)).and();
					}
					curatorTransactionFinal.setData().withVersion(version)
							.forPath(JobNodePath.getNodeFullPath(jobName, ShardingNode.NECESSARY),
									SHARDING_UN_NECESSARY.getBytes(StandardCharsets.UTF_8)).and();
					curatorTransactionFinal.commit();
				} catch (BadVersionException e) {
					LogUtils.warn(log, jobName, "zookeeper bad version exception happens.", e);
					needRetry = true;
					retryCount--;
				} catch (Exception e) {
					// 可能多个sharding task导致计算结果有滞后,但是server机器已经被删除,导致commit失败
					// 实际上可能不影响最终结果,仍然能正常分配分片,因为还会有resharding事件被响应
					// 修改日志级别为warn级别,避免不必要的告警
					LogUtils.warn(log, jobName, "Commit shards failed", e);
				}
				if (needRetry) {
					if (retryCount >= 0) {
						LogUtils.info(log, jobName,
								"Bad version because of concurrency, will retry to get shards later");
						Thread.sleep(200L); // NOSONAR
						getDataStat = getNecessaryDataStat();
					} else {
						LogUtils.warn(log, jobName, "Bad version because of concurrency, give up to retry");
						break;
					}
				} else {
					break;
				}
			}
		} catch (Exception e) {
			LogUtils.error(log, jobName, e.getMessage(), e);
		} finally {
			getJobNodeStorage().removeJobNodeIfExisted(ShardingNode.PROCESSING);
		}
	}

分片完成以后,执行之前去获取当前executor的分片信息:

/**
	 * 获取当前作业服务器运行时分片上下文.
	 *
	 * @return 当前作业服务器运行时分片上下文
	 */
	public JobExecutionMultipleShardingContext getJobExecutionShardingContext(final Triggered triggered) {
		SaturnExecutionContext result = new SaturnExecutionContext();
		result.setJobName(configService.getJobName());
		result.setShardingTotalCount(configService.getShardingTotalCount());
		result.setTriggered(triggered);
    //从/${namespace}/$Jobs/${jonName}/servers/${executorName}/sharding中获取本地的,这个值是在上文中的分片中写入的
		List<Integer> shardingItems = getShardingItems();
		boolean isEnabledReport = configService.isEnabledReport();
		if (isEnabledReport) {
    //从${namespace}/$Jobs/${jonName}/execution/${item}/running 获取到正在执行的item
			removeRunningItems(shardingItems);
		}
    // 设置当前需要执行的分片
		result.setShardingItems(shardingItems);
    // 传入的参数
		result.setJobParameter(configService.getJobParameter());
    // 业务上下文
		result.setCustomContext(configService.getCustomContext());
		result.setJobConfiguration(jobConfiguration);

		if (coordinatorRegistryCenter != null) {
			result.setNamespace(coordinatorRegistryCenter.getNamespace());
			result.setExecutorName(coordinatorRegistryCenter.getExecutorName());
		}

		if (result.getShardingItems().isEmpty()) {
			return result;
		}
    // 获取参数对照表
		Map<Integer, String> shardingItemParameters = configService.getShardingItemParameters();
		if (shardingItemParameters.containsKey(-1)) { // 本地模式
			for (int each : result.getShardingItems()) {
				result.getShardingItemParameters().put(each, shardingItemParameters.get(-1));
			}
		} else {
			for (int each : result.getShardingItems()) {
				if (shardingItemParameters.containsKey(each)) {
					result.getShardingItemParameters().put(each, shardingItemParameters.get(each));
				}
			}
		}
		if (jobConfiguration.getTimeoutSeconds() > 0) {
			result.setTimetoutSeconds(jobConfiguration.getTimeoutSeconds());
		}
    // 返回执行上下文
		return result;
	}

获取到分片信息,开始执行:

private void executeJobInternal(final JobExecutionMultipleShardingContext shardingContext) throws Exception {
	//注册分片执行状态为running
   executionService.registerJobBegin(shardingContext);

   try {
      //进入执行逻辑
      executeJob(shardingContext);
   } finally {
      List<Integer> shardingItems = shardingContext.getShardingItems();
      if (!shardingItems.isEmpty()) {
         Date nextFireTimePausePeriodEffected = jobScheduler.getNextFireTimePausePeriodEffected();
         boolean isEnabledReport = configService.isEnabledReport();
         for (int item : shardingItems) {
            if (isEnabledReport && !checkIfZkLostAfterExecution(item)) {
               continue;
            }
            if (!aborted) {
           //作业完成信息注册,此信息用于页面展现。注意,无论作业是否上报状态(对应/config/enabledReport/节点),都会注册此信息。
               executionService
                     .registerJobCompletedByItem(shardingContext, item, nextFireTimePausePeriodEffected);
            }
           // 如果当前为失败转移的分片,则从失败转移列表中移除当前的分片节点
            if (isFailoverSupported() && configService.isFailover()) {
               failoverService.updateFailoverComplete(item);
            }
         }
      }
     // 执行依赖作业,是通过http接口调用consoleUri + "/rest/v1/" + namespace + "/jobs/" + jobName + "/runDownStream";完成
      runDownStream(shardingContext);
   }
}

最后执行executeJob:

protected final void executeJob(final JobExecutionMultipleShardingContext shardingContext) {
		if (!(shardingContext instanceof SaturnExecutionContext)) {
			LogUtils.error(log, jobName, "!!! The context must be instance of SaturnJobExecutionContext !!!");
			return;
		}
		long start = System.currentTimeMillis();

		SaturnExecutionContext saturnContext = (SaturnExecutionContext) shardingContext;
		saturnContext.setSaturnJob(true);
		// 针对分片返回结果
		Map<Integer, SaturnJobReturn> retMap = new HashMap<Integer, SaturnJobReturn>();

		// shardingItemParameters为参数表解析出来的Key/Value值
		Map<Integer, String> shardingItemParameters = saturnContext.getShardingItemParameters();

		// items为需要处理的作业分片
		List<Integer> items = saturnContext.getShardingItems();

		LogUtils.info(log, jobName, "Job {} handle items: {}", jobName, items);

		for (Integer item : items) {
			// 兼容配置错误,如配置3个分片, 参数表配置为0=*, 2=*, 则1分片不会执行
			if (!shardingItemParameters.containsKey(item)) {
				LogUtils.error(log, jobName,
						"The {} item's parameter is not valid, will not execute the business code, please check shardingItemParameters",
						items);
				SaturnJobReturn errRet = new SaturnJobReturn(SaturnSystemReturnCode.SYSTEM_FAIL,
						"Config of parameter is not valid, check shardingItemParameters", SaturnSystemErrorGroup.FAIL);
				retMap.put(item, errRet);
			}
		}

		Map<Integer, SaturnJobReturn> handleJobMap = handleJob(saturnContext);
		if (handleJobMap != null) {
			retMap.putAll(handleJobMap);
		}

		// 汇总修改
		for (Integer item : items) {
			if (item == null) {
				continue;
			}
			SaturnJobReturn saturnJobReturn = retMap.get(item);
			if (saturnJobReturn == null) {
				saturnJobReturn = new SaturnJobReturn(SaturnSystemReturnCode.SYSTEM_FAIL,
						"Can not find the corresponding SaturnJobReturn", SaturnSystemErrorGroup.FAIL);
				retMap.put(item, saturnJobReturn);
			}
      // 将结果上传到ProcessCountStatistics,最后传入zk
			updateExecuteResult(saturnJobReturn, saturnContext, item);
		}
		long end = System.currentTimeMillis();
		LogUtils.info(log, jobName, "{} finished, totalCost={}ms, return={}", jobName, (end - start), retMap);
	}

如果当前的jobjava,则进入:

//SaturnJavaJob
protected Map<Integer, SaturnJobReturn> handleJob(final SaturnExecutionContext shardingContext) {
		final Map<Integer, SaturnJobReturn> retMap = new HashMap<Integer, SaturnJobReturn>();

		synchronized (futureTaskMap) {
			futureTaskMap.clear();

			final String jobName = shardingContext.getJobName();
			final int timeoutSeconds = getTimeoutSeconds();

			ExecutorService executorService = getExecutorService();

			// 处理自定义参数
			String jobParameter = shardingContext.getJobParameter();

			// shardingItemParameters为参数表解析出来的Key/Value值
			Map<Integer, String> shardingItemParameters = shardingContext.getShardingItemParameters();

			for (final Entry<Integer, String> shardingItem : shardingItemParameters.entrySet()) {
				final Integer key = shardingItem.getKey();
				try {
					String jobValue = shardingItem.getValue();
					final String itemVal = getRealItemValue(jobParameter, jobValue); // 作业分片的对应值
					// 将当前的job放入异步现场池进行处理,让分片可以几乎并发执行的
					ShardingItemFutureTask shardingItemFutureTask = new ShardingItemFutureTask(
							createCallable(jobName, key, itemVal, timeoutSeconds, shardingContext, this), null);
					Future<?> callFuture = executorService.submit(shardingItemFutureTask);
					if (timeoutSeconds > 0) {
						TimeoutSchedulerExecutor.scheduleTimeoutJob(shardingContext.getExecutorName(), timeoutSeconds,
								shardingItemFutureTask);
					}
					shardingItemFutureTask.setCallFuture(callFuture);
					futureTaskMap.put(key, shardingItemFutureTask);
				} catch (Throwable t) {
					LogUtils.error(log, jobName, t.getMessage(), t);
					retMap.put(key, new SaturnJobReturn(SaturnSystemReturnCode.SYSTEM_FAIL, t.getMessage(),
							SaturnSystemErrorGroup.FAIL));
				}
			}
		}
	// 汇总执行的过程
		for (Entry<Integer, ShardingItemFutureTask> entry : futureTaskMap.entrySet()) {
			Integer item = entry.getKey();
			ShardingItemFutureTask futureTask = entry.getValue();
			try {
				futureTask.getCallFuture().get();
			} catch (Exception e) {
				LogUtils.error(log, jobName, e.getMessage(), e);
				retMap.put(item, new SaturnJobReturn(SaturnSystemReturnCode.SYSTEM_FAIL, e.getMessage(),
						SaturnSystemErrorGroup.FAIL));
				continue;
			}
			retMap.put(item, futureTask.getCallable().getSaturnJobReturn());
		}

		synchronized (futureTaskMap) {
			futureTaskMap.clear();
		}

		return retMap;
	}

最后是放入JavaShardingItemCallable进行执行的:

public SaturnJobReturn call() {
		reset();

		SaturnSystemOutputStream.initLogger();
		currentThread = Thread.currentThread();
		SaturnJobReturn temp = null;
		try {

			beforeExecution();
			// 调用saturn的抽象方法
			temp = doExecution();

			// 在此之后,不能再强制停止本线程
			breakForceStop = true;
		} catch (Throwable t) {
			// 在此之后,不能再强制停止本线程
			breakForceStop = true;

			// 不是超时,不是强制停止。 打印错误日志,设置SaturnJobReturn。
			if (status.get() != TIMEOUT && status.get() != FORCE_STOP) {
				LogUtils.error(log, jobName, t.toString(), t);
				temp = new SaturnJobReturn(SaturnSystemReturnCode.SYSTEM_FAIL, t.getMessage(),
						SaturnSystemErrorGroup.FAIL);
			}

		} finally {
			if (status.compareAndSet(INIT, SUCCESS)) {
				saturnJobReturn = temp;
			}

			if (saturnJob != null && saturnJob.getConfigService().showNormalLog()) {
				String jobLog = SaturnSystemOutputStream.clearAndGetLog();
				if (jobLog != null && jobLog.length() > SaturnConstant.MAX_JOB_LOG_DATA_LENGTH) {
					LogUtils.info(log, jobName,
							"As the job log exceed max length, only the previous {} characters will be reported",
							SaturnConstant.MAX_JOB_LOG_DATA_LENGTH);
					jobLog = jobLog.substring(0, SaturnConstant.MAX_JOB_LOG_DATA_LENGTH);
				}

				this.shardingContext.putJobLog(this.item, jobLog);
			}
		}

		return saturnJobReturn;
	}

细说状态

上文已经完整的走了一遍job的运行,接下来就来详细看下executorjob的状态,通过源码结合saturnwiki。架构图:

logic.png

saturn是重度依赖zk的,无论是executor的管理,job的配置下发以及结果都是通过zk来做的。

executor的状态:

上线:

上文就已经提到,当上线的时候会在对应的/${namespace}/$SaturnExecutors/executors/${executorName}

而在console中,每一个namespace都会有对应的NamespaceShardingManager,里面会针对/${namespace}/$SaturnExecutors/executors/路径有监听器ExecuteOnlineShardingTask。当触发事件为TreeCacheEvent.Type.NODE_ADDED的时候,就会执行上线的逻辑,该逻辑会走到上文提到的AbstractAsyncShardingTask.run方法,会进行对应的分片算法(实现了AbstractAsyncShardingTask.pick)并且充分片后,将分片后的数据写入/${namespace}/$Jobs/${jonName}/leader/sharding/necessary,结合前文,每次job执行前回去判断是否需要分片,依据就是这个necessary是否为0,因为每次分片完成后,executorleader会将这个值写为0.

下线:和上线差不多,只是监听的事件为TreeCacheEvent.Type.NODE_REMOVED

job的状态:

job-status.png job的状态分为两种,一种是认为操作,也就是从console上点击作业禁用,新增,启用,删除等等。这些操作其实都是从console的角度做的,也就是上面的NamespaceShardingManager,他在pick的时候,会将当前的作业从调度列表删除,然后executor在进行数据分片的时候就会针对当前的job进行调度或者不调度。

比如:

job的启用和禁用,console通过监听/job/config下的enable配置文件,最后通过增加或者移除当前executor上的job来实现。

还有一种就是job在运行过程中出现问题。需要转移分片等等,这些状态的变化是在executor上进行转换的,也就是JobScheduler.init()里面有个startAll会将所有的路径监听器启动:

	@Override
	public void start() {
		electionListenerManager = new ElectionListenerManager(jobScheduler);
		failoverListenerManager = new FailoverListenerManager(jobScheduler);
		jobOperationListenerManager = new JobOperationListenerManager(jobScheduler);
		configurationListenerManager = new ConfigurationListenerManager(jobScheduler);
		shardingListenerManager = new ShardingListenerManager(jobScheduler);
		analyseResetListenerManager = new AnalyseResetListenerManager(jobScheduler);
		controlListenerManager = new ControlListenerManager(jobScheduler);

		electionListenerManager.start();
		failoverListenerManager.start();
		jobOperationListenerManager.start();
		configurationListenerManager.start();
		shardingListenerManager.start();
		analyseResetListenerManager.start();
		controlListenerManager.start();
	}

这里面部分监听器的实现,主要是为了让客户端感知到当前job的状态,如点击立即执行的时候,会被JobOperationListenerManager监听到,从而进行立即调度。当修改了状态如enable修改为false也就是禁用job的时候,ConfigurationListenerManager会将当前的config更新,然后在运行过程中AbstractElasticJob.execute会判断当前的job是否为enable状态。而且会调用jobonEnabled方法。

job运行后的failover:

上文提到,当作业开始运行的时候,会在对应的execution/${item}/上注册running的临时节点。当作业完成后,也会在注册completed信息。当作业开始运行的时候,FailoverListenerManager会监听execution路径,分为两种:

	class RunningPathListener implements NodeCacheListener {
		private int item;
		public RunningPathListener(int item) {
			this.item = item;
		}
		@Override
		public void nodeChanged() throws Exception {
      // 由于当前的监听为execution/${item}/  该节点下在运行的时候会将executor的节点写入,如果发生节点变化,则说明两种情况:当前的分片依据完成,当前的分片异常关闭需要failover。
      //getJobNodeStorage().fillEphemeralJobNode(ExecutionNode.getRunningNode(item), executorName);
			zkCacheManager.getExecutorService().execute(new Runnable() {
				@Override
				public void run() {
					try {
						if (isShutdown) {
							return;
						}
						if (!executionService.isRunning(item)) {
							failover(item);
						}
					} catch (Throwable t) {
						LogUtils.error(log, jobName, t.getMessage(), t);
					}
				}
			});
		}
	}

running节点数据发生变化分为三种情况,第一个是新增了running节点,说明分片正在运行,或者删除了running节点,那么说明两种情况,第一就是正常运行结束,还有一种就是异常失败:

private synchronized void failover(final Integer item) {
   if (jobScheduler == null || jobScheduler.getJob() == null) {
      return;
   }
   if (!jobScheduler.getJob().isFailoverSupported() || !configService.isFailover() || executionService
         .isCompleted(item)) {
      return;
   }

   failoverService.createCrashedFailoverFlag(item);

   if (!executionService.hasRunningItems(jobScheduler.getShardingService().getLocalHostShardingItems())) {
      failoverService.failoverIfNecessary();
   }
}

所以failover会判断当前是否运行异常转移和是否正常借宿,如果都不是,则创建failover,然后执行failover的逻辑:

class FailoverLeaderExecutionCallback implements LeaderExecutionCallback {

   @Override
   public void execute() {
      if (!needFailover()) {
         return;
      }
      if (jobScheduler == null) {
         return;
      }
      if (coordinatorRegistryCenter.isExisted(SaturnExecutorsNode.getExecutorNoTrafficNodePath(executorName))) {
         return;
      }
      if (!jobScheduler.getConfigService().getPreferList().contains(executorName) && !jobScheduler
            .getConfigService().isUseDispreferList()) {
         return;
      }
      List<String> items = getJobNodeStorage().getJobNodeChildrenKeys(FailoverNode.ITEMS_ROOT);
      if (items != null && !items.isEmpty()) {
         int crashedItem = Integer
               .parseInt(getJobNodeStorage().getJobNodeChildrenKeys(FailoverNode.ITEMS_ROOT).get(0));
         LogUtils.debug(log, jobName, "Elastic job: failover job begin, crashed item:{}.", crashedItem);
         getJobNodeStorage()
               .fillEphemeralJobNode(FailoverNode.getExecutionFailoverNode(crashedItem), executorName);
         getJobNodeStorage().removeJobNodeIfExisted(FailoverNode.getItemsNode(crashedItem));
         jobScheduler.triggerJob(null);
      }
   }
}

首先是在选主人。然后在FailoverLeaderExecutionCallback判断当前的节点是否被摘除流量,是否存在优先队列,是否有配置只使用优先地队列,如果说当前的配置了优先队列,但是当前的executor没有在此节点上,但是没有配置只使用优先队列,则可以进行失败转移,如果既设置了优先队列,又配置了只使用优先队列,而且当前的executor不属于优先队列,则不会进行失败转移。然后会先获取failover的列表,执行对应的分片,从而完成对应的failover