Flink源码阅读(一) Runtime机制

1,634 阅读17分钟

因为目前我们线上的Flink版本是用的1.11。所以这个系列博客从Flink1.11的代码跟读起来

对应文档章节: ci.apache.org/projects/fl…

首先我们需要先编译一下整个Flink工程

Flink Runtime整体工作流程

作业提交

假设现在有一个FLink Cep Job作业


public class AlertsGeoHashRepSubJob {
	private static final int PAUSE = 1000;
	private static final int NUMBER_OF_EVENTS_STD = 100;
	private static final int NUMBER_OF_EVENTS_MEAN = 180;
	private static final int NUMBER_OF_ZONES = 1;
	private static final int DELTA_LIMIT = 70;
	
	public static void main(String[] args) throws Exception {
		//final StreamExecutionEnvironment env = new BobStreamExecutionEnvironmentFactory().createExecutionEnvironment();
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // setting Parallelism to 1 
        env.setParallelism(1);
        //env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		AlertsGeoHashRepSubJob s = new AlertsGeoHashRepSubJob();
		s.addJob(env);
	>>	env.execute(AlertsGeoHashRepSubJob.class.getCanonicalName());
	}

	@SuppressWarnings("serial")
	public void addJob(StreamExecutionEnvironment env) throws Exception {
		DataStream<GeoHashEvent> inputEventStream = env.addSource(new GeoHashEventsGenerator(PAUSE, NUMBER_OF_EVENTS_STD, NUMBER_OF_EVENTS_MEAN, NUMBER_OF_ZONES));
		KeyedStream<GeoHashEvent, String> inputEventStreamKeyed = inputEventStream.keyBy(new KeySelector<GeoHashEvent, String>() {
			@Override
			public String getKey(GeoHashEvent value) throws Exception {
				return value.getGeohash();
			}
		});


		Pattern<GeoHashEvent, ?> warningPattern = Pattern.<GeoHashEvent>begin("first")
				.where(new IterativeCondition<GeoHashEvent>() {
					@Override
					public boolean filter(GeoHashEvent value, Context<GeoHashEvent> ctx) throws Exception {
						int a = Math.abs(value.getDeltaGPRSEvents())*100;
						if (a==0 || value.getTotalGPRSEvents() == 0){
							return false;
						}
						double b = a/value.getTotalGPRSEvents();
						int delta = Math.abs(new Double(b).intValue());
						return delta > DELTA_LIMIT;
					}
				})
				.next("second")
                .where(new IterativeCondition<GeoHashEvent>() {
					@Override
					public boolean filter(GeoHashEvent value, Context<GeoHashEvent> ctx) throws Exception {
						int a = Math.abs(value.getDeltaGPRSEvents())*100;
						if (a==0 || value.getTotalGPRSEvents() == 0){
							return false;
						}
						double b = a/value.getTotalGPRSEvents();
						int delta = Math.abs(new Double(b).intValue());
						return delta > DELTA_LIMIT;
					}
				})
                .within(Time.minutes(15));


		DataStream<GeoHashEvent> inputStream = inputEventStreamKeyed.countWindow(2,1).apply(new WindowFunction<GeoHashEvent, GeoHashEvent, String, GlobalWindow>() {
			@Override
			public void apply(String key, GlobalWindow window, Iterable<GeoHashEvent> input, Collector<GeoHashEvent> out) throws Exception {
				//debería haber dos eventos y calcular el delta de los mismos
				List<GeoHashEvent> l = new ArrayList<>();
				input.forEach(l::add);
				if(l.size() == 2) {
					l.get(1).setDeltaGPRSEvents(l.get(0).getTotalGPRSEvents() - l.get(1).getTotalGPRSEvents());
					out.collect(l.get(1));
				} else { //es el primero de los eventos para la key concreta ... por lo que el delta es 0
					l.get(0).setDeltaGPRSEvents(0);
					out.collect(l.get(0));
				}
				
			}
		});
		
		DataStream<Tuple2<GeoHashEvent,GeoHashEvent>> result = CEP.pattern(inputStream.keyBy(new KeySelector<GeoHashEvent, String>() {
			@Override
			public String getKey(GeoHashEvent value) throws Exception {
				return value.getGeohash();
			}}), warningPattern)
				.select(
					new PatternSelectFunction<GeoHashEvent, Tuple2<GeoHashEvent,GeoHashEvent>>() {
						@Override
						public Tuple2<GeoHashEvent,GeoHashEvent>  select(Map<String, List<GeoHashEvent>> pattern) throws Exception {
							return new Tuple2<GeoHashEvent, GeoHashEvent>((GeoHashEvent)pattern.get("first").get(0), (GeoHashEvent)pattern.get("second").get(0));
						}
					}
				);
		
		result.print("ALARM");


		PatternStream<GeoHashEvent> result1 = CEP.pattern(inputStream.keyBy(new KeySelector<GeoHashEvent, String>() {
			@Override
			public String getKey(GeoHashEvent value) throws Exception {
				return value.getGeohash();
			}}), warningPattern);
	}
}


首先我们从StreamExecutionEnvironment#execute中进入

public JobExecutionResult execute(String jobName) throws Exception {
		Preconditions.checkNotNull(jobName, "Streaming Job name should not be null.");

		return execute(getStreamGraph(jobName));
	}

Env执行流程

1.生成StreamGraph
2.将StreamGraph翻译成JObGraph
生成StreamGraph

1.初始化StreamGraph的配置

public StreamGraph generate() {
		//配置ExecutionConfig配置(如Job重启恢复策略),配置checkpoint配置(数据崩溃恢复策略,多长时间生成快债)
		streamGraph = new StreamGraph(executionConfig, checkpointConfig, savepointRestoreSettings);
		streamGraph.setStateBackend(stateBackend);
		streamGraph.setChaining(chaining);
		streamGraph.setScheduleMode(scheduleMode);
		//设置需要从加载的Jar包,如jdk的src包等等,当然也可以自己定义需要加载的jar包	
		streamGraph.setUserArtifacts(userArtifacts);
	
		//设置当前数据流图中的水印到达时间   
		streamGraph.setTimeCharacteristic(timeCharacteristic);
		streamGraph.setJobName(jobName);
		
 		//设置数据交换策略 TaskManager是否需要shuffle。当一个节点上某个算子的subtask执行完后,从另一个节点拉取数据,目的为使cpu少闲置    				    streamGraph.setGlobalDataExchangeMode(globalDataExchangeMode);

		alreadyTransformed = new HashMap<>();

		//开始翻译在程序中定义的算子流集合
		for (Transformation<?> transformation: transformations) {
			transform(transformation);
		}

		final StreamGraph builtStreamGraph = streamGraph;

		//清空已经适配完成的处理算子
		alreadyTransformed.clear();
		alreadyTransformed = null;
		streamGraph = null;

		return builtStreamGraph;
	}

2.将transform函数展开,看到 >> 标注的那一行 由于我们提交的flink job只有一个输入数据源,中间并无进行广播等其他操作,所以进入单数据源的条件分支

/**
	 * Transforms one {@code Transformation}.
	 *
	 * <p>This checks whether we already transformed it and exits early in that case. If not it
	 * delegates to one of the transformation specific methods.
	 */
	private Collection<Integer> transform(Transformation<?> transform) {

		if (alreadyTransformed.containsKey(transform)) {
			return alreadyTransformed.get(transform);
		}

		LOG.debug("Transforming " + transform);

		if (transform.getMaxParallelism() <= 0) {

			// if the max parallelism hasn't been set, then first use the job wide max parallelism
			// from the ExecutionConfig.
			int globalMaxParallelismFromConfig = executionConfig.getMaxParallelism();
			if (globalMaxParallelismFromConfig > 0) {
				transform.setMaxParallelism(globalMaxParallelismFromConfig);
			}
		}

		// call at least once to trigger exceptions about MissingTypeInfo
		transform.getOutputType();
		
        //通过递归调用TransForm方法来设置数据在每个TransForm操作后的分区策略,以及tranform后的数据格式
		Collection<Integer> transformedIds;
        //如果是单流输入
		if (transform instanceof OneInputTransformation<?, ?>) {
	>>		transformedIds = transformOneInputTransform((OneInputTransformation<?, ?>) transform);
		} else if (transform instanceof SourceTransformation) {
			transformedIds = transformSource((SourceTransformation<?>) transform);
		} 
        .....xxxx.....
        //如果是Sink操作
        else if (transform instanceof SinkTransformation<?>) {
			transformedIds = transformSink((SinkTransformation<?>) transform);
		} else if (transform instanceof UnionTransformation<?>) {
			transformedIds = transformUnion((UnionTransformation<?>) transform);
		} else if (transform instanceof SplitTransformation<?>) {
			transformedIds = transformSplit((SplitTransformation<?>) transform);
		} else if (transform instanceof SelectTransformation<?>) {
			transformedIds = transformSelect((SelectTransformation<?>) transform);
		} else if (transform instanceof FeedbackTransformation<?>) {
			transformedIds = transformFeedback((FeedbackTransformation<?>) transform);
		} else if (transform instanceof SideOutputTransformation<?>) {
			transformedIds = transformSideOutput((SideOutputTransformation<?>) transform);
		} else {
			throw new IllegalStateException("Unknown transformation: " + transform);
		}

		//由于我们是通过递归调用TransForm操作来解析每个TranForm操作的,为了保护递归栈的操作现场,需要维护已经解析过的TransForm操作
		if (!alreadyTransformed.containsKey(transform)) {
			alreadyTransformed.put(transform, transformedIds);
		}

		if (transform.getBufferTimeout() >= 0) {
			streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout());
		} else {
			streamGraph.setBufferTimeout(transform.getId(), defaultBufferTimeout);
		}
		
        //设置每个TransForm操作的uid,即算子id
		if (transform.getUid() != null) {
			streamGraph.setTransformationUID(transform.getId(), transform.getUid());
		}
        
        //为数据流图的每个节点设置hashId(这里的uid即是指每个算子的uid)
		if (transform.getUserProvidedNodeHash() != null) {
			streamGraph.setTransformationUserHash(transform.getId(), transform.getUserProvidedNodeHash());
		}
		
		if (!streamGraph.getExecutionConfig().hasAutoGeneratedUIDsEnabled()) {
			if (transform instanceof PhysicalTransformation &&
					
                    transform.getUserProvidedNodeHash() == null &&
					transform.getUid() == null) {
				throw new IllegalStateException("Auto generated UIDs have been disabled " +
					"but no UID or hash has been assigned to operator " + transform.getName());
			}
		}

		if (transform.getMinResources() != null && transform.getPreferredResources() != null) {
			streamGraph.setResources(transform.getId(), transform.getMinResources(), transform.getPreferredResources());
		}

		streamGraph.setManagedMemoryWeight(transform.getId(), transform.getManagedMemoryWeight());

		return transformedIds;
	}
StreamGraph翻译成JobGraph

点开execute方法,看到如注释所示。这个函数作用为触发flink程序执行,在设置的StreamExecutionGraph的数据DAG图中,将默认使用输出流作为Sink。当执行数据流图中存在Sink算子,则开始执行整个数据流

/**
	 * Triggers the program execution. The environment will execute all parts of
	 * the program that have resulted in a "sink" operation. Sink operations are
	 * for example printing results or forwarding them to a message queue.
	 *
	 * @param streamGraph the stream graph representing the transformations
	 * @return The result of the job execution, containing elapsed time and accumulators.
	 * @throws Exception which occurs during job execution.
	 */
	@Internal
	public JobExecutionResult execute(StreamGraph streamGraph) throws Exception {
        //异步执行数据流图
		final JobClient jobClient = executeAsync(streamGraph);

		try {
			final JobExecutionResult jobExecutionResult;
			//如果当前配置中已经存在这个Job类,并且执行部署模式为Attached,则加载对应的classLoader加载对应的Job类
            //否则则获取本次Job提交时的JObID
			if (configuration.getBoolean(DeploymentOptions.ATTACHED)) {
				jobExecutionResult = jobClient.getJobExecutionResult(userClassloader).get();
			} else {
				jobExecutionResult = new DetachedJobExecutionResult(jobClient.getJobID());
			}

			jobListeners.forEach(jobListener -> jobListener.onJobExecuted(jobExecutionResult, null));

			return jobExecutionResult;
		} catch (Throwable t) {
			// get() on the JobExecutionResult Future will throw an ExecutionException. This
			// behaviour was largely not there in Flink versions before the PipelineExecutor
			// refactoring so we should strip that exception.
			Throwable strippedException = ExceptionUtils.stripExecutionException(t);
			//开启监听者线程监听
			jobListeners.forEach(jobListener -> {
				jobListener.onJobExecuted(null, strippedException);
			});
			ExceptionUtils.rethrowException(strippedException);

			// never reached, only make javac happy
			return null;
		}
	}

按照flink官方的流程图,当执行数据流图的时候。flink的taskManager应该分配task执行某一个算子,为了验证,接下来我们在Task类中打一个断点。可以看到默认所有算子并行度为1的情况下,在遍历完Transform操作后,会开始先执行生成数据流图阶段transformedIds中添加的算子 即图中的线程栈Source Custom Source (1/1) @5964

ps:futrue promise的概念可以参见<<反应式设计模式一书>>

/** Partition producer state checker to request partition states from. */
	private final PartitionProducerStateChecker partitionProducerStateChecker;

	/** Executor to run future callbacks. */
    //执行这个futrue回调事件的执行器
	private final Executor executor;

	/** Future that is completed once {@link #run()} exits. */
    //CompletableFuture是一个JDK8异步回调类。内部实现类似于fork-join(不同的是ForkJoinTask是可以提交给的任务ForkJoinPool,CompletableFuture是可以与任何Executor一起使用的promise ,并且Executor不需要依赖ForkJoinPool)
	private final CompletableFuture<ExecutionState> terminationFuture = new CompletableFuture<>();

	// ------------------------------------------------------------------------
	//  Fields that control the task execution. All these fields are volatile
	//  (which means that they introduce memory barriers), to establish
	//  proper happens-before semantics on parallel modification
	// ------------------------------------------------------------------------

	/** atomic flag that makes sure the invokable is canceled exactly once upon error. */
    //原子标志,可确保在发生错误时完全取消可调用项
	private final AtomicBoolean invokableHasBeenCanceled;

	/** The invokable of this task, if initialized. All accesses must copy the reference and
	 * check for null, as this field is cleared as part of the disposal logic. */
	@Nullable
	private volatile AbstractInvokable invokable;

	/** The current execution state of the task. */
    //当前这个task的执行状态
	private volatile ExecutionState executionState = ExecutionState.CREATED;

/**
	 * The core work method that bootstraps the task and executes its code.
	 */
	@Override
	public void run() {
		try {
			doRun();
		} finally {
        	//在异步任务完成时回调
			terminationFuture.complete(executionState);
		}
	}

任务调度

其次,点进doRun方法

由于FLink对节点间的调度采用的是akka框架,从<<反应式设计模式>>一书中可以得知,在一个Actor发起多个任务后,会先调用Futrue返回一个异步提交通知,随后通过promise处理返回的计算结果。这里的工作分为三阶段

1.将异步执行过程中,TaskExecutor会将task的提交状态反馈给TasdkManager (对应DoRun中的 TaskExecutor提交状态反馈给TaskManager)

2.在上述数据流图的生成过程当中,可以注意到设置需要从加载的Jar包,如jdk的src包等等,当然也可以自己定义需要加载的jar包这个步骤。由于flink集群是分布式的,所以需要在每个task执行的时候,在所在节点上加载对应的jar包

3.加载当前执行线程所需要的上下文类加载器,配置相应的runtime环境,反射对应算子为初始化做准备 (如果我们有根据上游数据源的元信息来调用相应的定制化算子需求的时候,可以考虑重写StreamTask接口下的invoke以及beforeInvoke方法)

4.初始化算子后将数据写入分区数组consumableNotifyingPartitionWriters

5.对第一步提交状态为失败或取消的任务进行容错恢复

Task.java

private void doRun() {
		// ----------------------------
		//  Initial State transition
		// ----------------------------
        ---------------------------------------     TaskExecutor提交状态反馈给TaskManager     -------------------------------------------------
		while (true) {
			ExecutionState current = this.executionState;
			if (current == ExecutionState.CREATED) {
				if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
					// success, we can start our work
					break;
				}
			}
			else if (current == ExecutionState.FAILED) {
				// we were immediately failed. tell the TaskManager that we reached our final state
				notifyFinalState();
				if (metrics != null) {
					metrics.close();
				}
				return;
			}
			else if (current == ExecutionState.CANCELING) {
				if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
					// we were immediately canceled. tell the TaskManager that we reached our final state
					notifyFinalState();
					if (metrics != null) {
						metrics.close();
					}
					return;
				}
			}
			else {
				if (metrics != null) {
					metrics.close();
				}
				throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
			}
		}
  ---------------------------------------      TaskExecutor提交状态反馈给TaskManager     -------------------------------------------------
  
  
  -------------------------------------  在每个task执行的时候,在所在节点上加载对应的jar包 -------------------------------
		// all resource acquisitions and registrations from here on
		// need to be undone in the end
		Map<String, Future<Path>> distributedCacheEntries = new HashMap<>();
		AbstractInvokable invokable = null;

		try {
			// ----------------------------
			//  Task Bootstrap - We periodically
			//  check for canceling as a shortcut
			// ----------------------------

			// activate safety net for task thread
			LOG.info("Creating FileSystem stream leak safety net for task {}", this);
			FileSystemSafetyNet.initializeSafetyNetForThread();

			blobService.getPermanentBlobService().registerJob(jobId);

			// first of all, get a user-code classloader
			// this may involve downloading the job's JAR files and/or classes
			LOG.info("Loading JAR files for task {}.", this);

			userCodeClassLoader = createUserCodeClassloader();
			final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader);

			if (executionConfig.getTaskCancellationInterval() >= 0) {
				// override task cancellation interval from Flink config if set in ExecutionConfig
				taskCancellationInterval = executionConfig.getTaskCancellationInterval();
			}

			if (executionConfig.getTaskCancellationTimeout() >= 0) {
				// override task cancellation timeout from Flink config if set in ExecutionConfig
				taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
			}

			if (isCanceledOrFailed()) {
				throw new CancelTaskException();
			}

			// ----------------------------------------------------------------
			// register the task with the network stack
			// this operation may fail if the system does not have enough
			// memory to run the necessary data exchanges
			// the registration must also strictly be undone
			// ----------------------------------------------------------------

			LOG.info("Registering task at network: {}.", this);

			setupPartitionsAndGates(consumableNotifyingPartitionWriters, inputGates);

			for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
				taskEventDispatcher.registerPartition(partitionWriter.getPartitionId());
			}

			// next, kick off the background copying of files for the distributed cache
			try {
				for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry :
						DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
					LOG.info("Obtaining local cache file for '{}'.", entry.getKey());
					Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId, executionId);
					distributedCacheEntries.put(entry.getKey(), cp);
				}
			}
			catch (Exception e) {
				throw new Exception(
					String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
			}

			if (isCanceledOrFailed()) {
				throw new CancelTaskException();
			}
            
             -------------------------------------  在每个task执行的时候,在所在节点上加载对应的jar包 -------------------------------

			// ----------------------------------------------------------------
			//  call the user code initialization methods
			// ----------------------------------------------------------------
            
            
-------------------------------------            加载当前执行线程所需要的上下文类加载器,配置相应的runtime环境,反射对应算子为初始化做准备          ------------------------------------
			TaskKvStateRegistry kvStateRegistry = kvStateService.createKvStateTaskRegistry(jobId, getJobVertexId());

			Environment env = new RuntimeEnvironment(
				jobId,
				vertexId,
				executionId,
				executionConfig,
				taskInfo,
				jobConfiguration,
				taskConfiguration,
				userCodeClassLoader,
				memoryManager,
				ioManager,
				broadcastVariableManager,
				taskStateManager,
				aggregateManager,
				accumulatorRegistry,
				kvStateRegistry,
				inputSplitProvider,
				distributedCacheEntries,
				consumableNotifyingPartitionWriters,
				inputGates,
				taskEventDispatcher,
				checkpointResponder,
				operatorCoordinatorEventGateway,
				taskManagerConfig,
				metrics,
				this);

			// Make sure the user code classloader is accessible thread-locally.
			// We are setting the correct context class loader before instantiating the invokable
			// so that it is available to the invokable during its entire lifetime.
			executingThread.setContextClassLoader(userCodeClassLoader);

			// now load and instantiate the task's invokable code
			invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass, env);

			// ----------------------------------------------------------------
			//  actual task core work
			// ----------------------------------------------------------------

			// we must make strictly sure that the invokable is accessible to the cancel() call
			// by the time we switched to running.
			this.invokable = invokable;

			// switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime
			if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) {
				throw new CancelTaskException();
			}

			//当前正在执行的task会告知所在的TaskManager,此时他正在处于Running状态
			taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING));

			// make sure the user code classloader is accessible thread-locally
			executingThread.setContextClassLoader(userCodeClassLoader);

			//反射对应的算子开始初始化
			invokable.invoke();

			// make sure, we enter the catch block if the task leaves the invoke() method due
			// to the fact that it has been canceled
			if (isCanceledOrFailed()) {
				throw new CancelTaskException();
			}
-------------------------------------            加载当前执行线程所需要的上下文类加载器,配置相应的runtime环境,反射对应算子为初始化做准备          ------------------------------------
			// ----------------------------------------------------------------
			//  finalization of a successful execution
			// ----------------------------------------------------------------

			------------- 数据写入分区数组 --------------
			for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
				if (partitionWriter != null) {
					partitionWriter.finish();
				}
			}
            ------------------ 数据写入分区数组 -----------------

			// try to mark the task as finished
			// if that fails, the task was canceled/failed in the meantime
			if (!transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
				throw new CancelTaskException();
			}
		}
        
		catch (Throwable t) {

			// unwrap wrapped exceptions to make stack traces more compact
			if (t instanceof WrappingRuntimeException) {
				t = ((WrappingRuntimeException) t).unwrap();
			}

			// ----------------------------------------------------------------
			// the execution failed. either the invokable code properly failed, or
			// an exception was thrown as a side effect of cancelling
			// ----------------------------------------------------------------

			t = ExceptionUtils.enrichTaskManagerOutOfMemoryError(t);

			try {
				// check if the exception is unrecoverable
				if (ExceptionUtils.isJvmFatalError(t) ||
						(t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())) {

					// terminate the JVM immediately
					// don't attempt a clean shutdown, because we cannot expect the clean shutdown to complete
					try {
						LOG.error("Encountered fatal error {} - terminating the JVM", t.getClass().getName(), t);
					} finally {
						Runtime.getRuntime().halt(-1);
					}
				}

				--------------------------------------  对第一步提交状态为失败或取消的任务进行容错恢复  -------------------------------------
				while (true) {
					ExecutionState current = this.executionState;

					if (current == ExecutionState.RUNNING || current == ExecutionState.DEPLOYING) {
						if (t instanceof CancelTaskException) {
							if (transitionState(current, ExecutionState.CANCELED)) {
								cancelInvokable(invokable);
								break;
							}
						}
						else {
							if (transitionState(current, ExecutionState.FAILED, t)) {
								// proper failure of the task. record the exception as the root cause
								failureCause = t;
								cancelInvokable(invokable);

								break;
							}
						}
					}
					else if (current == ExecutionState.CANCELING) {
						if (transitionState(current, ExecutionState.CANCELED)) {
							break;
						}
					}
					else if (current == ExecutionState.FAILED) {
						// in state failed already, no transition necessary any more
						break;
					}
					// unexpected state, go to failed
					else if (transitionState(current, ExecutionState.FAILED, t)) {
						LOG.error("Unexpected state in task {} ({}) during an exception: {}.", taskNameWithSubtask, executionId, current);
						break;
					}
					// else fall through the loop and
				}
			}
			catch (Throwable tt) {
				String message = String.format("FATAL - exception in exception handler of task %s (%s).", taskNameWithSubtask, executionId);
				LOG.error(message, tt);
				notifyFatalError(message, tt);
			}
		}
		finally {
			try {
				LOG.info("Freeing task resources for {} ({}).", taskNameWithSubtask, executionId);

				// clear the reference to the invokable. this helps guard against holding references
				// to the invokable and its structures in cases where this Task object is still referenced
				this.invokable = null;

				// free the network resources
				releaseResources();

				// free memory resources
				if (invokable != null) {
					memoryManager.releaseAll(invokable);
				}

				// remove all of the tasks library resources
				libraryCache.unregisterTask(jobId, executionId);
				fileCache.releaseJob(jobId, executionId);
				blobService.getPermanentBlobService().releaseJob(jobId);

				// close and de-activate safety net for task thread
				LOG.info("Ensuring all FileSystem streams are closed for task {}", this);
				FileSystemSafetyNet.closeSafetyNetAndGuardedResourcesForThread();

				notifyFinalState();
			}
			catch (Throwable t) {
				// an error in the resource cleanup is fatal
				String message = String.format("FATAL - exception in resource cleanup of task %s (%s).", taskNameWithSubtask, executionId);
				LOG.error(message, t);
				notifyFatalError(message, t);
			}

			// un-register the metrics at the end so that the task may already be
			// counted as finished when this happens
			// errors here will only be logged
			try {
				metrics.close();
			}
			catch (Throwable t) {
				LOG.error("Error during metrics de-registration of task {} ({}).", taskNameWithSubtask, executionId, t);
			}
		}
        --------------------------------------  对第一步提交状态为失败或取消的任务进行容错恢复  -------------------------------------
	}
第一步:TaskExecutor提交状态反馈给TaskManager
private void notifyFinalState() {
		//检查Job执行状态
		checkState(executionState.isTerminal());
        //TaskExecutor提交给TaskManager
		taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, executionState, failureCause));
	}

TaskExecutor.java

@Override
		public void updateTaskExecutionState(final TaskExecutionState taskExecutionState) {
			if (taskExecutionState.getExecutionState().isTerminal()) {
            	//如果某个Task在执行过程中中断了,TaskExecutor则通知TaskManager,该Task的最后执行状态
				runAsync(() -> unregisterTaskAndNotifyFinalState(jobMasterGateway, taskExecutionState.getID()));
			} else {
            	//否则直接
				TaskExecutor.this.updateTaskExecutionState(jobMasterGateway, taskExecutionState);
			}
		}
        
        
        private void unregisterTaskAndNotifyFinalState(
			final JobMasterGateway jobMasterGateway,
			final ExecutionAttemptID executionAttemptID) {

		Task task = taskSlotTable.removeTask(executionAttemptID);
		if (task != null) {
			if (!task.getExecutionState().isTerminal()) {
				try {
					task.failExternally(new IllegalStateException("Task is being remove from TaskManager."));
				} catch (Exception e) {
					log.error("Could not properly fail task.", e);
				}
			}

			log.info("Un-registering task and sending final execution state {} to JobManager for task {} {}.",
				task.getExecutionState(), task.getTaskInfo().getTaskNameWithSubtasks(), task.getExecutionId());

			AccumulatorSnapshot accumulatorSnapshot = task.getAccumulatorRegistry().getSnapshot();

			updateTaskExecutionState(
					jobMasterGateway,
					new TaskExecutionState(
						task.getJobID(),
						task.getExecutionId(),
						task.getExecutionState(),
						task.getFailureCause(),
						accumulatorSnapshot,
						task.getMetricGroup().getIOMetricGroup().createSnapshot()));
		} else {
			log.error("Cannot find task with ID {} to unregister.", executionAttemptID);
		}
	}

调用updateTaskExecutionState方法,TaskManager将Job执行信息上报给JobManager

private void updateTaskExecutionState(
			final JobMasterGateway jobMasterGateway,
			final TaskExecutionState taskExecutionState) {
		final ExecutionAttemptID executionAttemptID = taskExecutionState.getID();
		//异步上报给JobMaster该Task的执行状态
		CompletableFuture<Acknowledge> futureAcknowledge = jobMasterGateway.updateTaskExecutionState(taskExecutionState);

		futureAcknowledge.whenCompleteAsync(
			(ack, throwable) -> {
				if (throwable != null) {
                    //当回调完成时,如果失败则抛出异常
					failTask(executionAttemptID, throwable);
				}
			},
            //获取分配TaskExecutor的主调度线程,即TaskExecutor提交状态反馈给JobMaster的线程
			getMainThreadExecutor());
	}

异步上报给JobMaster该Task的执行状态

@Override
	public CompletableFuture<Acknowledge> updateTaskExecutionState(
			final TaskExecutionState taskExecutionState) {
		checkNotNull(taskExecutionState, "taskExecutionState");

		if (schedulerNG.updateTaskExecutionState(taskExecutionState)) {
			return CompletableFuture.completedFuture(Acknowledge.get());
		} else {
			return FutureUtils.completedExceptionally(
				new ExecutionGraphException("The execution attempt " +
					taskExecutionState.getID() + " was not found."));
		}
	}

定时轮询执行TaskExecutor线程上报给JobMaster,异步执行Runnable方法体 getMainThreadExecutor()

/**
	 * Returns a main thread executor which is bound to the currently valid fencing token.
	 * This means that runnables which are executed with this executor fail after the fencing
	 * token has changed. This allows to scope operations by the fencing token.
	 *
	 * @return MainThreadExecutor bound to the current fencing token
	 */
	@Override
	protected MainThreadExecutor getMainThreadExecutor() {
		return fencedMainThreadExecutor;
	}
    
    
    /**
	 * Executor which executes runnables in the main thread context.
	 */
	protected static class MainThreadExecutor implements ComponentMainThreadExecutor {

		private final MainThreadExecutable gateway;
		private final Runnable mainThreadCheck;

		MainThreadExecutor(MainThreadExecutable gateway, Runnable mainThreadCheck) {
			this.gateway = Preconditions.checkNotNull(gateway);
			this.mainThreadCheck = Preconditions.checkNotNull(mainThreadCheck);
		}

		public void runAsync(Runnable runnable) {
			gateway.runAsync(runnable);
		}
		//TaskExecutor线程异步执行Runnable方法体
		public void scheduleRunAsync(Runnable runnable, long delayMillis) {
			gateway.scheduleRunAsync(runnable, delayMillis);
		}

		public void execute(@Nonnull Runnable command) {
			runAsync(command);
		}
		//TaskExecutor线程发起定时轮询任务
		@Override
		public ScheduledFuture<?> schedule(Runnable command, long delay, TimeUnit unit) {
			final long delayMillis = TimeUnit.MILLISECONDS.convert(delay, unit);
			FutureTask<Void> ft = new FutureTask<>(command, null);
			scheduleRunAsync(ft, delayMillis);
			return new ScheduledFutureAdapter<>(ft, delayMillis, TimeUnit.MILLISECONDS);
		}

		@Override
		public <V> ScheduledFuture<V> schedule(Callable<V> callable, long delay, TimeUnit unit) {
			throw new UnsupportedOperationException("Not implemented because the method is currently not required.");
		}
		//TaskExecutor线程发起定时轮询任务,允许定义上报延迟速率
		@Override
		public ScheduledFuture<?> scheduleAtFixedRate(Runnable command, long initialDelay, long period, TimeUnit unit) {
			throw new UnsupportedOperationException("Not implemented because the method is currently not required.");
		}
		//TaskExecutor线程发起定时轮询任务,允许定义上报延迟时间
		@Override
		public ScheduledFuture<?> scheduleWithFixedDelay(Runnable command, long initialDelay, long delay, TimeUnit unit) {
			throw new UnsupportedOperationException("Not implemented because the method is currently not required.");
		}

		@Override
		public void assertRunningInMainThread() {
			mainThreadCheck.run();
		}
	}

第二步: 在每个task执行的时候,在所在节点上加载对应的jar包

Task.java

private void run(
	//
	userCodeClassLoader = createUserCodeClassloader();
}


private ClassLoader createUserCodeClassloader() throws Exception {
		long startDownloadTime = System.currentTimeMillis();
		//从JobManager缓存中进行加载该task所需要的jar包进行下载
		// triggers the download of all missing jar files from the job manager
		libraryCache.registerTask(jobId, executionId, requiredJarFiles, requiredClasspaths);

		LOG.debug("Getting user code class loader for task {} at library cache manager took {} milliseconds",
				executionId, System.currentTimeMillis() - startDownloadTime);
		//根据JobId,加载本次Job需要加载的jar包 (即StreamGraph类中的userArtifacts属性)
		ClassLoader userCodeClassLoader = libraryCache.getClassLoader(jobId);
		if (userCodeClassLoader == null) {
			throw new Exception("No user code classloader available.");
		}
		return userCodeClassLoader;
	}
    
第三步:初始化算子,用切面的方式定义每个执行算子的Task生命周期

加载当前执行线程所需要的上下文类加载器,配置相应的runtime环境,反射对应算子为初始化做准备 
(如果我们有根据上游数据源的元信息来调用相应的定制化算子需求的时候,
可以考虑重写StreamTask接口下的invoke以及beforeInvoke方法)

从下面的类图,以及invoke方法的调用周期来看,this指针指向Source CuStom Source(1/1)这个task,并支持不同算子task处理时(当调用runMailboxLoop())的事件
这意味着我们可以通过重写相应算子Task (``` BatchTask```,``` SourceStreamTask ``` ``` OneInputStreamTask```)中invoke生命周期事件  ``` init``` ```cancelTask ``` ```
cleanUpInvoke``` 来控制反射初始化算子前后的配置信息解析,有些类似SpringBean生命周期事件

StreamTask.java

@Override
	public final void invoke() throws Exception {
		try {
			beforeInvoke();

			// final check to exit early before starting to run
			if (canceled) {
				throw new CancelTaskException();
			}

			// let the task do its work
			runMailboxLoop();

			// if this left the run() method cleanly despite the fact that this was canceled,
			// make sure the "clean shutdown" is not attempted
			if (canceled) {
				throw new CancelTaskException();
			}

			afterInvoke();
		}
		finally {
			cleanUpInvoke();
		}
	}
    
    
 // ------------------------------------------------------------------------
	//  Life cycle methods for specific implementations
	// ------------------------------------------------------------------------

	protected abstract void init() throws Exception;

	protected void cancelTask() throws Exception {
	}

	protected void cleanup() throws Exception {
		if (inputProcessor != null) {
			inputProcessor.close();
		}
	}
第四步 数据写入分区数组

Task#run

private final ResultPartitionWriter[] consumableNotifyingPartitionWriters;

//设置每个算子的输入输出 (inputGates为算子输入,resultPartition为算子输出)
setupPartitionsAndGates(consumableNotifyingPartitionWriters, inputGates);

for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
	taskEventDispatcher.registerPartition(partitionWriter.getPartitionId());
}
@VisibleForTesting
	public static void setupPartitionsAndGates(
		ResultPartitionWriter[] producedPartitions, InputGate[] inputGates) throws IOException, InterruptedException {
		
        //将上一个算子的输出分区ResultPartition写入bufferPool
		for (ResultPartitionWriter partition : producedPartitions) {
			partition.setup();
		}
		 //将当前算子对下一个算子的输入(分为single和union)写入bufferPool
		// InputGates must be initialized after the partitions, since during InputGate#setup
		// we are requesting partitions
		for (InputGate gate : inputGates) {
			gate.setup();
		}
	}
/**
	 * Registers a buffer pool with this result partition.
	 *
	 * <p>There is one pool for each result partition, which is shared by all its sub partitions.
	 *
	 * <p>The pool is registered with the partition *after* it as been constructed in order to conform
	 * to the life-cycle of task registrations in the {@link TaskExecutor}.
	 */
	@Override
	public void setup() throws IOException {
		checkState(this.bufferPool == null, "Bug in result partition setup logic: Already registered buffer pool.");
		//检查上一个算子的输出分区是否为空,不为空则写入bufferPool
		BufferPool bufferPool = checkNotNull(bufferPoolFactory.apply(this));
		checkArgument(bufferPool.getNumberOfRequiredMemorySegments() >= getNumberOfSubpartitions(),
			"Bug in result partition setup logic: Buffer pool has not enough guaranteed buffers for this result partition.");

		this.bufferPool = bufferPool;
        //注册到本地缓存
		partitionManager.registerResultPartition(this);
	}

SingleInputGate.java


/**
	 * The index of the consumed subpartition of each consumed partition. This index depends on the
	 * {@link DistributionPattern} and the subtask indices of the producing and consuming task.
	 */
	private final int consumedSubpartitionIndex;

@Override
	public void setup() throws IOException, InterruptedException {
		checkState(this.bufferPool == null, "Bug in input gate setup logic: Already registered buffer pool.");
		// assign exclusive buffers to input channels directly and use the rest for floating buffers
		assignExclusiveSegments();
		
		BufferPool bufferPool = bufferPoolFactory.get();
        //设置当前算子输入到bufferPool
		setBufferPool(bufferPool);
		//同步上一个算子的输出分区到当前算子
		requestPartitions();
	}
    
    
@VisibleForTesting
	void requestPartitions() throws IOException, InterruptedException {
    	//设置请求分区同步锁,设置一个全局Object引用
		synchronized (requestLock) {
			if (!requestedPartitionsFlag) {
				if (closeFuture.isDone()) {
					throw new IllegalStateException("Already released.");
				}

				// Sanity checks
                //如果当前算子当前写入的分区数,与下游已经同步完的分区数不一致
				if (numberOfInputChannels != inputChannels.size()) {
					throw new IllegalStateException(String.format(
						"Bug in input gate setup logic: mismatch between " +
						"number of total input channels [%s] and the currently set number of input " +
						"channels [%s].",
						inputChannels.size(),
						numberOfInputChannels));
				}
				
                //因为每个算子都有设置并行度,所以每个分区在并行度大于1的情况下,会有多个Task线程消费同一个分区的数据,所以需要提前划分每个Task在单位时间内所能消费的元素数
				for (InputChannel inputChannel : inputChannels.values()) {
					inputChannel.requestSubpartition(consumedSubpartitionIndex);
				}
			}

			requestedPartitionsFlag = true;
		}
	}
第五步: Job容错流程

由于官方的e2e测试样例中的TaskTest#testFailExternallyRightAway 以及 TaskTest#testLibraryCacheRegistrationFailed 只能返回预期的任务执行结果,无法感知任务的具体执行,所以我在 Task#run方法中打了个断点,只要长时间TaskMananger不向JobMananager汇总信息,那么显然这个任务就会失败,触发Task的容错机制

可以看到这里JobMananger的主工作线程flink-akks.actor.default-dispathcer 线程在不断进行CAS操作,不断重试失败的task 从下面的任务信息中可以看到,如果任务长时间执行超时的时候,会被标记为已经失败。容错机制即如下:

1.基于CompletableFutrue不断提交任务,不断重试。如果有设置重启策略即在重试失败时重启整个Flink程序
2.默认会从PerJobMiniJobClient以最小集群配置进行恢复

除了设置重启策略之外,集群资源的配置如果比程序执行时的机器资源大,不足以支撑程序启动,也会出现执行失败的现象。如果不想每次任务执行失败就重启,那么,我们能不能更改每次执行Task的JobClient呢?反向回溯堆栈,发现可以通过更改StreamEnvironment的初始化配置来进行更改,跟踪代码堆栈如下:

StreamExecutionEnvironment#execute

@Override
	public JobExecutionResult execute(StreamGraph streamGraph) throws Exception {
		JobClient jobClient = executeAsync(streamGraph);

		JobExecutionResult jobExecutionResult;
		if (getConfiguration().getBoolean(DeploymentOptions.ATTACHED)) {
			CompletableFuture<JobExecutionResult> jobExecutionResultFuture =
					jobClient.getJobExecutionResult(getUserClassloader());

			if (getConfiguration().getBoolean(DeploymentOptions.SHUTDOWN_IF_ATTACHED)) {
				Thread shutdownHook = ShutdownHookUtil.addShutdownHook(
					.........................
					..............................
			}

			..............................
		} else {
			jobExecutionResult = new DetachedJobExecutionResult(jobClient.getJobID());
		}

		return jobExecutionResult;
	}

调用StreamExecutionEnvironment#executeAsync方法

@Internal
	public JobClient executeAsync(StreamGraph streamGraph) throws Exception {
		checkNotNull(streamGraph, "StreamGraph cannot be null.");
		checkNotNull(configuration.get(DeploymentOptions.TARGET), "No execution.target specified in your configuration file.");

		final PipelineExecutorFactory executorFactory =
			executorServiceLoader.getExecutorFactory(configuration);

		checkNotNull(
			executorFactory,
			"Cannot find compatible factory for specified execution.target (=%s)",
			configuration.get(DeploymentOptions.TARGET));
	注意这行 配置对应实例的机器被分配的Executor
	>>	CompletableFuture<JobClient> jobClientFuture = executorFactory
			.getExecutor(configuration)
			.execute(streamGraph, configuration);

		try {
			JobClient jobClient = jobClientFuture.get();
			jobListeners.forEach(jobListener -> jobListener.onJobSubmitted(jobClient, null));
			return jobClient;
		} catch (Throwable t) {
			jobListeners.forEach(jobListener -> jobListener.onJobSubmitted(null, t));
			ExceptionUtils.rethrow(t);

			// make javac happy, this code path will not be reached
			return null;
		}
	}

LocalExecutor#execute

@Override
	public CompletableFuture<JobClient> execute(Pipeline pipeline, Configuration configuration) throws Exception {
		checkNotNull(pipeline);
		checkNotNull(configuration);

		Configuration effectiveConfig = new Configuration();
		effectiveConfig.addAll(this.configuration);
		effectiveConfig.addAll(configuration);

		// we only support attached execution with the local executor.
		checkState(configuration.getBoolean(DeploymentOptions.ATTACHED));

		final JobGraph jobGraph = getJobGraph(pipeline, effectiveConfig);

		return PerJobMiniClusterFactory.createWithFactory(effectiveConfig, miniClusterFactory).submitJob(jobGraph);
	}

从上述代码段中StreamExecutionEnvironment#executeAsync 方法,需要加载configuration配置选取对应的Executor类型,假设我们重写一个新的LocalExecutor让其支持我们自定义的JobClient,猜测可以通过设置在configuration中的某些参数来进行调用

重启程序进行验证,只需在StreamExecutionEnvironment环境变量创建的时候,指定自定义的ExecutorName即可。此外如果需要支持工厂方法的创建,需要在SPi中指定工厂类的完全限定名,如下:

排版中..QwQ