Flink Checkpoint恢复

334 阅读3分钟

1.1 org.apache.flink.streaming.runtime.tasks.StreamTask

入口从 org.apache.flink.streaming.runtime.tasks.StreamTask :

  1. 所有流任务(task)的基类。
  2. task 是 Task Manager 部署和执行的基本单元。
  3. 每个 task 执行一个或者多个 StreamOperator (对应 chain)。
  4. chain 在一起的 operator 同时在一个线程以及一个分区上同时执行。

被调用的入口是: StreamTask#invoke 方法。

但是在执行具体逻辑之前会先执行 StreamTask#beforeInvoke 进行算子状态的初始化和 open 操作:

@Internal
public abstract class StreamTask<OUT, OP extends StreamOperator<OUT>> extends AbstractInvokable
        implements AsyncExceptionHandler {
    public StreamTaskStateInitializer createStreamTaskStateInitializer() {
        InternalTimeServiceManager.Provider timerServiceProvider =
                configuration.getTimerServiceProvider(getUserCodeClassLoader());
        return new StreamTaskStateInitializerImpl(
                getEnvironment(),
                stateBackend,
                TtlTimeProvider.DEFAULT,
                timerServiceProvider != null
                        ? timerServiceProvider
                        : InternalTimeServiceManagerImpl::create);
    }
    protected void beforeInvoke() throws Exception {
        disposedOperators = false;
        LOG.debug("Initializing {}.", getName());
        operatorChain = new OperatorChain<>(this, recordWriter);
        mainOperator = operatorChain.getMainOperator();
        init();
        if (canceled) {
            throw new CancelTaskException();
        }
        LOG.debug("Invoking {}", getName());
        actionExecutor.runThrowing(
                () -> {
                    SequentialChannelStateReader reader =
                            getEnvironment()
                                    .getTaskStateManager()
                                    .getSequentialChannelStateReader();
                    reader.readOutputData(getEnvironment().getAllWriters(), false);

                    // StreamOperator 初始化状态的入口
                    operatorChain.initializeStateAndOpenOperators(
                            createStreamTaskStateInitializer());

                    channelIOExecutor.execute(
                            () -> {
                                try {
                                    reader.readInputData(getEnvironment().getAllInputGates());
                                } catch (Exception e) {
                                    asyncExceptionHandler.handleAsyncException(
                                            "Unable to read channel state", e);
                                }
                            });

                    for (InputGate inputGate : getEnvironment().getAllInputGates()) {
                        inputGate
                                .getStateConsumedFuture()
                                .thenRun(
                                        () ->
                                                mainMailboxExecutor.execute(
                                                        inputGate::requestPartitions,
                                                        "Input gate request partitions"));
                    }
                });

        isRunning = true;
    }

    @Override
    public final void invoke() throws Exception {
        try {
            // 前置操作进行状态恢复
            beforeInvoke();

            // final check to exit early before starting to run
            if (canceled) {
                throw new CancelTaskException();
            }

            // let the task do its work
            runMailboxLoop();

            // if this left the run() method cleanly despite the fact that this was canceled,
            // make sure the "clean shutdown" is not attempted
            if (canceled) {
                throw new CancelTaskException();
            }

            afterInvoke();
        } catch (Throwable invokeException) {
            failing = !canceled;
            try {
                cleanUpInvoke();
            }
            // TODO: investigate why Throwable instead of Exception is used here.
            catch (Throwable cleanUpException) {
                Throwable throwable =
                        ExceptionUtils.firstOrSuppressed(cleanUpException, invokeException);
                ExceptionUtils.rethrowException(throwable);
            }
            ExceptionUtils.rethrowException(invokeException);
        }
        cleanUpInvoke();
    }
}
protected void initializeStateAndOpenOperators(
  StreamTaskStateInitializer streamTaskStateInitializer) throws Exception {
  // 按照拓扑的反向顺序来进行 Operator 的遍历
  for (StreamOperatorWrapper<?, ?> operatorWrapper : getAllOperators(true)) {
    StreamOperator<?> operator = operatorWrapper.getStreamOperator();
    operator.initializeState(streamTaskStateInitializer);
    operator.open();
  }
}

1.2 org.apache.flink.streaming.api.operators.AbstractStreamOperator

AbstractStreamOperator 是所有 Stream Operator 的基类,执行状态初始化的方法为 AbstractStreamOperator#initializeState:

public final void initializeState(StreamTaskStateInitializer streamTaskStateManager)
  throws Exception {

  final TypeSerializer<?> keySerializer =
    config.getStateKeySerializer(getUserCodeClassloader());

  final StreamTask<?, ?> containingTask = Preconditions.checkNotNull(getContainingTask());
  final CloseableRegistry streamTaskCloseableRegistry =
    Preconditions.checkNotNull(containingTask.getCancelables());

  // 这个对象包含了 Stream Operator 用来连接状态所需的上下文,包括:backends,raw state 和 time service manager 等。
  // 实际类型为 org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.StreamOperatorStateContextImpl
  final StreamOperatorStateContext context =
    streamTaskStateManager.streamOperatorStateContext(
    getOperatorID(),
    getClass().getSimpleName(),
    getProcessingTimeService(),
    this,
    keySerializer,
    streamTaskCloseableRegistry,
    metrics,
    config.getManagedMemoryFractionOperatorUseCaseOfSlot(
      ManagedMemoryUseCase.STATE_BACKEND,
      runtimeContext.getTaskManagerRuntimeInfo().getConfiguration(),
      runtimeContext.getUserCodeClassLoader()),
    isUsingCustomRawKeyedState());

  stateHandler =
    new StreamOperatorStateHandler(
    context, getExecutionConfig(), streamTaskCloseableRegistry);
  timeServiceManager = context.internalTimerServiceManager();
  // 在这个方法可以恢复 Stream Operator 的 operator state 和 keyed state
  // StreamOperatorStateHandler#initializeOperatorState -> CheckpointedStreamOperator#initializeState (AbstractStreamOperator#initializeState(StateInitializationContext))
  stateHandler.initializeOperatorState(this);
  runtimeContext.setKeyedStateStore(stateHandler.getKeyedStateStore().orElse(null));
}

在这里调用了 StreamTaskStateInitializerImpl#streamOperatorStateContext 创建了 StreamOperatorStateContext 对象。

这个对象包含了 Stream Operator 用来连接状态所需的上下文,包括:backends,raw state 和 time service manager 等。

public StreamOperatorStateContext streamOperatorStateContext(
  @Nonnull OperatorID operatorID,
  @Nonnull String operatorClassName,
  @Nonnull ProcessingTimeService processingTimeService,
  @Nonnull KeyContext keyContext,
  @Nullable TypeSerializer<?> keySerializer,
  @Nonnull CloseableRegistry streamTaskCloseableRegistry,
  @Nonnull MetricGroup metricGroup,
  double managedMemoryFraction,
  boolean isUsingCustomRawKeyedState)
  throws Exception {

  TaskInfo taskInfo = environment.getTaskInfo();
  OperatorSubtaskDescriptionText operatorSubtaskDescription =
    new OperatorSubtaskDescriptionText(
    operatorID,
    operatorClassName,
    taskInfo.getIndexOfThisSubtask(),
    taskInfo.getNumberOfParallelSubtasks());

  final String operatorIdentifierText = operatorSubtaskDescription.toString();
  // 多个可互相(部分)替代的 OperatorSubtaskState 的包装器
  // 并为不同状态的所有替代项的建立了尝试恢复的优先顺序
  final PrioritizedOperatorSubtaskState prioritizedOperatorSubtaskStates =
    taskStateManager.prioritizedOperatorState(operatorID);

  CheckpointableKeyedStateBackend<?> keyedStatedBackend = null;
  OperatorStateBackend operatorStateBackend = null;
  CloseableIterable<KeyGroupStatePartitionStreamProvider> rawKeyedStateInputs = null;
  CloseableIterable<StatePartitionStreamProvider> rawOperatorStateInputs = null;
  InternalTimeServiceManager<?> timeServiceManager;

  try {

    // -------------- Keyed State Backend --------------
    keyedStatedBackend =
      keyedStatedBackend(
      keySerializer,
      operatorIdentifierText,
      prioritizedOperatorSubtaskStates,
      streamTaskCloseableRegistry,
      metricGroup,
      managedMemoryFraction);

    // -------------- Operator State Backend --------------
    operatorStateBackend =
      operatorStateBackend(
      operatorIdentifierText,
      prioritizedOperatorSubtaskStates,
      streamTaskCloseableRegistry);

    // -------------- Raw State Streams --------------
    rawKeyedStateInputs =
      rawKeyedStateInputs(
      prioritizedOperatorSubtaskStates
      .getPrioritizedRawKeyedState()
      .iterator());
    streamTaskCloseableRegistry.registerCloseable(rawKeyedStateInputs);

    rawOperatorStateInputs =
      rawOperatorStateInputs(
      prioritizedOperatorSubtaskStates
      .getPrioritizedRawOperatorState()
      .iterator());
    streamTaskCloseableRegistry.registerCloseable(rawOperatorStateInputs);

    // -------------- Internal Timer Service Manager --------------
    if (keyedStatedBackend != null) {

      // if the operator indicates that it is using custom raw keyed state,
      // then whatever was written in the raw keyed state snapshot was NOT written
      // by the internal timer services (because there is only ever one user of raw keyed
      // state);
      // in this case, timers should not attempt to restore timers from the raw keyed
      // state.
      final Iterable<KeyGroupStatePartitionStreamProvider> restoredRawKeyedStateTimers =
        (prioritizedOperatorSubtaskStates.isRestored()
         && !isUsingCustomRawKeyedState)
        ? rawKeyedStateInputs
        : Collections.emptyList();

      timeServiceManager =
        timeServiceManagerProvider.create(
        keyedStatedBackend,
        environment.getUserCodeClassLoader().asClassLoader(),
        keyContext,
        processingTimeService,
        restoredRawKeyedStateTimers);
    } else {
      timeServiceManager = null;
    }

    // -------------- Preparing return value --------------

    return new StreamOperatorStateContextImpl(
      prioritizedOperatorSubtaskStates.isRestored(),
      operatorStateBackend,
      keyedStatedBackend,
      timeServiceManager,
      rawOperatorStateInputs,
      rawKeyedStateInputs);
  } catch (Exception ex) {

    // cleanup if something went wrong before results got published.
    if (keyedStatedBackend != null) {
      if (streamTaskCloseableRegistry.unregisterCloseable(keyedStatedBackend)) {
        IOUtils.closeQuietly(keyedStatedBackend);
      }
      // release resource (e.g native resource)
      keyedStatedBackend.dispose();
    }

    if (operatorStateBackend != null) {
      if (streamTaskCloseableRegistry.unregisterCloseable(operatorStateBackend)) {
        IOUtils.closeQuietly(operatorStateBackend);
      }
      operatorStateBackend.dispose();
    }

    if (streamTaskCloseableRegistry.unregisterCloseable(rawKeyedStateInputs)) {
      IOUtils.closeQuietly(rawKeyedStateInputs);
    }

    if (streamTaskCloseableRegistry.unregisterCloseable(rawOperatorStateInputs)) {
      IOUtils.closeQuietly(rawOperatorStateInputs);
    }

    throw new Exception("Exception while creating StreamOperatorStateContext.", ex);
  }
}

至此,我们追溯到了 Stream Task 下的每一个 Stream Operator 如何确定当前执行 subTask 时所对应访问的状态。

Stream Operator 在访问状态的时候,通过自身所属 Operator 的 OperatorID 找到了对应的状态。