Spark源码阅读篇-rpc通信-ExecutorRunner上一节我们介绍到worker在启动executor的时候创

上一节我们介绍到worker在启动executor的时候创建了ExecutorRunner，接下来我们来看一下ExecutorRunner是怎么启动进程执行具体的任务的。

ExecutorRunner源码

package org.apache.spark.deploy.worker

import java.io._
import java.nio.charset.StandardCharsets

import scala.collection.JavaConverters._

import com.google.common.io.Files

import org.apache.spark.{SecurityManager, SparkConf}
import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
import org.apache.spark.deploy.DeployMessages.ExecutorStateChanged
import org.apache.spark.deploy.StandaloneResourceUtils.prepareResourcesFile
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.SPARK_EXECUTOR_PREFIX
import org.apache.spark.internal.config.UI._
import org.apache.spark.resource.ResourceInformation
import org.apache.spark.rpc.RpcEndpointRef
import org.apache.spark.util.{ShutdownHookManager, Utils}
import org.apache.spark.util.logging.FileAppender

/**
 * Manages the execution of one executor process.
 * This is currently only used in standalone mode.
 */
 //管理一个执行程序进程的执行。这目前仅在独立模式下使用。
private[deploy] class ExecutorRunner(
    val appId: String,
    val execId: Int,
    val appDesc: ApplicationDescription,
    val cores: Int,
    val memory: Int,
    val worker: RpcEndpointRef,
    val workerId: String,
    val webUiScheme: String,
    val host: String,
    val webUiPort: Int,
    val publicAddress: String,
    val sparkHome: File,
    val executorDir: File,
    val workerUrl: String,
    conf: SparkConf,
    val appLocalDirs: Seq[String],
    @volatile var state: ExecutorState.Value,
    val resources: Map[String, ResourceInformation] = Map.empty)
  extends Logging {

  //id全称
  private val fullId = appId + "/" + execId
  //worker线程
  private var workerThread: Thread = null
  //进程
  private var process: Process = null
  //日志输出附录
  private var stdoutAppender: FileAppender = null
  //错误日志输出附录
  private var stderrAppender: FileAppender = null

  // Timeout to wait for when trying to terminate an executor.
  //尝试终止执行程序时等待的超时。
  private val EXECUTOR_TERMINATE_TIMEOUT_MS = 10 * 1000

  // NOTE: This is now redundant with the automated shut-down enforced by the Executor. It might
  // make sense to remove this in the future.
  //注：现在这是多余的，因为执行器强制执行自动关闭。将来删除它可能是有意义的。
  private var shutdownHook: AnyRef = null

  private[worker] def start(): Unit = {
    workerThread = new Thread("ExecutorRunner for " + fullId) {
      override def run(): Unit = { fetchAndRunExecutor() }
    }
    //启动线程
    workerThread.start()
    // Shutdown hook that kills actors on shutdown.
    //关机挂钩，在关机时杀死参与者。
    shutdownHook = ShutdownHookManager.addShutdownHook { () =>
      // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
      // be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
      //我们可能在调用“fetchAndRunExecutor”之前到达这里，那么“state”将是“ExecutitorState.LAUNCHING”。
      //在这种情况下，我们应该将“state”设置为“FAILED”。
      if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
        state = ExecutorState.FAILED
      }
      killProcess(Some("Worker shutting down")) }
  }

  /**
   * Kill executor process, wait for exit and notify worker to update resource status.
   *
   * @param message the exception message which caused the executor's death
   */
  private def killProcess(message: Option[String]): Unit = {
    var exitCode: Option[Int] = None
    if (process != null) {
      logInfo("Killing process!")
      if (stdoutAppender != null) {
        stdoutAppender.stop()
      }
      if (stderrAppender != null) {
        stderrAppender.stop()
      }
      exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS)
      if (exitCode.isEmpty) {
        logWarning("Failed to terminate process: " + process +
          ". This process will likely be orphaned.")
      }
    }
    try {
      worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode))
    } catch {
      case e: IllegalStateException => logWarning(e.getMessage(), e)
    }
  }

  /** Stop this executor runner, including killing the process it launched */
  private[worker] def kill(): Unit = {
    if (workerThread != null) {
      // the workerThread will kill the child process when interrupted
      workerThread.interrupt()
      workerThread = null
      state = ExecutorState.KILLED
      try {
        ShutdownHookManager.removeShutdownHook(shutdownHook)
      } catch {
        case e: IllegalStateException => None
      }
    }
  }

  /** Replace variables such as {{EXECUTOR_ID}} and {{CORES}} in a command argument passed to us */
  private[worker] def substituteVariables(argument: String): String = argument match {
    case "{{WORKER_URL}}" => workerUrl
    case "{{EXECUTOR_ID}}" => execId.toString
    case "{{HOSTNAME}}" => host
    case "{{CORES}}" => cores.toString
    case "{{APP_ID}}" => appId
    case other => other
  }

  /**
   * Download and run the executor described in our ApplicationDescription
   */
   //下载并运行我们的ApplicationDescription中描述的executor
  private def fetchAndRunExecutor(): Unit = {
    try {
      //prepareResourcesFile--将驱动程序（仅限集群）或执行程序分配的资源保存到JSON格式的资源文件中。仅在单机版中使用。
      val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
      // Launch the process
      //启动进程
      //参数拼接 添加资源文件路径
      val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
        Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
      val subsOpts = appDesc.command.javaOpts.map {
        Utils.substituteAppNExecIds(_, appId, execId.toString)
      }
      val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
      //通过应用程序的信息和环境配置创建构造器
      val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
        memory, sparkHome.getAbsolutePath, substituteVariables)
      val command = builder.command()
      val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
        .mkString("\"", "\" \"", "\"")
      logInfo(s"Launch command: $redactedCommand")
      //在构造器中添加执行目录信息
      builder.directory(executorDir)
      builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
      // In case we are running this from within the Spark Shell, avoid creating a "scala"
      // parent process for the executor command
      //如果我们在Spark Shell中运行，请避免为executor命令创建“scala”父进程
      builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

      // Add webUI log urls
      //添加webui日志url
      //在构造器中添加监控页面输入日志地址信息
      val baseUrl =
        if (conf.get(UI_REVERSE_PROXY)) {
          conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
            s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
        } else {
          s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
        }
      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
      //启动构造器 创建CoarseGrainedExecutorBackend实例
      process = builder.start()
      val header = "Spark Executor Command: %s\n%s\n\n".format(
        redactedCommand, "=" * 40)

      // Redirect its stdout and stderr to files
      //将其stdout和stderr重定向到文件
      //输出CoarseGrainedExecutorBackend实例的运行信息
      val stdout = new File(executorDir, "stdout")
      stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)

      val stderr = new File(executorDir, "stderr")
      Files.write(header, stderr, StandardCharsets.UTF_8)
      stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)

      state = ExecutorState.RUNNING
      worker.send(ExecutorStateChanged(appId, execId, state, None, None))
      // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
      // or with nonzero exit code
      //等待它退出；执行程序可能会退出，代码为0（当驱动程序指示它关闭时）或具有非零退出代码
      val exitCode = process.waitFor()
      state = ExecutorState.EXITED
      val message = "Command exited with code " + exitCode
      //向worker发消息Executor状态已经改变
      worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
    } catch {
      case interrupted: InterruptedException =>
        logInfo("Runner thread for executor " + fullId + " interrupted")
        state = ExecutorState.KILLED
        killProcess(None)
      case e: Exception =>
        logError("Error running executor", e)
        state = ExecutorState.FAILED
        killProcess(Some(e.toString))
    }
  }
}

ExecutorRunner中初始化start方法中主要是创建一个线程并在线程中执行fetchAndRunExecutor方法：

private[worker] def start(): Unit = {
    workerThread = new Thread("ExecutorRunner for " + fullId) {
      override def run(): Unit = { fetchAndRunExecutor() }
    }
    //启动线程
    workerThread.start()
    // Shutdown hook that kills actors on shutdown.
    //关机挂钩，在关机时杀死参与者。
    shutdownHook = ShutdownHookManager.addShutdownHook { () =>
      // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
      // be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
      //我们可能在调用“fetchAndRunExecutor”之前到达这里，那么“state”将是“ExecutitorState.LAUNCHING”。
      //在这种情况下，我们应该将“state”设置为“FAILED”。
      if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
        state = ExecutorState.FAILED
      }
      killProcess(Some("Worker shutting down")) }
  }

接下来进一步看一下fetchAndRunExecutor方法具体执行什么操作：

/**
   * Download and run the executor described in our ApplicationDescription
   */
   //下载并运行我们的ApplicationDescription中描述的executor
  private def fetchAndRunExecutor(): Unit = {
    try {
      //prepareResourcesFile--将驱动程序（仅限集群）或执行程序分配的资源保存到JSON格式的资源文件中。仅在单机版中使用。
      val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
      // Launch the process
      //启动进程
      //参数拼接 添加资源文件路径
      val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
        Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
      val subsOpts = appDesc.command.javaOpts.map {
        Utils.substituteAppNExecIds(_, appId, execId.toString)
      }
      val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
      //通过应用程序的信息和环境配置创建构造器
      val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
        memory, sparkHome.getAbsolutePath, substituteVariables)
      val command = builder.command()
      val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
        .mkString("\"", "\" \"", "\"")
      logInfo(s"Launch command: $redactedCommand")
      //在构造器中添加执行目录信息
      builder.directory(executorDir)
      builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
      // In case we are running this from within the Spark Shell, avoid creating a "scala"
      // parent process for the executor command
      //如果我们在Spark Shell中运行，请避免为executor命令创建“scala”父进程
      builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

      // Add webUI log urls
      //添加webui日志url
      //在构造器中添加监控页面输入日志地址信息
      val baseUrl =
        if (conf.get(UI_REVERSE_PROXY)) {
          conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
            s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
        } else {
          s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
        }
      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
      //启动构造器 创建CoarseGrainedExecutorBackend实例
      process = builder.start()
      val header = "Spark Executor Command: %s\n%s\n\n".format(
        redactedCommand, "=" * 40)

      // Redirect its stdout and stderr to files
      //将其stdout和stderr重定向到文件
      //输出CoarseGrainedExecutorBackend实例的运行信息
      val stdout = new File(executorDir, "stdout")
      stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)

      val stderr = new File(executorDir, "stderr")
      Files.write(header, stderr, StandardCharsets.UTF_8)
      stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)

      state = ExecutorState.RUNNING
      worker.send(ExecutorStateChanged(appId, execId, state, None, None))
      // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
      // or with nonzero exit code
      //等待它退出；执行程序可能会退出，代码为0（当驱动程序指示它关闭时）或具有非零退出代码
      val exitCode = process.waitFor()
      state = ExecutorState.EXITED
      val message = "Command exited with code " + exitCode
      //向worker发消息Executor状态已经改变
      worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
    } catch {
      case interrupted: InterruptedException =>
        logInfo("Runner thread for executor " + fullId + " interrupted")
        state = ExecutorState.KILLED
        killProcess(None)
      case e: Exception =>
        logError("Error running executor", e)
        state = ExecutorState.FAILED
        killProcess(Some(e.toString))
    }
  }

fetchAndRunExecutor主要是拼接命令参数，创建Processbuilder并执行builder，而且可以看出命令的参数主要来自appDesc，这里面封装了命令，builder的start方法主要是执行命令：

public Process start() throws IOException {
    // Must convert to array first -- a malicious user-supplied
    // list might try to circumvent the security check.
    String[] cmdarray = command.toArray(new String[command.size()]);
    cmdarray = cmdarray.clone();

    for (String arg : cmdarray)
        if (arg == null)
            throw new NullPointerException();
    // Throws IndexOutOfBoundsException if command is empty
    String prog = cmdarray[0];

    SecurityManager security = System.getSecurityManager();
    if (security != null)
        security.checkExec(prog);

    String dir = directory == null ? null : directory.toString();

    for (int i = 1; i < cmdarray.length; i++) {
        if (cmdarray[i].indexOf('\u0000') >= 0) {
            throw new IOException("invalid null character in command");
        }
    }

    try {
        return ProcessImpl.start(cmdarray,
                                 environment,
                                 dir,
                                 redirects,
                                 redirectErrorStream);
    } catch (IOException | IllegalArgumentException e) {
        String exceptionInfo = ": " + e.getMessage();
        Throwable cause = e;
        if ((e instanceof IOException) && security != null) {
            // Can not disclose the fail reason for read-protected files.
            try {
                security.checkRead(prog);
            } catch (SecurityException se) {
                exceptionInfo = "";
                cause = se;
            }
        }
        // It's much easier for us to create a high-quality error
        // message than the low-level C code which found the problem.
        throw new IOException(
            "Cannot run program "" + prog + """
            + (dir == null ? "" : " (in directory "" + dir + "")")
            + exceptionInfo,
            cause);
    }
}

该方法只是执行命令，那么具体是什么命令，起到什么作用暂时看不出来，这是因为SparkContext在初始化的时候已经将实例初始化了，参数也设置好了，接下来只需要传入具体的参数命令执行即可，下一节我们进一步看一下具体执行任务的线程CoarseGrainedSchedulerBackend是怎么创建并通过命令启动的。

总结：这里有点逐步递进，逐渐深入，可能会有些云里雾里的感觉，但是顺着代码一步步往下看就好了。