上一节我们介绍到worker在启动executor的时候创建了ExecutorRunner,接下来我们来看一下ExecutorRunner是怎么启动进程执行具体的任务的。
ExecutorRunner源码
package org.apache.spark.deploy.worker
import java.io._
import java.nio.charset.StandardCharsets
import scala.collection.JavaConverters._
import com.google.common.io.Files
import org.apache.spark.{SecurityManager, SparkConf}
import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
import org.apache.spark.deploy.DeployMessages.ExecutorStateChanged
import org.apache.spark.deploy.StandaloneResourceUtils.prepareResourcesFile
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.SPARK_EXECUTOR_PREFIX
import org.apache.spark.internal.config.UI._
import org.apache.spark.resource.ResourceInformation
import org.apache.spark.rpc.RpcEndpointRef
import org.apache.spark.util.{ShutdownHookManager, Utils}
import org.apache.spark.util.logging.FileAppender
/**
* Manages the execution of one executor process.
* This is currently only used in standalone mode.
*/
//管理一个执行程序进程的执行。这目前仅在独立模式下使用。
private[deploy] class ExecutorRunner(
val appId: String,
val execId: Int,
val appDesc: ApplicationDescription,
val cores: Int,
val memory: Int,
val worker: RpcEndpointRef,
val workerId: String,
val webUiScheme: String,
val host: String,
val webUiPort: Int,
val publicAddress: String,
val sparkHome: File,
val executorDir: File,
val workerUrl: String,
conf: SparkConf,
val appLocalDirs: Seq[String],
@volatile var state: ExecutorState.Value,
val resources: Map[String, ResourceInformation] = Map.empty)
extends Logging {
//id全称
private val fullId = appId + "/" + execId
//worker线程
private var workerThread: Thread = null
//进程
private var process: Process = null
//日志输出附录
private var stdoutAppender: FileAppender = null
//错误日志输出附录
private var stderrAppender: FileAppender = null
// Timeout to wait for when trying to terminate an executor.
//尝试终止执行程序时等待的超时。
private val EXECUTOR_TERMINATE_TIMEOUT_MS = 10 * 1000
// NOTE: This is now redundant with the automated shut-down enforced by the Executor. It might
// make sense to remove this in the future.
//注:现在这是多余的,因为执行器强制执行自动关闭。将来删除它可能是有意义的。
private var shutdownHook: AnyRef = null
private[worker] def start(): Unit = {
workerThread = new Thread("ExecutorRunner for " + fullId) {
override def run(): Unit = { fetchAndRunExecutor() }
}
//启动线程
workerThread.start()
// Shutdown hook that kills actors on shutdown.
//关机挂钩,在关机时杀死参与者。
shutdownHook = ShutdownHookManager.addShutdownHook { () =>
// It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
// be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
//我们可能在调用“fetchAndRunExecutor”之前到达这里,那么“state”将是“ExecutitorState.LAUNCHING”。
//在这种情况下,我们应该将“state”设置为“FAILED”。
if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
state = ExecutorState.FAILED
}
killProcess(Some("Worker shutting down")) }
}
/**
* Kill executor process, wait for exit and notify worker to update resource status.
*
* @param message the exception message which caused the executor's death
*/
private def killProcess(message: Option[String]): Unit = {
var exitCode: Option[Int] = None
if (process != null) {
logInfo("Killing process!")
if (stdoutAppender != null) {
stdoutAppender.stop()
}
if (stderrAppender != null) {
stderrAppender.stop()
}
exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS)
if (exitCode.isEmpty) {
logWarning("Failed to terminate process: " + process +
". This process will likely be orphaned.")
}
}
try {
worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode))
} catch {
case e: IllegalStateException => logWarning(e.getMessage(), e)
}
}
/** Stop this executor runner, including killing the process it launched */
private[worker] def kill(): Unit = {
if (workerThread != null) {
// the workerThread will kill the child process when interrupted
workerThread.interrupt()
workerThread = null
state = ExecutorState.KILLED
try {
ShutdownHookManager.removeShutdownHook(shutdownHook)
} catch {
case e: IllegalStateException => None
}
}
}
/** Replace variables such as {{EXECUTOR_ID}} and {{CORES}} in a command argument passed to us */
private[worker] def substituteVariables(argument: String): String = argument match {
case "{{WORKER_URL}}" => workerUrl
case "{{EXECUTOR_ID}}" => execId.toString
case "{{HOSTNAME}}" => host
case "{{CORES}}" => cores.toString
case "{{APP_ID}}" => appId
case other => other
}
/**
* Download and run the executor described in our ApplicationDescription
*/
//下载并运行我们的ApplicationDescription中描述的executor
private def fetchAndRunExecutor(): Unit = {
try {
//prepareResourcesFile--将驱动程序(仅限集群)或执行程序分配的资源保存到JSON格式的资源文件中。仅在单机版中使用。
val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
// Launch the process
//启动进程
//参数拼接 添加资源文件路径
val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
val subsOpts = appDesc.command.javaOpts.map {
Utils.substituteAppNExecIds(_, appId, execId.toString)
}
val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
//通过应用程序的信息和环境配置创建构造器
val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
memory, sparkHome.getAbsolutePath, substituteVariables)
val command = builder.command()
val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
.mkString("\"", "\" \"", "\"")
logInfo(s"Launch command: $redactedCommand")
//在构造器中添加执行目录信息
builder.directory(executorDir)
builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
// In case we are running this from within the Spark Shell, avoid creating a "scala"
// parent process for the executor command
//如果我们在Spark Shell中运行,请避免为executor命令创建“scala”父进程
builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
// Add webUI log urls
//添加webui日志url
//在构造器中添加监控页面输入日志地址信息
val baseUrl =
if (conf.get(UI_REVERSE_PROXY)) {
conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
} else {
s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
}
builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
//启动构造器 创建CoarseGrainedExecutorBackend实例
process = builder.start()
val header = "Spark Executor Command: %s\n%s\n\n".format(
redactedCommand, "=" * 40)
// Redirect its stdout and stderr to files
//将其stdout和stderr重定向到文件
//输出CoarseGrainedExecutorBackend实例的运行信息
val stdout = new File(executorDir, "stdout")
stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)
val stderr = new File(executorDir, "stderr")
Files.write(header, stderr, StandardCharsets.UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)
state = ExecutorState.RUNNING
worker.send(ExecutorStateChanged(appId, execId, state, None, None))
// Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
// or with nonzero exit code
//等待它退出;执行程序可能会退出,代码为0(当驱动程序指示它关闭时)或具有非零退出代码
val exitCode = process.waitFor()
state = ExecutorState.EXITED
val message = "Command exited with code " + exitCode
//向worker发消息Executor状态已经改变
worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
} catch {
case interrupted: InterruptedException =>
logInfo("Runner thread for executor " + fullId + " interrupted")
state = ExecutorState.KILLED
killProcess(None)
case e: Exception =>
logError("Error running executor", e)
state = ExecutorState.FAILED
killProcess(Some(e.toString))
}
}
}
ExecutorRunner中初始化start方法中主要是创建一个线程并在线程中执行fetchAndRunExecutor方法:
private[worker] def start(): Unit = {
workerThread = new Thread("ExecutorRunner for " + fullId) {
override def run(): Unit = { fetchAndRunExecutor() }
}
//启动线程
workerThread.start()
// Shutdown hook that kills actors on shutdown.
//关机挂钩,在关机时杀死参与者。
shutdownHook = ShutdownHookManager.addShutdownHook { () =>
// It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
// be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
//我们可能在调用“fetchAndRunExecutor”之前到达这里,那么“state”将是“ExecutitorState.LAUNCHING”。
//在这种情况下,我们应该将“state”设置为“FAILED”。
if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
state = ExecutorState.FAILED
}
killProcess(Some("Worker shutting down")) }
}
接下来进一步看一下fetchAndRunExecutor方法具体执行什么操作:
/**
* Download and run the executor described in our ApplicationDescription
*/
//下载并运行我们的ApplicationDescription中描述的executor
private def fetchAndRunExecutor(): Unit = {
try {
//prepareResourcesFile--将驱动程序(仅限集群)或执行程序分配的资源保存到JSON格式的资源文件中。仅在单机版中使用。
val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
// Launch the process
//启动进程
//参数拼接 添加资源文件路径
val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
val subsOpts = appDesc.command.javaOpts.map {
Utils.substituteAppNExecIds(_, appId, execId.toString)
}
val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
//通过应用程序的信息和环境配置创建构造器
val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
memory, sparkHome.getAbsolutePath, substituteVariables)
val command = builder.command()
val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
.mkString("\"", "\" \"", "\"")
logInfo(s"Launch command: $redactedCommand")
//在构造器中添加执行目录信息
builder.directory(executorDir)
builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
// In case we are running this from within the Spark Shell, avoid creating a "scala"
// parent process for the executor command
//如果我们在Spark Shell中运行,请避免为executor命令创建“scala”父进程
builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
// Add webUI log urls
//添加webui日志url
//在构造器中添加监控页面输入日志地址信息
val baseUrl =
if (conf.get(UI_REVERSE_PROXY)) {
conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
} else {
s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
}
builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
//启动构造器 创建CoarseGrainedExecutorBackend实例
process = builder.start()
val header = "Spark Executor Command: %s\n%s\n\n".format(
redactedCommand, "=" * 40)
// Redirect its stdout and stderr to files
//将其stdout和stderr重定向到文件
//输出CoarseGrainedExecutorBackend实例的运行信息
val stdout = new File(executorDir, "stdout")
stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)
val stderr = new File(executorDir, "stderr")
Files.write(header, stderr, StandardCharsets.UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)
state = ExecutorState.RUNNING
worker.send(ExecutorStateChanged(appId, execId, state, None, None))
// Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
// or with nonzero exit code
//等待它退出;执行程序可能会退出,代码为0(当驱动程序指示它关闭时)或具有非零退出代码
val exitCode = process.waitFor()
state = ExecutorState.EXITED
val message = "Command exited with code " + exitCode
//向worker发消息Executor状态已经改变
worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
} catch {
case interrupted: InterruptedException =>
logInfo("Runner thread for executor " + fullId + " interrupted")
state = ExecutorState.KILLED
killProcess(None)
case e: Exception =>
logError("Error running executor", e)
state = ExecutorState.FAILED
killProcess(Some(e.toString))
}
}
fetchAndRunExecutor主要是拼接命令参数,创建Processbuilder并执行builder,而且可以看出命令的参数主要来自appDesc,这里面封装了命令,builder的start方法主要是执行命令:
public Process start() throws IOException {
// Must convert to array first -- a malicious user-supplied
// list might try to circumvent the security check.
String[] cmdarray = command.toArray(new String[command.size()]);
cmdarray = cmdarray.clone();
for (String arg : cmdarray)
if (arg == null)
throw new NullPointerException();
// Throws IndexOutOfBoundsException if command is empty
String prog = cmdarray[0];
SecurityManager security = System.getSecurityManager();
if (security != null)
security.checkExec(prog);
String dir = directory == null ? null : directory.toString();
for (int i = 1; i < cmdarray.length; i++) {
if (cmdarray[i].indexOf('\u0000') >= 0) {
throw new IOException("invalid null character in command");
}
}
try {
return ProcessImpl.start(cmdarray,
environment,
dir,
redirects,
redirectErrorStream);
} catch (IOException | IllegalArgumentException e) {
String exceptionInfo = ": " + e.getMessage();
Throwable cause = e;
if ((e instanceof IOException) && security != null) {
// Can not disclose the fail reason for read-protected files.
try {
security.checkRead(prog);
} catch (SecurityException se) {
exceptionInfo = "";
cause = se;
}
}
// It's much easier for us to create a high-quality error
// message than the low-level C code which found the problem.
throw new IOException(
"Cannot run program "" + prog + """
+ (dir == null ? "" : " (in directory "" + dir + "")")
+ exceptionInfo,
cause);
}
}
该方法只是执行命令,那么具体是什么命令,起到什么作用暂时看不出来,这是因为SparkContext在初始化的时候已经将实例初始化了,参数也设置好了,接下来只需要传入具体的参数命令执行即可,下一节我们进一步看一下具体执行任务的线程CoarseGrainedSchedulerBackend是怎么创建并通过命令启动的。
总结:这里有点逐步递进,逐渐深入,可能会有些云里雾里的感觉,但是顺着代码一步步往下看就好了。