上一节我们介绍了fetchAndRunExecutor方法中执行命令启动CoarseGrainedSchedulerBackend,但是看不出是怎么启动的,这一节就从前往后梳理一遍CoarseGrainedSchedulerBackend的创建过程,为什么要单独梳理CoarseGrainedSchedulerBackend呢?因为这是executor中具体执行task任务的线程,是执行任务的核心。
(1)SparkContext初始化
case SPARK_REGEX(sparkUrl) =>
val scheduler = new TaskSchedulerImpl(sc)
val masterUrls = sparkUrl.split(",").map("spark://" + _)
val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
scheduler.initialize(backend)
(backend, scheduler)
SparkContext中会根据匹配如果是Standalone模式创建StandaloneSchedulerBackend类。
(2)StandaloneSchedulerBackend创建
private[spark] class StandaloneSchedulerBackend(
scheduler: TaskSchedulerImpl,
sc: SparkContext,
masters: Array[String])
extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
with StandaloneAppClientListener
with Logging {
private[spark] var client: StandaloneAppClient = null
StandaloneSchedulerBackend是继承CoarseGrainedSchedulerBackend,并且会在内部创建StandaloneAppClient,同时StandaloneSchedulerBackend在start方法中会拼接启动CoarseGrainedSchedulerBackend类的命令:
override def start(): Unit = {
super.start()
//SPARK-21159。调度程序后端只应在客户端中尝试连接到启动器模式在集群模式下,向Master提交应用程序的代码需要连接改为发送程序。
if (sc.deployMode == "client") {
launcherBackend.connect()
}
// executor与我们交流的端点
val driverUrl = RpcEndpointAddress(
sc.conf.get(config.DRIVER_HOST_ADDRESS),
sc.conf.get(config.DRIVER_PORT),
CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
//spark任务提交启动参数
val args = Seq(
"--driver-url", driverUrl,
"--executor-id", "{{EXECUTOR_ID}}",
"--hostname", "{{HOSTNAME}}",
"--cores", "{{CORES}}",
"--app-id", "{{APP_ID}}",
"--worker-url", "{{WORKER_URL}}")
//executor相关Java参数
val extraJavaOpts = sc.conf.get(config.EXECUTOR_JAVA_OPTIONS)
.map(Utils.splitCommandString).getOrElse(Seq.empty)
//executor类目录
val classPathEntries = sc.conf.get(config.EXECUTOR_CLASS_PATH)
.map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)
//executor依赖目录
val libraryPathEntries = sc.conf.get(config.EXECUTOR_LIBRARY_PATH)
.map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)
// 测试时,将父类路径公开给子类。这是由计算类路径处理的。{cmd,sh},并且当在启用“*提供的”配置文件的情况下构建程序集时,使所有需要的jar都可用于子进程。
val testingClassPath =
if (sys.props.contains(IS_TESTING.key)) {
sys.props("java.class.path").split(java.io.File.pathSeparator).toSeq
} else {
Nil
}
//使用一些必要的配置启动执行器,以便向调度程序注册
val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)
val javaOpts = sparkJavaOpts ++ extraJavaOpts
//启动类CoarseGrainedExecutorBackend
val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",
args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts)
//UI地址
val webUrl = sc.ui.map(_.webUrl).getOrElse("")
val coresPerExecutor = conf.getOption(config.EXECUTOR_CORES.key).map(_.toInt)
// 如果我们使用动态分配,请将初始执行器限制设置为0。ExecutiorAllocationManager稍后将向Master发送实际的初始限制。
val initialExecutorLimit =
if (Utils.isDynamicAllocationEnabled(conf)) {
Some(0)
} else {
None
}
val executorResourceReqs = ResourceUtils.parseResourceRequirements(conf,
config.SPARK_EXECUTOR_PREFIX)
//应用信息 注:这里的appDesc就是ExecutorRunner中的参数信息,包含拼接的命令
val appDesc = ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
webUrl, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor, initialExecutorLimit,
resourceReqsPerExecutor = executorResourceReqs)
//创建StandaloneAppClient对象
client = new StandaloneAppClient(sc.env.rpcEnv, masters, appDesc, this, conf)
//启动客户端
client.start()
launcherBackend.setState(SparkAppHandle.State.SUBMITTED)
//等待注册
waitForRegistration()
launcherBackend.setState(SparkAppHandle.State.RUNNING)
}
(3)StandaloneAppClient向Master注册
override def onStart(): Unit = {
try {
//第一次向Master注册
registerWithMaster(1)
} catch {
case e: Exception =>
logWarning("Failed to connect to master", e)
markDisconnected()
stop()
}
}
StandaloneAppClient默认执行的启动方法onStart方法,该方法会向Master进行注册。
//以异步方式向所有主机注册。它将每隔REGISTRATION_TIMEOUT_SECONDS秒调用“registerWithMaster”,
//直到超过REGISTRATION _RETRIES次数。一旦我们成功连接到主机,所有调度工作和期货都将被取消。
//nthRetry表示这是第n次尝试向master注册。
private def registerWithMaster(nthRetry: Int): Unit = {
//向所有的master注册 并获取注册行为
registerMasterFutures.set(tryRegisterAllMasters())
registrationRetryTimer.set(registrationRetryThread.schedule(new Runnable {
override def run(): Unit = {
//如果注册成功 注册成功 master会返回RegisteredApplication消息 由recive处理 会把registered置为true
if (registered.get) {
//已经注册成功了 所以取消注册行为
registerMasterFutures.get.foreach(_.cancel(true))
//关闭注册线程池
registerMasterThreadPool.shutdownNow()
} else if (nthRetry >= REGISTRATION_RETRIES) {//如果超过3次 则放弃注册
markDead("All masters are unresponsive! Giving up.")
} else {
//取消当前的注册行为
registerMasterFutures.get.foreach(_.cancel(true))
//重新尝试注册
registerWithMaster(nthRetry + 1)
}
}
}, REGISTRATION_TIMEOUT_SECONDS, TimeUnit.SECONDS))
}
registerWithMaster会通过异步的方式向Master提交注册消息,具体的提交由tryRegisterAllMasters完成:
//以异步方式向所有主控器注册,并返回一个数组“Future”以进行取消。
private def tryRegisterAllMasters(): Array[JFuture[_]] = {
//遍历集群中所有的master地址 都需要注册 针对高可用集群一般有备用master
//for和yield搭配 返回多个结果序列 即Array[]
//JFuture表示操作行为 可以取消执行
for (masterAddress <- masterRpcAddresses) yield {
registerMasterThreadPool.submit(new Runnable {
override def run(): Unit = try {
//如果注册成功则直接返回
if (registered.get) {
return
}
logInfo("Connecting to master " + masterAddress.toSparkURL + "...")
//获取master的引用
val masterRef = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME)
//通过master的引用向master发送注册消息
masterRef.send(RegisterApplication(appDescription, self))
} catch {
case ie: InterruptedException => // Cancelled
case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
}
})
}
}
tryRegisterAllMasters中主要是获取Master的引用masterRef,并向Master发送RegisterApplication消息,Master在收到消息后进行处理。
(4)Master注册应用
//注册应用
case RegisterApplication(description, driver) =>
// TODO Prevent repeated registrations from some driver
//如果是备用的Master 则不做处理
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
//创建应用
val app = createApplication(description, driver)
//注册应用
registerApplication(app)
logInfo("Registered app " + description.name + " with ID " + app.id)
//持久化引擎中加入该应用
persistenceEngine.addApplication(app)
//向提交给该应用的driver发消息应用注册
driver.send(RegisteredApplication(app.id, self))
//开始调度
schedule()
}
Master在收到StandaloneAppClient发送过来的注册消息之后根据参数创建应用,并且注册应用,最后向Standaloneclient(也就是driver)发消息RegisteredApplication已经注册成功,然后开始调度任务和资源schedule。
(5)StandaloneAppClient设置应用id
case RegisteredApplication(appId_, masterRef) =>
// FIXME How to handle the following cases?
// 1. A master receives multiple registrations and sends back multiple
// RegisteredApplications due to an unstable network.
// 2. Receive multiple RegisteredApplication from different masters because the master is
// changing.
//FIXME如何处理以下情况?
//1.由于网络不稳定,Master接收多个注册并发回多个RegisteredApplication。
//2.从不同的Master接收多个RegisteredApplication,因为Master正在更改。
//设置Master返回的应用ID
appId.set(appId_)
//已经注册成功了 设置变量
registered.set(true)
//设置Master的引用
master = Some(masterRef)
//回调给Master 表示连接成功
listener.connected(appId.get)
StandaloneAppClient在收到Master返回的RegisteredApplication消息之后就设置应用注册成功,同时设置Master引用。
(6)Master调度任务和资源
//在等待的应用程序中安排当前可用的资源。每当有新应用加入或资源可用性发生变化时,都会调用此方法。
private def schedule(): Unit = {
//如果状态不是存活 则返回
if (state != RecoveryState.ALIVE) {
return
}
// Drivers take strict precedence over executors
//driver优先于executor
//Random.shuffle--打乱组列表中元素的位置
//打乱worker信息列表中存活的worker的位置
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
//获取存活的worker的数量
val numWorkersAlive = shuffledAliveWorkers.size
//当前位置0
var curPos = 0
//遍历等待中的driver列表
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
// We assign workers to each waiting driver in a round-robin fashion. For each driver, we
// start from the last worker that was assigned a driver, and continue onwards until we have
// explored all alive workers.
//迭代遍历waitingDrivers的副本。我们以循环的方式为每个等候的driver分配worker。
//对于每个driver,我们从最后一个被分配driver的worker开始,然后继续,直到我们探索了所有活着的worker。
//设置启动状态为false
var launched = false
//集群状态空闲状态 true
var isClusterIdle = true
//可访问的worker数量
var numWorkersVisited = 0
//当可访问的worker数量少于存活的worker数量并且deriver状态是未启动
while (numWorkersVisited < numWorkersAlive && !launched) {
//取打乱位置的组中curpos位置的worker
val worker = shuffledAliveWorkers(curPos)
//如果该worker的driver为空且该worker的exxecutor是空 则设置集群状态为空闲
isClusterIdle = worker.drivers.isEmpty && worker.executors.isEmpty
//可访问的worker数量加1
numWorkersVisited += 1
//判断如果worker能启动driver
if (canLaunchDriver(worker, driver.desc)) {
//分配给workerd的资源
val allocated = worker.acquireResources(driver.desc.resourceReqs)
//设置driver的资源
driver.withResources(allocated)
//在worker上启动driver
launchDriver(worker, driver)
//等待的driver列表中去掉已经启动的driver
waitingDrivers -= driver
//设置该driver的启动状态为true
launched = true
}
//指标加1
curPos = (curPos + 1) % numWorkersAlive
}
//如果没有启动且集群还是空闲
if (!launched && isClusterIdle) {
//打印消息 driver需要比任何worker更多的资源
logWarning(s"Driver ${driver.id} requires more resource than any of Workers could have.")
}
}
//启动worker上的executor
startExecutorsOnWorkers()
}
scheduler方法每次都会在有应用注册或者资源变动的时候被调用,该方法主要是检查释放资源,并根据任务先进先出的策略配置可执行的任务资源,即给应用分配相应满足其执行的内存空间、核数等等,最后在Worker上启动Executor来执行任务startExecutorsOnWorkers。
//在worker中安排和启动executor
private def startExecutorsOnWorkers(): Unit = {
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
// in the queue, then the second app, etc.
//现在这是一个非常简单的FIFO调度器。我们一直在努力适应队列中的第一个应用程序,然后是第二个应用程序等。
//遍历等待执行的应用列表
for (app <- waitingApps) {
//获取给定的executor数 默认1个
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
// If the cores left is less than the coresPerExecutor,the cores left will not be allocated
//如果剩余的内核小于coresPerExecutor,则不会分配剩余的内核
if (app.coresLeft >= coresPerExecutor) {
// Filter out workers that don't have enough resources to launch an executor
//筛选出没有足够资源启动executor的worker
//找出符合条件的worker 即资源满足条件的
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
.filter(canLaunchExecutor(_, app.desc))
.sortBy(_.coresFree).reverse
//应用程序是否可能挂起 等待应用只有一个且唯一的应用的executor是空的且可用的worker是空的
val appMayHang = waitingApps.length == 1 &&
waitingApps.head.executors.isEmpty && usableWorkers.isEmpty
//如果应用可能挂起
if (appMayHang) {
//提示应用程序需要的资源超过任何一个worker拥有的
logWarning(s"App ${app.id} requires more resource than any of Workers could have.")
}
//worker分配给该应用程序的核数
val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
// Now that we've decided how many cores to allocate on each worker, let's allocate them
//现在我们已经决定给每个worker分配多少core了,接下来开始分配
for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
//将worker的资源分配给executor
allocateWorkerResourceToExecutors(
app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
}
}
}
}
startExecutorsOnWorkers中根据应用需要的资源找到满足条件的Worker,并将Worker的资源分给将要执行该任务的Executor--allocateWorkerResourceToExecutors。
//分配worker的资源给一个或者多个executor
//app执行者所属的应用程序的信息
//assignedCores表示此工作线程上用于此应用程序的核心数
//coresPerExecutor表示每个executor的核数
//worker表示worker的信息
private def allocateWorkerResourceToExecutors(
app: ApplicationInfo,
assignedCores: Int,
coresPerExecutor: Option[Int],
worker: WorkerInfo): Unit = {
// If the number of cores per executor is specified, we divide the cores assigned
// to this worker evenly among the executors with no remainder.
// Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
//每个executor的内核数如果指定了
//我们将分配给该worker的内核在executor中平均分配,没有余数。
//否则,我们将启动一个单独的executor来获取该worker上所有的assignedCores。
//将要分配的核数除以execotor数 即将core平均分给每个executor
val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
//将要分配的核数 优先获取指定的值 获取不到再获取平均核数
val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
//遍历每个executor
for (i <- 1 to numExecutors) {
//获取worker分配给该应用程序的资源
val allocated = worker.acquireResources(app.desc.resourceReqsPerExecutor)
//在应用中加入executor
val exec = app.addExecutor(worker, coresToAssign, allocated)
//启动worker上的executor
launchExecutor(worker, exec)
//设置应用程序的状态是运行中
app.state = ApplicationState.RUNNING
}
}
allocateWorkerResourceToExecutors中确定分配给一个Executor的资源,启动Executor并将应用信息传过去--launchExecutor。
//在worker上启动executor
private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
//提示 在worker上启动executor
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
//在worker上加入executor
worker.addExecutor(exec)
//向worker发消息启动了新的executor
worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id,
exec.application.desc, exec.cores, exec.memory, exec.resources))
//向提交该应用的driver发消息executor已经添加
exec.application.driver.send(
ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
}
launchExecutor实际上就是给Worker发消息LaunchExecutor,并附带应用信息和资源信息,同时给StandaloneAppClient发消息告知Executor已经添加--ExecutorAdded。
(7)Worker启动Executor
//启动executor
case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_, resources_) =>
//如果传入的master url不是有效的master url
if (masterUrl != activeMasterUrl) {
//提示 无效的master尝试启动executor
logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
} else if (decommissioned) {//如果worker已经退役
//worker已经退役却请求启动executor
logWarning("Asked to launch an executor while decommissioned. Not launching executor.")
} else {
try {
//提示 请求启动executor
logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
// Create the executor's working directory
//创建executor工作目录
val executorDir = new File(workDir, appId + "/" + execId)
if (!executorDir.mkdirs()) {
//如果创建目录失败则抛出异常
throw new IOException("Failed to create directory " + executorDir)
}
// Create local dirs for the executor. These are passed to the executor via the
// SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the
// application finishes.
//为executor创建本地目录。这些信息通过SPARK_EXECUTOR_DIRS环境变量,并在应用程序完成时由Worker删除。
//创建应用本地目录
val appLocalDirs = appDirectories.getOrElse(appId, {
//根据配置创建本地根目录
val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
//根目录遍历
val dirs = localRootDirs.flatMap { dir =>
try {
//创建应用目录
val appDir = Utils.createDirectory(dir, namePrefix = "executor")
//设置目录权限
Utils.chmod700(appDir)
//返回应用目录地址
Some(appDir.getAbsolutePath())
} catch {
//创建失败抛出IO异常
case e: IOException =>
//提示忽视这个目录
logWarning(s"${e.getMessage}. Ignoring this directory.")
None
}
}.toSeq
//如果目录为空
if (dirs.isEmpty) {
//抛出异常 子目录无法被创建
throw new IOException("No subfolder can be created in " +
s"${localRootDirs.mkString(",")}.")
}
dirs
})
//应用程序目录初始化
appDirectories(appId) = appLocalDirs
//ExecutorRunner--管理一个执行程序进程的执行。这目前仅在独立模式下使用。
val manager = new ExecutorRunner(
appId,
execId,
appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
cores_,
memory_,
self,
workerId,
webUi.scheme,
host,
webUi.boundPort,
publicAddress,
sparkHome,
executorDir,
workerUri,
conf,
appLocalDirs,
ExecutorState.LAUNCHING,
resources_)
//添加executor管理者
executors(appId + "/" + execId) = manager
//管理者启动
manager.start()
//已经使用的core更新为加上启动该executor所用的core
coresUsed += cores_
//已经使用的内存更新为加上启动该executor所用的内存
memoryUsed += memory_
//更新已经使用的资源
addResourcesUsed(resources_)
} catch {
case e: Exception =>
//启动executor失败
logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
//如果executors列表中已经包含要注册的executor
if (executors.contains(appId + "/" + execId)) {
//杀死旧executor
executors(appId + "/" + execId).kill()
//在executors列表中移除旧executor
executors -= appId + "/" + execId
}
//给master发消息executor状态已经改变 即失败了
sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
Some(e.toString), None))
}
}
Worker在收到Master发过来的LaunchExecutor消息之后会创建应用执行的目录进行初始化,然后根绝应用信息创建一个ExecutorRunner--负责管理Executor的执行(只有在StandardAlone模式下才有),创建过程中就有应用信息appDesc参数,包含了启动CoarseGrainedSchedulerBackend类的命令。ExecutorRunner在启动--manager.start的时候会执行该命令。
(8)ExecutorRunner启动
private[worker] def start(): Unit = {
workerThread = new Thread("ExecutorRunner for " + fullId) {
override def run(): Unit = { fetchAndRunExecutor() }
}
//启动线程
workerThread.start()
// Shutdown hook that kills actors on shutdown.
//关机挂钩,在关机时杀死参与者。
shutdownHook = ShutdownHookManager.addShutdownHook { () =>
// It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
// be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
//我们可能在调用“fetchAndRunExecutor”之前到达这里,那么“state”将是“ExecutitorState.LAUNCHING”。
//在这种情况下,我们应该将“state”设置为“FAILED”。
if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
state = ExecutorState.FAILED
}
killProcess(Some("Worker shutting down")) }
}
ExecutorRunner在启动的时候主要会执行一个fetchAndRunExecutor方法,该方法负责创建CoarseGrainedSchedulerBackend。
/**
* Download and run the executor described in our ApplicationDescription
*/
//下载并运行我们的ApplicationDescription中描述的executor
private def fetchAndRunExecutor(): Unit = {
try {
//prepareResourcesFile--将驱动程序(仅限集群)或执行程序分配的资源保存到JSON格式的资源文件中。仅在单机版中使用。
val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
// Launch the process
//启动进程
//参数拼接 添加资源文件路径
val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
val subsOpts = appDesc.command.javaOpts.map {
Utils.substituteAppNExecIds(_, appId, execId.toString)
}
val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
//通过应用程序的信息和环境配置创建构造器
val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
memory, sparkHome.getAbsolutePath, substituteVariables)
val command = builder.command()
val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
.mkString("\"", "\" \"", "\"")
logInfo(s"Launch command: $redactedCommand")
//在构造器中添加执行目录信息
builder.directory(executorDir)
builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
// In case we are running this from within the Spark Shell, avoid creating a "scala"
// parent process for the executor command
//如果我们在Spark Shell中运行,请避免为executor命令创建“scala”父进程
builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
// Add webUI log urls
//添加webui日志url
//在构造器中添加监控页面输入日志地址信息
val baseUrl =
if (conf.get(UI_REVERSE_PROXY)) {
conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
} else {
s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
}
builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
//启动构造器 创建CoarseGrainedExecutorBackend实例
process = builder.start()
val header = "Spark Executor Command: %s\n%s\n\n".format(
redactedCommand, "=" * 40)
// Redirect its stdout and stderr to files
//将其stdout和stderr重定向到文件
//输出CoarseGrainedExecutorBackend实例的运行信息
val stdout = new File(executorDir, "stdout")
stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)
val stderr = new File(executorDir, "stderr")
Files.write(header, stderr, StandardCharsets.UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)
state = ExecutorState.RUNNING
worker.send(ExecutorStateChanged(appId, execId, state, None, None))
// Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
// or with nonzero exit code
//等待它退出;执行程序可能会退出,代码为0(当驱动程序指示它关闭时)或具有非零退出代码
val exitCode = process.waitFor()
state = ExecutorState.EXITED
val message = "Command exited with code " + exitCode
//向worker发消息Executor状态已经改变
worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
} catch {
case interrupted: InterruptedException =>
logInfo("Runner thread for executor " + fullId + " interrupted")
state = ExecutorState.KILLED
killProcess(None)
case e: Exception =>
logError("Error running executor", e)
state = ExecutorState.FAILED
killProcess(Some(e.toString))
}
}
该方法会拼接参数,配置目录信息,创建构造器buildProcessBuilder,然后启动构造器创建CoarseGrainedSchedulerBackend。
(9)ProcessBuilder启动命令
public Process start() throws IOException {
// Must convert to array first -- a malicious user-supplied
// list might try to circumvent the security check.
String[] cmdarray = command.toArray(new String[command.size()]);
cmdarray = cmdarray.clone();
for (String arg : cmdarray)
if (arg == null)
throw new NullPointerException();
// Throws IndexOutOfBoundsException if command is empty
String prog = cmdarray[0];
SecurityManager security = System.getSecurityManager();
if (security != null)
security.checkExec(prog);
String dir = directory == null ? null : directory.toString();
for (int i = 1; i < cmdarray.length; i++) {
if (cmdarray[i].indexOf('\u0000') >= 0) {
throw new IOException("invalid null character in command");
}
}
try {
return ProcessImpl.start(cmdarray,
environment,
dir,
redirects,
redirectErrorStream);
} catch (IOException | IllegalArgumentException e) {
String exceptionInfo = ": " + e.getMessage();
Throwable cause = e;
if ((e instanceof IOException) && security != null) {
// Can not disclose the fail reason for read-protected files.
try {
security.checkRead(prog);
} catch (SecurityException se) {
exceptionInfo = "";
cause = se;
}
}
// It's much easier for us to create a high-quality error
// message than the low-level C code which found the problem.
throw new IOException(
"Cannot run program "" + prog + """
+ (dir == null ? "" : " (in directory "" + dir + "")")
+ exceptionInfo,
cause);
}
}
}
该方法主要是执行应用信息中已经拼接好的启动命令提交应用启动CoarseGrainedSchedulerBackend类。
总结:本篇文章从前到后梳理了应用提交的过程,一步一步深挖直到CoarseGrainedSchedulerBackend创建的全部过程,该过程涉及到rpc通信,应用提交,资源分配,任务执行等等,收益颇丰,建议读者对照源码一步一步跟着看。
参考:dongkelun.com/2020/12/26/… blog.csdn.net/i6015/artic… blog.csdn.net/pre_tender/…