上一节介绍了单例模式下driver提交应用的过程,这一节我们来具体看一下master怎么后续进行资源调度和启动executor来执行任务,这个过程中可能会有发消息给driver或者worker,请跳转到相应的页面看(上一节是driver,下一节是worker),由于看代码的过程中是一行一行的看的,看到函数就会跳转到函数中看,有时候会跳转很多次有突然回到最开始,最好是跟着一遍看解析一遍对着源码。如果从代码第一行开始往下读,没有思绪很容易无聊,所以我们从driver提交应用开始往后面读,思路会清晰很多。
Master源码阅读
package org.apache.spark.deploy.master
import java.text.SimpleDateFormat
import java.util.{Date, Locale}
import java.util.concurrent.{ScheduledFuture, TimeUnit}
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
import scala.util.Random
import org.apache.spark.{SecurityManager, SparkConf, SparkException}
import org.apache.spark.deploy.{ApplicationDescription, DriverDescription, ExecutorState, SparkHadoopUtil}
import org.apache.spark.deploy.DeployMessages._
import org.apache.spark.deploy.master.DriverState.DriverState
import org.apache.spark.deploy.master.MasterMessages._
import org.apache.spark.deploy.master.ui.MasterWebUI
import org.apache.spark.deploy.rest.StandaloneRestServer
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._
import org.apache.spark.internal.config.Deploy._
import org.apache.spark.internal.config.UI._
import org.apache.spark.internal.config.Worker._
import org.apache.spark.metrics.{MetricsSystem, MetricsSystemInstances}
import org.apache.spark.resource.{ResourceRequirement, ResourceUtils}
import org.apache.spark.rpc._
import org.apache.spark.serializer.{JavaSerializer, Serializer}
import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils}
private[deploy] class Master(
override val rpcEnv: RpcEnv,
address: RpcAddress,
webUiPort: Int,
val securityMgr: SecurityManager,
val conf: SparkConf)
extends ThreadSafeRpcEndpoint with Logging with LeaderElectable {
//转发消息线程
private val forwardMessageThread =
ThreadUtils.newDaemonSingleThreadScheduledExecutor("master-forward-message-thread")
//获取hadoop配置
private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
// For application IDs
//为创建应用设置的时间格式
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
//获取Worker超时时间
private val workerTimeoutMs = conf.get(WORKER_TIMEOUT) * 1000
//保留的应用程序
private val retainedApplications = conf.get(RETAINED_APPLICATIONS)
//保留的Driver
private val retainedDrivers = conf.get(RETAINED_DRIVERS)
//收割者迭代器
private val reaperIterations = conf.get(REAPER_ITERATIONS)
//恢复模式
private val recoveryMode = conf.get(RECOVERY_MODE)
//最大执行器重试次数
private val maxExecutorRetries = conf.get(MAX_EXECUTOR_RETRIES)
//所有的Worker
val workers = new HashSet[WorkerInfo]
//应用与ID的映射
val idToApp = new HashMap[String, ApplicationInfo]
//等待执行的应用
private val waitingApps = new ArrayBuffer[ApplicationInfo]
//Master中有哪些应用
val apps = new HashSet[ApplicationInfo]
//Worker与ID的映射
private val idToWorker = new HashMap[String, WorkerInfo]
//Worker与地址的映射
private val addressToWorker = new HashMap[RpcAddress, WorkerInfo]
//端点与应用的映射 通常指dirver
private val endpointToApp = new HashMap[RpcEndpointRef, ApplicationInfo]
//应用与地址的映射
private val addressToApp = new HashMap[RpcAddress, ApplicationInfo]
//已经完成的应用
private val completedApps = new ArrayBuffer[ApplicationInfo]
//下一个应用的序号
private var nextAppNumber = 0
//所有的driver的信息
private val drivers = new HashSet[DriverInfo]
//提交的应用已经完成的driver
private val completedDrivers = new ArrayBuffer[DriverInfo]
// Drivers currently spooled for scheduling
//当前已放到后台处理的driver 等待被执行的driver
private val waitingDrivers = new ArrayBuffer[DriverInfo]
//下一个被执行的driver的序号
private var nextDriverNumber = 0
//检查主机是否只包含有效的主机名/ip而不包含端口
Utils.checkHost(address.host)
//MetricsSystem--由特定的个例(可以是Master/client/worker等)创建 主要有source和sink组成,source采集数据sink下沉到目的地
//这里是创建Master的metrics系统
private val masterMetricsSystem =
MetricsSystem.createMetricsSystem(MetricsSystemInstances.MASTER, conf, securityMgr)
//创建应用的metrics系统
private val applicationMetricsSystem =
MetricsSystem.createMetricsSystem(MetricsSystemInstances.APPLICATIONS, conf, securityMgr)
//指定Master的source为自身
private val masterSource = new MasterSource(this)
// After onStart, webUi will be set
//一旦启动 webUi将会被设置
private var webUi: MasterWebUI = null
//master的公开地址
private val masterPublicAddress = {
//获取DNS地址
val envVar = conf.getenv("SPARK_PUBLIC_DNS")
//如果获取到DNS地址则返回该地址 没有获取到则返回Master主机ip
if (envVar != null) envVar else address.host
}
//master的url
private val masterUrl = address.toSparkURL
//master的webui的地址
private var masterWebUiUrl: String = _
//设置状态 默认是备用状态
private var state = RecoveryState.STANDBY
//PersistenceEngine--允许Master保持从故障中恢复所需的任何状态。
//持久化引擎
private var persistenceEngine: PersistenceEngine = _
//LeaderElectionAgent--LeaderElectionAgent跟踪当前master,是所有选举代理的通用接口。
//leader选举代理
private var leaderElectionAgent: LeaderElectionAgent = _
//ScheduledFuture--可以取消的延迟结果承载操作。通常,计划的未来是使用ScheduledExecutiorService计划任务的结果。
//恢复完成任务
private var recoveryCompletionTask: ScheduledFuture[_] = _
//检查Worker超时任务
private var checkForWorkerTimeOutTask: ScheduledFuture[_] = _
// As a temporary workaround before better ways of configuring memory, we allow users to set
// a flag that will perform round-robin scheduling across the nodes (spreading out each app
// among all the nodes) instead of trying to consolidate each app onto a small # of nodes.
//作为配置内存的更好方法之前的临时解决方案,我们允许用户设置一个标志,
//在节点之间执行循环调度(将每个应用程序分散在所有节点中),而不是试图将每个应用整合到一小部分节点上。
//展开应用程序
private val spreadOutApps = conf.get(SPREAD_OUT_APPS)
// Default maxCores for applications that don't specify it (i.e. pass Int.MaxValue)
//未指定的应用程序的默认maxCores(即传递Int.MaxValue)
private val defaultCores = conf.get(DEFAULT_CORES)
//UI反向代理 默认不设置反向代理
val reverseProxy = conf.get(UI_REVERSE_PROXY)
//如果设置的默认核数小于1 则抛出异常
if (defaultCores < 1) {
throw new SparkException(s"${DEFAULT_CORES.key} must be positive")
}
// Alternative application submission gateway that is stable across Spark versions
//跨Spark版本稳定的替代应用程序提交网关
//剩下可用的服务器
private val restServerEnabled = conf.get(MASTER_REST_SERVER_ENABLED)
//剩下的服务器
private var restServer: Option[StandaloneRestServer] = None
//剩下的服务器绑定端口的
private var restServerBoundPort: Option[Int] = None
{
val authKey = SecurityManager.SPARK_AUTH_SECRET_CONF
require(conf.getOption(authKey).isEmpty || !restServerEnabled,
//RestSubmissionServer不支持通过{authKey}进行身份验证。
//使用spark.master.rest.enabled=false关闭RestSubmissionServer,或者不使用身份验证。
s"The RestSubmissionServer does not support authentication via ${authKey}. Either turn " +
"off the RestSubmissionServer with spark.master.rest.enabled=false, or do not use " +
"authentication.")
}
//启动Master
override def onStart(): Unit = {
logInfo("Starting Spark master at " + masterUrl)
logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
//设置webui地址
webUi = new MasterWebUI(this, webUiPort)
webUi.bind()
masterWebUiUrl = webUi.webUrl
//如果设置反向代理
if (reverseProxy) {
//获取webui地址url
val uiReverseProxyUrl = conf.get(UI_REVERSE_PROXY_URL).map(_.stripSuffix("/"))
//如果url不为空
if (uiReverseProxyUrl.nonEmpty) {
System.setProperty("spark.ui.proxyBase", uiReverseProxyUrl.get)
// If the master URL has a path component, it must end with a slash.
// Otherwise the browser generates incorrect relative links
//如果主URL具有路径组件,则它必须以斜杠结尾。
//否则,浏览器会生成不正确的相对链接
masterWebUiUrl = uiReverseProxyUrl.get + "/"
}
//添加代理
webUi.addProxy()
//Spark Master充当反向代理。Master、Workers和Applications UI可在$masterWebUiUrl上获得
logInfo(s"Spark Master is acting as a reverse proxy. Master, Workers and " +
s"Applications UIs are available at $masterWebUiUrl")
}
//检查Worker超时任务
checkForWorkerTimeOutTask = forwardMessageThread.scheduleAtFixedRate(
() => Utils.tryLogNonFatalError { self.send(CheckForWorkerTimeOut) },
0, workerTimeoutMs, TimeUnit.MILLISECONDS)
//如果还有剩下的服务器可用
if (restServerEnabled) {
//获取剩下的服务器的端口
val port = conf.get(MASTER_REST_SERVER_PORT)
//获取剩下的服务器
restServer = Some(new StandaloneRestServer(address.host, port, conf, self, masterUrl))
}
//剩下的服务器绑定端口
restServerBoundPort = restServer.map(_.start())
//指定Master的source为自身
masterMetricsSystem.registerSource(masterSource)
//Master Metrics启动
masterMetricsSystem.start()
//应用 Metrics启动
applicationMetricsSystem.start()
// Attach the master and app metrics servlet handler to the web ui after the metrics systems are
// started.
//在Metrics系统启动后,将master和app度量servlet处理程序附加到web ui。
masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
val serializer = new JavaSerializer(conf)
//设置Master故障之后恢复模式
val (persistenceEngine_, leaderElectionAgent_) = recoveryMode match {
case "ZOOKEEPER" =>
//将恢复状态持续到ZooKeeper
logInfo("Persisting recovery state to ZooKeeper")
val zkFactory =
new ZooKeeperRecoveryModeFactory(conf, serializer)
//Master设置自己作为leader选举候选人
(zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this))
//文件系统模式
case "FILESYSTEM" =>
val fsFactory =
new FileSystemRecoveryModeFactory(conf, serializer)
//Master设置自己作为leader选举候选人
(fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this))
//自定义模式
case "CUSTOM" =>
val clazz = Utils.classForName(conf.get(RECOVERY_MODE_FACTORY))
val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serializer])
.newInstance(conf, serializer)
.asInstanceOf[StandaloneRecoveryModeFactory]
(factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this))
//单节点模式 即Master一直是leader
case _ =>
//MonarchyLeaderAgent--LeaderElectionAgent的单节点实现——我们最初一直是领导者。
(new BlackHolePersistenceEngine(), new MonarchyLeaderAgent(this))
}
//设置Master状态持久化模式
persistenceEngine = persistenceEngine_
//设置Master故障后选举代理
leaderElectionAgent = leaderElectionAgent_
}
override def onStop(): Unit = {
//Master通知 sink发出
masterMetricsSystem.report()
//应用通知 sink发出
applicationMetricsSystem.report()
// prevent the CompleteRecovery message sending to restarted master
//阻止将CompleteRecovery消息发送到重新启动的主机
if (recoveryCompletionTask != null) {
//取消重启任务
recoveryCompletionTask.cancel(true)
}
if (checkForWorkerTimeOutTask != null) {
//检查Worker超时任务停止
checkForWorkerTimeOutTask.cancel(true)
}
//转发消息线程关闭
forwardMessageThread.shutdownNow()
//ui停止
webUi.stop()
//将剩下的服务器停止
restServer.foreach(_.stop())
//Master的Meteics系统停止
masterMetricsSystem.stop()
//应用的Metrics系统停止
applicationMetricsSystem.stop()
//Master的持久化状态引擎关闭
persistenceEngine.close()
//选举代理关闭
leaderElectionAgent.stop()
}
//选举leader
override def electedLeader(): Unit = {
//发送自身参与leader选举
self.send(ElectedLeader)
}
//被撤销的领导权
override def revokedLeadership(): Unit = {
self.send(RevokedLeadership)
}
override def receive: PartialFunction[Any, Unit] = {
//被选举的leader
case ElectedLeader =>
//持久化引擎从rpcEnv中读取应用信息 Driver信息 Worker信息
val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData(rpcEnv)
//如果没有应用 没有注册的Driver 没有Worker 则状态置为存活 否则置为恢复中
state = if (storedApps.isEmpty && storedDrivers.isEmpty && storedWorkers.isEmpty) {
RecoveryState.ALIVE
} else {
RecoveryState.RECOVERING
}
//打印消息 已经当选为leader
logInfo("I have been elected leader! New state: " + state)
//如果状态是恢复中
if (state == RecoveryState.RECOVERING) {
//开始恢复应用 重新注册应用
beginRecovery(storedApps, storedDrivers, storedWorkers)
//恢复完成任务
recoveryCompletionTask = forwardMessageThread.schedule(new Runnable {
override def run(): Unit = Utils.tryLogNonFatalError {
//发送消息已经完全恢复
self.send(CompleteRecovery)
}
}, workerTimeoutMs, TimeUnit.MILLISECONDS)
}
//完成恢复
case CompleteRecovery => completeRecovery()
//被撤销的领导地位
case RevokedLeadership =>
logError("Leadership has been revoked -- master shutting down.")
System.exit(0)
//worker退役
case WorkerDecommissioning(id, workerRef) =>
//如果master的状态是standby
if (state == RecoveryState.STANDBY) {
//向worker发消息master是standby状态
workerRef.send(MasterInStandby)
} else {
// We use foreach since get gives us an option and we can skip the failures.
//我们使用foreach,因为get给了我们一个选项,我们可以跳过失败。
//遍历所有的worker 并告知该worker已经退役
idToWorker.get(id).foreach(decommissionWorker)
}
//使某个worker退役
case DecommissionWorkers(ids) =>
// The caller has already checked the state when handling DecommissionWorkersOnHosts,
// so it should not be the STANDBY
//调用方在处理DecommissionWorkersOnHosts时已经检查了状态,因此它不应该是STANDBY
//断言 如果状态是standby则抛出异常
assert(state != RecoveryState.STANDBY)
ids.foreach ( id =>
// We use foreach since get gives us an option and we can skip the failures.
idToWorker.get(id).foreach { w =>
//使该worker退役
decommissionWorker(w)
// Also send a message to the worker node to notify.
//还要向工作节点发送消息以进行通知。
w.endpoint.send(DecommissionWorker)
}
)
//注册worker
case RegisterWorker(
id, workerHost, workerPort, workerRef, cores, memory, workerWebUiUrl,
masterAddress, resources) =>
//正在注册worker
logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
workerHost, workerPort, cores, Utils.megabytesToString(memory)))
//如果master的状态是standby
if (state == RecoveryState.STANDBY) {
//告知该worker状态是standby
workerRef.send(MasterInStandby)
//如果master中包含worker的id
} else if (idToWorker.contains(id)) {
//告知worker已经注册了
workerRef.send(RegisteredWorker(self, masterWebUiUrl, masterAddress, true))
} else {
//worker的资源
val workerResources = resources.map(r => r._1 -> WorkerResourceInfo(r._1, r._2.addresses))
//初始化worker的信息
val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
workerRef, workerWebUiUrl, workerResources)
//如果worker注册成功
if (registerWorker(worker)) {
//在持久化引擎中添加该worker
persistenceEngine.addWorker(worker)
//向该worker发送消息已经注册成功 并带上master的url和地址
workerRef.send(RegisteredWorker(self, masterWebUiUrl, masterAddress, false))
//重新调度
schedule()
} else {
//初始化worker的地址
val workerAddress = worker.endpoint.address
//提示worker注册失败 尝试在该地址上重新注册
logWarning("Worker registration failed. Attempted to re-register worker at same " +
"address: " + workerAddress)
//向该worker发送消息注册失败
workerRef.send(RegisterWorkerFailed("Attempted to re-register worker at same address: "
+ workerAddress))
}
}
//注册应用
case RegisterApplication(description, driver) =>
// TODO Prevent repeated registrations from some driver
//如果是备用的Master 则不做处理
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
//创建应用
val app = createApplication(description, driver)
//注册应用
registerApplication(app)
logInfo("Registered app " + description.name + " with ID " + app.id)
//持久化引擎中加入该应用
persistenceEngine.addApplication(app)
//向提交给该应用的driver发消息应用注册
driver.send(RegisteredApplication(app.id, self))
//开始调度
schedule()
}
//executor状态已改变
case ExecutorStateChanged(appId, execId, state, message, exitStatus) =>
//从应用列表中获取executor id
val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
//executor id 匹配
execOption match {
case Some(exec) =>
//根据id获取到引用
val appInfo = idToApp(appId)
//获取executor状态
val oldState = exec.state
//设置executor状态
exec.state = state
//如果executor状态是运行中
if (state == ExecutorState.RUNNING) {
//断言 如果executor状态不是运行中 则抛出异常
assert(oldState == ExecutorState.LAUNCHING,
s"executor $execId state transfer from $oldState to RUNNING is illegal")
//应用重试次数置0
appInfo.resetRetryCount()
}
//向该executor执行的任务driver发送executor状态已经改变的消息
exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, None))
//如果executor状态是已经完成
if (ExecutorState.isFinished(state)) {
// Remove this executor from the worker and app
//从worker和应用中移除该executor
logInfo(s"Removing executor ${exec.fullId} because it is $state")
// If an application has already finished, preserve its
// state to display its information properly on the UI
//如果应用程序已经完成,请保留其状态以在UI上正确显示其信息
if (!appInfo.isFinished) {
//应用中移除executor
appInfo.removeExecutor(exec)
}
//在该executor所在的worker中移除该executor
exec.worker.removeExecutor(exec)
//判断是否是正常退出 0则是正常退出
val normalExit = exitStatus == Some(0)
// Only retry certain number of times so we don't go into an infinite loop.
// Important note: this code path is not exercised by tests, so be very careful when
// changing this `if` condition.
// We also don't count failures from decommissioned workers since they are "expected."
//只重试一定次数,这样我们就不会进入无限循环。
//重要提示:测试不使用此代码路径,因此在更改此“if”条件时要非常小心。我们也不计算退役worker的故障,因为它们是“意料之中的”
//如果不是正常退出 且原状态不是已经退役状态 且应用重试次数大于executor重试最大数
if (!normalExit
&& oldState != ExecutorState.DECOMMISSIONED
&& appInfo.incrementRetryCount() >= maxExecutorRetries
&& maxExecutorRetries >= 0) { // < 0 disables this application-killing path
//获取所有executor
val execs = appInfo.executors.values
//判断每个executor的状态是否是运行中 如果有不是运行中则执行
if (!execs.exists(_.state == ExecutorState.RUNNING)) {
//提示应用已经失败了多次 移除该任务
logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
s"${appInfo.retryCount} times; removing it")
//移除应用
removeApplication(appInfo, ApplicationState.FAILED)
}
}
}
//重新调度
schedule()
case None =>
//已获取未知executor的状态更新
logWarning(s"Got status update for unknown executor $appId/$execId")
}
//driver状态已经更新
case DriverStateChanged(driverId, state, exception) =>
//状态匹配
state match {
//如果状态是错误/结束/被杀掉/失败
case DriverState.ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED =>
//移除driver
removeDriver(driverId, state, exception)
//如果是其他状态则抛出异常
case _ =>
throw new Exception(s"Received unexpected state update for driver $driverId: $state")
}
//接收到worker的心跳
case Heartbeat(workerId, worker) =>
//进行worker匹配
idToWorker.get(workerId) match {
//如果是已经注册的worker
case Some(workerInfo) =>
//记录当前worker的上一次心跳时间是当前系统时间
workerInfo.lastHeartbeat = System.currentTimeMillis()
//如果不是已经注册的worker
case None =>
//如果能找到worker对应的id
if (workers.map(_.id).contains(workerId)) {
//提示收到未注册的worker的心跳 要求重新注册
logWarning(s"Got heartbeat from unregistered worker $workerId." +
" Asking it to re-register.")
//向worker发消息重新连接到master
worker.send(ReconnectWorker(masterUrl))
} else {
//提示收到未注册的worker的心跳并且该worker从未注册过 可以忽略该心跳
logWarning(s"Got heartbeat from unregistered worker $workerId." +
" This worker was never registered, so ignoring the heartbeat.")
}
}
//master收到变更已确认
case MasterChangeAcknowledged(appId) =>
//通过应用id找到应用并进行匹配
idToApp.get(appId) match {
//如果找到对应得应用
case Some(app) =>
//提示应用已经重新注册
logInfo("Application has been re-registered: " + appId)
//设置应用的状态是等待中
app.state = ApplicationState.WAITING
case None =>
//提示未知应用程序的master更改确认
logWarning("Master change ack from unknown app: " + appId)
}
//如果worker和应用状态正常 则完成恢复
if (canCompleteRecovery) { completeRecovery() }
//worker调度状态回应
case WorkerSchedulerStateResponse(workerId, execResponses, driverResponses) =>
idToWorker.get(workerId) match {
//如果匹配到worker
case Some(worker) =>
logInfo("Worker has been re-registered: " + workerId)
//设置worker的状态是存活
worker.state = WorkerState.ALIVE
//获取可用的executor
val validExecutors = execResponses.filter(
exec => idToApp.get(exec.desc.appId).isDefined)
//遍历可用的executor
for (exec <- validExecutors) {
//executor资源信息
val (execDesc, execResources) = (exec.desc, exec.resources)
//应用信息
val app = idToApp(execDesc.appId)
//应用添加executor信息
val execInfo = app.addExecutor(
worker, execDesc.cores, execResources, Some(execDesc.execId))
//worker添加executor
worker.addExecutor(execInfo)
//worker添加executor资源
worker.recoverResources(execResources)
//添加exeecutor信息
execInfo.copyState(execDesc)
}
//遍历driver
for (driver <- driverResponses) {
//初始化driver信息
val (driverId, driverResource) = (driver.driverId, driver.resources)
//遍历drivers列表找到该driver id的driver
drivers.find(_.id == driverId).foreach { driver =>
//添加driver的worker
driver.worker = Some(worker)
//设置driver的状态是运行中
driver.state = DriverState.RUNNING
//添加driver的资源
driver.withResources(driverResource)
//添加worker的driver资源
worker.recoverResources(driverResource)
//添加worker的driver
worker.addDriver(driver)
}
}
case None =>
//提示未知worker的计划程序状态
logWarning("Scheduler state from unknown worker: " + workerId)
}
////如果worker和应用状态正常 则完成恢复
if (canCompleteRecovery) { completeRecovery() }
//worker最新状态
case WorkerLatestState(workerId, executors, driverIds) =>
//通过worker id得到对应的worker
idToWorker.get(workerId) match {
//如果是已经注册的worker
case Some(worker) =>
//遍历所有的executor
for (exec <- executors) {
//判断executor是否存在
val executorMatches = worker.executors.exists {
//如果应用id和executor id都能匹配上则返回true
case (_, e) => e.application.id == exec.appId && e.id == exec.execId
}
//如果没匹配上
if (!executorMatches) {
// master doesn't recognize this executor. So just tell worker to kill it.
//master不认识这个executor。所以,只要告诉worker杀死它。
//向worker发消息杀死executor
worker.endpoint.send(KillExecutor(masterUrl, exec.appId, exec.execId))
}
}
//遍历driver
for (driverId <- driverIds) {
//判断driver是否存在
val driverMatches = worker.drivers.exists { case (id, _) => id == driverId }
//如果driver不存在
if (!driverMatches) {
// master doesn't recognize this driver. So just tell worker to kill it.
//向worker发消息杀死该driver
worker.endpoint.send(KillDriver(driverId))
}
}
case None =>
//提示worker的状态来自未知worker
logWarning("Worker state from unknown worker: " + workerId)
}
//注销应用
case UnregisterApplication(applicationId) =>
//提示收到来自应用程序的注销请求
logInfo(s"Received unregister request from application $applicationId")
//通过id找到应用并结束该应用
idToApp.get(applicationId).foreach(finishApplication)
//检查worker超时
case CheckForWorkerTimeOut =>
//检查并移除任何超时worker
timeOutDeadWorkers()
}
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
//请求提交driver
case RequestSubmitDriver(description) =>
//如果状态不是存活
if (state != RecoveryState.ALIVE) {
//提示 只能接受处于“活动”状态的驱动程序提交。
val msg = s"${Utils.BACKUP_STANDALONE_MASTER_PREFIX}: $state. " +
"Can only accept driver submissions in ALIVE state."
//回复提交失败
context.reply(SubmitDriverResponse(self, false, None, msg))
} else {
logInfo("Driver submitted " + description.command.mainClass)
//创建driver
val driver = createDriver(description)
//将driver添加到持久化引擎中
persistenceEngine.addDriver(driver)
//将该driver添加到等待列表中
waitingDrivers += driver
//driver添加到drivers列表中
drivers.add(driver)
//调度资源
schedule()
// TODO: It might be good to instead have the submission client poll the master to determine
// the current status of the driver. For now it's simply "fire and forget".
//相反,最好让提交客户端轮询master以确定驱动程序的当前状态。现在,它只是“点燃并忘记”。
//回复提交driver成功
context.reply(SubmitDriverResponse(self, true, Some(driver.id),
s"Driver successfully submitted as ${driver.id}"))
}
//请求杀死driver
case RequestKillDriver(driverId) =>
//如果状态不是存活
if (state != RecoveryState.ALIVE) {
//提示只能杀死存活的driver
val msg = s"${Utils.BACKUP_STANDALONE_MASTER_PREFIX}: $state. " +
s"Can only kill drivers in ALIVE state."
//回复杀死driver失败
context.reply(KillDriverResponse(self, driverId, success = false, msg))
} else {
//提示请求杀死driver
logInfo("Asked to kill driver " + driverId)
//根据id找到需要杀死的driver
val driver = drivers.find(_.id == driverId)
//driver匹配
driver match {
//如果找到对应的driver
case Some(d) =>
//如果是等待执行的driver
if (waitingDrivers.contains(d)) {
//将该driver从等待列表中移除
waitingDrivers -= d
//向该driver发消息状态已经变化
self.send(DriverStateChanged(driverId, DriverState.KILLED, None))
} else {
// We just notify the worker to kill the driver here. The final bookkeeping occurs
// on the return path when the worker submits a state change back to the master
// to notify it that the driver was successfully killed.
//我们只是通知worker杀死这里的driver。最终记账发生
//在返回路径上,当worker向master提交状态更改以通知其driver已成功终止时。
d.worker.foreach { w =>
//向worker发消息杀死driver
w.endpoint.send(KillDriver(driverId))
}
}
// TODO: It would be nice for this to be a synchronous response
//如果这是一个同步响应,那就太好了
//提示请求杀死driver
val msg = s"Kill request for $driverId submitted"
logInfo(msg)
//回复提交driver成功
context.reply(KillDriverResponse(self, driverId, success = true, msg))
//如果没有匹配到对应的driver
case None =>
//提示driver已经结束或者不存在
val msg = s"Driver $driverId has already finished or does not exist"
logWarning(msg)
//回复杀死driver失败
context.reply(KillDriverResponse(self, driverId, success = false, msg))
}
}
//请求driver状态
case RequestDriverStatus(driverId) =>
//如果状态不是存活
if (state != RecoveryState.ALIVE) {
//提示只能请求存活状态的driver
val msg = s"${Utils.BACKUP_STANDALONE_MASTER_PREFIX}: $state. " +
"Can only request driver status in ALIVE state."
//回复返回driver状态失败
context.reply(
DriverStatusResponse(found = false, None, None, None, Some(new Exception(msg))))
} else {
//在drivers列表中加上已经完成的driver列表中找
(drivers ++ completedDrivers).find(_.id == driverId) match {
//找到driver
case Some(driver) =>
//返回driver的状态
context.reply(DriverStatusResponse(found = true, Some(driver.state),
driver.worker.map(_.id), driver.worker.map(_.hostPort), driver.exception))
//没找到driver
case None =>
context.reply(DriverStatusResponse(found = false, None, None, None, None))
}
}
//请求master状态
case RequestMasterState =>
//回复master状态
context.reply(MasterStateResponse(
address.host, address.port, restServerBoundPort,
workers.toArray, apps.toArray, completedApps.toArray,
drivers.toArray, completedDrivers.toArray, state))
//边界端口请求
case BoundPortsRequest =>
//回复边界端口
context.reply(BoundPortsResponse(address.port, webUi.boundPort, restServerBoundPort))
//应用程序请求executor
case RequestExecutors(appId, requestedTotal) =>
context.reply(handleRequestExecutors(appId, requestedTotal))
//杀死executor
case KillExecutors(appId, executorIds) =>
//将executor id转换为整型
val formattedExecutorIds = formatExecutorIds(executorIds)
//回复 杀死executor是否成功
context.reply(handleKillExecutors(appId, formattedExecutorIds))
//使hosts上的worker退役
case DecommissionWorkersOnHosts(hostnames) =>
//如果状态不是STANDBY
if (state != RecoveryState.STANDBY) {
//回复 是否使worker退役
context.reply(decommissionWorkersOnHosts(hostnames))
} else {
//否则回复0
context.reply(0)
}
}
//已经断开连接
override def onDisconnected(address: RpcAddress): Unit = {
// The disconnected client could've been either a worker or an app; remove whichever it was
//断开连接的客户端可能是工作程序或应用程序;删除它
//提示 该地址已经断开连接
logInfo(s"$address got disassociated, removing it.")
//通过地址找到对应的worker并移除该worker
addressToWorker.get(address).foreach(removeWorker(_, s"${address} got disassociated"))
//通过地址找到对应的应用并结束应用
addressToApp.get(address).foreach(finishApplication)
//如果状态是正在回复中或者能够恢复 则完成恢复
if (state == RecoveryState.RECOVERING && canCompleteRecovery) { completeRecovery() }
}
//能完成恢复 如果worker中没有状态是unknown的并且应用没有状态是unknown的 则返回true 否则返回false
private def canCompleteRecovery =
workers.count(_.state == WorkerState.UNKNOWN) == 0 &&
apps.count(_.state == ApplicationState.UNKNOWN) == 0
//开始恢复
private def beginRecovery(storedApps: Seq[ApplicationInfo], storedDrivers: Seq[DriverInfo],
storedWorkers: Seq[WorkerInfo]): Unit = {
//遍历存储的应用信息
for (app <- storedApps) {
logInfo("Trying to recover app: " + app.id)
try {
//重新注册应用
registerApplication(app)
//将该应用的状态设置成unknown
app.state = ApplicationState.UNKNOWN
//向提交给该应用的driver发消息 告知其Master已经改为当前Master
app.driver.send(MasterChanged(self, masterWebUiUrl))
} catch {
//重新注册应用失败 则抛出异常
case e: Exception => logInfo("App " + app.id + " had exception on reconnect")
}
}
//遍历存储的driver
for (driver <- storedDrivers) {
// Here we just read in the list of drivers. Any drivers associated with now-lost workers
// will be re-launched when we detect that the worker is missing.
//在这里,我们刚刚阅读了driver列表。当我们检测到worker失踪时,任何与现在失踪的worker相关的driver都将重新启动。
drivers += driver
}
//遍历存储的worker信息
for (worker <- storedWorkers) {
//尝试恢复worker
logInfo("Trying to recover worker: " + worker.id)
try {
//worker注册
registerWorker(worker)
//worker的状态设置成unknown
worker.state = WorkerState.UNKNOWN
//向worker所在的端点发送Master已经改变的消息
worker.endpoint.send(MasterChanged(self, masterWebUiUrl))
} catch {
//worker注册失败则抛出异常
case e: Exception => logInfo("Worker " + worker.id + " had exception on reconnect")
}
}
}
//完成恢复
private def completeRecovery(): Unit = {
// Ensure "only-once" recovery semantics using a short synchronization period.
//确保使用短同步期的“仅一次”恢复语义。
//如果状态不是在恢复中 则返回
if (state != RecoveryState.RECOVERING) { return }
//设置状态为已经完成恢复
state = RecoveryState.COMPLETING_RECOVERY
// Kill off any workers and apps that didn't respond to us.
//杀死所有没有回应我们的worker和应用程序。
workers.filter(_.state == WorkerState.UNKNOWN).foreach(
//移除所有状态是unknown的worker
removeWorker(_, "Not responding for recovery"))
//如果应用状态是unknown 则结束应用
apps.filter(_.state == ApplicationState.UNKNOWN).foreach(finishApplication)
// Update the state of recovered apps to RUNNING
//将恢复的应用程序的状态更新为RUNNING
apps.filter(_.state == ApplicationState.WAITING).foreach(_.state = ApplicationState.RUNNING)
// Reschedule drivers which were not claimed by any workers
//重新安排没有任何worker认领的driver
drivers.filter(_.worker.isEmpty).foreach { d =>
//打印driver没有被找到当master恢复之后
logWarning(s"Driver ${d.id} was not found after master recovery")
//如果driver被管理了
if (d.desc.supervise) {
logWarning(s"Re-launching ${d.id}")
//重新启动该driver
relaunchDriver(d)
} else {
//移除driver
removeDriver(d.id, DriverState.ERROR, None)
//打印 没有重启该driver 因为它没有被管理
logWarning(s"Did not re-launch ${d.id} because it was not supervised")
}
}
//状态设置成存活
state = RecoveryState.ALIVE
//调度资源
schedule()
//恢复完成
logInfo("Recovery complete - resuming operations!")
}
/**
* Schedule executors to be launched on the workers.
* Returns an array containing number of cores assigned to each worker.
*
* There are two modes of launching executors. The first attempts to spread out an application's
* executors on as many workers as possible, while the second does the opposite (i.e. launch them
* on as few workers as possible). The former is usually better for data locality purposes and is
* the default.
*
* The number of cores assigned to each executor is configurable. When this is explicitly set,
* multiple executors from the same application may be launched on the same worker if the worker
* has enough cores and memory. Otherwise, each executor grabs all the cores available on the
* worker by default, in which case only one executor per application may be launched on each
* worker during one single schedule iteration.
* Note that when `spark.executor.cores` is not set, we may still launch multiple executors from
* the same application on the same worker. Consider appA and appB both have one executor running
* on worker1, and appA.coresLeft > 0, then appB is finished and release all its cores on worker1,
* thus for the next schedule iteration, appA launches a new executor that grabs all the free
* cores on worker1, therefore we get multiple executors from appA running on worker1.
*
* It is important to allocate coresPerExecutor on each worker at a time (instead of 1 core
* at a time). Consider the following example: cluster has 4 workers with 16 cores each.
* User requests 3 executors (spark.cores.max = 48, spark.executor.cores = 16). If 1 core is
* allocated at a time, 12 cores from each worker would be assigned to each executor.
* Since 12 < 16, no executors would launch [SPARK-8881].
*/
//计划executor将启动到worker身上。返回一个数组,该数组包含分配给每个worker的核数。
//有两种启动执行器的模式。第一种方法试图将应用程序的executor分配给尽可能多的worker,
//而第二种方法则相反(即在尽可能少的worker上启动它们)。
//前者通常更适合用于数据本地化,并且是默认值。
//分配给每个executor的内核数量是可配置的。
//如果显式设置了这一点,则如果同一个worker具有足够的核心和内存,则可以在该worker上启动来自同一应用程序的多个executor。
//否则,默认情况下,每个executor都会获取worker上可用的所有核心,
//在这种情况下,在一次调度迭代期间,每个应用程序只能在每个worker上启动一个executor。
//请注意,当未设置“spark.executor.cores”时,我们仍然可以在同一worker上从同一应用程序启动多个executor。
//假设appA和appB都有一个executor在worker1上运行,并且appA.coresLeft>0,那么appB就完成了,
//并释放了worker1上的所有核心,因此在下一次计划迭代中,appA启动了一个新的executor,
//该executor会获取worker1中的所有空闲核心,因此我们从运行在worker1的appA中获得多个executor。
//一次在每个工作线程上分配coresPerExecutor是很重要的(而不是一次分配一个core)。
//考虑以下示例:集群有4个worker,每个worker有16个核心。
//用户请求3个executor(spark.cores.max=48,spark.executor.cores=16)。
//如果一次分配1个核心,则每个worker的12个核心将被分配给每个executor。由于12<16,没有executor会启动[SPARK-8881]。
//在worker上启动executor
private def scheduleExecutorsOnWorkers(
app: ApplicationInfo,
//可用的worker
usableWorkers: Array[WorkerInfo],
//是否展开应用程序
spreadOutApps: Boolean): Array[Int] = {
//获取每个executor分配的core
val coresPerExecutor = app.desc.coresPerExecutor
//每个executor最少需要的核数
val minCoresPerExecutor = coresPerExecutor.getOrElse(1)
//每个worker上是否有一个executor
val oneExecutorPerWorker = coresPerExecutor.isEmpty
//每个executor需要的内存
val memoryPerExecutor = app.desc.memoryPerExecutorMB
//每个executor需要的资源
val resourceReqsPerExecutor = app.desc.resourceReqsPerExecutor
//可用的worker数
val numUsable = usableWorkers.length
//worker已经分配的核数
val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
//每个worker上已经分配的executor数
val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker
//需要分配的核数 取应用剩下的和可用worker空闲的之中小的
var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
/** Return whether the specified worker can launch an executor for this app. */
//返回指定的worker是否可以启动此应用程序的executor。
//判断worker能否为该应用启动executor
def canLaunchExecutorForApp(pos: Int): Boolean = {
//判断能否调度 如果需要分配的核数大于每个executor最少拥有的核数 则为true
val keepScheduling = coresToAssign >= minCoresPerExecutor
//判断是否有足够的核数 如果pos位置的worker有的空闲核数减去已经分配的核数大于每个executor最少需要的核数 则为true
val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor
//获取worker上已经分配的executor数
val assignedExecutorNum = assignedExecutors(pos)
// If we allow multiple executors per worker, then we can always launch new executors.
// Otherwise, if there is already an executor on this worker, just give it more cores.
//如果我们允许每个worker有多个executor,那么我们总是可以启动新的executor。
//否则,如果这个worker上已经有一个executor,只需给它更多的core。
//如果worker上的executor数不止一个且已经分配的executor为0 则为true
val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutorNum == 0
if (launchingNewExecutor) {
//已经分配的内存 已经分配的executor乘以每个executor分配的内存
val assignedMemory = assignedExecutorNum * memoryPerExecutor
//判断是否有足够的内存 worker可用的空闲内存减去已经分配的内存大于每个executor需要的内存
val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
//已经分配的资源
val assignedResources = resourceReqsPerExecutor.map {
req => req.resourceName -> req.amount * assignedExecutorNum
}.toMap
//空闲资源
val resourcesFree = usableWorkers(pos).resourcesAmountFree.map {
case (rName, free) => rName -> (free - assignedResources.getOrElse(rName, 0))
}
//判断资源是否满足要求 如果可用资源多于需要的资源
val enoughResources = ResourceUtils.resourcesMeetRequirements(
resourcesFree, resourceReqsPerExecutor)
//判断是否低于限制 如果已经分配的executor加上应用目前已有的executor数小于应用executor限制数 则为true
val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
//只有当各种资源都符合要求时才会返回true
keepScheduling && enoughCores && enoughMemory && enoughResources && underLimit
} else {
// We're adding cores to an existing executor, so no need
// to check memory and executor limits
//我们正在向现有的executor添加核心,因此无需检查内存和执行器限制
//只需要有足够的核数并且核数多于executor最小需求数
keepScheduling && enoughCores
}
}
// Keep launching executors until no more workers can accommodate any
// more executors, or if we have reached this application's limits
//继续启动executor,直到没有更多的worker可以容纳任何更多的executor,或者如果我们已经达到了申请的限制
//获取可用的worker
var freeWorkers = (0 until numUsable).filter(canLaunchExecutorForApp)
//当有可用的worker时
while (freeWorkers.nonEmpty) {
//遍历每个可用的worker
freeWorkers.foreach { pos =>
//初始化默认可以继续调度
var keepScheduling = true
//当可以继续调度并且可以为应用启动executor
while (keepScheduling && canLaunchExecutorForApp(pos)) {
//需要分配的核数中减去executor启动需要的最少核数
coresToAssign -= minCoresPerExecutor
//已经分配的核数中加上executor启动需要的最少核数
assignedCores(pos) += minCoresPerExecutor
// If we are launching one executor per worker, then every iteration assigns 1 core
// to the executor. Otherwise, every iteration assigns cores to a new executor.
//如果我们为每个worker启动一个executor,那么每次迭代都会为executor分配一个核心。否则,每次迭代都会将核心分配给一个新的executor。
//判断worker上是否有启动的executor
if (oneExecutorPerWorker) {
//没有则初始化为1 表示有一个
assignedExecutors(pos) = 1
} else {
//有就加一个
assignedExecutors(pos) += 1
}
// Spreading out an application means spreading out its executors across as
// many workers as possible. If we are not spreading out, then we should keep
// scheduling executors on this worker until we use all of its resources.
// Otherwise, just move on to the next worker.
//分散应用程序意味着将其executor分散为尽可能多的worker。
//如果我们不分散,那么我们应该继续为这个worker安排executor,直到我们使用了它的所有资源。否则,请转到下一个worker。
//如果开启了分散 则不继续调度起executor 而是将executor分配到其他worker上
if (spreadOutApps) {
//不分散应用程序
keepScheduling = false
}
}
}
//更新可用的worker 在可用的worker中去掉启用的
freeWorkers = freeWorkers.filter(canLaunchExecutorForApp)
}
//返回已经分配的核数
assignedCores
}
/**
* Schedule and launch executors on workers
*/
//在worker中安排和启动executor
private def startExecutorsOnWorkers(): Unit = {
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
// in the queue, then the second app, etc.
//现在这是一个非常简单的FIFO调度器。我们一直在努力适应队列中的第一个应用程序,然后是第二个应用程序等。
//遍历等待执行的应用列表
for (app <- waitingApps) {
//获取给定的executor数 默认1个
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
// If the cores left is less than the coresPerExecutor,the cores left will not be allocated
//如果剩余的内核小于coresPerExecutor,则不会分配剩余的内核
if (app.coresLeft >= coresPerExecutor) {
// Filter out workers that don't have enough resources to launch an executor
//筛选出没有足够资源启动executor的worker
//找出符合条件的worker 即资源满足条件的
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
.filter(canLaunchExecutor(_, app.desc))
.sortBy(_.coresFree).reverse
//应用程序是否可能挂起 等待应用只有一个且唯一的应用的executor是空的且可用的worker是空的
val appMayHang = waitingApps.length == 1 &&
waitingApps.head.executors.isEmpty && usableWorkers.isEmpty
//如果应用可能挂起
if (appMayHang) {
//提示应用程序需要的资源超过任何一个worker拥有的
logWarning(s"App ${app.id} requires more resource than any of Workers could have.")
}
//worker分配给该应用程序的核数
val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
// Now that we've decided how many cores to allocate on each worker, let's allocate them
//现在我们已经决定给每个worker分配多少core了,接下来开始分配
for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
//将worker的资源分配给executor
allocateWorkerResourceToExecutors(
app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
}
}
}
}
/**
* Allocate a worker's resources to one or more executors.
* @param app the info of the application which the executors belong to
* @param assignedCores number of cores on this worker for this application
* @param coresPerExecutor number of cores per executor
* @param worker the worker info
*/
//分配worker的资源给一个或者多个executor
//app执行者所属的应用程序的信息
//assignedCores表示此工作线程上用于此应用程序的核心数
//coresPerExecutor表示每个executor的核数
//worker表示worker的信息
private def allocateWorkerResourceToExecutors(
app: ApplicationInfo,
assignedCores: Int,
coresPerExecutor: Option[Int],
worker: WorkerInfo): Unit = {
// If the number of cores per executor is specified, we divide the cores assigned
// to this worker evenly among the executors with no remainder.
// Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
//每个executor的内核数如果指定了
//我们将分配给该worker的内核在executor中平均分配,没有余数。
//否则,我们将启动一个单独的executor来获取该worker上所有的assignedCores。
//将要分配的核数除以execotor数 即将core平均分给每个executor
val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
//将要分配的核数 优先获取指定的值 获取不到再获取平均核数
val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
//遍历每个executor
for (i <- 1 to numExecutors) {
//获取worker分配给该应用程序的资源
val allocated = worker.acquireResources(app.desc.resourceReqsPerExecutor)
//在应用中加入executor
val exec = app.addExecutor(worker, coresToAssign, allocated)
//启动worker上的executor
launchExecutor(worker, exec)
//设置应用程序的状态是运行中
app.state = ApplicationState.RUNNING
}
}
//能否启动
private def canLaunch(
worker: WorkerInfo,
memoryReq: Int,
coresReq: Int,
resourceRequirements: Seq[ResourceRequirement])
: Boolean = {
//判断worker的内存是否大于给定的内存
val enoughMem = worker.memoryFree >= memoryReq
//判断worker的核数是否大于给定的核数
val enoughCores = worker.coresFree >= coresReq
//判断worker空闲空间能否装载需要的资源
val enoughResources = ResourceUtils.resourcesMeetRequirements(
worker.resourcesAmountFree, resourceRequirements)
//条件都满足则可以启动
enoughMem && enoughCores && enoughResources
}
/**
* @return whether the worker could launch the driver represented by DriverDescription
*/
//worker是否可以启动driver DriverDescription表示的驱动程序
private def canLaunchDriver(worker: WorkerInfo, desc: DriverDescription): Boolean = {
canLaunch(worker, desc.mem, desc.cores, desc.resourceReqs)
}
/**
* @return whether the worker could launch the executor according to application's requirement
*/
//worker是否可以根据应用程序的要求启动executor
private def canLaunchExecutor(worker: WorkerInfo, desc: ApplicationDescription): Boolean = {
canLaunch(
worker,
desc.memoryPerExecutorMB,
desc.coresPerExecutor.getOrElse(1),
desc.resourceReqsPerExecutor)
}
/**
* Schedule the currently available resources among waiting apps. This method will be called
* every time a new app joins or resource availability changes.
*/
//在等待的应用程序中安排当前可用的资源。每当有新应用加入或资源可用性发生变化时,都会调用此方法。
private def schedule(): Unit = {
//如果状态不是存活 则返回
if (state != RecoveryState.ALIVE) {
return
}
// Drivers take strict precedence over executors
//driver优先于executor
//Random.shuffle--打乱组列表中元素的位置
//打乱worker信息列表中存活的worker的位置
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
//获取存活的worker的数量
val numWorkersAlive = shuffledAliveWorkers.size
//当前位置0
var curPos = 0
//遍历等待中的driver列表
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
// We assign workers to each waiting driver in a round-robin fashion. For each driver, we
// start from the last worker that was assigned a driver, and continue onwards until we have
// explored all alive workers.
//迭代遍历waitingDrivers的副本。我们以循环的方式为每个等候的driver分配worker。
//对于每个driver,我们从最后一个被分配driver的worker开始,然后继续,直到我们探索了所有活着的worker。
//设置启动状态为false
var launched = false
//集群状态空闲状态 true
var isClusterIdle = true
//可访问的worker数量
var numWorkersVisited = 0
//当可访问的worker数量少于存活的worker数量并且deriver状态是未启动
while (numWorkersVisited < numWorkersAlive && !launched) {
//取打乱位置的组中curpos位置的worker
val worker = shuffledAliveWorkers(curPos)
//如果该worker的driver为空且该worker的exxecutor是空 则设置集群状态为空闲
isClusterIdle = worker.drivers.isEmpty && worker.executors.isEmpty
//可访问的worker数量加1
numWorkersVisited += 1
//判断如果worker能启动driver
if (canLaunchDriver(worker, driver.desc)) {
//分配给workerd的资源
val allocated = worker.acquireResources(driver.desc.resourceReqs)
//设置driver的资源
driver.withResources(allocated)
//在worker上启动driver
launchDriver(worker, driver)
//等待的driver列表中去掉已经启动的driver
waitingDrivers -= driver
//设置该driver的启动状态为true
launched = true
}
//指标加1
curPos = (curPos + 1) % numWorkersAlive
}
//如果没有启动且集群还是空闲
if (!launched && isClusterIdle) {
//打印消息 driver需要比任何worker更多的资源
logWarning(s"Driver ${driver.id} requires more resource than any of Workers could have.")
}
}
//启动worker上的executor
startExecutorsOnWorkers()
}
//在worker上启动executor
private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
//提示 在worker上启动executor
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
//在worker上加入executor
worker.addExecutor(exec)
//向worker发消息启动了新的executor
worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id,
exec.application.desc, exec.cores, exec.memory, exec.resources))
//向提交该应用的driver发消息executor已经添加
exec.application.driver.send(
ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
}
//注册worker
private def registerWorker(worker: WorkerInfo): Boolean = {
// There may be one or more refs to dead workers on this same node (w/ different ID's),
// remove them.
//同一节点上可能有一个或多个引用指向已死亡的工作人员(具有不同的ID),请删除它们。
//在workers中移除之前的引用
workers.filter { w =>
(w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
}.foreach { w =>
workers -= w
}
//获取worker的地址
val workerAddress = worker.endpoint.address
//如果包含该worker的地址
if (addressToWorker.contains(workerAddress)) {
//获取旧worker
val oldWorker = addressToWorker(workerAddress)
//如果旧worker的状态未知
if (oldWorker.state == WorkerState.UNKNOWN) {
// A worker registering from UNKNOWN implies that the worker was restarted during recovery.
// The old worker must thus be dead, so we will remove it and accept the new worker.
//从UNKNOWN注册的工作程序意味着该worker在恢复过程中已重新启动。
//因此,旧worker肯定已经死了,所以我们将把它移走,接受新worker。
//移除worker
removeWorker(oldWorker, "Worker replaced by a new worker with same address")
} else {
logInfo("Attempted to re-register worker at same address: " + workerAddress)
return false
}
}
//workers列表中添加该driver
workers += worker
//添加worker和id的映射
idToWorker(worker.id) = worker
//添加worker和地址的映射
addressToWorker(workerAddress) = worker
true
}
/**
* Decommission all workers that are active on any of the given hostnames. The decommissioning is
* asynchronously done by enqueueing WorkerDecommission messages to self. No checks are done about
* the prior state of the worker. So an already decommissioned worker will match as well.
*
* @param hostnames: A list of hostnames without the ports. Like "localhost", "foo.bar.com" etc
*
* Returns the number of workers that matched the hostnames.
*/
//停用任何给定主机名上活动的所有worker。停用是通过将WorkerDecommission消息排队到self异步完成的。
//不检查worker先前的状态。因此,一个已经退役的工人也会匹配。
//hostnames:不带端口的主机名列表。像“localhost”、“foo.bar.com”等
//返回与主机名匹配的worker数。
private def decommissionWorkersOnHosts(hostnames: Seq[String]): Integer = {
//将hostname转化为小写
val hostnamesSet = hostnames.map(_.toLowerCase(Locale.ROOT)).toSet
//找到需要移除的worker
val workersToRemove = addressToWorker
.filterKeys(addr => hostnamesSet.contains(addr.host.toLowerCase(Locale.ROOT)))
.values
//找到需要移除的worker的端口
val workersToRemoveHostPorts = workersToRemove.map(_.hostPort)
//提示 正在使该host:port退役
logInfo(s"Decommissioning the workers with host:ports ${workersToRemoveHostPorts}")
// The workers are removed async to avoid blocking the receive loop for the entire batch
//异步移除工作进程,以避免阻塞整个批处理的接收循环
//向worker发消息退役
self.send(DecommissionWorkers(workersToRemove.map(_.id).toSeq))
// Return the count of workers actually removed
//返回实际移除的worker数量
workersToRemove.size
}
//使某个worker退役
private def decommissionWorker(worker: WorkerInfo): Unit = {
//如果worker不是退役状态
if (worker.state != WorkerState.DECOMMISSIONED) {
//正在退役的worker是..
logInfo("Decommissioning worker %s on %s:%d".format(worker.id, worker.host, worker.port))
//标记worker状态是退役
worker.setState(WorkerState.DECOMMISSIONED)
//遍历该worker中的executor
for (exec <- worker.executors.values) {
logInfo("Telling app of decommission executors")
//向executor正在执行的应用的driver发消息 该executor要退役了
exec.application.driver.send(ExecutorUpdated(
exec.id, ExecutorState.DECOMMISSIONED,
Some("worker decommissioned"), None,
// worker host is being set here to let the driver know that the host (aka. worker)
// is also being decommissioned. So the driver can unregister all the shuffle map
// statues located at this host when it receives the executor lost event.
//这里设置的worker主机是为了让driver知道主机(aka.worker)也正在停用。
//因此,当driver接收到executor lost事件时,它可以注销位于该主机上的所有shuffle映射雕像。
Some(worker.host)))
//设置executor退役
exec.state = ExecutorState.DECOMMISSIONED
//通知应用程序移除executor
exec.application.removeExecutor(exec)
}
// On recovery do not add a decommissioned executor
//恢复时不要添加已停用的执行器
persistenceEngine.removeWorker(worker)
} else {
//如果该worker已经退役
logWarning("Skipping decommissioning worker %s on %s:%d as worker is already decommissioned".
format(worker.id, worker.host, worker.port))
}
}
//移除worker
private def removeWorker(worker: WorkerInfo, msg: String): Unit = {
logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port)
//设置worker的状态是死亡
worker.setState(WorkerState.DEAD)
//移除worker的id
idToWorker -= worker.id
//移除worker的地址
addressToWorker -= worker.endpoint.address
//遍历该worker中的executor
for (exec <- worker.executors.values) {
logInfo("Telling app of lost executor: " + exec.id)
//告知executor上面的提交应用的dirver 该executor丢失了
exec.application.driver.send(ExecutorUpdated(
exec.id, ExecutorState.LOST, Some("worker lost"), None, Some(worker.host)))
//设置executor的状态是丢失
exec.state = ExecutorState.LOST
//应用程序移除该executor
exec.application.removeExecutor(exec)
}
//遍历该worker中的driver
for (driver <- worker.drivers.values) {
//如果该driver被管理
if (driver.desc.supervise) {
logInfo(s"Re-launching ${driver.id}")
//重新启动该driver
relaunchDriver(driver)
} else {
//提示不是重启该driver 因为该driver没有被管理
logInfo(s"Not re-launching ${driver.id} because it was not supervised")
//移除该状态异常的driver
removeDriver(driver.id, DriverState.ERROR, None)
}
}
logInfo(s"Telling app of lost worker: " + worker.id)
//遍历未完成的应用列表
apps.filterNot(completedApps.contains(_)).foreach { app =>
//向提交该应用的driver发消息该worker已经被移除了
app.driver.send(WorkerRemoved(worker.id, worker.host, msg))
}
//从持久化引擎中移除该worker
persistenceEngine.removeWorker(worker)
//重新调度
schedule()
}
//重新启动该driver
private def relaunchDriver(driver: DriverInfo): Unit = {
// We must setup a new driver with a new driver id here, because the original driver may
// be still running. Consider this scenario: a worker is network partitioned with master,
// the master then relaunches driver driverID1 with a driver id driverID2, then the worker
// reconnects to master. From this point on, if driverID2 is equal to driverID1, then master
// can not distinguish the statusUpdate of the original driver and the newly relaunched one,
// for example, when DriverStateChanged(driverID1, KILLED) arrives at master, master will
// remove driverID1, so the newly relaunched driver disappears too. See SPARK-19900 for details.
//我们必须在此处设置一个具有新driver id的新driver,因为原始driver可能仍在运行。
//考虑这个场景:工作进程与主进程进行网络分区,然后主进程使用驱动程序id driverID2重新启动驱动程序driverID1,
//然后工作进程重新连接到主进程。从这一点开始,如果driverID2等于driverID1,
//则master无法区分原始驱动程序和新重新启动的驱动程序的状态Update,
//例如,当DriverStateChanged(driverID 1,KILLED)到达master时,master会删除driverIDl,
//因此新重新启动的驱动程序也会消失。有关详细信息,请参阅SPARK-19900。
//移除driver
removeDriver(driver.id, DriverState.RELAUNCHING, None)
//新建driver
val newDriver = createDriver(driver.desc)
//在持久化引擎中添加该driver
persistenceEngine.addDriver(newDriver)
//在drivers列表中添加driver
drivers.add(newDriver)
//将该driver添加到等待列表中
waitingDrivers += newDriver
//重新调度driver
schedule()
}
//创建应用
private def createApplication(desc: ApplicationDescription, driver: RpcEndpointRef):
ApplicationInfo = {
val now = System.currentTimeMillis()
val date = new Date(now)
val appId = newApplicationId(date)
new ApplicationInfo(now, appId, desc, date, driver, defaultCores)
}
//注册应用
private def registerApplication(app: ApplicationInfo): Unit = {
//初始化应用地址
val appAddress = app.driver.address
//如果应用地址列表中包含该地址
if (addressToApp.contains(appAddress)) {
//提示在同一个地址上尝试重新注册应用
logInfo("Attempted to re-register application at same address: " + appAddress)
return
}
//在应用metris系统中注册应用
applicationMetricsSystem.registerSource(app.appSource)
//在apps应用列表中加入该应用
apps += app
//添加应用id
idToApp(app.id) = app
//添加应用程序的端点
endpointToApp(app.driver) = app
//添加应用程序的地址
addressToApp(appAddress) = app
//将该应用添加到等待执行的应用列表中
waitingApps += app
}
//结束应用
private def finishApplication(app: ApplicationInfo): Unit = {
//移除应用
removeApplication(app, ApplicationState.FINISHED)
}
//移除应用
def removeApplication(app: ApplicationInfo, state: ApplicationState.Value): Unit = {
//如果应用列表中包含该应用
if (apps.contains(app)) {
//提示正在移除该应用
logInfo("Removing app " + app.id)
//应用列表中去掉该应用
apps -= app
//移除该应用id
idToApp -= app.id
//移除提交该应用的driver
endpointToApp -= app.driver
//移除提交该应用的driver的地址
addressToApp -= app.driver.address
//如果已经完成的应用数量大于保留未执行的应用数量
if (completedApps.size >= retainedApplications) {
//待移除的应用数量
val toRemove = math.max(retainedApplications / 10, 1)
//在已经完成的应用列表中选取指定数量的应用
completedApps.take(toRemove).foreach { a =>
//在应用metris系统中移除该应用
applicationMetricsSystem.removeSource(a.appSource)
}
//已经完成的应用列表中移除前toRemove个
completedApps.trimStart(toRemove)
}
//将该应用添加到已经完成的应用列表中
completedApps += app // Remember it in our history
//在应用等待列表中去掉该应用
waitingApps -= app
//遍历该应用下的所有executor
for (exec <- app.executors.values) {
//杀死executor
killExecutor(exec)
}
//标记应用的状态
app.markFinished(state)
//如果应用的状态不是已经结束
if (state != ApplicationState.FINISHED) {
//向提交该应用的driver发消息该应用已经被移除了
app.driver.send(ApplicationRemoved(state.toString))
}
//从持久化引擎中移除该应用
persistenceEngine.removeApplication(app)
//重新调度
schedule()
// Tell all workers that the application has finished, so they can clean up any app state.
//告诉所有worker应用程序已完成,以便他们可以清除任何应用程序状态。
workers.foreach { w =>
//向worker发消息应用已经结束
w.endpoint.send(ApplicationFinished(app.id))
}
}
}
/**
* Handle a request to set the target number of executors for this application.
*
* If the executor limit is adjusted upwards, new executors will be launched provided
* that there are workers with sufficient resources. If it is adjusted downwards, however,
* we do not kill existing executors until we explicitly receive a kill request.
*
* @return whether the application has previously registered with this Master.
*/
//处理设置此应用程序的executor目标数量的请求。
//如果executor限额上调,只要有足够资源的worker,就会推出新的executor。
//然而,如果它向下调整,我们不会杀死现有的executor,直到我们明确地收到一个杀死请求。
private def handleRequestExecutors(appId: String, requestedTotal: Int): Boolean = {
//通过id找到应用
idToApp.get(appId) match {
//匹配应用
case Some(appInfo) =>
//提示应用程序请求将executor总数设置为$requestedTotal。
logInfo(s"Application $appId requested to set total executors to $requestedTotal.")
//应用的executor总数设置为requestedTotal
appInfo.executorLimit = requestedTotal
//重新调度
schedule()
//返回true
true
//如果没有匹配到
case None =>
//提示未知应用程序请求了$requestedTotal个执行程序。
logWarning(s"Unknown application $appId requested $requestedTotal total executors.")
false
}
}
/**
* Handle a kill request from the given application.
*
* This method assumes the executor limit has already been adjusted downwards through
* a separate [[RequestExecutors]] message, such that we do not launch new executors
* immediately after the old ones are removed.
*
* @return whether the application has previously registered with this Master.
*/
//处理来自给定应用程序的终止请求。
//该方法假设executor限制已经通过单独的[[RequestExecutions]]消息向下调整,
//这样我们就不会在删除旧的executor后立即启动新的executor。
private def handleKillExecutors(appId: String, executorIds: Seq[Int]): Boolean = {
//通过id找到应用
idToApp.get(appId) match {
//匹配应用
case Some(appInfo) =>
//提示应用程序请求终止执行程序
logInfo(s"Application $appId requests to kill executors: " + executorIds.mkString(", "))
//将该应用的executor分类
val (known, unknown) = executorIds.partition(appInfo.executors.contains)
//已知的executor遍历
known.foreach { executorId =>
//获取excutor信息
val desc = appInfo.executors(executorId)
//应用中移除executor
appInfo.removeExecutor(desc)
//杀死该executor
killExecutor(desc)
}
//如果未知的executor存在
if (unknown.nonEmpty) {
//提示 应用尝试杀死不存在的executor
logWarning(s"Application $appId attempted to kill non-existent executors: "
+ unknown.mkString(", "))
}
//重新调度
schedule()
true
//如果没有匹配到
case None =>
//提示 没有注册的应用请求杀死executor
logWarning(s"Unregistered application $appId requested us to kill executors!")
false
}
}
/**
* Cast the given executor IDs to integers and filter out the ones that fail.
*
* All executors IDs should be integers since we launched these executors. However,
* the kill interface on the driver side accepts arbitrary strings, so we need to
* handle non-integer executor IDs just to be safe.
*/
//将给定的executorID强制转换为整数,并过滤掉失败的ID。
//所有executorID都应该是整数,因为我们启动了这些executor。
//然而,驱动程序端的kill接口接受任意字符串,因此为了安全起见,我们需要处理非整数执行器ID。
private def formatExecutorIds(executorIds: Seq[String]): Seq[Int] = {
//executor遍历
executorIds.flatMap { executorId =>
try {
//转化为Int类型
Some(executorId.toInt)
} catch {
case e: NumberFormatException =>
//提示 遇到具有非整数ID的executor:$executorId。忽略
logError(s"Encountered executor with a non-integer ID: $executorId. Ignoring")
None
}
}
}
/**
* Ask the worker on which the specified executor is launched to kill the executor.
*/
//询问启动指定executor的worker以终止该executor。
private def killExecutor(exec: ExecutorDesc): Unit = {
//worker移除该executor
exec.worker.removeExecutor(exec)
//向worker发消息杀死executor
exec.worker.endpoint.send(KillExecutor(masterUrl, exec.application.id, exec.id))
//设置executor的状态是被杀死
exec.state = ExecutorState.KILLED
}
/** Generate a new app ID given an app's submission date */
private def newApplicationId(submitDate: Date): String = {
val appId = "app-%s-%04d".format(createDateFormat.format(submitDate), nextAppNumber)
nextAppNumber += 1
appId
}
/** Check for, and remove, any timed-out workers */
//检查并移除任何超时worker
private def timeOutDeadWorkers(): Unit = {
// Copy the workers into an array so we don't modify the hashset while iterating through it
//将worker复制到一个数组中,这样我们在迭代时就不会修改哈希集
//获取当前系统时间
val currentTime = System.currentTimeMillis()
//过滤出上一次心跳时间小于当前时间减去超时时间的差值
val toRemove = workers.filter(_.lastHeartbeat < currentTime - workerTimeoutMs).toArray
//遍历需要移除的worker
for (worker <- toRemove) {
//如果worker的状态不是死亡
if (worker.state != WorkerState.DEAD) {
//获取Worker超时时间
val workerTimeoutSecs = TimeUnit.MILLISECONDS.toSeconds(workerTimeoutMs)
//提示移除worker 因为未在指定时间内收到心跳
logWarning("Removing %s because we got no heartbeat in %d seconds".format(
worker.id, workerTimeoutSecs))
//移除worker
removeWorker(worker, s"Not receiving heartbeat for $workerTimeoutSecs seconds")
} else {
if (worker.lastHeartbeat < currentTime - ((reaperIterations + 1) * workerTimeoutMs)) {
workers -= worker // we've seen this DEAD worker in the UI, etc. for long enough; cull it
//我们已经在UI等中看到这个DEAD worker足够长的时间了;剔除它
}
}
}
}
//生成新的driver id
private def newDriverId(submitDate: Date): String = {
val appId = "driver-%s-%04d".format(createDateFormat.format(submitDate), nextDriverNumber)
//下一个被执行的driver的序号
nextDriverNumber += 1
appId
}
//创建新的driver
private def createDriver(desc: DriverDescription): DriverInfo = {
//获取系统时间
val now = System.currentTimeMillis()
//获取日期
val date = new Date(now)
//初始化driver信息
new DriverInfo(now, newDriverId(date), desc, date)
}
//启动driver
private def launchDriver(worker: WorkerInfo, driver: DriverInfo): Unit = {
//打印 正在worker上启动driver
logInfo("Launching driver " + driver.id + " on worker " + worker.id)
//worker中添加driver
worker.addDriver(driver)
//设置driver的worker是该worker
driver.worker = Some(worker)
//向该worker发消息启动driver
worker.endpoint.send(LaunchDriver(driver.id, driver.desc, driver.resources))
//设置driver的状态是运行中
driver.state = DriverState.RUNNING
}
//移除driver
private def removeDriver(
driverId: String,
finalState: DriverState,
exception: Option[Exception]): Unit = {
//在drivers列表中找到需要移除的driver
drivers.find(d => d.id == driverId) match {
case Some(driver) =>
logInfo(s"Removing driver: $driverId")
//移除driver
drivers -= driver
//如果已经完成的driver 即已经完成的应用 多余还未完成的driver
if (completedDrivers.size >= retainedDrivers) {
//需要移除的driver数量
val toRemove = math.max(retainedDrivers / 10, 1)
//在已经完成的driver中从前往后开始移除
completedDrivers.trimStart(toRemove)
}
//将该driver加到完成的driver列表
completedDrivers += driver
//持久化状态中移除该driver
persistenceEngine.removeDriver(driver)
//设置该driver的状态是完成状态
driver.state = finalState
//设置driver的异常
driver.exception = exception
//遍历该driver所对应的worker并在worker中移除该driver
driver.worker.foreach(w => w.removeDriver(driver))
//调度任务
schedule()
case None =>
logWarning(s"Asked to remove unknown driver: $driverId")
}
}
}
//Master对象
private[deploy] object Master extends Logging {
val SYSTEM_NAME = "sparkMaster"
val ENDPOINT_NAME = "Master"
def main(argStrings: Array[String]): Unit = {
Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler(
exitOnUncaughtException = false))
Utils.initDaemon(log)
val conf = new SparkConf
val args = new MasterArguments(argStrings, conf)
//启动环境和端点
val (rpcEnv, _, _) = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, conf)
rpcEnv.awaitTermination()
}
/**
* Start the Master and return a three tuple of:
* (1) The Master RpcEnv
* (2) The web UI bound port
* (3) The REST server bound port, if any
*/
def startRpcEnvAndEndpoint(
host: String,
port: Int,
webUiPort: Int,
conf: SparkConf): (RpcEnv, Int, Option[Int]) = {
//初始化安全管理器
val securityMgr = new SecurityManager(conf)
//初始化rpcenv环境
val rpcEnv = RpcEnv.create(SYSTEM_NAME, host, port, conf, securityMgr)
//初始化master
val masterEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME,
new Master(rpcEnv, rpcEnv.address, webUiPort, securityMgr, conf))
//端口请求响应
val portsResponse = masterEndpoint.askSync[BoundPortsResponse](BoundPortsRequest)
(rpcEnv, portsResponse.webUIPort, portsResponse.restPort)
}
}
上一节我们读到master给driver发回复消息应用已经注册成功,接下来开始调度资源schedule():
//注册应用
case RegisterApplication(description, driver) =>
// TODO Prevent repeated registrations from some driver
//如果是备用的Master 则不做处理
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
//创建应用
val app = createApplication(description, driver)
//注册应用
registerApplication(app)
logInfo("Registered app " + description.name + " with ID " + app.id)
//持久化引擎中加入该应用
persistenceEngine.addApplication(app)
//向提交给该应用的driver发消息应用注册
driver.send(RegisteredApplication(app.id, self))
//开始调度
schedule()
}
进一步看一下schedule()是怎么调度资源的:
/**
* Schedule the currently available resources among waiting apps. This method will be called
* every time a new app joins or resource availability changes.
*/
//在等待的应用程序中安排当前可用的资源。每当有新应用加入或资源可用性发生变化时,都会调用此方法。
private def schedule(): Unit = {
//如果状态不是存活 则返回
if (state != RecoveryState.ALIVE) {
return
}
// Drivers take strict precedence over executors
//driver优先于executor
//Random.shuffle--打乱组列表中元素的位置
//打乱worker信息列表中存活的worker的位置
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
//获取存活的worker的数量
val numWorkersAlive = shuffledAliveWorkers.size
//当前位置0
var curPos = 0
//遍历等待中的driver列表
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
// We assign workers to each waiting driver in a round-robin fashion. For each driver, we
// start from the last worker that was assigned a driver, and continue onwards until we have
// explored all alive workers.
//迭代遍历waitingDrivers的副本。我们以循环的方式为每个等候的driver分配worker。
//对于每个driver,我们从最后一个被分配driver的worker开始,然后继续,直到我们探索了所有活着的worker。
//设置启动状态为false
var launched = false
//集群状态空闲状态 true
var isClusterIdle = true
//可访问的worker数量
var numWorkersVisited = 0
//当可访问的worker数量少于存活的worker数量并且deriver状态是未启动
while (numWorkersVisited < numWorkersAlive && !launched) {
//取打乱位置的组中curpos位置的worker
val worker = shuffledAliveWorkers(curPos)
//如果该worker的driver为空且该worker的exxecutor是空 则设置集群状态为空闲
isClusterIdle = worker.drivers.isEmpty && worker.executors.isEmpty
//可访问的worker数量加1
numWorkersVisited += 1
//判断如果worker能启动driver
if (canLaunchDriver(worker, driver.desc)) {
//分配给workerd的资源
val allocated = worker.acquireResources(driver.desc.resourceReqs)
//设置driver的资源
driver.withResources(allocated)
//在worker上启动driver
launchDriver(worker, driver)
//等待的driver列表中去掉已经启动的driver
waitingDrivers -= driver
//设置该driver的启动状态为true
launched = true
}
//指标加1
curPos = (curPos + 1) % numWorkersAlive
}
//如果没有启动且集群还是空闲
if (!launched && isClusterIdle) {
//打印消息 driver需要比任何worker更多的资源
logWarning(s"Driver ${driver.id} requires more resource than any of Workers could have.")
}
}
//启动worker上的executor
startExecutorsOnWorkers()
}
schedule是每次可用资源发生变化时或者有新应用提交时都会被调用,首先会检查当前master的状态是不是存活,如果不是则直接返回,接下来将workers(保存所有的worker的信息)中存活的worker位置随机打乱,(不太清楚为啥这么做),获取可用的worker数量,遍历提交应用等待执行的driver列表waitingDrivers为driver分配worker,每次都遍历可用的worker列表,如果判断某个worker的资源能够满足启动该driver提交的应用则将该worker分配给该driver,最后在worker上启动Executor--startExecutorsOnWorkers:
/**
* Schedule and launch executors on workers
*/
//在worker中安排和启动executor
private def startExecutorsOnWorkers(): Unit = {
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
// in the queue, then the second app, etc.
//现在这是一个非常简单的FIFO调度器。我们一直在努力适应队列中的第一个应用程序,然后是第二个应用程序等。
//遍历等待执行的应用列表
for (app <- waitingApps) {
//获取给定的executor数 默认1个
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
// If the cores left is less than the coresPerExecutor,the cores left will not be allocated
//如果剩余的内核小于coresPerExecutor,则不会分配剩余的内核
if (app.coresLeft >= coresPerExecutor) {
// Filter out workers that don't have enough resources to launch an executor
//筛选出没有足够资源启动executor的worker
//找出符合条件的worker 即资源满足条件的
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
.filter(canLaunchExecutor(_, app.desc))
.sortBy(_.coresFree).reverse
//应用程序是否可能挂起 等待应用只有一个且唯一的应用的executor是空的且可用的worker是空的
val appMayHang = waitingApps.length == 1 &&
waitingApps.head.executors.isEmpty && usableWorkers.isEmpty
//如果应用可能挂起
if (appMayHang) {
//提示应用程序需要的资源超过任何一个worker拥有的
logWarning(s"App ${app.id} requires more resource than any of Workers could have.")
}
//worker分配给该应用程序的核数
val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
// Now that we've decided how many cores to allocate on each worker, let's allocate them
//现在我们已经决定给每个worker分配多少core了,接下来开始分配
for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
//将worker的资源分配给executor
allocateWorkerResourceToExecutors(
app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
}
}
}
}
对于提交的应用来说应用的执行顺序是先来先执行,遍历等待应用列表waitingApps,获取给定的Executor数,如果剩余的内核数小于给定的内核数则不会分配剩余的内核,找出符合条件的worker,然后算出满足条件的worker应该分配给该应用的核数,最后就是将worker的资源下发给Executor。scheduleExecutorsOnWorkers负责计算出worker给应用的核数:
//计划executor将启动到worker身上。返回一个数组,该数组包含分配给每个worker的核数。
//有两种启动执行器的模式。第一种方法试图将应用程序的executor分配给尽可能多的worker,
//而第二种方法则相反(即在尽可能少的worker上启动它们)。
//前者通常更适合用于数据本地化,并且是默认值。
//分配给每个executor的内核数量是可配置的。
//如果显式设置了这一点,则如果同一个worker具有足够的核心和内存,则可以在该worker上启动来自同一应用程序的多个executor。
//否则,默认情况下,每个executor都会获取worker上可用的所有核心,
//在这种情况下,在一次调度迭代期间,每个应用程序只能在每个worker上启动一个executor。
//请注意,当未设置“spark.executor.cores”时,我们仍然可以在同一worker上从同一应用程序启动多个executor。
//假设appA和appB都有一个executor在worker1上运行,并且appA.coresLeft>0,那么appB就完成了,
//并释放了worker1上的所有核心,因此在下一次计划迭代中,appA启动了一个新的executor,
//该executor会获取worker1中的所有空闲核心,因此我们从运行在worker1的appA中获得多个executor。
//一次在每个工作线程上分配coresPerExecutor是很重要的(而不是一次分配一个core)。
//考虑以下示例:集群有4个worker,每个worker有16个核心。
//用户请求3个executor(spark.cores.max=48,spark.executor.cores=16)。
//如果一次分配1个核心,则每个worker的12个核心将被分配给每个executor。由于12<16,没有executor会启动[SPARK-8881]。
//在worker上启动executor
private def scheduleExecutorsOnWorkers(
app: ApplicationInfo,
//可用的worker
usableWorkers: Array[WorkerInfo],
//是否展开应用程序
spreadOutApps: Boolean): Array[Int] = {
//获取每个executor分配的core
val coresPerExecutor = app.desc.coresPerExecutor
//每个executor最少需要的核数
val minCoresPerExecutor = coresPerExecutor.getOrElse(1)
//每个worker上是否有一个executor
val oneExecutorPerWorker = coresPerExecutor.isEmpty
//每个executor需要的内存
val memoryPerExecutor = app.desc.memoryPerExecutorMB
//每个executor需要的资源
val resourceReqsPerExecutor = app.desc.resourceReqsPerExecutor
//可用的worker数
val numUsable = usableWorkers.length
//worker已经分配的核数
val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
//每个worker上已经分配的executor数
val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker
//需要分配的核数 取应用剩下的和可用worker空闲的之中小的
var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
/** Return whether the specified worker can launch an executor for this app. */
//返回指定的worker是否可以启动此应用程序的executor。
//判断worker能否为该应用启动executor
def canLaunchExecutorForApp(pos: Int): Boolean = {
//判断能否调度 如果需要分配的核数大于每个executor最少拥有的核数 则为true
val keepScheduling = coresToAssign >= minCoresPerExecutor
//判断是否有足够的核数 如果pos位置的worker有的空闲核数减去已经分配的核数大于每个executor最少需要的核数 则为true
val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor
//获取worker上已经分配的executor数
val assignedExecutorNum = assignedExecutors(pos)
// If we allow multiple executors per worker, then we can always launch new executors.
// Otherwise, if there is already an executor on this worker, just give it more cores.
//如果我们允许每个worker有多个executor,那么我们总是可以启动新的executor。
//否则,如果这个worker上已经有一个executor,只需给它更多的core。
//如果worker上的executor数不止一个且已经分配的executor为0 则为true
val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutorNum == 0
if (launchingNewExecutor) {
//已经分配的内存 已经分配的executor乘以每个executor分配的内存
val assignedMemory = assignedExecutorNum * memoryPerExecutor
//判断是否有足够的内存 worker可用的空闲内存减去已经分配的内存大于每个executor需要的内存
val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
//已经分配的资源
val assignedResources = resourceReqsPerExecutor.map {
req => req.resourceName -> req.amount * assignedExecutorNum
}.toMap
//空闲资源
val resourcesFree = usableWorkers(pos).resourcesAmountFree.map {
case (rName, free) => rName -> (free - assignedResources.getOrElse(rName, 0))
}
//判断资源是否满足要求 如果可用资源多于需要的资源
val enoughResources = ResourceUtils.resourcesMeetRequirements(
resourcesFree, resourceReqsPerExecutor)
//判断是否低于限制 如果已经分配的executor加上应用目前已有的executor数小于应用executor限制数 则为true
val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
//只有当各种资源都符合要求时才会返回true
keepScheduling && enoughCores && enoughMemory && enoughResources && underLimit
} else {
// We're adding cores to an existing executor, so no need
// to check memory and executor limits
//我们正在向现有的executor添加核心,因此无需检查内存和执行器限制
//只需要有足够的核数并且核数多于executor最小需求数
keepScheduling && enoughCores
}
}
// Keep launching executors until no more workers can accommodate any
// more executors, or if we have reached this application's limits
//继续启动executor,直到没有更多的worker可以容纳任何更多的executor,或者如果我们已经达到了申请的限制
//获取可用的worker
var freeWorkers = (0 until numUsable).filter(canLaunchExecutorForApp)
//当有可用的worker时
while (freeWorkers.nonEmpty) {
//遍历每个可用的worker
freeWorkers.foreach { pos =>
//初始化默认可以继续调度
var keepScheduling = true
//当可以继续调度并且可以为应用启动executor
while (keepScheduling && canLaunchExecutorForApp(pos)) {
//需要分配的核数中减去executor启动需要的最少核数
coresToAssign -= minCoresPerExecutor
//已经分配的核数中加上executor启动需要的最少核数
assignedCores(pos) += minCoresPerExecutor
// If we are launching one executor per worker, then every iteration assigns 1 core
// to the executor. Otherwise, every iteration assigns cores to a new executor.
//如果我们为每个worker启动一个executor,那么每次迭代都会为executor分配一个核心。否则,每次迭代都会将核心分配给一个新的executor。
//判断worker上是否有启动的executor
if (oneExecutorPerWorker) {
//没有则初始化为1 表示有一个
assignedExecutors(pos) = 1
} else {
//有就加一个
assignedExecutors(pos) += 1
}
// Spreading out an application means spreading out its executors across as
// many workers as possible. If we are not spreading out, then we should keep
// scheduling executors on this worker until we use all of its resources.
// Otherwise, just move on to the next worker.
//分散应用程序意味着将其executor分散为尽可能多的worker。
//如果我们不分散,那么我们应该继续为这个worker安排executor,直到我们使用了它的所有资源。否则,请转到下一个worker。
//如果开启了分散 则不继续调度起executor 而是将executor分配到其他worker上
if (spreadOutApps) {
//不分散应用程序
keepScheduling = false
}
}
}
//更新可用的worker 在可用的worker中去掉启用的
freeWorkers = freeWorkers.filter(canLaunchExecutorForApp)
}
//返回已经分配的核数
assignedCores
}
需要注意参数spreadOutApps,表示是否展开应用,如果确定展开则将应用分配到多个worker中,否则尽量在一个worker中多分配几个executor,该函数中包含的方法canLaunchExecutorForApp主要是用来判断worker能否为该应用启动executor,接着遍历所有可用的worker,如果可以继续调度资源并且可以为该应用启动executor时,每次为该应用加一个核,直到满足应用要求。
allocateWorkerResourceToExecutors负责将资源分给Executor:
//分配worker的资源给一个或者多个executor
//app执行者所属的应用程序的信息
//assignedCores表示此工作线程上用于此应用程序的核心数
//coresPerExecutor表示每个executor的核数
//worker表示worker的信息
private def allocateWorkerResourceToExecutors(
app: ApplicationInfo,
assignedCores: Int,
coresPerExecutor: Option[Int],
worker: WorkerInfo): Unit = {
// If the number of cores per executor is specified, we divide the cores assigned
// to this worker evenly among the executors with no remainder.
// Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
//每个executor的内核数如果指定了
//我们将分配给该worker的内核在executor中平均分配,没有余数。
//否则,我们将启动一个单独的executor来获取该worker上所有的assignedCores。
//将要分配的核数除以execotor数 即将core平均分给每个executor
val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
//将要分配的核数 优先获取指定的值 获取不到再获取平均核数
val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
//遍历每个executor
for (i <- 1 to numExecutors) {
//获取worker分配给该应用程序的资源
val allocated = worker.acquireResources(app.desc.resourceReqsPerExecutor)
//在应用中加入executor
val exec = app.addExecutor(worker, coresToAssign, allocated)
//启动worker上的executor
launchExecutor(worker, exec)
//设置应用程序的状态是运行中
app.state = ApplicationState.RUNNING
}
}
worker给executor分配core时是平均分配,如果是指定了需要的核数则优先按照指定的核数分配,遍历每个executor的时候将executor加入到应用程序资源中,然后启动executor,最后设置应用处于运行状态,接下来看一下executor是怎么启动的:
//在worker上启动executor
private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = {
//提示 在worker上启动executor
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
//在worker上加入executor
worker.addExecutor(exec)
//向worker发消息启动了新的executor
worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id,
exec.application.desc, exec.cores, exec.memory, exec.resources))
//向提交该应用的driver发消息executor已经添加
exec.application.driver.send(
ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory))
}
先在worker上加入该executor,然后向worker发送启动executor的消息--LaunchExecutor,然后给driver发消息exexutor已经加入的消息--ExecutorAdded。下一节我们继续看一下worker收到消息怎么处理。