这是草稿
state machine
controller中有两个主要的状态机,分别是replicaStateMachine和partitionStateMachine。我们先看replicaStateMachine
replicaStateMachine
replica state machine保存了replica的状态,并且定义了一个replica所有可能的状态和转化的方式。一个replica能够拥有的状态有以下几种
- NewReplica:处于这个状态的replica只能接受become follower的状态改变请求
- OnlineReplica:处于这个状态的replica只能接受become leader或者become follower的状态改变请求。
- OfflineReplica:如果replica宕机了进入此状态
- ReplicaDeletionStarted:replica开始删除topic时的状态
- ReplicaDeletionSuccessful:如果replica删除topic成功,进入此状态
- ReplicaDeletionIneligible:如果replica删除topic失败,进入此状态
- NonExistentReplica:如果replica被成功删除,进入此状态
replicaStateMachine一个主要的方法是handleStateChanges,它通过给replica发送请求,指定replica的状态,并可以根据replica的响应执行相应的回调函数。
def handleStateChanges(replicas: Seq[PartitionAndReplica], targetState: ReplicaState,
callbacks: Callbacks = new Callbacks()): Unit = {
if (replicas.nonEmpty) {
try {
controllerBrokerRequestBatch.newBatch()
replicas.groupBy(_.replica).map { case (replicaId, replicas) =>
val partitions = replicas.map(_.topicPartition)
doHandleStateChanges(replicaId, partitions, targetState, callbacks)
}
controllerBrokerRequestBatch.sendRequestsToBrokers(controllerContext.epoch)
} catch {
case e: ControllerMovedException =>
error(s"Controller moved to another broker when moving some replicas to $targetState state", e)
throw e
case e: Throwable => error(s"Error while moving some replicas to $targetState state", e)
}
}
}
handleStateChanges对接受到的replicas按replica进行分组,对每个replica执行doHandleStateChanges方法
private def doHandleStateChanges(replicaId: Int, partitions: Seq[TopicPartition], targetState: ReplicaState,
callbacks: Callbacks): Unit = {
val replicas = partitions.map(partition => PartitionAndReplica(partition, replicaId))
replicas.foreach(replica => replicaState.getOrElseUpdate(replica, NonExistentReplica))
//这里要判断一下这个状态变化是否合法
val (validReplicas, invalidReplicas) = replicas.partition(replica => isValidTransition(replica, targetState))
invalidReplicas.foreach(replica => logInvalidTransition(replica, targetState))
targetState match {
case NewReplica =>
validReplicas.foreach { replica =>
val partition = replica.topicPartition
controllerContext.partitionLeadershipInfo.get(partition) match {
case Some(leaderIsrAndControllerEpoch) =>
if (leaderIsrAndControllerEpoch.leaderAndIsr.leader == replicaId) {
val exception = new StateChangeFailedException(s"Replica $replicaId for partition $partition cannot be moved to NewReplica state as it is being requested to become leader")
logFailedStateChange(replica, replicaState(replica), OfflineReplica, exception)
} else {
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(replicaId),
replica.topicPartition,
leaderIsrAndControllerEpoch,
controllerContext.partitionReplicaAssignment(replica.topicPartition),
isNew = true)
logSuccessfulTransition(replicaId, partition, replicaState(replica), NewReplica)
replicaState.put(replica, NewReplica)
}
case None =>
logSuccessfulTransition(replicaId, partition, replicaState(replica), NewReplica)
replicaState.put(replica, NewReplica)
}
}
case OnlineReplica =>
validReplicas.foreach { replica =>
val partition = replica.topicPartition
replicaState(replica) match {
case NewReplica =>
val assignment = controllerContext.partitionReplicaAssignment(partition)
if (!assignment.contains(replicaId)) {
controllerContext.updatePartitionReplicaAssignment(partition, assignment :+ replicaId)
}
case _ =>
controllerContext.partitionLeadershipInfo.get(partition) match {
case Some(leaderIsrAndControllerEpoch) =>
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(replicaId),
replica.topicPartition,
leaderIsrAndControllerEpoch,
controllerContext.partitionReplicaAssignment(partition), isNew = false)
case None =>
}
}
logSuccessfulTransition(replicaId, partition, replicaState(replica), OnlineReplica)
replicaState.put(replica, OnlineReplica)
}
case OfflineReplica =>
validReplicas.foreach { replica =>
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition,
deletePartition = false, (_, _) => ())
}
val (replicasWithLeadershipInfo, replicasWithoutLeadershipInfo) = validReplicas.partition { replica =>
controllerContext.partitionLeadershipInfo.contains(replica.topicPartition)
}
val updatedLeaderIsrAndControllerEpochs = removeReplicasFromIsr(replicaId, replicasWithLeadershipInfo.map(_.topicPartition))
updatedLeaderIsrAndControllerEpochs.foreach { case (partition, leaderIsrAndControllerEpoch) =>
if (!topicDeletionManager.isTopicQueuedUpForDeletion(partition.topic)) {
val recipients = controllerContext.partitionReplicaAssignment(partition).filterNot(_ == replicaId)
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(recipients,
partition,
leaderIsrAndControllerEpoch,
controllerContext.partitionReplicaAssignment(partition), isNew = false)
}
val replica = PartitionAndReplica(partition, replicaId)
logSuccessfulTransition(replicaId, partition, replicaState(replica), OfflineReplica)
replicaState.put(replica, OfflineReplica)
}
replicasWithoutLeadershipInfo.foreach { replica =>
logSuccessfulTransition(replicaId, replica.topicPartition, replicaState(replica), OfflineReplica)
replicaState.put(replica, OfflineReplica)
}
case ReplicaDeletionStarted =>
validReplicas.foreach { replica =>
logSuccessfulTransition(replicaId, replica.topicPartition, replicaState(replica), ReplicaDeletionStarted)
replicaState.put(replica, ReplicaDeletionStarted)
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId),
replica.topicPartition,
deletePartition = true,
callbacks.stopReplicaResponseCallback)
}
case ReplicaDeletionIneligible =>
validReplicas.foreach { replica =>
logSuccessfulTransition(replicaId, replica.topicPartition, replicaState(replica), ReplicaDeletionIneligible)
replicaState.put(replica, ReplicaDeletionIneligible)
}
case ReplicaDeletionSuccessful =>
validReplicas.foreach { replica =>
logSuccessfulTransition(replicaId, replica.topicPartition, replicaState(replica), ReplicaDeletionSuccessful)
replicaState.put(replica, ReplicaDeletionSuccessful)
}
case NonExistentReplica =>
validReplicas.foreach { replica =>
val currentAssignedReplicas = controllerContext.partitionReplicaAssignment(replica.topicPartition)
controllerContext.updatePartitionReplicaAssignment(replica.topicPartition, currentAssignedReplicas.filterNot(_ == replica.replica))
logSuccessfulTransition(replicaId, replica.topicPartition, replicaState(replica), NonExistentReplica)
replicaState.remove(replica)
}
}
}
这个很长的方法这里不准备一一解读了,可以直接看代码理解。
PartitionStateMachine
PartitionStateMachine维护了partition的状态,定义了一个partition可能存在的所有状态
- NonExistentPartition:如果一个partition从未存在,或者后来被删除,位于此状态
- NewPartition:当partition创建完以后位于此状态,此时等待replica的分配
- OnlinePartition:一旦对partition进行了leader的选举,就位于此状态
- OfflinePartition:一旦partition完成了leader的选举,但是leader后来挂掉了,就进入此状态
partitionStateMachine处理partition状态变化的方法是doHandleStateChange
private def doHandleStateChanges(partitions: Seq[TopicPartition], targetState: PartitionState,
partitionLeaderElectionStrategyOpt: Option[PartitionLeaderElectionStrategy]): Unit = {
val stateChangeLog = stateChangeLogger.withControllerEpoch(controllerContext.epoch)
partitions.foreach(partition => partitionState.getOrElseUpdate(partition, NonExistentPartition))
val (validPartitions, invalidPartitions) = partitions.partition(partition => isValidTransition(partition, targetState))
invalidPartitions.foreach(partition => logInvalidTransition(partition, targetState))
targetState match {
case NewPartition =>
validPartitions.foreach { partition =>
stateChangeLog.trace(s"Changed partition $partition state from ${partitionState(partition)} to $targetState with " +
s"assigned replicas ${controllerContext.partitionReplicaAssignment(partition).mkString(",")}")
changeStateTo(partition, partitionState(partition), NewPartition)
}
case OnlinePartition =>
val uninitializedPartitions = validPartitions.filter(partition => partitionState(partition) == NewPartition)
val partitionsToElectLeader = validPartitions.filter(partition => partitionState(partition) == OfflinePartition || partitionState(partition) == OnlinePartition)
if (uninitializedPartitions.nonEmpty) {
val successfulInitializations = initializeLeaderAndIsrForPartitions(uninitializedPartitions)
successfulInitializations.foreach { partition =>
stateChangeLog.trace(s"Changed partition $partition from ${partitionState(partition)} to $targetState with state " +
s"${controllerContext.partitionLeadershipInfo(partition).leaderAndIsr}")
changeStateTo(partition, partitionState(partition), OnlinePartition)
}
}
if (partitionsToElectLeader.nonEmpty) {
val successfulElections = electLeaderForPartitions(partitionsToElectLeader, partitionLeaderElectionStrategyOpt.get)
successfulElections.foreach { partition =>
stateChangeLog.trace(s"Changed partition $partition from ${partitionState(partition)} to $targetState with state " +
s"${controllerContext.partitionLeadershipInfo(partition).leaderAndIsr}")
changeStateTo(partition, partitionState(partition), OnlinePartition)
}
}
case OfflinePartition =>
validPartitions.foreach { partition =>
stateChangeLog.trace(s"Changed partition $partition state from ${partitionState(partition)} to $targetState")
changeStateTo(partition, partitionState(partition), OfflinePartition)
}
case NonExistentPartition =>
validPartitions.foreach { partition =>
stateChangeLog.trace(s"Changed partition $partition state from ${partitionState(partition)} to $targetState")
changeStateTo(partition, partitionState(partition), NonExistentPartition)
}
}
}
我们只看targetState为OnlinePartition的处理。要想将一个partition的状态转移为online,那么它开始的状态要么为new要么为online或者offline。对于那些初始状态为new的partition,执行initializeLeaderAndIsrForPartitions方法
private def initializeLeaderAndIsrForPartitions(partitions: Seq[TopicPartition]): Seq[TopicPartition] = {
val successfulInitializations = mutable.Buffer.empty[TopicPartition]
val replicasPerPartition = partitions.map(partition => partition -> controllerContext.partitionReplicaAssignment(partition))
val liveReplicasPerPartition = replicasPerPartition.map { case (partition, replicas) =>
val liveReplicasForPartition = replicas.filter(replica => controllerContext.isReplicaOnline(replica, partition))
partition -> liveReplicasForPartition
}
val (partitionsWithoutLiveReplicas, partitionsWithLiveReplicas) = liveReplicasPerPartition.partition { case (_, liveReplicas) => liveReplicas.isEmpty }
partitionsWithoutLiveReplicas.foreach { case (partition, replicas) =>
val failMsg = s"Controller $controllerId epoch ${controllerContext.epoch} encountered error during state change of " +
s"partition $partition from New to Online, assigned replicas are " +
s"[${replicas.mkString(",")}], live brokers are [${controllerContext.liveBrokerIds}]. No assigned " +
"replica is alive."
logFailedStateChange(partition, NewPartition, OnlinePartition, new StateChangeFailedException(failMsg))
}
val leaderIsrAndControllerEpochs = partitionsWithLiveReplicas.map { case (partition, liveReplicas) =>
val leaderAndIsr = LeaderAndIsr(liveReplicas.head, liveReplicas.toList)
val leaderIsrAndControllerEpoch = LeaderIsrAndControllerEpoch(leaderAndIsr, controllerContext.epoch)
partition -> leaderIsrAndControllerEpoch
}.toMap
val createResponses = try {
zkClient.createTopicPartitionStatesRaw(leaderIsrAndControllerEpochs, controllerContext.epochZkVersion)
} catch {
case e: ControllerMovedException =>
error("Controller moved to another broker when trying to create the topic partition state znode", e)
throw e
case e: Exception =>
partitionsWithLiveReplicas.foreach { case (partition,_) => logFailedStateChange(partition, partitionState(partition), NewPartition, e) }
Seq.empty
}
createResponses.foreach { createResponse =>
val code = createResponse.resultCode
val partition = createResponse.ctx.get.asInstanceOf[TopicPartition]
val leaderIsrAndControllerEpoch = leaderIsrAndControllerEpochs(partition)
if (code == Code.OK) {
controllerContext.partitionLeadershipInfo.put(partition, leaderIsrAndControllerEpoch)
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(leaderIsrAndControllerEpoch.leaderAndIsr.isr,
partition, leaderIsrAndControllerEpoch, controllerContext.partitionReplicaAssignment(partition), isNew = true)
successfulInitializations += partition
} else {
logFailedStateChange(partition, NewPartition, OnlinePartition, code)
}
}
successfulInitializations
}
- 先通过controller context获取partition的所有replica,然后过滤出broker仍然online的replica
- 将所有的partition分成两类,一类是具有online replica的partition:partitionsWithLiveReplicas,一类是没有online replica的partition
- 对partitionsWithLiveReplicas中的每个partition进行选举,replica中第一个broker为leader,剩余的为isr
- 通过第三步产生的leader,isr和controller context epoch,构造LeaderIsrAndControllerEpoch
- 将LeaderIsrAndControllerEpoch保存在znode当中
- 更新controller context中partitionLeadershipInfo
对那些初始状态为offline或者online的partition,进行partition leader的选举
private def electLeaderForPartitions(partitions: Seq[TopicPartition], partitionLeaderElectionStrategy: PartitionLeaderElectionStrategy): Seq[TopicPartition] = {
val successfulElections = mutable.Buffer.empty[TopicPartition]
var remaining = partitions
while (remaining.nonEmpty) {
val (success, updatesToRetry, failedElections) = doElectLeaderForPartitions(partitions, partitionLeaderElectionStrategy)
remaining = updatesToRetry
successfulElections ++= success
failedElections.foreach { case (partition, e) =>
logFailedStateChange(partition, partitionState(partition), OnlinePartition, e)
}
}
successfulElections
}
这个方法不断重试partition的选举,直到全部重试完成。具体的选举的方法是doElectLeaderForPartitions
private def doElectLeaderForPartitions(partitions: Seq[TopicPartition], partitionLeaderElectionStrategy: PartitionLeaderElectionStrategy):
(Seq[TopicPartition], Seq[TopicPartition], Map[TopicPartition, Exception]) = {
val getDataResponses = try {
zkClient.getTopicPartitionStatesRaw(partitions)
} catch {
case e: Exception =>
return (Seq.empty, Seq.empty, partitions.map(_ -> e).toMap)
}
val failedElections = mutable.Map.empty[TopicPartition, Exception]
val leaderIsrAndControllerEpochPerPartition = mutable.Buffer.empty[(TopicPartition, LeaderIsrAndControllerEpoch)]
getDataResponses.foreach { getDataResponse =>
val partition = getDataResponse.ctx.get.asInstanceOf[TopicPartition]
val currState = partitionState(partition)
if (getDataResponse.resultCode == Code.OK) {
val leaderIsrAndControllerEpochOpt = TopicPartitionStateZNode.decode(getDataResponse.data, getDataResponse.stat)
if (leaderIsrAndControllerEpochOpt.isEmpty) {
val exception = new StateChangeFailedException(s"LeaderAndIsr information doesn't exist for partition $partition in $currState state")
failedElections.put(partition, exception)
}
leaderIsrAndControllerEpochPerPartition += partition -> leaderIsrAndControllerEpochOpt.get
} else if (getDataResponse.resultCode == Code.NONODE) {
val exception = new StateChangeFailedException(s"LeaderAndIsr information doesn't exist for partition $partition in $currState state")
failedElections.put(partition, exception)
} else {
failedElections.put(partition, getDataResponse.resultException.get)
}
}
val (invalidPartitionsForElection, validPartitionsForElection) = leaderIsrAndControllerEpochPerPartition.partition { case (_, leaderIsrAndControllerEpoch) =>
leaderIsrAndControllerEpoch.controllerEpoch > controllerContext.epoch
}
invalidPartitionsForElection.foreach { case (partition, leaderIsrAndControllerEpoch) =>
val failMsg = s"aborted leader election for partition $partition since the LeaderAndIsr path was " +
s"already written by another controller. This probably means that the current controller $controllerId went through " +
s"a soft failure and another controller was elected with epoch ${leaderIsrAndControllerEpoch.controllerEpoch}."
failedElections.put(partition, new StateChangeFailedException(failMsg))
}
if (validPartitionsForElection.isEmpty) {
return (Seq.empty, Seq.empty, failedElections.toMap)
}
val shuttingDownBrokers = controllerContext.shuttingDownBrokerIds.toSet
val (partitionsWithoutLeaders, partitionsWithLeaders) = partitionLeaderElectionStrategy match {
case OfflinePartitionLeaderElectionStrategy =>
leaderForOffline(validPartitionsForElection).partition { case (_, newLeaderAndIsrOpt, _) => newLeaderAndIsrOpt.isEmpty }
case ReassignPartitionLeaderElectionStrategy =>
leaderForReassign(validPartitionsForElection).partition { case (_, newLeaderAndIsrOpt, _) => newLeaderAndIsrOpt.isEmpty }
case PreferredReplicaPartitionLeaderElectionStrategy =>
leaderForPreferredReplica(validPartitionsForElection).partition { case (_, newLeaderAndIsrOpt, _) => newLeaderAndIsrOpt.isEmpty }
case ControlledShutdownPartitionLeaderElectionStrategy =>
leaderForControlledShutdown(validPartitionsForElection, shuttingDownBrokers).partition { case (_, newLeaderAndIsrOpt, _) => newLeaderAndIsrOpt.isEmpty }
}
partitionsWithoutLeaders.foreach { case (partition, _, _) =>
val failMsg = s"Failed to elect leader for partition $partition under strategy $partitionLeaderElectionStrategy"
failedElections.put(partition, new StateChangeFailedException(failMsg))
}
val recipientsPerPartition = partitionsWithLeaders.map { case (partition, _, recipients) => partition -> recipients }.toMap
val adjustedLeaderAndIsrs = partitionsWithLeaders.map { case (partition, leaderAndIsrOpt, _) => partition -> leaderAndIsrOpt.get }.toMap
val UpdateLeaderAndIsrResult(successfulUpdates, updatesToRetry, failedUpdates) = zkClient.updateLeaderAndIsr(
adjustedLeaderAndIsrs, controllerContext.epoch, controllerContext.epochZkVersion)
successfulUpdates.foreach { case (partition, leaderAndIsr) =>
val replicas = controllerContext.partitionReplicaAssignment(partition)
val leaderIsrAndControllerEpoch = LeaderIsrAndControllerEpoch(leaderAndIsr, controllerContext.epoch)
controllerContext.partitionLeadershipInfo.put(partition, leaderIsrAndControllerEpoch)
controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(recipientsPerPartition(partition), partition,
leaderIsrAndControllerEpoch, replicas, isNew = false)
}
(successfulUpdates.keys.toSeq, updatesToRetry, failedElections.toMap ++ failedUpdates)
}
- 先通过zk获取到所有partitions的topic partition state
- 从zk的响应中获取partition的LeaderIsrAndControllerEpoch信息
- 如果partition的leaderIsrAndControllerEpoch > controllerContext.epoch,就将这个partition标记为invalidPartitionsForElection,否则标记为validPartitionsForElection。invalid意思是指,已经有一个controllerContext.epoch的controller产生,并且进行过partition leader的选举,所以不再对这个partition做选举
- 根据选举策略的不同,对partition leader进行选举
- OfflinePartitionLeaderElectionStrategy
- ReassignPartitionLeaderElectionStrategy
- PreferredReplicaPartitionLeaderElectionStrategy
- ControlledShutdownPartitionLeaderElectionStrategy
- 根据选举结果,将partitions分成successful,retry,fail三类并返回