实验原址:mit 6.824 Lab2 Raft
中文翻译:mit 6.824 Lab2 翻译
Raft论文: In Search of an Understandable Consensus Algorithm
Raft论文翻译:(zhuanlan.zhihu.com/p/524885008)
介绍
Lab 2D的实验,要求我们实现日志压缩。Raft是基于日志复制的共识算法,对于每执行一条命令,就需要新增日志,一旦日志过长,便会拖垮服务器,因此必须实现日志压缩。
日志压缩的方式,其实就是截断存在内存中的日志,然后将截断的日志转换成二进制压缩存储。
2D的实验,依旧是严格按照Raft论文的描述实现。参考图如下:
2D新增了新的RPC:InstallSnapshot。本次实验不要求实现基于offset的日志压缩。
Raft层与Server层交互图
Raft层与Server层交互的通道是applyChannel。而在集群中,各个机器的Server层不直接交互,而是从Raft层进行的交互。因此,一条cmd从客户端到Server端,Server端异步进行命令同步,提交,等到命令被大多数复制后,提交,提交后才会返回客户端成功。
而日志压缩也是从Server层进行传入,Server层有一个maxraftstate字段,这个后续在Lab3时会遇到。 Server会调用Raft层的Snapshot函数。
//由Raft的上层来决定何时进行快照。
func (rf *Raft) Snapshot(index int, snapshot []byte) {
// Your code here (2D).
// 上层传入:一个从index开始之后的快照
rf.mu.Lock()
defer rf.mu.Unlock()
//必须是已经提交的日志
if index >rf. commitIndex || index <= rf.lastIncludedIndex {
return
}
DPrintf("Server %v 上层传入快照,index %v ", rf.me,index)
rf.snapshot = snapshot
//截断从 1 - index
//这个Index是全局递增的,需要进行转换
ridx := rf.ToRealIndex(index)
//注意,日志第一个的填充
rf.lastIncludedIndex = index
old := len(rf.logs)
rf.lastIncludedTerm = rf.logs[ridx].Term
rf.logs = append([]LogEntry{},rf.logs[ridx:]...)
DPrintf("Server %v cut old %v To len(logs) %v",rf.me,old,len(rf.logs))
if rf.lastApplied < index {
rf.lastApplied = index
}
rf.persist()
}
读取快照
在make函数中调用(服务启动时)
func (rf *Raft) readSnapshot(data []byte) {
if len(data) == 0 {
return
}
rf.snapshot = data
DPrintf("server %v 读取快照c成功", rf.me)
}
持久化时,补充快照持久化
func (rf *Raft) persist() {
// Your code here (2C).
// Example:
w := new(bytes.Buffer)
e := labgob.NewEncoder(w)
e.Encode(rf.currentTerm)
e.Encode(rf.votedFor)
e.Encode(rf.logs)
e.Encode(rf.lastIncludedIndex)
e.Encode(rf.lastIncludedTerm)
raftstate := w.Bytes()
rf.persister.Save(raftstate, rf.snapshot)
}
// restore previously persisted state.
func (rf *Raft) readPersist(data []byte) {
if data == nil || len(data) <= 1 { // bootstrap without any state?
return
}
// Your code here (2C).
// Example:
r := bytes.NewBuffer(data)
d := labgob.NewDecoder(r)
var currentTerm int
var votedFor int
var logs[] LogEntry
var lastIncludedIndex int
var lastIncludedTerm int
if d.Decode(¤tTerm) != nil ||
d.Decode(&votedFor) != nil ||
d.Decode(&logs) != nil ||
d.Decode(&lastIncludedIndex) != nil ||
d.Decode(&lastIncludedTerm) != nil{
DPrintf("Server %v readPersist Fail",rf.me)
} else {
rf.currentTerm = currentTerm
rf.votedFor = votedFor
rf.logs = logs
rf.lastIncludedIndex = lastIncludedIndex
rf.lastIncludedTerm = lastIncludedTerm
rf.commitIndex = lastIncludedIndex
rf.lastApplied = lastIncludedIndex
DPrintf("server %v readPersist 成功\n", rf.me)
}
}
下标问题
日志一旦被截断,一定涉及到真实日志长度,与虚拟日志长度。因此,将所有访问log[]的下标,进行替换。
//获取真实的日志下标
func (rf*Raft) ToRealIndex(vidx int) int {
return vidx - rf.lastIncludedIndex
}
func(rf*Raft) ToVirtualIndex(ridx int) int {
return ridx + rf.lastIncludedIndex
}
参数定义
type InstallSnapshotArgs struct{
Term int
LeaderId int
LastIncludedIndex int
LastIncludedTerm int
Data[] byte
}
type InstallSnapshotReply struct {
Term int
}
发送snapshotRPC
func (rf*Raft) SendInstallSnapshot(server int) {
rf.mu.Lock()
if rf.state != Leader {
rf.mu.Unlock()
return
}
args := InstallSnapshotArgs {
LeaderId: rf.me,
Term: rf.currentTerm,
LastIncludedIndex: rf.lastIncludedIndex,
LastIncludedTerm : rf.lastIncludedTerm,
Data: rf.snapshot,
}
reply := InstallSnapshotReply{}
DPrintf("Server %v Send IS Args %v To %v",rf.me,args,server)
rf.mu.Unlock()
ok := rf.sendInstallSnapshot(server,&args,&reply)
if !ok {
return
}
rf.mu.Lock()
//All Servers: If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
if reply.Term > rf.currentTerm{
rf.currentTerm = reply.Term
rf.state = Follower
rf.votedFor = None
rf.voteNum = 0
rf.persist()
rf.mu.Unlock()
return
}
//调整该Server的matchIndex 与 nextIndex
rf.nextIndex[server] = rf.ToVirtualIndex(1)
rf.mu.Unlock()
}
难点:何时发送该快照RPC?
- 当发现Follower AE RPC的nextIndex存在于快照中
- 当回复的AE RPCs 发现冲突的Term Or XIndex 存在于快照中
//周期心跳函数
func (rf*Raft) cycleAppendEntries(){
rf.nextBeatTime = time.Now()
for !rf.killed() {
rf.mu.Lock()
// if the server is dead or is not the leader, just return
if rf.state != Leader{
// 不是leader则终止心跳的发送
rf.mu.Unlock()
return
}
if !time.Now().After(rf.nextBeatTime){
rf.mu.Unlock()
continue
}
for i := 0; i < len(rf.peers); i++ {
if i == rf.me {
continue
}
reply := AppendEntriesReply{}
args := AppendEntriesArgs{
LeaderId : rf.me,
Term : rf.currentTerm,
LeaderCommit: rf.commitIndex,
PrevLogIndex: rf.nextIndex[i] - 1,
}
flag := false
//如果发送时,发现要发送的日志存在于快照中
if args.PrevLogIndex < rf.lastIncludedIndex {
flag = true
//If last log index>= nextIndex for a follower,AppendEntries RPC with log entries starting at nextIndex
}else if rf.ToVirtualIndex(len(rf.logs) - 1) > args.PrevLogIndex {
args.Entries = rf.logs[rf.ToRealIndex(args.PrevLogIndex+1):]
DPrintf("Server %v Send AE Args %v To %v",rf.me,args,i)
}else {
args.Entries = make([]LogEntry,0)
}
if flag {
go rf.SendInstallSnapshot(i)
}else{
args.PrevLogTerm = rf.logs[rf.ToRealIndex(args.PrevLogIndex)].Term
go rf.SendAppendEntries(i,&args,&reply)
}
}
rf.nextBeatTime = time.Now().Add(time.Duration(HeartBeatInterval)*time.Millisecond)
rf.mu.Unlock()
}
}
func(rf*Raft) SendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply){
ok := rf.sendAppendEntries(server,args,reply)
if !ok {
return
}
rf.mu.Lock()
//不是当前Term的RPC,丢弃
if rf.currentTerm != args.Term {
DPrintf("Old RPC")
rf.mu.Unlock()
return
}
//All Servers: If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
if reply.Term > rf.currentTerm{
rf.currentTerm = reply.Term
rf.state = Follower
rf.votedFor = None
rf.voteNum = 0
rf.persist()
rf.mu.Unlock()
return
}
//if successful: update nextIndex and matchIndex for follower
if reply.Success {
rf.matchIndex[server] = args.PrevLogIndex + len(args.Entries)
rf.nextIndex[server] =args.PrevLogIndex + len(args.Entries) + 1
//如何确保多数派提交同一条日志,且不会重复提交?
//每一条Reply只会更新到自己认为的最新的Commit,这样就不会重复提交
//Leader不被允许提交之前Term中的Log
//If there exists an N such that N > commitIndex, a majority of matchIndex[i] >= N, and log[N].Term == currentTerm: set commitIndex = N
N := rf.ToVirtualIndex(len(rf.logs) - 1)
for N > rf.commitIndex {
cnt := 1
for i:= 0; i < len(rf.peers); i++{
if i == rf.me {
continue
}
if rf.matchIndex[i] >= N && rf.logs[rf.ToRealIndex(N)].Term == rf.currentTerm {
cnt++
}
}
if cnt > len(rf.peers) / 2{
rf.commitIndex = N
DPrintf("update commitIndex to %v",N)
break
}
N-= 1
}
rf.mu.Unlock()
return
}
if reply.Term == rf.currentTerm && rf.state == Leader {
//if fails because of log inconsistency: decrement nextIndex and retry
//Upon receiving a conflict response, the leader should first search its log for conflictTerm.
// If it finds a entry in its log with that term, it should set nextIndex to be the one beyond the index of the last entry in that term in its log.
DPrintf("Server %v reply: conflictTerm %v conflictIndex %v",server,reply.XTerm,reply.XIndex)
//回复不存在该Term
if reply.XTerm == -1 {
//
if rf.lastIncludedIndex >= reply.XLen {
go rf.SendInstallSnapshot(server)
}else{
rf.nextIndex[server] = reply.XLen
}
DPrintf(" ConflictIndex By Too Short Follower Len . Leader logs %v",rf.logs)
rf.mu.Unlock()
return
}
}
//case Leader has the same term with follower
i := rf.nextIndex[server] - 1
if i < rf.lastIncludedIndex {
i = rf.lastIncludedIndex
}
for i > rf.lastIncludedIndex && rf.logs[rf.ToRealIndex(i)].Term > reply.XTerm {
i -= 1
}
//当前logs中不存在该Term
if i == rf.lastIncludedIndex && rf.logs[rf.ToRealIndex(i)].Term > reply.XTerm {
go rf.SendInstallSnapshot(server)
//存在该Term
}else if rf.logs[rf.ToRealIndex(i)].Term == reply.XTerm{
rf.nextIndex[server] = i+1
}else{
// 之前PrevLogIndex发生冲突位置时, Follower的Term自己没有
DPrintf("leader %v 收到 server %v 的回退请求, 冲突位置的Term为%v, server的这个Term从索引%v开始, 而leader对应的XTerm不存在, 回退前的nextIndex[%v]=%v, 回退后的nextIndex[%v]=%v\n", rf.me, server, reply.XTerm, reply.XIndex, server, rf.nextIndex[server], server, reply.XIndex)
if reply.XIndex <= rf.lastIncludedIndex {
// XIndex位置也被截断了
// 添加InstallSnapshot的处理
go rf.SendInstallSnapshot(server)
} else {
rf.nextIndex[server] = reply.XIndex
}
}
//If it does not find an entry with that term, it should set nextIndex = conflictInde
rf.mu.Unlock()
return
}
处理snapshot RPC
func (rf*Raft) HandlerInstallSnapshot (args * InstallSnapshotArgs, reply * InstallSnapshotReply){
DPrintf("Server %v 收到 Server %v 的 SnapshotRPC args %v",rf.me,args.LeaderId,args)
DPrintf("Server %v lastAppliy%v commitindex %v lastIncludeIndex %v lastIncludeTerm %v log %v",rf.me,rf.lastApplied,rf.commitIndex,rf.lastIncludedIndex,rf.lastIncludedTerm,rf.logs)
// 1. Reply immediately if term < currentTerm
rf.mu.Lock()
defer func() {
//有效的InstallSnapshot应当重置选举时间
rf.stamp = time.Now()
rf.mu.Unlock()
DPrintf("server %v 接收到 leader %v 的InstallSnapshot, 重设定时器", rf.me, args.LeaderId)
}()
if args.Term < rf.currentTerm {
reply.Term = rf.currentTerm
return
}
if args.Term > rf.currentTerm {
rf.currentTerm = args.Term
rf.votedFor = -1
DPrintf("server %v 接受来自 %v 的 InstallSnapshot, 且发现了更大的Term\n", rf.me, args.LeaderId)
}
rf.state = Follower
logIndex := args.LastIncludedIndex - rf.lastIncludedIndex // leader快照与本地快照对比
if logIndex < 0 {
return
}
if logIndex < len(rf.logs) && rf.logs[logIndex].Term == args.LastIncludedTerm { // step 5
DPrintf("same log entry. server id:%v lastIncludedIndex:%v log len:%v\n", rf.me, rf.lastIncludedIndex, len(rf.logs))
return
}
// If existing log entry has same index and term as snapshot's last
// inclouded entry, retain log entries following it and reply
hasEntry := false
idx := 0
for ; idx < len(rf.logs); idx++ {
if rf.ToVirtualIndex(idx) == args.LastIncludedIndex && rf.logs[idx].Term == args.LastIncludedTerm {
hasEntry = true
break
}
}
//存在,截断
if hasEntry {
rf.logs = rf.logs[idx:]
}else{
//Discard the entire log
rf.logs = make([]LogEntry,0)
rf.logs = append(rf.logs,LogEntry{Term: rf.lastIncludedTerm}) // 索引为0处占位
}
//Rest state machine using snapshot contents
msg := & ApplyMsg {
SnapshotValid : true,
Snapshot : args.Data,
SnapshotTerm: args.LastIncludedTerm,
SnapshotIndex: args.LastIncludedIndex,
}
// // Save snapshot
rf.snapshot = args.Data
//更新索引
rf.lastIncludedIndex = args.LastIncludedIndex
rf.lastIncludedTerm = args.LastIncludedTerm
if rf.commitIndex < args.LastIncludedIndex {
rf.commitIndex = args.LastIncludedIndex
}
if rf.lastApplied < args.LastIncludedIndex {
rf.lastApplied = args.LastIncludedIndex
}
rf.applyCh <- *msg
rf.persist()
return
}
apply To State
func(rf*Raft) appliyToState(){
//All Servers
//if commitIndex > lastApplied:increment lastApplied,apply log[lastApplied] to state machine
for !rf.killed() {
rf.mu.Lock()
sendMsg := false
var msg ApplyMsg
if rf.commitIndex > rf.lastApplied {
rf.lastApplied += 1
sendMsg = true
msg = ApplyMsg{
Command: rf.logs[rf.ToRealIndex(rf.lastApplied)].Command,
CommandIndex: rf.lastApplied,
CommandValid:true,
}
DPrintf("server %v appliy %v to state index %v",rf.me,rf.logs[rf.ToRealIndex(rf.lastApplied)].Command, rf.lastApplied)
}
rf.mu.Unlock()
if sendMsg { // 向service发送消息
rf.applyCh <- msg
} else { // 此次未发送消息,休眠
time.Sleep(ApplyInterval * time.Millisecond)
}
}
}
关于测试
2D较为繁琐的下标转换,会发生很多的越界问题,需要从大量的日志中寻找错误,并且2D的测试又会暴露出A、B、C的缺陷。2D部分,是整个Raft最后一部分,也是最恶心的一部分。
- 可以一个一个测试来实验。
rm outD
go test -run TestSnapshotInstallUnreliable2D > outD
Test (2D): snapshots basic ...
... Passed -- 4.6 3 138 48082 228
Test (2D): install snapshots (disconnect) ...
... Passed -- 41.7 3 1071 482506 331
Test (2D): install snapshots (disconnect+unreliable) ...
... Passed -- 62.5 3 1537 598130 339
Test (2D): install snapshots (crash) ...
... Passed -- 31.7 3 734 387263 330
Test (2D): install snapshots (unreliable+crash) ...
... Passed -- 37.5 3 824 395189 296
Test (2D): crash and restart all servers ...
... Passed -- 8.0 3 236 67112 53
Test (2D): snapshot initialization after crash ...
... Passed -- 2.5 3 68 18886 14
PASS
ok 6.5840/raft 188.568s
整个Lab2测试
vv@ubuntu:~/6.5840/src/raft$ time go test
Test (2A): initial election ...
... Passed -- 3.1 3 62 16820 0
Test (2A): election after network failure ...
... Passed -- 4.5 3 132 25502 0
Test (2A): multiple elections ...
... Passed -- 5.5 7 720 138482 0
Test (2B): basic agreement ...
... Passed -- 0.6 3 16 4324 3
Test (2B): RPC byte count ...
... Passed -- 1.5 3 48 113712 11
Test (2B): test progressive failure of followers ...
... Passed -- 4.7 3 130 26194 3
Test (2B): test failure of leaders ...
... Passed -- 5.0 3 195 41140 3
Test (2B): agreement after follower reconnects ...
... Passed -- 5.6 3 134 34653 8
Test (2B): no agreement if too many followers disconnect ...
... Passed -- 3.4 5 224 43394 4
Test (2B): concurrent Start()s ...
... Passed -- 0.6 3 16 4356 6
Test (2B): rejoin of partitioned leader ...
... Passed -- 4.1 3 146 32962 4
Test (2B): leader backs up quickly over incorrect follower logs ...
... Passed -- 19.0 5 1932 1305957 103
Test (2B): RPC counts aren't too high ...
... Passed -- 2.2 3 44 12384 12
Test (2C): basic persistence ...
... Passed -- 3.6 3 86 21664 6
Test (2C): more persistence ...
... Passed -- 15.9 5 980 200948 16
Test (2C): partitioned leader and one follower crash, leader restarts ...
... Passed -- 1.6 3 34 8632 4
Test (2C): Figure 8 ...
... Passed -- 30.1 5 1088 229143 58
Test (2C): unreliable agreement ...
... Passed -- 1.7 5 560 187381 246
Test (2C): Figure 8 (unreliable) ...
... Passed -- 37.2 5 9816 17454663 330
Test (2C): churn ...
... Passed -- 16.2 5 9512 28084585 2402
Test (2C): unreliable churn ...
... Passed -- 16.3 5 3628 4791630 874
Test (2D): snapshots basic ...
... Passed -- 4.4 3 136 46146 201
Test (2D): install snapshots (disconnect) ...
... Passed -- 43.2 3 1114 522180 365
Test (2D): install snapshots (disconnect+unreliable) ...
... Passed -- 63.6 3 1612 637606 343
Test (2D): install snapshots (crash) ...
... Passed -- 30.7 3 725 365511 332
Test (2D): install snapshots (unreliable+crash) ...
... Passed -- 35.6 3 800 412641 326
Test (2D): crash and restart all servers ...
... Passed -- 8.3 3 240 68212 54
Test (2D): snapshot initialization after crash ...
... Passed -- 2.5 3 68 18852 14
PASS
ok 6.5840/raft 370.873s
real 6m11.360s
user 5m40.683s
sys 1m17.473s