Mit 6.824 Lab:2D Raft 实现历程

480 阅读8分钟

实验原址:mit 6.824 Lab2 Raft

中文翻译:mit 6.824 Lab2 翻译

Raft论文: In Search of an Understandable Consensus Algorithm

Raft论文翻译:(zhuanlan.zhihu.com/p/524885008)

介绍

Lab 2D的实验,要求我们实现日志压缩。Raft是基于日志复制的共识算法,对于每执行一条命令,就需要新增日志,一旦日志过长,便会拖垮服务器,因此必须实现日志压缩。

日志压缩的方式,其实就是截断存在内存中的日志,然后将截断的日志转换成二进制压缩存储。

2D的实验,依旧是严格按照Raft论文的描述实现。参考图如下:

v2-38d68cb1ba8c3d3c97fedc378a6ed284_r.jpg

2D新增了新的RPC:InstallSnapshot。本次实验不要求实现基于offset的日志压缩。

Raft层与Server层交互图

v2-e90f4254cad384b257fe059aa33a89e7_1440w.png

Raft层与Server层交互的通道是applyChannel。而在集群中,各个机器的Server层不直接交互,而是从Raft层进行的交互。因此,一条cmd从客户端到Server端,Server端异步进行命令同步,提交,等到命令被大多数复制后,提交,提交后才会返回客户端成功。

而日志压缩也是从Server层进行传入,Server层有一个maxraftstate字段,这个后续在Lab3时会遇到。 Server会调用Raft层的Snapshot函数。

//由Raft的上层来决定何时进行快照。
func (rf *Raft) Snapshot(index int, snapshot []byte) {
	// Your code here (2D).
   // 上层传入:一个从index开始之后的快照
	rf.mu.Lock()
	defer rf.mu.Unlock()
	//必须是已经提交的日志
	if index >rf. commitIndex || index <= rf.lastIncludedIndex {
		return
	}
	DPrintf("Server %v 上层传入快照,index %v ", rf.me,index)
   rf.snapshot = snapshot
   //截断从 1 - index
   //这个Index是全局递增的,需要进行转换
   ridx := rf.ToRealIndex(index)
   //注意,日志第一个的填充
   rf.lastIncludedIndex = index 
   old := len(rf.logs)
   rf.lastIncludedTerm =  rf.logs[ridx].Term
   rf.logs = append([]LogEntry{},rf.logs[ridx:]...)
   DPrintf("Server %v cut old %v To len(logs) %v",rf.me,old,len(rf.logs))
   if rf.lastApplied < index {
	    rf.lastApplied = index
   }
   rf.persist()
}

读取快照

在make函数中调用(服务启动时)

func (rf *Raft) readSnapshot(data []byte) {
	if len(data) == 0 {
		return
	}
	rf.snapshot = data
	DPrintf("server %v 读取快照c成功", rf.me)
}

持久化时,补充快照持久化

func (rf *Raft) persist() {
	// Your code here (2C).
	// Example:
	w := new(bytes.Buffer)
	e := labgob.NewEncoder(w)
	e.Encode(rf.currentTerm)
	e.Encode(rf.votedFor)
	e.Encode(rf.logs)
	e.Encode(rf.lastIncludedIndex)
	e.Encode(rf.lastIncludedTerm)
	raftstate := w.Bytes()
	rf.persister.Save(raftstate, rf.snapshot)
}
// restore previously persisted state.
func (rf *Raft) readPersist(data []byte) {
	if data == nil || len(data) <= 1 { // bootstrap without any state?
		return
	}
	// Your code here (2C).
	// Example:
	r := bytes.NewBuffer(data)
	d := labgob.NewDecoder(r)
	var currentTerm int
	var votedFor int
	var logs[] LogEntry
	var lastIncludedIndex int
	var lastIncludedTerm int
	if d.Decode(&currentTerm) != nil ||
	   d.Decode(&votedFor) != nil  || 
	   d.Decode(&logs) != nil || 
	   d.Decode(&lastIncludedIndex) != nil || 
	   d.Decode(&lastIncludedTerm) != nil{
		DPrintf("Server %v readPersist Fail",rf.me) 
	} else {
	  rf.currentTerm = currentTerm
	  rf.votedFor = votedFor
	  rf.logs = logs
	  rf.lastIncludedIndex = lastIncludedIndex
	  rf.lastIncludedTerm = lastIncludedTerm
	  rf.commitIndex = lastIncludedIndex
	 rf.lastApplied = lastIncludedIndex
	DPrintf("server %v  readPersist 成功\n", rf.me)
	}
}

下标问题

日志一旦被截断,一定涉及到真实日志长度,与虚拟日志长度。因此,将所有访问log[]的下标,进行替换。

//获取真实的日志下标
func (rf*Raft) ToRealIndex(vidx int) int {
	return vidx - rf.lastIncludedIndex
}
func(rf*Raft) ToVirtualIndex(ridx int) int {
	return ridx + rf.lastIncludedIndex
}

参数定义

type InstallSnapshotArgs struct{
	Term int
	LeaderId int
	LastIncludedIndex int
	LastIncludedTerm int
	Data[] byte
}
type InstallSnapshotReply struct {
	Term int
}

发送snapshotRPC

func (rf*Raft) SendInstallSnapshot(server int) {
	rf.mu.Lock()
	if rf.state != Leader {
		rf.mu.Unlock()
		return
	}
	args := InstallSnapshotArgs {
		LeaderId: rf.me,
		Term: rf.currentTerm,
		LastIncludedIndex: rf.lastIncludedIndex,
		LastIncludedTerm : rf.lastIncludedTerm,
		Data: rf.snapshot,
	}
	reply := InstallSnapshotReply{}
	DPrintf("Server %v Send IS Args %v To %v",rf.me,args,server)
	rf.mu.Unlock()
    ok := rf.sendInstallSnapshot(server,&args,&reply)
	if !ok {
		return
	}
	rf.mu.Lock()
	
	//All Servers: If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
	if reply.Term > rf.currentTerm{
		rf.currentTerm = reply.Term
		rf.state = Follower
		rf.votedFor = None
		rf.voteNum = 0
		rf.persist()
                rf.mu.Unlock()
		return
	}
	//调整该Server的matchIndex 与 nextIndex
	rf.nextIndex[server] = rf.ToVirtualIndex(1)
        rf.mu.Unlock()
}

难点:何时发送该快照RPC?

  • 当发现Follower AE RPC的nextIndex存在于快照中
  • 当回复的AE RPCs 发现冲突的Term Or XIndex 存在于快照中
//周期心跳函数
func (rf*Raft) cycleAppendEntries(){
	rf.nextBeatTime = time.Now()
	for !rf.killed() {
		rf.mu.Lock()
		// if the server is dead or is not the leader, just return
		if rf.state != Leader{
			// 不是leader则终止心跳的发送
			rf.mu.Unlock()
			return
		}
		if !time.Now().After(rf.nextBeatTime){
			rf.mu.Unlock()
			continue
		}
		for i := 0; i < len(rf.peers); i++ {
			if i == rf.me {
				continue
			}
			reply := AppendEntriesReply{}
			args := AppendEntriesArgs{
				LeaderId : rf.me,
				Term : rf.currentTerm,
				LeaderCommit: rf.commitIndex,
				PrevLogIndex: rf.nextIndex[i] - 1,
			}
			flag := false
			//如果发送时,发现要发送的日志存在于快照中
			if args.PrevLogIndex  < rf.lastIncludedIndex  {
				flag = true
				//If last log index>= nextIndex for a follower,AppendEntries RPC with log entries starting at nextIndex
			}else if rf.ToVirtualIndex(len(rf.logs) - 1) > args.PrevLogIndex  { 
					args.Entries = rf.logs[rf.ToRealIndex(args.PrevLogIndex+1):]

					DPrintf("Server %v Send AE Args %v To %v",rf.me,args,i)
			}else {
					args.Entries = make([]LogEntry,0)
			}
			if flag {
				go rf.SendInstallSnapshot(i)
			}else{
				args.PrevLogTerm = rf.logs[rf.ToRealIndex(args.PrevLogIndex)].Term
				go rf.SendAppendEntries(i,&args,&reply)
			}
		}
		rf.nextBeatTime = time.Now().Add(time.Duration(HeartBeatInterval)*time.Millisecond)
		rf.mu.Unlock()
	}
}
func(rf*Raft) SendAppendEntries(server int, args *AppendEntriesArgs, reply *AppendEntriesReply){
	ok := rf.sendAppendEntries(server,args,reply)
	if !ok {
		return
	}
	rf.mu.Lock()
	//不是当前Term的RPC,丢弃
	if rf.currentTerm != args.Term {
		DPrintf("Old RPC")
		rf.mu.Unlock()
		return
	}
	
	//All Servers: If RPC request or resopnse contains term T > currentTerm: set currentTerm = T, convert to follower
	if reply.Term > rf.currentTerm{
		rf.currentTerm = reply.Term
		rf.state = Follower
		rf.votedFor = None
		rf.voteNum = 0
		rf.persist()
		rf.mu.Unlock()
		return
	}
	
	//if successful: update nextIndex and matchIndex for follower
	if reply.Success {
		rf.matchIndex[server] = args.PrevLogIndex + len(args.Entries) 
		rf.nextIndex[server] =args.PrevLogIndex + len(args.Entries)   + 1  
		//如何确保多数派提交同一条日志,且不会重复提交?
		//每一条Reply只会更新到自己认为的最新的Commit,这样就不会重复提交
		//Leader不被允许提交之前Term中的Log
		//If there exists an N such that N > commitIndex, a majority of matchIndex[i] >= N, and log[N].Term == currentTerm: set commitIndex = N
		N := rf.ToVirtualIndex(len(rf.logs) - 1)
		for N > rf.commitIndex {
			cnt := 1
			for i:= 0; i < len(rf.peers); i++{
				if i == rf.me {
					continue
				}
				if rf.matchIndex[i] >= N && rf.logs[rf.ToRealIndex(N)].Term == rf.currentTerm {
					cnt++
				}
			}
			if cnt > len(rf.peers) / 2{
				rf.commitIndex = N
				DPrintf("update commitIndex to %v",N)
				break
			}
			N-= 1
		}
		rf.mu.Unlock()
		return
	}
	
	if reply.Term == rf.currentTerm && rf.state == Leader {
			//if fails because of log inconsistency: decrement nextIndex and retry
			//Upon receiving a conflict response, the leader should first search its log for conflictTerm.
			// If it finds a entry in its log with that term, it should set nextIndex to be the one beyond the index of the last entry in that term in its log.
			DPrintf("Server %v reply: conflictTerm %v conflictIndex %v",server,reply.XTerm,reply.XIndex)
			//回复不存在该Term
			if reply.XTerm == -1 {
					//
					if rf.lastIncludedIndex >= reply.XLen {
						go rf.SendInstallSnapshot(server)
					}else{
						rf.nextIndex[server] = reply.XLen
					}
					
					DPrintf(" ConflictIndex By Too Short Follower Len . Leader logs %v",rf.logs)
					rf.mu.Unlock()
					return
				}
			}
			//case Leader has  the same term with follower
			i := rf.nextIndex[server] - 1
			if i < rf.lastIncludedIndex {
				i = rf.lastIncludedIndex
			}
			for  i > rf.lastIncludedIndex && rf.logs[rf.ToRealIndex(i)].Term > reply.XTerm {
				i -= 1
			}
			//当前logs中不存在该Term
			if i == rf.lastIncludedIndex && rf.logs[rf.ToRealIndex(i)].Term > reply.XTerm {
				go rf.SendInstallSnapshot(server)
			//存在该Term	
			}else if rf.logs[rf.ToRealIndex(i)].Term ==  reply.XTerm{
				rf.nextIndex[server] = i+1
			}else{
				// 之前PrevLogIndex发生冲突位置时, Follower的Term自己没有
				DPrintf("leader %v 收到 server %v 的回退请求, 冲突位置的Term为%v, server的这个Term从索引%v开始, 而leader对应的XTerm不存在, 回退前的nextIndex[%v]=%v, 回退后的nextIndex[%v]=%v\n", rf.me, server, reply.XTerm, reply.XIndex, server, rf.nextIndex[server], server, reply.XIndex)
				if reply.XIndex <= rf.lastIncludedIndex {
					// XIndex位置也被截断了
					// 添加InstallSnapshot的处理
					go rf.SendInstallSnapshot(server)
				} else {
					rf.nextIndex[server] = reply.XIndex
				}
			}
			//If it does not find an entry with that term, it should set nextIndex = conflictInde
			rf.mu.Unlock()
			return
	}

处理snapshot RPC

func (rf*Raft) HandlerInstallSnapshot (args * InstallSnapshotArgs, reply * InstallSnapshotReply){
	DPrintf("Server %v 收到 Server %v 的 SnapshotRPC  args %v",rf.me,args.LeaderId,args)
	DPrintf("Server %v lastAppliy%v commitindex %v lastIncludeIndex %v lastIncludeTerm %v log %v",rf.me,rf.lastApplied,rf.commitIndex,rf.lastIncludedIndex,rf.lastIncludedTerm,rf.logs)
	// 1. Reply immediately if term < currentTerm
	rf.mu.Lock()
	defer func() {
		//有效的InstallSnapshot应当重置选举时间
		rf.stamp = time.Now()
		rf.mu.Unlock()
		DPrintf("server %v 接收到 leader %v 的InstallSnapshot, 重设定时器", rf.me, args.LeaderId)
	}()
	if args.Term < rf.currentTerm {
		reply.Term = rf.currentTerm
		return
	}
	if args.Term > rf.currentTerm {
		rf.currentTerm = args.Term
		rf.votedFor = -1
		DPrintf("server %v 接受来自 %v 的 InstallSnapshot, 且发现了更大的Term\n", rf.me, args.LeaderId)
	}
	
	rf.state = Follower
	
	logIndex := args.LastIncludedIndex - rf.lastIncludedIndex // leader快照与本地快照对比
	if logIndex < 0 {
		return
	}
	if logIndex < len(rf.logs) && rf.logs[logIndex].Term == args.LastIncludedTerm { // step 5
		DPrintf("same log entry. server id:%v lastIncludedIndex:%v  log len:%v\n", rf.me, rf.lastIncludedIndex, len(rf.logs))
		return
	}
	// If existing log entry has same index and term as snapshot's last
	// inclouded entry, retain log entries following it and reply
	hasEntry := false
	idx := 0
	for ; idx < len(rf.logs); idx++ {
		if rf.ToVirtualIndex(idx) == args.LastIncludedIndex && rf.logs[idx].Term == args.LastIncludedTerm {
			hasEntry = true
			break
		}
	}
	//存在,截断
	if hasEntry {
		rf.logs = rf.logs[idx:]
		
	}else{
		//Discard the entire log
		rf.logs = make([]LogEntry,0)
		rf.logs = append(rf.logs,LogEntry{Term: rf.lastIncludedTerm}) // 索引为0处占位
	}

	//Rest state machine using snapshot contents
	msg := & ApplyMsg {
		SnapshotValid : true,
		Snapshot         : args.Data,
		SnapshotTerm: args.LastIncludedTerm,
		SnapshotIndex: args.LastIncludedIndex,
	}
	// // Save snapshot
	rf.snapshot = args.Data
	//更新索引
	rf.lastIncludedIndex = args.LastIncludedIndex
	rf.lastIncludedTerm  = args.LastIncludedTerm

	if rf.commitIndex < args.LastIncludedIndex {
		rf.commitIndex = args.LastIncludedIndex
	}

	if rf.lastApplied < args.LastIncludedIndex {
		rf.lastApplied = args.LastIncludedIndex
	}
	
	rf.applyCh <- *msg
	rf.persist()
	return
}

apply To State

func(rf*Raft) appliyToState(){
	//All Servers
	//if commitIndex > lastApplied:increment lastApplied,apply log[lastApplied] to state machine
	for !rf.killed() {
		rf.mu.Lock()
		sendMsg := false
		var msg ApplyMsg
		if rf.commitIndex > rf.lastApplied {
			rf.lastApplied += 1
			sendMsg = true	
			msg = ApplyMsg{
				Command: rf.logs[rf.ToRealIndex(rf.lastApplied)].Command,
				CommandIndex: rf.lastApplied,
				CommandValid:true,
			}
			DPrintf("server %v appliy %v to state index %v",rf.me,rf.logs[rf.ToRealIndex(rf.lastApplied)].Command, rf.lastApplied)
		}
		rf.mu.Unlock()
		if sendMsg { // 向service发送消息
			rf.applyCh <- msg
		} else { // 此次未发送消息,休眠
			time.Sleep(ApplyInterval * time.Millisecond)
		}
	}
}      

关于测试

2D较为繁琐的下标转换,会发生很多的越界问题,需要从大量的日志中寻找错误,并且2D的测试又会暴露出A、B、C的缺陷。2D部分,是整个Raft最后一部分,也是最恶心的一部分。

  • 可以一个一个测试来实验。
rm outD
go test -run TestSnapshotInstallUnreliable2D > outD
Test (2D): snapshots basic ...
  ... Passed --   4.6  3  138   48082  228
Test (2D): install snapshots (disconnect) ...
  ... Passed --  41.7  3 1071  482506  331
Test (2D): install snapshots (disconnect+unreliable) ...
  ... Passed --  62.5  3 1537  598130  339
Test (2D): install snapshots (crash) ...
  ... Passed --  31.7  3  734  387263  330
Test (2D): install snapshots (unreliable+crash) ...
  ... Passed --  37.5  3  824  395189  296
Test (2D): crash and restart all servers ...
  ... Passed --   8.0  3  236   67112   53
Test (2D): snapshot initialization after crash ...
  ... Passed --   2.5  3   68   18886   14
PASS
ok  	6.5840/raft	188.568s

整个Lab2测试

vv@ubuntu:~/6.5840/src/raft$ time go test
Test (2A): initial election ...
  ... Passed --   3.1  3   62   16820    0
Test (2A): election after network failure ...
  ... Passed --   4.5  3  132   25502    0
Test (2A): multiple elections ...
  ... Passed --   5.5  7  720  138482    0
Test (2B): basic agreement ...
  ... Passed --   0.6  3   16    4324    3
Test (2B): RPC byte count ...
  ... Passed --   1.5  3   48  113712   11
Test (2B): test progressive failure of followers ...
  ... Passed --   4.7  3  130   26194    3
Test (2B): test failure of leaders ...
  ... Passed --   5.0  3  195   41140    3
Test (2B): agreement after follower reconnects ...
  ... Passed --   5.6  3  134   34653    8
Test (2B): no agreement if too many followers disconnect ...
  ... Passed --   3.4  5  224   43394    4
Test (2B): concurrent Start()s ...
  ... Passed --   0.6  3   16    4356    6
Test (2B): rejoin of partitioned leader ...
  ... Passed --   4.1  3  146   32962    4
Test (2B): leader backs up quickly over incorrect follower logs ...
  ... Passed --  19.0  5 1932 1305957  103
Test (2B): RPC counts aren't too high ...
  ... Passed --   2.2  3   44   12384   12
Test (2C): basic persistence ...
  ... Passed --   3.6  3   86   21664    6
Test (2C): more persistence ...
  ... Passed --  15.9  5  980  200948   16
Test (2C): partitioned leader and one follower crash, leader restarts ...
  ... Passed --   1.6  3   34    8632    4
Test (2C): Figure 8 ...
  ... Passed --  30.1  5 1088  229143   58
Test (2C): unreliable agreement ...
  ... Passed --   1.7  5  560  187381  246
Test (2C): Figure 8 (unreliable) ...
  ... Passed --  37.2  5 9816 17454663  330
Test (2C): churn ...
  ... Passed --  16.2  5 9512 28084585 2402
Test (2C): unreliable churn ...
  ... Passed --  16.3  5 3628 4791630  874
Test (2D): snapshots basic ...
  ... Passed --   4.4  3  136   46146  201
Test (2D): install snapshots (disconnect) ...
  ... Passed --  43.2  3 1114  522180  365
Test (2D): install snapshots (disconnect+unreliable) ...
  ... Passed --  63.6  3 1612  637606  343
Test (2D): install snapshots (crash) ...
  ... Passed --  30.7  3  725  365511  332
Test (2D): install snapshots (unreliable+crash) ...
  ... Passed --  35.6  3  800  412641  326
Test (2D): crash and restart all servers ...
  ... Passed --   8.3  3  240   68212   54
Test (2D): snapshot initialization after crash ...
  ... Passed --   2.5  3   68   18852   14
PASS
ok      6.5840/raft     370.873s

real    6m11.360s
user    5m40.683s
sys     1m17.473s