概述
开启redis集群模式后,各个节点间互相定期交互(ping,pong)来交换信息 当发送给某个节点的ping超时没有收到Pong后,会标记为pfail 当交换信息后发现pfail达到多数派,则标识为fail,并发送广播 满足条件的主节点发起failover认证请求 主节点处理认证请求,发送认证响应 收到的认证成功是来自主节点,则提升自己为主节点,分配slots,进行广播
前置代码
int main(int argc, char **argv) {
...
initServer()
...
}
void initServer(void) {
...
if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
serverPanic("Can't create event loop timers.");
exit(1);
}
...
}
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
...
run_with_period(100) {
if (server.cluster_enabled) clusterCron();
}
...
}
void clusterCron(void) {
...
每迭代10次,我们就随机选一个节点发送ping,也就是我们通常每秒随机给一个节点发送ping
if (!(iteration % 10)) {
int j;
随机挑选一个节点,给收到pong消息到现在最久的节点发送ping
if (node->link == NULL) {
...
aeCreateFileEvent(server.el,link->fd,AE_READABLE,
clusterReadHandler,link);
...
}
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
不要挑选断开连接的或者存在当前活跃ping的节点
if (this->link == NULL || this->ping_sent != 0) continue;
if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
continue;
if (min_pong_node == NULL || min_pong > this->pong_received) {
min_pong_node = this;
min_pong = this->pong_received;
}
}
if (min_pong_node) {
发送ping
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
}
}
while((de = dictNext(di)) != NULL) {
如果收到pong 则 node->ping_sent为0
delay = now - node->ping_sent;
超时标记为pfail
if (delay > server.cluster_node_timeout) {
如果不是fail或者pfail状态
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
node->flags |= CLUSTER_NODE_PFAIL;
update_state = 1;
}
}
}
如果自己是从节点
if (nodeIsSlave(myself)) {
处理手动failover,满足条件则设置标识
clusterHandleManualFailover();
if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
处理failover
clusterHandleSlaveFailover();
if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves)
clusterHandleSlaveMigration(max_slaves);
}
...
}
void clusterHandleManualFailover(void) {
if (server.cluster->mf_end == 0) return;
if (server.cluster->mf_can_start) return;
if (server.cluster->mf_master_offset == 0) return; /* Wait for offset... */
if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
server.cluster->mf_can_start = 1;
}
gossip相关
void clusterSendPing(clusterLink *link, int type) {
...
添加把pfail状态的节点 添加到发送信息中
if (pfail_wanted) {
dictIterator *di;
dictEntry *de;
di = dictGetSafeIterator(server.cluster->nodes);
while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
clusterNode *node = dictGetVal(de);
if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
if (node->flags & CLUSTER_NODE_NOADDR) continue;
if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
clusterSetGossipEntry(hdr,gossipcount,node);
}
...
clusterSendMessage(link,buf,totlen);
}
void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
...
if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) {
if (clusterProcessPacket(link)) {
...
}
int clusterProcessPacket(clusterLink *link) {
...
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
type == CLUSTERMSG_TYPE_MEET)
{
...
if (sender) clusterProcessGossipSection(hdr,link);
...
}
...
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
if (!sender) return 1; /* We don't know that node. */
clusterSendFailoverAuthIfNeeded(sender,hdr);
...
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
if (!sender) return 1; /* We don't know that node. */
/* We consider this vote only if the sender is a master serving
* a non zero number of slots, and its currentEpoch is greater or
* equal to epoch where this node started the election. */
if (nodeIsMaster(sender) && sender->numslots > 0 &&
senderCurrentEpoch >= server.cluster->failover_auth_epoch)
{
server.cluster->failover_auth_count++;
/* Maybe we reached a quorum here, set a flag to make sure
* we check ASAP. */
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
}
} else if (type == CLUSTERMSG_TYPE_FAIL) {
...
failing = clusterLookupNode(hdr->data.fail.about.nodename);
if (failing &&
!(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF)))
{
...
}
...
}
void clusterDoBeforeSleep(int flags) {
server.cluster->todo_before_sleep |= flags;
}
void clusterHandleSlaveFailover(void) {
...
条件
1)是自身是从节点
2)主节点是fail状态或者是手动failover
3) 没有no failover配置,而且不是手动failover
4)服务了slots
if (nodeIsMaster(myself) ||
myself->slaveof == NULL ||
(!nodeFailed(myself->slaveof) && !manual_failover) ||
(server.cluster_slave_no_failover && !manual_failover) ||
myself->slaveof->numslots == 0)
{
/* There are no reasons to failover, so we set the reason why we
* are returning without failing over to NONE. */
server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
return;
}
...
检查我们的数据是否够新
if (server.cluster_slave_validity_factor &&
data_age >
(((mstime_t)server.repl_ping_slave_period * 1000) +
(server.cluster_node_timeout * server.cluster_slave_validity_factor)))
{
if (!manual_failover) {
clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
return;
}
}
...
...
/* If the previous failover attempt timedout and the retry time has
* elapsed, we can setup a new one. */
if (auth_age > auth_retry_time) {
...
发送failover认证的请求
if (server.cluster->failover_auth_sent == 0) {
server.cluster->currentEpoch++;
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
clusterRequestFailoverAuth();
server.cluster->failover_auth_sent = 1;
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
CLUSTER_TODO_UPDATE_STATE|
CLUSTER_TODO_FSYNC_CONFIG);
return; /* Wait for replies. */
}
检查认证是否达到多数派
if (server.cluster->failover_auth_count >= needed_quorum) {
...
clusterFailoverReplaceYourMaster();
...
}
...
}
void clusterRequestFailoverAuth(void) {
unsigned char buf[sizeof(clusterMsg)];
clusterMsg *hdr = (clusterMsg*) buf;
uint32_t totlen;
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST);
if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
hdr->totlen = htonl(totlen);
clusterBroadcastMessage(buf,totlen);
}
void clusterFailoverReplaceYourMaster(void) {
...
把自己提升为master
clusterSetNodeAsMaster(myself);
replicationUnsetMaster();
...
发送广播
clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
}
void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
...
如果自身是从节点或者没有服务slots则不处理
if (nodeIsSlave(myself) || myself->numslots == 0) return;
...
如果此节点是主节点或者此节点没有主节点,或者他的主节点不是fail状态
if (nodeIsMaster(node) || master == NULL ||
(!nodeFailed(master) && !force_ack))
{
}
...
发送failover认证响应
clusterSendFailoverAuth(node);
}
void clusterSendFailoverAuth(clusterNode *node) {
unsigned char buf[sizeof(clusterMsg)];
clusterMsg *hdr = (clusterMsg*) buf;
uint32_t totlen;
if (!node->link) return;
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK);
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
hdr->totlen = htonl(totlen);
clusterSendMessage(node->link,buf,totlen);
}
void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
node = clusterLookupNode(g->nodename);
if (node) {
如果这个节点是主节点并且不是自身
if (sender && nodeIsMaster(sender) && node != myself) {
如果是fail或者pfail状态
if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
检查是否需要标识此节点为fail
markNodeAsFailingIfNeeded(node);
} else {
if (clusterNodeDelFailureReport(node,sender)) {
serverLog(LL_VERBOSE,
"Node %.40s reported node %.40s is back online.",
sender->name, node->name);
}
}
}
}
void markNodeAsFailingIfNeeded(clusterNode *node) {
检查我们自身是否可达这个节点
if (!nodeTimedOut(node)) return;
检查这个节点已经是fail状态
if (nodeFailed(node)) return;
...
自身是master才会进行增加
if (nodeIsMaster(myself)) failures++;
主从节点都会进行判断pfail或者fail达到多数派
if (failures < needed_quorum) return;
...
如果自身是主节点,则发送此节点fail的广播
if (nodeIsMaster(myself)) clusterSendFail(node->name);
...
}
void clusterSendFail(char *nodename) {
unsigned char buf[sizeof(clusterMsg)];
clusterMsg *hdr = (clusterMsg*) buf;
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL);
memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
clusterBroadcastMessage(buf,ntohl(hdr->totlen));
}