从源码看redis cluster

504 阅读4分钟

概述

开启redis集群模式后,各个节点间互相定期交互(ping,pong)来交换信息 当发送给某个节点的ping超时没有收到Pong后,会标记为pfail 当交换信息后发现pfail达到多数派,则标识为fail,并发送广播 满足条件的主节点发起failover认证请求 主节点处理认证请求,发送认证响应 收到的认证成功是来自主节点,则提升自己为主节点,分配slots,进行广播

前置代码

int main(int argc, char **argv) {
...
initServer()
...
}
void initServer(void) {
...
    if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
        serverPanic("Can't create event loop timers.");
        exit(1);
    }
...
}
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
...    
run_with_period(100) {
        if (server.cluster_enabled) clusterCron();
    }
...
}

void clusterCron(void) {
...
每迭代10次,我们就随机选一个节点发送ping,也就是我们通常每秒随机给一个节点发送ping
if (!(iteration % 10)) {
        int j;
         随机挑选一个节点,给收到pong消息到现在最久的节点发送ping


        if (node->link == NULL) {
...
      aeCreateFileEvent(server.el,link->fd,AE_READABLE,
                    clusterReadHandler,link);
...
}
        for (j = 0; j < 5; j++) {
            de = dictGetRandomKey(server.cluster->nodes);
            clusterNode *this = dictGetVal(de);
            不要挑选断开连接的或者存在当前活跃ping的节点
      
            if (this->link == NULL || this->ping_sent != 0) continue;
            if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
                continue;
            if (min_pong_node == NULL || min_pong > this->pong_received) {
                min_pong_node = this;
                min_pong = this->pong_received;
            }
        }
        if (min_pong_node) {
发送ping
            clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
        }
    }



    while((de = dictNext(di)) != NULL) {
如果收到pong 则 node->ping_sent为0
        delay = now - node->ping_sent;
超时标记为pfail
        if (delay > server.cluster_node_timeout) {
      如果不是fail或者pfail状态
            if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
                node->flags |= CLUSTER_NODE_PFAIL;
                update_state = 1;
            }
        }
}
如果自己是从节点
   if (nodeIsSlave(myself)) {
        处理手动failover,满足条件则设置标识
        clusterHandleManualFailover();
        if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
处理failover
            clusterHandleSlaveFailover();
     
        if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves)
            clusterHandleSlaveMigration(max_slaves);
    }
...
}

void clusterHandleManualFailover(void) {
    if (server.cluster->mf_end == 0) return;


    if (server.cluster->mf_can_start) return;

    if (server.cluster->mf_master_offset == 0) return; /* Wait for offset... */

    if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
     
        server.cluster->mf_can_start = 1;

}

gossip相关

void clusterSendPing(clusterLink *link, int type) {
...
添加把pfail状态的节点 添加到发送信息中
 if (pfail_wanted) {
        dictIterator *di;
        dictEntry *de;

        di = dictGetSafeIterator(server.cluster->nodes);
        while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
            clusterNode *node = dictGetVal(de);
            if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
            if (node->flags & CLUSTER_NODE_NOADDR) continue;
            if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
            clusterSetGossipEntry(hdr,gossipcount,node);

    }
...
    clusterSendMessage(link,buf,totlen);


}

void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
...
if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) {
            if (clusterProcessPacket(link)) {
...

}
int clusterProcessPacket(clusterLink *link) {

...
    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
        type == CLUSTERMSG_TYPE_MEET)
    {
...
        if (sender) clusterProcessGossipSection(hdr,link);

...
}
...
    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
if (!sender) return 1;  /* We don't know that node. */
        clusterSendFailoverAuthIfNeeded(sender,hdr);
...
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
        if (!sender) return 1;  /* We don't know that node. */
        /* We consider this vote only if the sender is a master serving
         * a non zero number of slots, and its currentEpoch is greater or
         * equal to epoch where this node started the election. */
        if (nodeIsMaster(sender) && sender->numslots > 0 &&
            senderCurrentEpoch >= server.cluster->failover_auth_epoch)
        {
            server.cluster->failover_auth_count++;
            /* Maybe we reached a quorum here, set a flag to make sure
             * we check ASAP. */
            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
        }
    } else if (type == CLUSTERMSG_TYPE_FAIL) {
...
       failing = clusterLookupNode(hdr->data.fail.about.nodename);
            if (failing &&
                !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF)))
            {
...
}
...
}

void clusterDoBeforeSleep(int flags) {
    server.cluster->todo_before_sleep |= flags;
}

void clusterHandleSlaveFailover(void) {

...
条件
1)是自身是从节点
2)主节点是fail状态或者是手动failover
3) 没有no failover配置,而且不是手动failover
4)服务了slots
 if (nodeIsMaster(myself) ||
        myself->slaveof == NULL ||
        (!nodeFailed(myself->slaveof) && !manual_failover) ||
        (server.cluster_slave_no_failover && !manual_failover) ||
        myself->slaveof->numslots == 0)
    {
        /* There are no reasons to failover, so we set the reason why we
         * are returning without failing over to NONE. */
        server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
        return;
    }
...

  检查我们的数据是否够新
    if (server.cluster_slave_validity_factor &&
        data_age >
        (((mstime_t)server.repl_ping_slave_period * 1000) +
         (server.cluster_node_timeout * server.cluster_slave_validity_factor)))
    {
        if (!manual_failover) {
            clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
            return;
        }
    }

...

...
 /* If the previous failover attempt timedout and the retry time has
     * elapsed, we can setup a new one. */
    if (auth_age > auth_retry_time) {
...
发送failover认证的请求
if (server.cluster->failover_auth_sent == 0) {
        server.cluster->currentEpoch++;
        server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
  
        clusterRequestFailoverAuth();
        server.cluster->failover_auth_sent = 1;
        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
                             CLUSTER_TODO_UPDATE_STATE|
                             CLUSTER_TODO_FSYNC_CONFIG);
        return; /* Wait for replies. */
    }

检查认证是否达到多数派
    if (server.cluster->failover_auth_count >= needed_quorum) {
...
        clusterFailoverReplaceYourMaster();
...
}
...
}

void clusterRequestFailoverAuth(void) {
    unsigned char buf[sizeof(clusterMsg)];
    clusterMsg *hdr = (clusterMsg*) buf;
    uint32_t totlen;

    clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST);
  
    if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
    hdr->totlen = htonl(totlen);
    clusterBroadcastMessage(buf,totlen);
}

void clusterFailoverReplaceYourMaster(void) {

...
把自己提升为master
    clusterSetNodeAsMaster(myself);
    replicationUnsetMaster();
...
发送广播
    clusterBroadcastPong(CLUSTER_BROADCAST_ALL);

}

void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
...
如果自身是从节点或者没有服务slots则不处理
    if (nodeIsSlave(myself) || myself->numslots == 0) return;
...
如果此节点是主节点或者此节点没有主节点,或者他的主节点不是fail状态
if (nodeIsMaster(node) || master == NULL ||
        (!nodeFailed(master) && !force_ack))
    {
}
...
发送failover认证响应
    clusterSendFailoverAuth(node);

}

void clusterSendFailoverAuth(clusterNode *node) {
    unsigned char buf[sizeof(clusterMsg)];
    clusterMsg *hdr = (clusterMsg*) buf;
    uint32_t totlen;

    if (!node->link) return;
    clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK);
    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
    hdr->totlen = htonl(totlen);
    clusterSendMessage(node->link,buf,totlen);
}


void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
node = clusterLookupNode(g->nodename);
        if (node) {
            如果这个节点是主节点并且不是自身
            if (sender && nodeIsMaster(sender) && node != myself) {
如果是fail或者pfail状态
                if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
                   检查是否需要标识此节点为fail
                    markNodeAsFailingIfNeeded(node);
                } else {
                    if (clusterNodeDelFailureReport(node,sender)) {
                        serverLog(LL_VERBOSE,
                            "Node %.40s reported node %.40s is back online.",
                            sender->name, node->name);
                    }
                }
            }
}

void markNodeAsFailingIfNeeded(clusterNode *node) {
检查我们自身是否可达这个节点
    if (!nodeTimedOut(node)) return;
检查这个节点已经是fail状态
    if (nodeFailed(node)) return;
...
自身是master才会进行增加
    if (nodeIsMaster(myself)) failures++;
主从节点都会进行判断pfail或者fail达到多数派
    if (failures < needed_quorum) return; 
...
如果自身是主节点,则发送此节点fail的广播
    if (nodeIsMaster(myself)) clusterSendFail(node->name);
...
}

void clusterSendFail(char *nodename) {
    unsigned char buf[sizeof(clusterMsg)];
    clusterMsg *hdr = (clusterMsg*) buf;

    clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL);
    memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
    clusterBroadcastMessage(buf,ntohl(hdr->totlen));
}