eventloop定义
aeEventLoop为redis事件循环的结构定义,实现在ae.h
typedef struct aeEventLoop {
int maxfd; /* highest file descriptor currently registered 注册的文件事件的fd最大值*/
int setsize; /* max number of file descriptors tracked */
long long timeEventNextId; // 用于生成aeTimeEvent的成员变量:id,自增
time_t lastTime; /* Used to detect system clock skew */
aeFileEvent *events; /* Registered events, key为fd */
aeFiredEvent *fired; /* Fired events */
aeTimeEvent *timeEventHead; // 时间事件列表
int stop; // 是否停止eventloop
void *apidata; /* This is used for polling API specific data */
aeBeforeSleepProc *beforesleep; // sleep前回调执行函数
aeBeforeSleepProc *aftersleep; // sleep后执行函数
int flags;
} aeEventLoop;
定义中,file事件events和就绪事件fired为数组,分别保存已注册的文件事件和可读/可写事件;时间事件timeEventHead为一个时间事件链表的表头,链表为双向链表,保存时间事件的回调函数等信息
事件定义
aeFileEvent为文件事件结构;aeTimeEvent为事件事件定义
其中aeTimeEvent为一个双向链表结构,timeProc函数的返回值为下次执行时间间隔,等于-1时把id设置为-1,删除这个event不再执行
/* File event structure */
typedef struct aeFileEvent {
int mask; /* one of AE_(READABLE|WRITABLE|BARRIER) */
aeFileProc *rfileProc; // 读事件回调函数
aeFileProc *wfileProc; // 参数(eventLoop,fd,fe->clientData,mask)
void *clientData;
} aeFileEvent;
/* Time event structure */
typedef struct aeTimeEvent {
long long id; /* time event identifier. -1时可以删除*/
long when_sec; /* 执行时间 seconds */
long when_ms; /* milliseconds */
aeTimeProc *timeProc; // 入参(eventLoop, id, te->clientData)
aeEventFinalizerProc *finalizerProc; // time event释放之前执行的函数,入参(eventLoop, te->clientData)
void *clientData;
struct aeTimeEvent *prev;
struct aeTimeEvent *next;
int refcount; /* refcount to prevent timer events from being
* freed in recursive time event calls. 防止递归执行的time event被释放*/
} aeTimeEvent;
redis main函数通过aeMain执行while循环,循环调用aeProcessEvents执行事件循环,在这个函数中处理时间以及文件事件,事件循环的主体函数实现:
int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
int processed = 0, numevents;
/* Nothing to do? return ASAP */
// 既不处理时间事件也不处理文件事件,直接返回
if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0;
/* Note that we want call select() even if there are no
* file events to process as long as we want to process time
* events, in order to sleep until the next time event is ready
* to fire. */
// maxfd表示已经注册的文件事件fd的最大值,不为-1表示有fd注册了
// 没有fd注册,但是有时间事件,并且这个时间事件每次迭代需要sleep也会执行
if (eventLoop->maxfd != -1 ||
((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {
// 有时间事件,并且不wait?wait what?
int j;
aeTimeEvent *shortest = NULL;
struct timeval tv, *tvp;
if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT)) // 时间时间处理
shortest = aeSearchNearestTimer(eventLoop); // 查找eventLoop->timeEventHead中执行时间最近的一个timer,返回aeTimeEvent
// 计算等待下一个(这个)时间事件执行需要等待的ms数
if (shortest) { // 存在
long now_sec, now_ms;
aeGetTime(&now_sec, &now_ms); // 获取当前时间的秒数,毫秒数
tvp = &tv;
/* How many milliseconds we need to wait for the next
* time event to fire? */
long long ms =
(shortest->when_sec - now_sec)*1000 +
shortest->when_ms - now_ms;
if (ms > 0) {
tvp->tv_sec = ms/1000;
tvp->tv_usec = (ms % 1000)*1000;
} else {
tvp->tv_sec = 0;
tvp->tv_usec = 0;
}
} else { // eventLoop->timeEventHead为空
/* If we have to check for events but need to return
* ASAP because of AE_DONT_WAIT we need to set the timeout
* to zero */
if (flags & AE_DONT_WAIT) {
tv.tv_sec = tv.tv_usec = 0;
tvp = &tv;
} else {
/* Otherwise we can block */
tvp = NULL; /* wait forever */
}
}
if (eventLoop->flags & AE_DONT_WAIT) {
tv.tv_sec = tv.tv_usec = 0;
tvp = &tv;
}
if (eventLoop->beforesleep != NULL && flags & AE_CALL_BEFORE_SLEEP)
eventLoop->beforesleep(eventLoop);
/* Call the multiplexing API, will return only on timeout or when
* some event fires. */
// 调用多路复用API,仅当超时或者监测到事件才返回
// 这里tvp仅作为epoll_wait中的超时控制,及时返回执行时间事件
// 定义了#ifdef HAVE_EPOLL会使用epoll的api
// 返回: numevents 可执行的事件数量
// eventLoop->fired 可执行的事件列表
numevents = aeApiPoll(eventLoop, tvp);
/* After sleep callback. */
if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP)
eventLoop->aftersleep(eventLoop);
for (j = 0; j < numevents; j++) {
aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
int mask = eventLoop->fired[j].mask;
int fd = eventLoop->fired[j].fd;
int fired = 0; /* Number of events fired for current fd. */
// 同一个事件既读又写,一般先执行读事件,再执行写事件,因为有时候我们需要处理完query之后返回服务处理结果
int invert = fe->mask & AE_BARRIER; // AE_BARRIER?
/* Note the "fe->mask & mask & ..." code: maybe an already
* processed event removed an element that fired and we still
* didn't processed, so we check if the event is still valid.
*
* Fire the readable event if the call sequence is not
* inverted. */
if (!invert && fe->mask & mask & AE_READABLE) {
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
fired++;
fe = &eventLoop->events[fd]; /* Refresh in case of resize. */
}
/* Fire the writable event. */
if (fe->mask & mask & AE_WRITABLE) {
if (!fired || fe->wfileProc != fe->rfileProc) {
fe->wfileProc(eventLoop,fd,fe->clientData,mask);
fired++;
}
}
/* If we have to invert the call, fire the readable event now
* after the writable one. */
// 如果需要invert,先执行写事件,后执行读事件
if (invert) {
fe = &eventLoop->events[fd]; /* Refresh in case of resize. */
if ((fe->mask & mask & AE_READABLE) &&
(!fired || fe->wfileProc != fe->rfileProc))
{
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
fired++;
}
}
processed++;
}
}
/* Check time events */
if (flags & AE_TIME_EVENTS)
processed += processTimeEvents(eventLoop); // 处理时间事件
return processed; /* return the number of processed file/time events */
}
多路复用API
redis提供了多种多路复用实现,epoll/select/kqueue等,多路复用API选择在ae.c文件中使用宏指定
#ifdef HAVE_EVPORT
#include "ae_evport.c"
#else
#ifdef HAVE_EPOLL
#include "ae_epoll.c"
#else
#ifdef HAVE_KQUEUE
#include "ae_kqueue.c"
#else
#include "ae_select.c"
#endif
#endif
#endif
只看一个epoll
epoll
// 多路复用api,与ae交互数据
// struct epoll_event {
// __uint32_t events; /* Epoll events */
// epoll_data_t data; /* User data variable */
// };
typedef struct aeApiState {
int epfd; // epoll文件描述符
struct epoll_event *events; // 从内核拷贝出的事件列表
} aeApiState;
添加事件
// 输入fd和mask,添加文件事件
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
aeApiState *state = eventLoop->apidata;
struct epoll_event ee = {0}; /* avoid valgrind warning */
/* If the fd was already monitored for some event, we need a MOD
* operation. Otherwise we need an ADD operation. */
int op = eventLoop->events[fd].mask == AE_NONE ?
EPOLL_CTL_ADD : EPOLL_CTL_MOD; // 已经在监控事件了就修改,否则就增加
ee.events = 0;
mask |= eventLoop->events[fd].mask; /* Merge old events*/
if (mask & AE_READABLE) ee.events |= EPOLLIN;
if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
ee.data.fd = fd;
if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
return 0;
}
IO多路复用api调用接口函数实现:
// 调用多路复用API,仅当超时或者有事件执行了才返回
// 这里的tvp仅作为epoll_wait中的超时控制
// 定义了#ifdef HAVE_EPOLL会使用epoll的这个api
// 返回: 可执行的事件数量
// eventLoop->fired 可执行的事件列表
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
aeApiState *state = eventLoop->apidata;
int retval, numevents = 0;
// epfd是epoll的文件描述符
// state->events 分配好的epoll结构体数组,内核只负责把数据复制到这个events数组中,不会帮我们在用户态中分配内存
// eventLoop->setsize返回的最多文件描述符(事件)数量
// 没有监测到事件发生的最多等待时间
// 返回需要处理的事件数目,如返回0表示已超时。如果返回–1,则表示出现错误,需要检查errno错误码判断错误类型
retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
if (retval > 0) {
int j;
numevents = retval;
for (j = 0; j < numevents; j++) {
int mask = 0;
struct epoll_event *e = state->events+j; // state->events的第j个元素
// 这里直接取epoll_event的events(__uint32_t)与类型做按位与
if (e->events & EPOLLIN) mask |= AE_READABLE; // 文件描述符可读
if (e->events & EPOLLOUT) mask |= AE_WRITABLE; // 文件描述符可写
if (e->events & EPOLLERR) mask |= AE_WRITABLE|AE_READABLE; // 文件描述符发生错误
if (e->events & EPOLLHUP) mask |= AE_WRITABLE|AE_READABLE; // 文件描述符被挂断
eventLoop->fired[j].fd = e->data.fd;
eventLoop->fired[j].mask = mask;
}
}
return numevents;
}
事件实例
时间事件
serverCron函数为redis的主要时间事件处理函数,clients超时处理/统计数据更新等都在该函数内部实现
aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL)
serverCron返回值1000/server.hz(ms),即函数执行时间间隔,为固定值或者根据clients数量和server.hz决定
文件事件
文件事件,分析一个tcp请求处理过程
初始化:
服务启动,initServer函数内创建文件事件监听多个listenFd0~listenFdn多个fd读事件注册读回调函数acceptTcpHandler,并启动eventloop
请求处理对应的eventloop过程:
- eventloop执行
- beforeSleep中执行handleClientsWithPendingWritesUsingThreads函数,检查server.clients_pending_write队列,注册每个client的写事件及回调函数sendReplyToClient
- eventloop执行,等待事件发生(这个可能会持续到timeout,所以描述为sleep)
- 有事件或者超时了,对fired事件进行处理,包括以下几类事件的处理
-
- 有listenFd0~listenFdn的fd注册的读事件,执行acceptTcpHandler回调函数,accept建立连接后获取connectFd,注册读事件及其回调函数readQueryFromClient;
- connectFd对应的文件事件发生,执行读回调函数readQueryFromClient,读数据并处理请求,请求处理完成后的client数据丢到server.clients_pending_write队列;
- 写事件执行回调函数sendReplyToClient
以下为代码中对应部分:
处理客户端写事件,遍历server.clients_pending_write队列,对于队列中每个client的connection:
- 注册写事件到eventloop
- 注册sendReplyToClient回调函数,设置mask为AE_WRITABLE
int handleClientsWithPendingWritesUsingThreads(void) {
int processed = listLength(server.clients_pending_write);
if (processed == 0) return 0; /* Return ASAP if there are no clients. */
/* If I/O threads are disabled or we have few clients to serve, don't
* use I/O threads, but thejboring synchronous code. */
// io线程被禁用,并且客户端比较少的情况下,则不用io线程
if (server.io_threads_num == 1 || stopThreadedIOIfNeeded()) {
return handleClientsWithPendingWrites();
}
// 启动io线程
/* Start threads if needed. */
if (!server.io_threads_active) startThreadedIO();
if (tio_debug) printf("%d TOTAL WRITE pending clients\n", processed);
// 下面这部分为使用io多线程情况,将clients_pending_write分到多个集合,由不同的线程来处理
/* Distribute the clients across N different lists. */
listIter li;
listNode *ln;
listRewind(server.clients_pending_write,&li);
int item_id = 0;
while((ln = listNext(&li))) {
client *c = listNodeValue(ln);
c->flags &= ~CLIENT_PENDING_WRITE;
/* Remove clients from the list of pending writes since
* they are going to be closed ASAP. */
if (c->flags & CLIENT_CLOSE_ASAP) {
listDelNode(server.clients_pending_write, ln); // 这里将节点从server.clients_pending_write删除
continue;
}
int target_id = item_id % server.io_threads_num;
listAddNodeTail(io_threads_list[target_id],c);
item_id++;
}
/* Give the start condition to the waiting threads, by setting the
* start condition atomic var. */
io_threads_op = IO_THREADS_OP_WRITE;
for (int j = 1; j < server.io_threads_num; j++) {
int count = listLength(io_threads_list[j]);
io_threads_pending[j] = count;
}
/* Also use the main thread to process a slice of clients. */
listRewind(io_threads_list[0],&li);
while((ln = listNext(&li))) {
client *c = listNodeValue(ln);
writeToClient(c,0);
}
listEmpty(io_threads_list[0]);
// 等待多个io线程执行完成
/* Wait for all the other threads to end their work. */
while(1) {
unsigned long pending = 0;
for (int j = 1; j < server.io_threads_num; j++)
pending += io_threads_pending[j];
if (pending == 0) break;
}
if (tio_debug) printf("I/O WRITE All threads finshed\n");
/* Run the list of clients again to install the write handler where
* needed. */
// 下面这一段是不使用io线程的情况,遍历server.clients_pending_write,注册write事件
listRewind(server.clients_pending_write,&li);
while((ln = listNext(&li))) {
client *c = listNodeValue(ln);
/* Install the write handler if there are pending writes in some
* of the clients. */
if (clientHasPendingReplies(c) && // c->bufpos || listLength(c->reply)
connSetWriteHandler(c->conn, sendReplyToClient) == AE_ERR) // 注册写事件到eventloop,为写事件注册回调函数sendReplyToClient将缓冲区数据写到客户端
{
freeClientAsync(c);
}
}
listEmpty(server.clients_pending_write);
/* Update processed count on server */
server.stat_io_writes_processed += processed;
return processed;
}
listen读事件/连接建立/注册connectFd读事件部分 acceptTcpHandler为main函数中注册的socket文件读事件
void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
int cport, cfd, max = MAX_ACCEPTS_PER_CALL;
char cip[NET_IP_STR_LEN];
UNUSED(el);
UNUSED(mask);
UNUSED(privdata);
while(max--) { // 每次调用最多处理1000个连接
// fd: 监听的文件描述符
// cip:客户端ip
// cport: 对端端口
// 返回:cfd connectFd
// 功能:接受一个连接,并获取客户端的ip port,并通过connectFd与客户端通信,listenFd仍旧用于监听
cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
if (cfd == ANET_ERR) {
if (errno != EWOULDBLOCK)
serverLog(LL_WARNING,
"Accepting client connection: %s", server.neterr);
return;
}
serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport);
// connCreateAcceptedSocket : 创建一个connection,绑定ConnectionType和fd,设置状态为CONN_STATE_ACCEPTING
// 返回: 新创建的connection
// acceptCommonHandler:创建客户端/注册读事件回调函数
acceptCommonHandler(connCreateAcceptedSocket(cfd),0,cip);
}
}
创建客户端,注册connect读事件及注册读回调函数部分
static void acceptCommonHandler(connection *conn, int flags, char *ip) {
......
if ((c = createClient(conn)) == NULL) {
......
return;
}
......
}
createClient实现
// 创建client,设置当前connection的socket状态
// 注册读事件回调函数,创建相关的文件读事件,回调函数readQueryFromClient
client *createClient(connection *conn) {
client *c = zmalloc(sizeof(client));
if (conn) {
connNonBlock(conn);
connEnableTcpNoDelay(conn);
if (server.tcpkeepalive)
connKeepAlive(conn,server.tcpkeepalive);
connSetReadHandler(conn, readQueryFromClient);// 注册读事件回调函数,创建相关的connect读事件readQueryFromClient
connSetPrivateData(conn, c);
}
selectDb(c,0); // c->db = server.db[id]
uint64_t client_id = ++server.next_client_id;
c->id = client_id;
......
if (conn) linkClient(c);
initClientMultiState(c);
return c;
}
connect读回调readQueryFromClient函数中实现了从socket读数据、请求解析和处理、结果丢到server.clients_pending_write队列
void readQueryFromClient(connection *conn) {
client *c = connGetPrivateData(conn);
int nread, readlen;
size_t qblen;
/* Check if we want to read from the client later when exiting from
* the event loop. This is the case if threaded I/O is enabled. */
// 如果需要从eventloop退出之后再读数据,就直接return,启用了线程I/O的情况,把client丢到server.clients_pending_read
if (postponeClientRead(c)) return;
......
c->querybuf = sdsMakeRoomFor(c->querybuf, readlen);
// 调用read从连接读数据
nread = connRead(c->conn, c->querybuf+qblen, readlen);
......
// 调用processCommandAndResetClient(c)函数进行命令执行部分
processInputBuffer(c);
}
调用链processInputBuffer -> processCommandAndResetClient(c) -> processCommand(c)
int processCommand(client *c) {
......
/* Exec the command */
if (c->flags & CLIENT_MULTI &&
c->cmd->proc != execCommand && c->cmd->proc != discardCommand &&
c->cmd->proc != multiCommand && c->cmd->proc != watchCommand)
{
queueMultiCommand(c);
addReply(c,shared.queued);
} else {
// call中c->cmd->proc(c)调用具体的命令处理函数对请求进行处理
// 在proc中addReply等函数写buf并把c添加到server.clients_pending_write队列的头部,并设置flag-CLIENT_PENDING_WRITE
call(c,CMD_CALL_FULL);
c->woff = server.master_repl_offset;
if (listLength(server.ready_keys))
handleClientsBlockedOnKeys();
}
return C_OK;
}
以上为redis单线程情况处理tcp请求,线程I/O与单线程区别为,connect读回调会把clients丢到server.clients_pending_read队列,写事件回调会把clients丢到server.clients_pending_write队列,使用多个线程处理回调函数