[redis]eventloop

423 阅读10分钟

eventloop定义

aeEventLoop为redis事件循环的结构定义,实现在ae.h

typedef struct aeEventLoop {
    int maxfd;   /* highest file descriptor currently registered 注册的文件事件的fd最大值*/
    int setsize; /* max number of file descriptors tracked */
    long long timeEventNextId;  // 用于生成aeTimeEvent的成员变量:id,自增
    time_t lastTime;     /* Used to detect system clock skew */
    aeFileEvent *events; /* Registered events, key为fd */
    aeFiredEvent *fired; /* Fired events */
    aeTimeEvent *timeEventHead;  // 时间事件列表
    int stop; // 是否停止eventloop
    void *apidata; /* This is used for polling API specific data */
    aeBeforeSleepProc *beforesleep;  // sleep前回调执行函数
    aeBeforeSleepProc *aftersleep;   // sleep后执行函数
    int flags;
} aeEventLoop;

定义中,file事件events和就绪事件fired为数组,分别保存已注册的文件事件和可读/可写事件;时间事件timeEventHead为一个时间事件链表的表头,链表为双向链表,保存时间事件的回调函数等信息

事件定义

aeFileEvent为文件事件结构;aeTimeEvent为事件事件定义

其中aeTimeEvent为一个双向链表结构,timeProc函数的返回值为下次执行时间间隔,等于-1时把id设置为-1,删除这个event不再执行

/* File event structure */
typedef struct aeFileEvent {
    int mask; /* one of AE_(READABLE|WRITABLE|BARRIER) */
    aeFileProc *rfileProc; // 读事件回调函数
    aeFileProc *wfileProc; // 参数(eventLoop,fd,fe->clientData,mask)
    void *clientData;
} aeFileEvent;


/* Time event structure */
typedef struct aeTimeEvent {
    long long id; /* time event identifier. -1时可以删除*/
    long when_sec; /* 执行时间 seconds */
    long when_ms; /* milliseconds */
    aeTimeProc *timeProc;  // 入参(eventLoop, id, te->clientData)
    aeEventFinalizerProc *finalizerProc; // time event释放之前执行的函数,入参(eventLoop, te->clientData)
    void *clientData;
    struct aeTimeEvent *prev;
    struct aeTimeEvent *next;
    int refcount; /* refcount to prevent timer events from being
           * freed in recursive time event calls. 防止递归执行的time event被释放*/
} aeTimeEvent;

redis main函数通过aeMain执行while循环,循环调用aeProcessEvents执行事件循环,在这个函数中处理时间以及文件事件,事件循环的主体函数实现:

int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
    int processed = 0, numevents;


    /* Nothing to do? return ASAP */
    // 既不处理时间事件也不处理文件事件,直接返回
    if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0;


    /* Note that we want call select() even if there are no
     * file events to process as long as we want to process time
     * events, in order to sleep until the next time event is ready
     * to fire. */
    // maxfd表示已经注册的文件事件fd的最大值,不为-1表示有fd注册了
    // 没有fd注册,但是有时间事件,并且这个时间事件每次迭代需要sleep也会执行
    if (eventLoop->maxfd != -1 ||
        ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {
        // 有时间事件,并且不wait?wait what?
        int j;
        aeTimeEvent *shortest = NULL;
        struct timeval tv, *tvp;


        if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))  // 时间时间处理
            shortest = aeSearchNearestTimer(eventLoop); // 查找eventLoop->timeEventHead中执行时间最近的一个timer,返回aeTimeEvent
        // 计算等待下一个(这个)时间事件执行需要等待的ms数
        if (shortest) { // 存在
            long now_sec, now_ms;


            aeGetTime(&now_sec, &now_ms); // 获取当前时间的秒数,毫秒数
            tvp = &tv;


            /* How many milliseconds we need to wait for the next
             * time event to fire? */
            long long ms =
                (shortest->when_sec - now_sec)*1000 +
                shortest->when_ms - now_ms;


            if (ms > 0) {
                tvp->tv_sec = ms/1000;
                tvp->tv_usec = (ms % 1000)*1000;
            } else {
                tvp->tv_sec = 0;
                tvp->tv_usec = 0;
            }
        } else { // eventLoop->timeEventHead为空
            /* If we have to check for events but need to return
             * ASAP because of AE_DONT_WAIT we need to set the timeout
             * to zero */
            if (flags & AE_DONT_WAIT) {
                tv.tv_sec = tv.tv_usec = 0;
                tvp = &tv;
            } else {
                /* Otherwise we can block */
                tvp = NULL; /* wait forever */
            }
        }
        if (eventLoop->flags & AE_DONT_WAIT) {
            tv.tv_sec = tv.tv_usec = 0;
            tvp = &tv;
        }


        if (eventLoop->beforesleep != NULL && flags & AE_CALL_BEFORE_SLEEP)
            eventLoop->beforesleep(eventLoop);


        /* Call the multiplexing API, will return only on timeout or when
         * some event fires. */
        // 调用多路复用API,仅当超时或者监测到事件才返回
        // 这里tvp仅作为epoll_wait中的超时控制,及时返回执行时间事件
        // 定义了#ifdef HAVE_EPOLL会使用epoll的api
        // 返回: numevents 可执行的事件数量
        //       eventLoop->fired 可执行的事件列表
        numevents = aeApiPoll(eventLoop, tvp);


        /* After sleep callback. */
        if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP)
            eventLoop->aftersleep(eventLoop);


        for (j = 0; j < numevents; j++) {
            aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
            int mask = eventLoop->fired[j].mask;
            int fd = eventLoop->fired[j].fd;
            int fired = 0; /* Number of events fired for current fd. */
            // 同一个事件既读又写,一般先执行读事件,再执行写事件,因为有时候我们需要处理完query之后返回服务处理结果
            int invert = fe->mask & AE_BARRIER;  // AE_BARRIER?


            /* Note the "fe->mask & mask & ..." code: maybe an already
             * processed event removed an element that fired and we still
             * didn't processed, so we check if the event is still valid.
             *
             * Fire the readable event if the call sequence is not
             * inverted. */
            if (!invert && fe->mask & mask & AE_READABLE) {
                fe->rfileProc(eventLoop,fd,fe->clientData,mask);
                fired++;
                fe = &eventLoop->events[fd]; /* Refresh in case of resize. */
            }


            /* Fire the writable event. */
            if (fe->mask & mask & AE_WRITABLE) {
                if (!fired || fe->wfileProc != fe->rfileProc) {  
                    fe->wfileProc(eventLoop,fd,fe->clientData,mask);
                    fired++;
                }
            }


            /* If we have to invert the call, fire the readable event now
             * after the writable one. */
            // 如果需要invert,先执行写事件,后执行读事件
            if (invert) {
                fe = &eventLoop->events[fd]; /* Refresh in case of resize. */
                if ((fe->mask & mask & AE_READABLE) &&
                    (!fired || fe->wfileProc != fe->rfileProc))
                {
                    fe->rfileProc(eventLoop,fd,fe->clientData,mask);
                    fired++;
                }
            }


            processed++;
        }
    }
    /* Check time events */
    if (flags & AE_TIME_EVENTS)
        processed += processTimeEvents(eventLoop); // 处理时间事件


    return processed; /* return the number of processed file/time events */
}

多路复用API

redis提供了多种多路复用实现,epoll/select/kqueue等,多路复用API选择在ae.c文件中使用宏指定

#ifdef HAVE_EVPORT
#include "ae_evport.c"
#else
    #ifdef HAVE_EPOLL
    #include "ae_epoll.c"
    #else
        #ifdef HAVE_KQUEUE
        #include "ae_kqueue.c"
        #else
        #include "ae_select.c"
        #endif
    #endif
#endif

只看一个epoll

epoll

// 多路复用api,与ae交互数据
// struct epoll_event {
//     __uint32_t events; /* Epoll events */
//     epoll_data_t data; /* User data variable */
// };
typedef struct aeApiState {
    int epfd;  // epoll文件描述符
    struct epoll_event *events;  // 从内核拷贝出的事件列表
} aeApiState;

添加事件

// 输入fd和mask,添加文件事件
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
    aeApiState *state = eventLoop->apidata;
    struct epoll_event ee = {0}; /* avoid valgrind warning */
    /* If the fd was already monitored for some event, we need a MOD
     * operation. Otherwise we need an ADD operation. */
    int op = eventLoop->events[fd].mask == AE_NONE ?
            EPOLL_CTL_ADD : EPOLL_CTL_MOD;  // 已经在监控事件了就修改,否则就增加


    ee.events = 0;
    mask |= eventLoop->events[fd].mask; /* Merge old events*/
    if (mask & AE_READABLE) ee.events |= EPOLLIN;
    if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
    ee.data.fd = fd;
    if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
    return 0;
}

IO多路复用api调用接口函数实现:

// 调用多路复用API,仅当超时或者有事件执行了才返回
// 这里的tvp仅作为epoll_wait中的超时控制
// 定义了#ifdef HAVE_EPOLL会使用epoll的这个api
// 返回: 可执行的事件数量
//       eventLoop->fired 可执行的事件列表
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
    aeApiState *state = eventLoop->apidata;
    int retval, numevents = 0;


    // epfd是epoll的文件描述符
    // state->events 分配好的epoll结构体数组,内核只负责把数据复制到这个events数组中,不会帮我们在用户态中分配内存
    // eventLoop->setsize返回的最多文件描述符(事件)数量
    // 没有监测到事件发生的最多等待时间
    // 返回需要处理的事件数目,如返回0表示已超时。如果返回–1,则表示出现错误,需要检查errno错误码判断错误类型
    retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
            tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
    if (retval > 0) {
        int j;


        numevents = retval;
        for (j = 0; j < numevents; j++) {
            int mask = 0;
            struct epoll_event *e = state->events+j; // state->events的第j个元素


            // 这里直接取epoll_event的events(__uint32_t)与类型做按位与
            if (e->events & EPOLLIN) mask |= AE_READABLE;   // 文件描述符可读
            if (e->events & EPOLLOUT) mask |= AE_WRITABLE;  // 文件描述符可写
            if (e->events & EPOLLERR) mask |= AE_WRITABLE|AE_READABLE; // 文件描述符发生错误
            if (e->events & EPOLLHUP) mask |= AE_WRITABLE|AE_READABLE; // 文件描述符被挂断
            eventLoop->fired[j].fd = e->data.fd;
            eventLoop->fired[j].mask = mask;
        }
    }
    return numevents;
}

事件实例

时间事件

serverCron函数为redis的主要时间事件处理函数,clients超时处理/统计数据更新等都在该函数内部实现

aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL)

serverCron返回值1000/server.hz(ms),即函数执行时间间隔,为固定值或者根据clients数量和server.hz决定

文件事件

文件事件,分析一个tcp请求处理过程

初始化:

服务启动,initServer函数内创建文件事件监听多个listenFd0~listenFdn多个fd读事件注册读回调函数acceptTcpHandler,并启动eventloop

请求处理对应的eventloop过程:

  • eventloop执行
  • beforeSleep中执行handleClientsWithPendingWritesUsingThreads函数,检查server.clients_pending_write队列,注册每个client的写事件及回调函数sendReplyToClient
  • eventloop执行,等待事件发生(这个可能会持续到timeout,所以描述为sleep)
  • 有事件或者超时了,对fired事件进行处理,包括以下几类事件的处理
    • 有listenFd0~listenFdn的fd注册的读事件,执行acceptTcpHandler回调函数,accept建立连接后获取connectFd,注册读事件及其回调函数readQueryFromClient;
    • connectFd对应的文件事件发生,执行读回调函数readQueryFromClient,读数据并处理请求,请求处理完成后的client数据丢到server.clients_pending_write队列;
    • 写事件执行回调函数sendReplyToClient

以下为代码中对应部分:

处理客户端写事件,遍历server.clients_pending_write队列,对于队列中每个client的connection:

  • 注册写事件到eventloop
  • 注册sendReplyToClient回调函数,设置mask为AE_WRITABLE
int handleClientsWithPendingWritesUsingThreads(void) {
    int processed = listLength(server.clients_pending_write);
    if (processed == 0) return 0; /* Return ASAP if there are no clients. */


    /* If I/O threads are disabled or we have few clients to serve, don't
     * use I/O threads, but thejboring synchronous code. */
     // io线程被禁用,并且客户端比较少的情况下,则不用io线程
    if (server.io_threads_num == 1 || stopThreadedIOIfNeeded()) {
        return handleClientsWithPendingWrites();
    }


    // 启动io线程
    /* Start threads if needed. */
    if (!server.io_threads_active) startThreadedIO();


    if (tio_debug) printf("%d TOTAL WRITE pending clients\n", processed);


    // 下面这部分为使用io多线程情况,将clients_pending_write分到多个集合,由不同的线程来处理
    /* Distribute the clients across N different lists. */
    listIter li;
    listNode *ln;
    listRewind(server.clients_pending_write,&li);
    int item_id = 0;
    while((ln = listNext(&li))) {
        client *c = listNodeValue(ln);
        c->flags &= ~CLIENT_PENDING_WRITE;


        /* Remove clients from the list of pending writes since
         * they are going to be closed ASAP. */
        if (c->flags & CLIENT_CLOSE_ASAP) {
            listDelNode(server.clients_pending_write, ln); // 这里将节点从server.clients_pending_write删除
            continue;
        }


        int target_id = item_id % server.io_threads_num;
        listAddNodeTail(io_threads_list[target_id],c);
        item_id++;
    }


    /* Give the start condition to the waiting threads, by setting the
     * start condition atomic var. */
    io_threads_op = IO_THREADS_OP_WRITE;
    for (int j = 1; j < server.io_threads_num; j++) {
        int count = listLength(io_threads_list[j]);
        io_threads_pending[j] = count;
    }


    /* Also use the main thread to process a slice of clients. */
    listRewind(io_threads_list[0],&li);
    while((ln = listNext(&li))) {
        client *c = listNodeValue(ln);
        writeToClient(c,0);
    }
    listEmpty(io_threads_list[0]);


    // 等待多个io线程执行完成
    /* Wait for all the other threads to end their work. */
    while(1) {
        unsigned long pending = 0;
        for (int j = 1; j < server.io_threads_num; j++)
            pending += io_threads_pending[j];
        if (pending == 0) break;
    }
    if (tio_debug) printf("I/O WRITE All threads finshed\n");


    /* Run the list of clients again to install the write handler where
     * needed. */
    // 下面这一段是不使用io线程的情况,遍历server.clients_pending_write,注册write事件
    listRewind(server.clients_pending_write,&li);
    while((ln = listNext(&li))) {
        client *c = listNodeValue(ln);


        /* Install the write handler if there are pending writes in some
         * of the clients. */
        if (clientHasPendingReplies(c) &&  // c->bufpos || listLength(c->reply)
                connSetWriteHandler(c->conn, sendReplyToClient) == AE_ERR)  // 注册写事件到eventloop,为写事件注册回调函数sendReplyToClient将缓冲区数据写到客户端
        {
            freeClientAsync(c);
        }
    }
    listEmpty(server.clients_pending_write);


    /* Update processed count on server */
    server.stat_io_writes_processed += processed;


    return processed;
}

listen读事件/连接建立/注册connectFd读事件部分 acceptTcpHandler为main函数中注册的socket文件读事件

void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
    int cport, cfd, max = MAX_ACCEPTS_PER_CALL;
    char cip[NET_IP_STR_LEN];
    UNUSED(el);
    UNUSED(mask);
    UNUSED(privdata);


    while(max--) { // 每次调用最多处理1000个连接
        // fd: 监听的文件描述符
        // cip:客户端ip
        // cport: 对端端口
        // 返回:cfd  connectFd
        // 功能:接受一个连接,并获取客户端的ip port,并通过connectFd与客户端通信,listenFd仍旧用于监听
        cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
        if (cfd == ANET_ERR) {
            if (errno != EWOULDBLOCK)
                serverLog(LL_WARNING,
                    "Accepting client connection: %s", server.neterr);
            return;
        }
        serverLog(LL_VERBOSE,"Accepted %s:%d", cip, cport);
        // connCreateAcceptedSocket : 创建一个connection,绑定ConnectionType和fd,设置状态为CONN_STATE_ACCEPTING
        //                             返回: 新创建的connection
        // acceptCommonHandler:创建客户端/注册读事件回调函数
        acceptCommonHandler(connCreateAcceptedSocket(cfd),0,cip);
    }
}

创建客户端,注册connect读事件及注册读回调函数部分

static void acceptCommonHandler(connection *conn, int flags, char *ip) {
    ......
    if ((c = createClient(conn)) == NULL) {
        ......
        return;
    }
    ......
}

createClient实现

// 创建client,设置当前connection的socket状态
// 注册读事件回调函数,创建相关的文件读事件,回调函数readQueryFromClient
client *createClient(connection *conn) {
    client *c = zmalloc(sizeof(client));


    if (conn) {
        connNonBlock(conn);
        connEnableTcpNoDelay(conn);
        if (server.tcpkeepalive)
            connKeepAlive(conn,server.tcpkeepalive);
        connSetReadHandler(conn, readQueryFromClient);// 注册读事件回调函数,创建相关的connect读事件readQueryFromClient
        connSetPrivateData(conn, c);
    }


    selectDb(c,0); // c->db = server.db[id]
    uint64_t client_id = ++server.next_client_id;
    c->id = client_id;
    ......
    if (conn) linkClient(c);
    initClientMultiState(c);
    return c;
}

connect读回调readQueryFromClient函数中实现了从socket读数据、请求解析和处理、结果丢到server.clients_pending_write队列

void readQueryFromClient(connection *conn) {
    client *c = connGetPrivateData(conn);
    int nread, readlen;
    size_t qblen;


    /* Check if we want to read from the client later when exiting from
     * the event loop. This is the case if threaded I/O is enabled. */
     // 如果需要从eventloop退出之后再读数据,就直接return,启用了线程I/O的情况,把client丢到server.clients_pending_read
    if (postponeClientRead(c)) return;
    ......
    c->querybuf = sdsMakeRoomFor(c->querybuf, readlen);
    // 调用read从连接读数据
    nread = connRead(c->conn, c->querybuf+qblen, readlen);
    ......
    // 调用processCommandAndResetClient(c)函数进行命令执行部分
    processInputBuffer(c);
}

调用链processInputBuffer -> processCommandAndResetClient(c) -> processCommand(c)

int processCommand(client *c) {
    ......
    /* Exec the command */
    if (c->flags & CLIENT_MULTI &&
        c->cmd->proc != execCommand && c->cmd->proc != discardCommand &&
        c->cmd->proc != multiCommand && c->cmd->proc != watchCommand)
    {
        queueMultiCommand(c);
        addReply(c,shared.queued);
    } else {
        // call中c->cmd->proc(c)调用具体的命令处理函数对请求进行处理
        // 在proc中addReply等函数写buf并把c添加到server.clients_pending_write队列的头部,并设置flag-CLIENT_PENDING_WRITE
        call(c,CMD_CALL_FULL); 
        c->woff = server.master_repl_offset;
        if (listLength(server.ready_keys))
            handleClientsBlockedOnKeys();
    }
    return C_OK;
}

以上为redis单线程情况处理tcp请求,线程I/O与单线程区别为,connect读回调会把clients丢到server.clients_pending_read队列,写事件回调会把clients丢到server.clients_pending_write队列,使用多个线程处理回调函数