Redis源码分析12——AOF持久化下(重写)

882 阅读11分钟

AOF重写

原理

AOF持久化是通过保存被执行的写命令来记录数据库状态的,所以随着服务器运行时间的增长,AOF文件会越来越大,这样导致使用大文件还原所需的时间也就越多。重写并不是一条条分析aof文件中的日志,而是从数据库中读取现在的值,然后用一条命令来记录键值对,代替之前记录这个键值对的多条命令。

rewriteAppendOnlyFile—–>rewriteAppendOnlyFileRio

重写后的aof具备如下特点:
  1. 过期的键不会写入
  2. 重写使用进程内数据直接生成,这样新的AOF文件只保留最终数据的写入命令。
  3. 多条写命令可以合并为一个
  4. 单独开辟一个子进程执行rewrite
触发条件
手动触发

手动触发是通过bgrewriteaof命令。需要注意的是一次只能有一个子进程(无论是RDB子进程还是已有的AOF重写子进程)

123456789101112
void bgrewriteaofCommand(client *c) {    if (server.aof_child_pid != -1) {        addReplyError(c,"Background append only file rewriting already in progress");    } else if (server.rdb_child_pid != -1) {        server.aof_rewrite_scheduled = 1;        addReplyStatus(c,"Background append only file rewriting scheduled");    } else if (rewriteAppendOnlyFileBackground() == C_OK) {        addReplyStatus(c,"Background append only file rewriting started");    } else {        addReply(c,shared.err);    }}
自动触发

根据配置文件auto-aof-rewrite-min-size和auto-aof-rewrite-percentage参数确定自动触发时机。即aof_current_size>auto-aof-rewrite-minsize和 (aof_current_size-aof_base_size)/aof_base_size>=auto-aof-rewritepercentage

检测时间是在serverCron函数中

123456789101112131415
/* Trigger an AOF rewrite if needed. */       if (server.aof_state == AOF_ON &&           server.rdb_child_pid == -1 &&           server.aof_child_pid == -1 &&           server.aof_rewrite_perc &&           server.aof_current_size > server.aof_rewrite_min_size)       {           long long base = server.aof_rewrite_base_size ?               server.aof_rewrite_base_size : 1;           long long growth = (server.aof_current_size*100/base) - 100;           if (growth >= server.aof_rewrite_perc) {               serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);               rewriteAppendOnlyFileBackground();           }       }

当然由于redis要求没事只能有一个子进程,所以AOF重写子进程可能被延迟

1234
if (server.rdb_child_pid != -1) {       server.aof_rewrite_scheduled = 1;       serverLog(LL_WARNING,"AOF was enabled but there is already a child process saving an RDB file on disk. An AOF background was scheduled to start when possible.");   }

延迟了总得被唤醒吧,

在serverCron函数中也会执行检查、唤醒他的逻辑

12345
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 &&    server.aof_rewrite_scheduled){    rewriteAppendOnlyFileBackground();}
重写流程图

1591671922414

重写缓冲区

在执行重写时,redis是可以继续对外服务的,你可能会想,如果此时有数据过来,怎么办呢?redis使用了一个新的数据结构aof_rewrite_buf_blocks,叫做重写缓冲区,是一个链表,位于redisServer中,每个节点大小为10M

  1. 数据结构——redisServer结构中
1
list *aof_rewrite_buf_blocks;   /* Hold changes during an AOF rewrite. */
  1. 里面具体的内容为aofrwblock
123456789101112131415161718192021
#define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10)    /* 10 MB per block */typedef struct aofrwblock {    unsigned long used, free;    char buf[AOF_RW_BUF_BLOCK_SIZE];} aofrwblock;//返回当前aof重写缓冲区的大小/* Return the current size of the AOF rewrite buffer. */unsigned long aofRewriteBufferSize(void) {    listNode *ln;    listIter li;    unsigned long size = 0;    listRewind(server.aof_rewrite_buf_blocks,&li);    while((ln = listNext(&li))) {        aofrwblock *block = listNodeValue(ln);        size += block->used;    }    return size;}
  1. 数据过来会将数据拷贝一份到aof重写缓冲区,见下面的aofRewriteBufferAppend函数
刷盘模式
  1. 开启混合模式

如果开启混合模式,则先进行rdb持久化,然后将增量aof数据追加在文件末尾

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
int rewriteAppendOnlyFile(char *filename) {    rio aof;...    //这里先进行rdb持久化    if (server.aof_use_rdb_preamble) {        int error;        if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {            errno = error;            goto werr;        }    } else {        if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;    }     if (fflush(fp) == EOF) goto werr;    if (fsync(fileno(fp)) == -1) goto werr;    /* 增量数据 */    int nodata = 0;    mstime_t start = mstime();    while(mstime()-start < 1000 && nodata < 20) {        if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)        {            nodata++;            continue;        }        nodata = 0; /* Start counting from zero, we stop on N *contiguous*                       timeouts. */        aofReadDiffFromParent();    }    /* Ask the master to stop sending diffs. */    if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;    if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)        goto werr;    /* We read the ACK from the server using a 10 seconds timeout. Normally     * it should reply ASAP, but just in case we lose its reply, we are sure     * the child will eventually get terminated. */    if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||        byte != '!') goto werr;    serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");    /* Read the final diff if any. */    aofReadDiffFromParent();....}
  1. 仅仅AOF持久化,这直接AOF持久化+增量数据
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
int rewriteAppendOnlyFile(char *filename) {    rio aof;    FILE *fp;   //这里为false了    if (server.aof_use_rdb_preamble) {        int error;        if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {            errno = error;            goto werr;        }    } else {        if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;    }        if (fflush(fp) == EOF) goto werr;    if (fsync(fileno(fp)) == -1) goto werr;     int nodata = 0;    mstime_t start = mstime();    while(mstime()-start < 1000 && nodata < 20) {        if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)        {            nodata++;            continue;        }        nodata = 0; /* Start counting from zero, we stop on N *contiguous*                       timeouts. */        aofReadDiffFromParent();    }    /* Ask the master to stop sending diffs. */    if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;    if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)        goto werr;    /* We read the ACK from the server using a 10 seconds timeout. Normally     * it should reply ASAP, but just in case we lose its reply, we are sure     * the child will eventually get terminated. */    if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||        byte != '!') goto werr;    serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");    /* Read the final diff if any. */    aofReadDiffFromParent();    /* Write the received diff to the file. */    serverLog(LL_NOTICE,        "Concatenating %.2f MB of AOF diff received from parent.",        (double) sdslen(server.aof_child_diff) / (1024*1024));    if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)        goto werr;    /* Make sure data will not remain on the OS's output buffers */    if (fflush(fp) == EOF) goto werr;    if (fsync(fileno(fp)) == -1) goto werr;    if (fclose(fp) == EOF) goto werr;    /* Use RENAME to make sure the DB file is changed atomically only     * if the generate DB file is ok. */    if (rename(tmpfile,filename) == -1) {        serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));        unlink(tmpfile);        return C_ERR;    }    }
  1. 若开启aof_rewrite_incremental_fsync选项,那么会每32M执行一次阻塞的fsync
1234567891011
int rewriteAppendOnlyFile(char *filename) {  ....    server.aof_child_diff = sdsempty();    rioInitWithFile(&aof,fp);    //如果配置了server.aof_rewrite_incremental_fsync,则在写aof时会增量地进行fsync,    //这里配置的是每写入32M就sync一次。避免集中sync导致磁盘跑满。    if (server.aof_rewrite_incremental_fsync)        rioSetAutoSync(&aof,REDIS_AUTOSYNC_BYTES); ....}
管道

在执行重写时,redis是可以继续对外服务的,那此时如何保证数据的一致性呢?答案就是:管道。通过管道,父进程会将重写缓冲区中的数据通过管道发送给子进程。

1591672177336

  1. 主进程

feedAppendOnlyFile函数:

12
if (server.aof_child_pid != -1)       aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));

aofRewriteBufferAppend函数:

12345678910111213141516171819202122232425262728293031323334353637383940414243
/* 将数据拷贝一份到aof重写缓冲区 */void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {    listNode *ln = listLast(server.aof_rewrite_buf_blocks);    aofrwblock *block = ln ? ln->value : NULL;    while(len) {        if (block) {            unsigned long thislen = (block->free < len) ? block->free : len;            if (thislen) {  /* The current block is not already full. */                memcpy(block->buf+block->used, s, thislen);                block->used += thislen;                block->free -= thislen;                s += thislen;                len -= thislen;            }        }        if (len) { /* First block to allocate, or need another block. */            int numblocks;            block = zmalloc(sizeof(*block));            block->free = AOF_RW_BUF_BLOCK_SIZE;            block->used = 0;            listAddNodeTail(server.aof_rewrite_buf_blocks,block);                         numblocks = listLength(server.aof_rewrite_buf_blocks);            if (((numblocks+1) % 10) == 0) {                int level = ((numblocks+1) % 100) == 0 ? LL_WARNING :                                                         LL_NOTICE;                serverLog(level,"Background AOF buffer size: %lu MB",                    aofRewriteBufferSize()/(1024*1024));            }        }    }    /* 看到没注册了可写事件 */    if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {        aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,            AE_WRITABLE, aofChildWriteDiffData, NULL);    }}
  1. 子进程
12345678910111213141516171819202122232425
if ((childpid = fork()) == 0) {        char tmpfile[256];        /* Child         关闭监听套接字,不在接受新的连接        */        closeListeningSockets(0);        redisSetProcTitle("redis-aof-rewrite");        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());        if (rewriteAppendOnlyFile(tmpfile) == C_OK) {            size_t private_dirty = zmalloc_get_private_dirty(-1);            if (private_dirty) {                serverLog(LL_NOTICE,                    "AOF rewrite: %zu MB of memory used by copy-on-write",                    private_dirty/(1024*1024));            }            server.child_info_data.cow_size = private_dirty;            sendChildInfo(CHILD_INFO_TYPE_AOF);            exitFromChild(0);        } else {            exitFromChild(1);        }    }

rewriteAppendOnlyFile函数:

123456789101112131415161718192021222324252627282930313233
int nodata = 0;  mstime_t start = mstime();  while(mstime()-start < 1000 && nodata < 20) {      if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)      {          nodata++;          continue;      }      nodata = 0; /* Start counting from zero, we stop on N *contiguous*                     timeouts. */      aofReadDiffFromParent();  }  /* Ask the master to stop sending diffs. */  if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;  if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)      goto werr;  /* We read the ACK from the server using a 10 seconds timeout. Normally   * it should reply ASAP, but just in case we lose its reply, we are sure   * the child will eventually get terminated. */  if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||      byte != '!') goto werr;  serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");  /* Read the final diff if any. */  aofReadDiffFromParent();  /* Write the received diff to the file. */  serverLog(LL_NOTICE,      "Concatenating %.2f MB of AOF diff received from parent.",      (double) sdslen(server.aof_child_diff) / (1024*1024));  if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)      goto werr;
AOF重写时临时文件
可以思考一下为啥呢?
1234567891011121314151617181920212223
if ((childpid = fork()) == 0) {        char tmpfile[256];       //注意这里的临时文件        closeListeningSockets(0);        redisSetProcTitle("redis-aof-rewrite");        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());        if (rewriteAppendOnlyFile(tmpfile) == C_OK) {            size_t private_dirty = zmalloc_get_private_dirty(-1);            if (private_dirty) {                serverLog(LL_NOTICE,                    "AOF rewrite: %zu MB of memory used by copy-on-write",                    private_dirty/(1024*1024));            }            server.child_info_data.cow_size = private_dirty;            sendChildInfo(CHILD_INFO_TYPE_AOF);            exitFromChild(0);        } else {            exitFromChild(1);        }    }

有小伙伴可能会问道在rewriteAppendOnlyFile函数中发现了一个临时文件并不是这个名字呀?首先肯定小伙伴阅读的很仔细,在这个函数中虽然不是这个名字,但是在最后会调用rename进行原子性的重命名。

123456789101112131415161718192021222324252627282930313233
int rewriteAppendOnlyFile(char *filename) {    rio aof;    FILE *fp;    char tmpfile[256];    char byte;   //临时文件    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());    fp = fopen(tmpfile,"w");    if (!fp) {        serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));        return C_ERR;    }     .....    /* Make sure data will not remain on the OS's output buffers */    if (fflush(fp) == EOF) goto werr;    if (fsync(fileno(fp)) == -1) goto werr;    if (fclose(fp) == EOF) goto werr;    /* rename原子操作 */    if (rename(tmpfile,filename) == -1) {        serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));        unlink(tmpfile);        return C_ERR;    }  ...werr:    serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));    fclose(fp);    unlink(tmpfile);    return C_ERR;}
AOF重写时,也会尽可能避免rehash操作
12345678
//父进程中 updateDictResizePolicy();void updateDictResizePolicy(void) {    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1)        dictEnableResize();    else        dictDisableResize();}
对key的限制

在aof重写的过程中,为了防止客户端输入缓冲区溢出,针对list、hash、set和zset四种集合时,会先检查所包含的元素数量,如果元素数量超过了AOF_REWRITE_ITEMS_PER_CMD(默认为64),那么重写程序将使用多条命令来记录键值,而不是一条命令。

1591677673394

serverCron中wait3回收子进程
123456789101112131415161718192021222324252627282930313233343536
/* Check if a background saving or AOF rewrite in progress terminated. */  if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||      ldbPendingChildren())  {      int statloc;      pid_t pid;      if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {          int exitcode = WEXITSTATUS(statloc);          int bysignal = 0;          if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);          if (pid == -1) {              serverLog(LL_WARNING,"wait3() returned an error: %s. "                  "rdb_child_pid = %d, aof_child_pid = %d",                  strerror(errno),                  (int) server.rdb_child_pid,                  (int) server.aof_child_pid);          } else if (pid == server.rdb_child_pid) {              backgroundSaveDoneHandler(exitcode,bysignal);              if (!bysignal && exitcode == 0) receiveChildInfo();          } else if (pid == server.aof_child_pid) {              backgroundRewriteDoneHandler(exitcode,bysignal);              if (!bysignal && exitcode == 0) receiveChildInfo();          } else {              if (!ldbRemoveChild(pid)) {                  serverLog(LL_WARNING,                      "Warning, detected child with unmatched pid: %ld",                      (long)pid);              }          }          updateDictResizePolicy();          closeChildInfoPipe();      }  }
数据总可能存在差异,因为客户端是一直在发数据,因此在主进程回收子进程时,会调用backgroundRewriteDoneHandler函数中的aofRewriteBufferWrite将这些差异数据写入至aof临时文件(注意这个文件名是不是似曾相识)中。当然如果因为信号中断服务,可能造成数据的丢失

123456789101112131415161718192021222324
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {    if (!bysignal && exitcode == 0) {        int newfd, oldfd;        char tmpfile[256];        long long now = ustime();        mstime_t latency;        /* Flush the differences accumulated by the parent to the         * rewritten AOF. */        latencyStartMonitor(latency);        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",            (int)server.aof_child_pid);        newfd = open(tmpfile,O_WRONLY|O_APPEND);        if (newfd == -1) {            serverLog(LL_WARNING,                "Unable to open the temporary AOF produced by the child: %s", strerror(errno));            goto cleanup;        }        //写入这个临时文件中。        if (aofRewriteBufferWrite(newfd) == -1) {                    close(newfd);            goto cleanup;        }

在父进程回收子进程中,会将剩余的差异数据全部写入到临时文件中,最后调用rename原子性的替换文件。

12345678910
if (rename(tmpfile,server.aof_filename) == -1) {           serverLog(LL_WARNING,               "Error trying to rename the temporary AOF file %s into %s: %s",               tmpfile,               server.aof_filename,               strerror(errno));           close(newfd);           if (oldfd != -1) close(oldfd);           goto cleanup;       }

接下来,该干啥干啥,如aof持久化:

12345678910111213141516171819202122
if (server.aof_fd == -1) {    /* AOF disabled, we don't need to set the AOF file descriptor     * to this new file, so we can close it. */    close(newfd);} else {    /* AOF enabled, replace the old fd with the new one. */    oldfd = server.aof_fd;    server.aof_fd = newfd;    if (server.aof_fsync == AOF_FSYNC_ALWAYS)        redis_fsync(newfd);    else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)        aof_background_fsync(newfd);    server.aof_selected_db = -1; /* Make sure SELECT is re-issued */    aofUpdateCurrentSize();    server.aof_rewrite_base_size = server.aof_current_size;    /* Clear regular AOF buffer since its contents was just written to     * the new AOF from the background rewrite buffer. */    sdsfree(server.aof_buf);    server.aof_buf = sdsempty();}

至于丢失数据,因为发生错误直接跳转到这里

1234567891011
cleanup:    aofClosePipes();    aofRewriteBufferReset();    aofRemoveTempFile(server.aof_child_pid);    server.aof_child_pid = -1;    server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;    server.aof_rewrite_time_start = -1;    /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */    if (server.aof_state == AOF_WAIT_REWRITE)        server.aof_rewrite_scheduled = 1;
AOF重写对过期键处理

在前一篇文章中说道AOF持久化中被惰性或者定期删除后的键,会追加一条del指令至aof文件,并向客户端返回空;具体清除是在AOF重写时期。

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
int rewriteAppendOnlyFileRio(rio *aof) {    dictIterator *di = NULL;    dictEntry *de;    size_t processed = 0;    int j;    for (j = 0; j < server.dbnum; j++) {        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";        redisDb *db = server.db+j;        dict *d = db->dict;        if (dictSize(d) == 0) continue;        di = dictGetSafeIterator(d);        /* SELECT the new DB */        if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;        if (rioWriteBulkLongLong(aof,j) == 0) goto werr;        /* Iterate this DB writing every entry */        while((de = dictNext(di)) != NULL) {            sds keystr;            robj key, *o;            long long expiretime;            keystr = dictGetKey(de);            o = dictGetVal(de);            initStaticStringObject(key,keystr);            //看看这里,他会查看是否过期了            expiretime = getExpire(db,&key);            ...            /* Save the expire time */            if (expiretime != -1) {                char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";                if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;                if (rioWriteBulkObject(aof,&key) == 0) goto werr;                if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr;            }            /* Read some diff from the parent process from time to time. */            if (aof->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) {                processed = aof->processed_bytes;                aofReadDiffFromParent();            }        }        dictReleaseIterator(di);        di = NULL;    }    return C_OK;     return C_ERR;}
info命令监控
123456789101112131415161718
127.0.0.1:6379> info Persistence# Persistenceloading:0rdb_changes_since_last_save:0rdb_bgsave_in_progress:0rdb_last_save_time:1591678222rdb_last_bgsave_status:okrdb_last_bgsave_time_sec:-1rdb_current_bgsave_time_sec:-1rdb_last_cow_size:0aof_enabled:0         ----->表示是否启用aof持久化aof_rewrite_in_progress:0   ----->表示aof子进程是否在运行aof_rewrite_scheduled:0   ----->表示aof重写是否被延迟调度aof_last_rewrite_time_sec:-1   ----->表示上一次aof重写运行的时间,-1表示未运行aof_current_rewrite_time_sec:-1   ----->表示当前aof重写运行的时间,-1表示未运行aof_last_bgrewrite_status:ok  ----->表示最近一次aof重写进程状态aof_last_write_status:ok   ----->表示最近一次aof持久化进程状态aof_last_cow_size:0   ----->表示最近一次aof重写进程cow复制大小