AOF重写
原理
AOF持久化是通过保存被执行的写命令来记录数据库状态的,所以随着服务器运行时间的增长,AOF文件会越来越大,这样导致使用大文件还原所需的时间也就越多。重写并不是一条条分析aof文件中的日志,而是从数据库中读取现在的值,然后用一条命令来记录键值对,代替之前记录这个键值对的多条命令。
rewriteAppendOnlyFile—–>rewriteAppendOnlyFileRio
重写后的aof具备如下特点:
- 过期的键不会写入
- 重写使用进程内数据直接生成,这样新的AOF文件只保留最终数据的写入命令。
- 多条写命令可以合并为一个
- 单独开辟一个子进程执行rewrite
触发条件
手动触发
手动触发是通过bgrewriteaof命令。需要注意的是一次只能有一个子进程(无论是RDB子进程还是已有的AOF重写子进程)
123456789101112
|
void bgrewriteaofCommand(client *c) { if (server.aof_child_pid != -1) { addReplyError(c,"Background append only file rewriting already in progress"); } else if (server.rdb_child_pid != -1) { server.aof_rewrite_scheduled = 1; addReplyStatus(c,"Background append only file rewriting scheduled"); } else if (rewriteAppendOnlyFileBackground() == C_OK) { addReplyStatus(c,"Background append only file rewriting started"); } else { addReply(c,shared.err); }}
|
自动触发
根据配置文件auto-aof-rewrite-min-size和auto-aof-rewrite-percentage参数确定自动触发时机。即aof_current_size>auto-aof-rewrite-minsize和 (aof_current_size-aof_base_size)/aof_base_size>=auto-aof-rewritepercentage
检测时间是在serverCron函数中
123456789101112131415
|
/* Trigger an AOF rewrite if needed. */ if (server.aof_state == AOF_ON && server.rdb_child_pid == -1 && server.aof_child_pid == -1 && server.aof_rewrite_perc && server.aof_current_size > server.aof_rewrite_min_size) { long long base = server.aof_rewrite_base_size ? server.aof_rewrite_base_size : 1; long long growth = (server.aof_current_size*100/base) - 100; if (growth >= server.aof_rewrite_perc) { serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth); rewriteAppendOnlyFileBackground(); } }
|
当然由于redis要求没事只能有一个子进程,所以AOF重写子进程可能被延迟
1234
|
if (server.rdb_child_pid != -1) { server.aof_rewrite_scheduled = 1
|
延迟了总得被唤醒吧,
在serverCron函数中也会执行检查、唤醒他的逻辑
。
12345
|
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 && server.aof_rewrite_scheduled){ rewriteAppendOnlyFileBackground();}
|
重写流程图

重写缓冲区
在执行重写时,redis是可以继续对外服务的,你可能会想,如果此时有数据过来,怎么办呢?redis使用了一个新的数据结构aof_rewrite_buf_blocks,叫做重写缓冲区,是一个链表,位于redisServer中,每个节点大小为10M
- 数据结构——redisServer结构中
1
|
list *aof_rewrite_buf_blocks;
|
- 里面具体的内容为aofrwblock
123456789101112131415161718192021
|
#define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10) typedef struct aofrwblock { unsigned long used, free; char buf[AOF_RW_BUF_BLOCK_SIZE];} aofrwblock;//返回当前aof重写缓冲区的大小unsigned long aofRewriteBufferSize(void) { listNode *ln; listIter li; unsigned long size = 0; listRewind(server.aof_rewrite_buf_blocks,&li); while((ln = listNext(&li))) { aofrwblock *block = listNodeValue(ln); size += block->used; } return size;}
|
-
数据过来会将数据拷贝一份到aof重写缓冲区,见下面的aofRewriteBufferAppend函数
刷盘模式
- 开启混合模式
如果开启混合模式,则先进行rdb持久化,然后将增量aof数据追加在文件末尾
12345678910111213141516171819202122232425262728293031323334353637383940414243444546
|
int rewriteAppendOnlyFile(char *filename) { rio aof;... //这里先进行rdb持久化 if (server.aof_use_rdb_preamble) { int error; if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) { errno = error; goto werr; } } else { if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr; } if (fflush(fp) == EOF) goto werr; if (fsync(fileno(fp)) == -1) goto werr; /* 增量数据 */ int nodata = 0; mstime_t start = mstime(); while(mstime()-start < 1000 && nodata < 20) { if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0) { nodata++; continue; } nodata = 0; /* Start counting from zero, we stop on N *contiguous* timeouts. */ aofReadDiffFromParent(); } /* Ask the master to stop sending diffs. */ if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr; if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK) goto werr; /* We read the ACK from the server using a 10 seconds timeout. Normally * it should reply ASAP, but just in case we lose its reply, we are sure * the child will eventually get terminated. */ if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 || byte !=
|
- 仅仅AOF持久化,这直接AOF持久化+增量数据
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
|
int rewriteAppendOnlyFile(char *filename) { rio aof; FILE *fp; //这里为false了 if (server.aof_use_rdb_preamble) { int error; if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) { errno = error; goto werr; } } else { if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr; } if (fflush(fp) == EOF) goto werr; if (fsync(fileno(fp)) == -1) goto werr; int nodata = 0; mstime_t start = mstime(); while(mstime()-start < 1000 && nodata < 20) { if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0) { nodata++; continue; } nodata = 0; aofReadDiffFromParent(); } if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr; if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK) goto werr; if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 || byte != '!') goto werr; serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF..."); aofReadDiffFromParent(); serverLog(LL_NOTICE, "Concatenating %.2f MB of AOF diff received from parent.", (double) sdslen(server.aof_child_diff) / (1024*1024)); if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0) goto werr; if (fflush(fp) == EOF) goto werr; if (fsync(fileno(fp)) == -1) goto werr; if (fclose(fp) == EOF) goto werr; if (rename(tmpfile,filename) == -1) { serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); unlink(tmpfile); return C_ERR; } }
|
- 若开启aof_rewrite_incremental_fsync选项,那么会每32M执行一次阻塞的fsync
1234567891011
|
int rewriteAppendOnlyFile(char *filename) { .... server.aof_child_diff = sdsempty(); rioInitWithFile(&aof,fp);
|
管道
在执行重写时,redis是可以继续对外服务的,那此时如何保证数据的一致性呢?答案就是:管道。通过管道,父进程会将重写缓冲区中的数据通过管道发送给子进程。

- 主进程
feedAppendOnlyFile函数:
12
|
if (server.aof_child_pid != -1) aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
|
aofRewriteBufferAppend函数:
12345678910111213141516171819202122232425262728293031323334353637383940414243
|
void aofRewriteBufferAppend(unsigned char *s, unsigned long len) { listNode *ln = listLast(server.aof_rewrite_buf_blocks); aofrwblock *block = ln ? ln->value : NULL; while(len) { if (block) { unsigned long thislen = (block->free < len) ? block->free : len; if (thislen) { memcpy(block->buf+block->used, s, thislen); block->used += thislen; block->free -= thislen; s += thislen; len -= thislen; } } if (len) { int numblocks; block = zmalloc(sizeof(*block)); block->free = AOF_RW_BUF_BLOCK_SIZE; block->used = 0; listAddNodeTail(server.aof_rewrite_buf_blocks,block); numblocks = listLength(server.aof_rewrite_buf_blocks); if (((numblocks+1) % 10) == 0) { int level = ((numblocks+1) % 100) == 0 ? LL_WARNING : LL_NOTICE; serverLog(level,"Background AOF buffer size: %lu MB", aofRewriteBufferSize()/(1024*1024)); } } } if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) { aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child, AE_WRITABLE, aofChildWriteDiffData, NULL); }}
|
- 子进程
12345678910111213141516171819202122232425
|
if ((childpid = fork()) == 0) { char tmpfile[256]; closeListeningSockets(0); redisSetProcTitle("redis-aof-rewrite"); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == C_OK) { size_t private_dirty = zmalloc_get_private_dirty(-1); if (private_dirty) { serverLog(LL_NOTICE, "AOF rewrite: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } server.child_info_data.cow_size = private_dirty; sendChildInfo(CHILD_INFO_TYPE_AOF); exitFromChild(0); } else { exitFromChild(1); } }
|
rewriteAppendOnlyFile函数:
123456789101112131415161718192021222324252627282930313233
|
int nodata = 0; mstime_t start = mstime(); while(mstime()-start < 1000 && nodata < 20) { if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0) { nodata++; continue; } nodata = 0; aofReadDiffFromParent(); } if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr; if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK) goto werr; if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 || byte != '!') goto werr; serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF..."); aofReadDiffFromParent(); serverLog(LL_NOTICE, "Concatenating %.2f MB of AOF diff received from parent.", (double) sdslen(server.aof_child_diff) / (1024*1024)); if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0) goto werr;
|
AOF重写时临时文件
可以思考一下为啥呢?
1234567891011121314151617181920212223
|
if ((childpid = fork()) == 0) { char tmpfile[256]; //注意这里的临时文件 closeListeningSockets(0); redisSetProcTitle("redis-aof-rewrite"); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); if (rewriteAppendOnlyFile(tmpfile) == C_OK) { size_t private_dirty = zmalloc_get_private_dirty(-1); if (private_dirty) { serverLog(LL_NOTICE, "AOF rewrite: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } server.child_info_data.cow_size = private_dirty; sendChildInfo(CHILD_INFO_TYPE_AOF); exitFromChild(0); } else { exitFromChild(1); } }
|
有小伙伴可能会问道在rewriteAppendOnlyFile函数中发现了一个临时文件并不是这个名字呀?首先肯定小伙伴阅读的很仔细,在这个函数中虽然不是这个名字,但是在最后会调用rename进行原子性的重命名。
123456789101112131415161718192021222324252627282930313233
|
int rewriteAppendOnlyFile(char *filename) { rio aof; FILE *fp; char tmpfile[256]; char byte; //临时文件 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); fp = fopen(tmpfile,"w"); if (!fp) { serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno)); return C_ERR; } ..... if (fflush(fp) == EOF) goto werr; if (fsync(fileno(fp)) == -1) goto werr; if (fclose(fp) == EOF) goto werr; if (rename(tmpfile,filename) == -1) { serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); unlink(tmpfile); return C_ERR; } ...werr: serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); fclose(fp); unlink(tmpfile); return C_ERR;}
|
AOF重写时,也会尽可能避免rehash操作
12345678
|
//父进程中 updateDictResizePolicy();void updateDictResizePolicy(void) { if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) dictEnableResize(); else dictDisableResize();}
|
对key的限制
在aof重写的过程中,为了防止客户端输入缓冲区溢出,针对list、hash、set和zset四种集合时,会先检查所包含的元素数量,如果元素数量超过了AOF_REWRITE_ITEMS_PER_CMD(默认为64),那么重写程序将使用多条命令来记录键值,而不是一条命令。

serverCron中wait3回收子进程
123456789101112131415161718192021222324252627282930313233343536
|
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 || ldbPendingChildren()) { int statloc; pid_t pid; if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { int exitcode = WEXITSTATUS(statloc); int bysignal = 0; if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); if (pid == -1) { serverLog(LL_WARNING,"wait3() returned an error: %s. " "rdb_child_pid = %d, aof_child_pid = %d", strerror(errno), (int) server.rdb_child_pid, (int) server.aof_child_pid); } else if (pid == server.rdb_child_pid) { backgroundSaveDoneHandler(exitcode,bysignal); if (!bysignal && exitcode == 0) receiveChildInfo(); } else if (pid == server.aof_child_pid) { backgroundRewriteDoneHandler(exitcode,bysignal); if (!bysignal && exitcode == 0) receiveChildInfo(); } else { if (!ldbRemoveChild(pid)) { serverLog(LL_WARNING, "Warning, detected child with unmatched pid: %ld", (long)pid); } } updateDictResizePolicy(); closeChildInfoPipe(); } }
|
数据总可能存在差异,因为客户端是一直在发数据,因此在主进程回收子进程时,会调用backgroundRewriteDoneHandler函数中的aofRewriteBufferWrite将这些差异数据写入至aof临时文件(注意这个文件名是不是似曾相识)中。当然如果因为信号中断服务,可能造成数据的丢失
。
123456789101112131415161718192021222324
|
void backgroundRewriteDoneHandler(int exitcode, int bysignal) { if (!bysignal && exitcode == 0) { int newfd, oldfd; char tmpfile[256]; long long now = ustime(); mstime_t latency; latencyStartMonitor(latency); snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int)server.aof_child_pid); newfd = open(tmpfile,O_WRONLY|O_APPEND); if (newfd == -1) { serverLog(LL_WARNING, "Unable to open the temporary AOF produced by the child: %s", strerror(errno)); goto cleanup; }
|
在父进程回收子进程中,会将剩余的差异数据全部写入到临时文件中,最后调用rename原子性的替换文件。
12345678910
|
if (rename(tmpfile,server.aof_filename) == -1) { serverLog(LL_WARNING, "Error trying to rename the temporary AOF file %s into %s: %s", tmpfile, server.aof_filename, strerror(errno)); close(newfd); if (oldfd != -1) close(oldfd); goto cleanup; }
|
接下来,该干啥干啥,如aof持久化:
12345678910111213141516171819202122
|
if (server.aof_fd == -1) { close(newfd);} else { oldfd = server.aof_fd; server.aof_fd = newfd; if (server.aof_fsync == AOF_FSYNC_ALWAYS) redis_fsync(newfd); else if (server.aof_fsync == AOF_FSYNC_EVERYSEC) aof_background_fsync(newfd); server.aof_selected_db = -1; aofUpdateCurrentSize(); server.aof_rewrite_base_size = server.aof_current_size; sdsfree(server.aof_buf); server.aof_buf = sdsempty();}
|
至于丢失数据,因为发生错误直接跳转到这里
1234567891011
|
cleanup: aofClosePipes(); aofRewriteBufferReset(); aofRemoveTempFile(server.aof_child_pid); server.aof_child_pid = -1; server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start; server.aof_rewrite_time_start = -1; /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */ if (server.aof_state == AOF_WAIT_REWRITE) server.aof_rewrite_scheduled = 1;
|
AOF重写对过期键处理
在前一篇文章中说道AOF持久化中被惰性或者定期删除后的键,会追加一条del指令至aof文件,并向客户端返回空;具体清除是在AOF重写时期。
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
|
int rewriteAppendOnlyFileRio(rio *aof) { dictIterator *di = NULL; dictEntry *de; size_t processed = 0; int j; for (j = 0; j < server.dbnum; j++) { char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; di = dictGetSafeIterator(d); if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr; if (rioWriteBulkLongLong(aof,j) == 0) goto werr; while((de = dictNext(di)) != NULL) { sds keystr; robj key, *o; long long expiretime; keystr = dictGetKey(de); o = dictGetVal(de); initStaticStringObject(key,keystr);
|
info命令监控
123456789101112131415161718
|
127.0.0.1:6379> info Persistence# Persistenceloading:0rdb_changes_since_last_save:0rdb_bgsave_in_progress:0rdb_last_save_time:1591678222rdb_last_bgsave_status:okrdb_last_bgsave_time_sec:-1rdb_current_bgsave_time_sec:-1rdb_last_cow_size:0aof_enabled:0 ----->表示是否启用aof持久化aof_rewrite_in_progress:0 ----->表示aof子进程是否在运行aof_rewrite_scheduled:0 ----->表示aof重写是否被延迟调度aof_last_rewrite_time_sec:-1 ----->表示上一次aof重写运行的时间,-1表示未运行aof_current_rewrite_time_sec:-1 ----->表示当前aof重写运行的时间,-1表示未运行aof_last_bgrewrite_status:ok ----->表示最近一次aof重写进程状态aof_last_write_status:ok ----->表示最近一次aof持久化进程状态aof_last_cow_size:0 ----->表示最近一次aof重写进程cow复制大小
|
