Redis源码分析11——AOF持久化上

847 阅读12分钟

AOF

思想

AOF(append only file):以独立日志的方式记录每次写命令,重启时再重新执行AOF文件中的命令达到恢复数据的目的。AOF的主要作用是解决了

数据持久化的实时性,目前已经是Redis持久化的主流方式

特点
  1. 被写入到AOF文件的所有命令都是以redis命令请求协议格式保存的,redis请求协议是纯文本的,具有很高的可读性,且可以直接修改。因为是纯文本协议,因此我们可以使用记事本这类工具打开,在centos上,使用vim打开。
12345678910111213141516171819
^M----->对应的就是\r\n$6^M  4 SELECT^M  5 $1^M  6 0^M  7 *3^M  8 $3^M  9 set^M 10 $1^M 11 a^M 12 $1^M 13 a^M 14 *5^M 15 $4^M 16 mset^M 17 $1^M 18 a^M 19 $1^M 20 a^M
  1. 只保存写命令(pubsub除外)
  2. 支持aof重写
  3. 支持RDB+AOF混合存储
配置文件

1591584644260

123456789101112131415161718192021222324252627282930313233343536373839404142
appendonly noappendfilename "appendonly.aof"# The fsync() call tells the Operating System to actually write data on disk# instead of waiting for more data in the output buffer. Some OS will really flush# data on disk, some other OS will just try to do it ASAP.## Redis supports three different modes:## no: don't fsync, just let the OS flush the data when it wants. Faster.# always: fsync after every write to the append only log. Slow, Safest.# everysec: fsync only one time every second. Compromise.## The default is "everysec", as that's usually the right compromise between# speed and data safety. It's up to you to understand if you can relax this to# "no" that will let the operating system flush the output buffer when# it wants, for better performances (but if you can live with the idea of# some data loss consider the default persistence mode that's snapshotting),# or on the contrary, use "always" that's very slow but a bit safer than# everysec.## More details please check the following article:# http://antirez.com/post/redis-persistence-demystified.html## If unsure, use "everysec".# appendfsync alwaysappendfsync everysec# appendfsync nono-appendfsync-on-rewrite noauto-aof-rewrite-percentage 100auto-aof-rewrite-min-size 64mb aof-load-truncated yes#   [RDB file][AOF tail] aof-use-rdb-preamble yes
AOF写入流程
数据结构

redis中aof,

是先保存在aof缓冲区中的

,数据结构见redisServer:

123456
// aof 缓冲区struct redisServer {    ...    sds aof_buf;      /* AOF buffer, written before entering the event loop */    ...}
持久化参数解析

redis中aof刷盘有三种策略,由参数appendfsync控制:

  • always:每次写入都要同步AOF文件,在一般的SATA硬盘上,Redis只能支持大约几百TPS写入,显然跟Redis高性能特性背道而驰,不建议配置。

  • no:由于操作系统每次同步AOF文件的周期不可控(其实差不多也就1s),而且会加大每次同步硬盘的数据量,虽然提升了性能,但数据安全性无法保证。

  • everysec,是建议的同步策略,也是默认配置,做到兼顾性能和数据安全性。理论上只有在系统突然宕机的情况下丢失1秒的数据。需要注意的是使用everysec选项时,

    是利用异步线程来处理的(还记得我们之前讨论的三类子线程吗)

123456789101112131415161718192021222324252627
/* Define redis_fsync to fdatasync() in Linux and fsync() for all the rest */#ifdef __linux__#define redis_fsync fdatasync#else#define redis_fsync fsync#endifvoid flushAppendOnlyFile(int force) {    ...        //这个是条件    else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&                server.unixtime > server.aof_last_fsync)) {        // 每秒刷新缓存到磁盘一次。        if (!sync_in_progress) {            // 添加任务到后台线程。            aof_background_fsync(server.aof_fd);            server.aof_fsync_offset = server.aof_current_size;        }        server.aof_last_fsync = server.unixtime;    }    ...}// 添加异步任务void aof_background_fsync(int fd) {    bioCreateBackgroundJob(BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);}

关于更多IO,推荐看看这篇文章lrita.github.io/2019/03/13/…

具体流程

1591586726321

  1. feedAppendOnlyFile

开启aof持久化时,会将命令保存到aof_buf缓冲区中,

然后再定时任务中定期的将数据持久化到磁盘。
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {    sds buf = sdsempty();    robj *tmpargv[3];    /* The DB this command was targeting is not the same as the last command     * we appended. To issue a SELECT command is needed. */    if (dictid != server.aof_selected_db) {        char seldb[64];        snprintf(seldb,sizeof(seldb),"%d",dictid);        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",            (unsigned long)strlen(seldb),seldb);        server.aof_selected_db = dictid;    }    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||        cmd->proc == expireatCommand) {        /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {        /* Translate SETEX/PSETEX to SET and PEXPIREAT */        tmpargv[0] = createStringObject("SET",3);        tmpargv[1] = argv[1];        tmpargv[2] = argv[3];        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);        decrRefCount(tmpargv[0]);        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);    } else if (cmd->proc == setCommand && argc > 3) {        int i;        robj *exarg = NULL, *pxarg = NULL;        /* Translate SET [EX seconds][PX milliseconds] to SET and PEXPIREAT */        buf = catAppendOnlyGenericCommand(buf,3,argv);        for (i = 3; i < argc; i ++) {            if (!strcasecmp(argv[i]->ptr, "ex")) exarg = argv[i+1];            if (!strcasecmp(argv[i]->ptr, "px")) pxarg = argv[i+1];        }        serverAssert(!(exarg && pxarg));        if (exarg)            buf = catAppendOnlyExpireAtCommand(buf,server.expireCommand,argv[1],                                               exarg);        if (pxarg)            buf = catAppendOnlyExpireAtCommand(buf,server.pexpireCommand,argv[1],                                               pxarg);    } else {        /* All the other commands don't need translation or need the         * same translation already operated in the command vector         * for the replication itself. */        buf = catAppendOnlyGenericCommand(buf,argc,argv);    }    /* 重点在这里 */    if (server.aof_state == AOF_ON)        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));    /*如果正在rewrite重写,此时为了保证数据的一致性,需要将这个增量数据通过管道写给子进程 */    if (server.aof_child_pid != -1)        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));    sdsfree(buf);}
  1. flushAppendOnlyFile
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */void flushAppendOnlyFile(int force) {    ssize_t nwritten;    int sync_in_progress = 0;    mstime_t latency;    if (sdslen(server.aof_buf) == 0) return;    //如果是AOF_FSYNC_EVERYSEC模式,因为aof写入操作是在单独线程完成的,所以要看是否有处于正在    //同步中    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {               if (sync_in_progress) {          //我们可以推迟1,2s,但是也不能一直推迟           // 前面没有推迟过 write 操作,这里将推迟写操作的起始时间记录下来            // 然后就返回,不执行 write 或者 fsync            if (server.aof_flush_postponed_start == 0) {                /* No previous write postponing, remember that we are                 * postponing the flush and return. */                server.aof_flush_postponed_start = server.unixtime;                return;            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {                /* We were already waiting for fsync to finish, but for less                 * than two seconds this is still ok. Postpone again. */                return;            }            /* Otherwise fall trough, and go write since we can't wait             * over two seconds.             * 如果超过2s了,那么write将会阻塞             *              *  */            server.aof_delayed_fsync++;            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");        }    }      //计算延时,开启monitor下,才会启用    latencyStartMonitor(latency);    //具体刷盘操作    nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));    latencyEndMonitor(latency);     //不同情况输出不同的内容引起的超时    if (sync_in_progress) {        latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);    } else if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) {        latencyAddSampleIfNeeded("aof-write-active-child",latency);    } else {        latencyAddSampleIfNeeded("aof-write-alone",latency);    }    latencyAddSampleIfNeeded("aof-write",latency);    //清零延迟 write 的时间记录    server.aof_flush_postponed_start = 0;    //写入的文件出错    if (nwritten != (ssize_t)sdslen(server.aof_buf)) {        static time_t last_write_error_log = 0;        int can_log = 0;        /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {            can_log = 1;            last_write_error_log = server.unixtime;        }        /* Log the AOF write error and record the error code. */        if (nwritten == -1) {            if (can_log) {                serverLog(LL_WARNING,"Error writing to the AOF file: %s",                    strerror(errno));                server.aof_last_write_errno = errno;            }        } else {            if (can_log) {                serverLog(LL_WARNING,"Short write while writing to "                                       "the AOF file: (nwritten=%lld, "                                       "expected=%lld)",                                       (long long)nwritten,                                       (long long)sdslen(server.aof_buf));            }            //如果写入的文件问题,我们将其移除,有点类似回滚机制            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {                if (can_log) {                    serverLog(LL_WARNING, "Could not remove short write "                             "from the append-only file.  Redis may refuse "                             "to load the AOF the next time it starts.  "                             "ftruncate: %s", strerror(errno));                }            } else {                /* If the ftruncate() succeeded we can set nwritten to                 * -1 since there is no longer partial data into the AOF. */                nwritten = -1;            }            server.aof_last_write_errno = ENOSPC;        }        //如果出问题了,是alway那么就直接退出,因为他无法回滚,已经写入到磁盘了        /* Handle the AOF write error. */        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {            /* We can't recover when the fsync policy is ALWAYS since the             * reply for the client is already in the output buffers, and we             * have the contract with the user that on acknowledged write data             * is synced on disk. */            serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");            exit(1);        } else {            /* Recover from failed write leaving data into the buffer. However             * set an error to stop accepting writes as long as the error             * condition is not cleared. */            server.aof_last_write_status = C_ERR;            /* Trim the sds buffer if there was a partial write, and there             * was no way to undo it with ftruncate(2). */            if (nwritten > 0) {                server.aof_current_size += nwritten;                sdsrange(server.aof_buf,nwritten,-1);            }            return; /* We'll try again on the next call... */        }    } else {        /* Successful write(2). If AOF was in error state, restore the         * OK state and log the event. */        if (server.aof_last_write_status == C_ERR) {            serverLog(LL_WARNING,                "AOF write error looks solved, Redis can write again.");            server.aof_last_write_status = C_OK;        }    }    server.aof_current_size += nwritten;    //重复使用aof_buf,小于4K的话只是清空,如果大于4K直接释放再进行分配    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {        sdsclear(server.aof_buf);    } else {        sdsfree(server.aof_buf);        server.aof_buf = sdsempty();    }    // no-appendfsync-on-rewrite参数设置了,表示在rewrite截断不能进行fsync    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are     * children doing I/O in the background. */    if (server.aof_no_fsync_on_rewrite &&        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))            return;    //如果是always,那么执行redis_fsync,linux下是fdatasync    /* Perform the fsync if needed. */    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {        /* redis_fsync is defined as fdatasync() for Linux in order to avoid         * flushing metadata. */        latencyStartMonitor(latency);        redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */        latencyEndMonitor(latency);        latencyAddSampleIfNeeded("aof-fsync-always",latency);        server.aof_last_fsync = server.unixtime;    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&                server.unixtime > server.aof_last_fsync)) {        if (!sync_in_progress) aof_background_fsync(server.aof_fd);        server.aof_last_fsync = server.unixtime;    }}
  • flushAppendOnlyFile函数有两种模式,即参数force=0|1

    • force=0:表示可以启用一些优化操作
    • force=1:表示强制fsync同步操作,无论此时是否有write操作,都会进行fsync,这种情况出现在shutdown redis或者stopaof中。
    12
    
    prepareForShutdown---->flushAppendOnlyFile(1);stopAppendOnly------>flushAppendOnlyFile(1);
    
    • force==0时的骚操作
      • 在AOF_FSYNC_EVERYSEC模式下,为了在减少fsync同步操作,如果正在同步,那么将会再延时2s同步的优化。
1234567891011121314151617181920
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {        if (sync_in_progress) {            if (server.aof_flush_postponed_start == 0) {                /* No previous write postponing, remember that we are                 * postponing the flush and return. */                server.aof_flush_postponed_start = server.unixtime;                return;            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {                /* We were already waiting for fsync to finish, but for less                 * than two seconds this is still ok. Postpone again. */                return;            }            server.aof_delayed_fsync++;            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");        }    }
  • 啥时候结束呢?如果上次延时操作距离现在超过了2s,那么需要强制刷新了。
12345
else if (server.unixtime - server.aof_flush_postponed_start < 2) {                /* We were already waiting for fsync to finish, but for less                 * than two seconds this is still ok. Postpone again. */                return;            }
  • 写入文件,理论上aofWrite不会失败,需要注意的是他写入的字节数应该等于aof的长度(因为他是一个循环写的过程)

    • 第一步使用ftruncate将文件裁剪至上一次成功的位置
    • 第二步判断当前同步模式,如果是AOF_FSYNC_ALWAYS这直接退出redis
    • 第三步如果是其他模式,这更新aof_buf中数据,然后再serverCron函数中再尝试写入。
    123456789101112131415
    
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {                         serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");            exit(1);        } else {                         server.aof_last_write_status = C_ERR;            /* Trim the sds buffer if there was a partial write, and there             * was no way to undo it with ftruncate(2). */            if (nwritten > 0) {                server.aof_current_size += nwritten;                sdsrange(server.aof_buf,nwritten,-1);            }            return;         }
    
    • 如果设置了no_fsync_on_rewrite,那么在rewrite时,不进行fsync操作。
12345678910111213141516171819
 if (server.aof_no_fsync_on_rewrite &&        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))            return;//那么下面这些就不会进行 */    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {        /* redis_fsync is defined as fdatasync() for Linux in order to avoid         * flushing metadata. */        latencyStartMonitor(latency);        redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */        latencyEndMonitor(latency);        latencyAddSampleIfNeeded("aof-fsync-always",latency);        server.aof_last_fsync = server.unixtime;    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&                server.unixtime > server.aof_last_fsync)) {        if (!sync_in_progress) aof_background_fsync(server.aof_fd);        server.aof_last_fsync = server.unixtime;    }
  • 在serverCron定时任务中执行force=0写入
123456789101112131415161718192021
/* Using the following macro you can run code inside serverCron() with the * specified period, specified in milliseconds. * The actual resolution depends on server.hz. */#define run_with_period(_ms_)         \    if ((_ms_ <= 1000 / server.hz) || \        !(cronloops % ((_ms_) / (1000 / server.hz))))int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {    ...    // 如果延迟过    if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);    // 刷新缓存到磁盘出现错误(例如:磁盘满了),定时检查回写。    run_with_period(1000) {        if (server.aof_last_write_status == C_ERR)            flushAppendOnlyFile(0);    }    ...    server.cronloops++;    return 1000/server.hz;}
AOF读取
Redis启动时加载aof文件
12345678910111213141516171819202122232425262728293031323334353637383940414243
int main(int argc, char **argv) {    ...    loadDataFromDisk();    ...}void loadDataFromDisk(void) {    ...   long long start = ustime();    if (server.aof_state == AOF_ON) {        if (loadAppendOnlyFile(server.aof_filename) == C_OK)            serverLog(LL_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);    } else {        rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;    }    ...}int loadAppendOnlyFile(char *filename) {    ...    // 程序模拟一个客户端执行从 aof 文件读出的命令。    fakeClient = createAOFClient();    ...    // 检查 aof 文件读取数据方式。    char sig[5];    if (fread(sig,1,5,fp) != 5 || memcmp(sig,"REDIS",5) != 0) {        // 通过 aof 方式加载数据。        if (fseek(fp,0,SEEK_SET) == -1) goto readerr;    } else {        ...        // 通过 rdb 方式加载数据。        if (rdbLoadRio(&rdb,RDBFLAGS_AOF_PREAMBLE,NULL) != C_OK) {            serverLog(LL_WARNING,"Error reading the RDB preamble of the AOF file, AOF loading aborted");            goto readerr;        }    }    /* Read the actual AOF file, in REPL format, command by command. */    while(1) {        // 根据 aof 文件数据结构,取出数据回写内存。        ...    }    ...}
debug loadaof加载aof

debug loadaof会先清空数据库,然后再加载aof文件。

1234567891011121314
else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {        if (server.aof_state != AOF_OFF) flushAppendOnlyFile(1);        emptyDb(-1,EMPTYDB_NO_FLAGS,NULL);        protectClient(c);        int ret = loadAppendOnlyFile(server.aof_filename);        unprotectClient(c);        if (ret != C_OK) {            addReply(c,shared.err);            return;        }        server.dirty = 0; /* Prevent AOF / replication */        serverLog(LL_WARNING,"Append Only File loaded by DEBUG LOADAOF");        addReply(c,shared.ok);    }
AOF文件结构

AOF文件真没啥可以说的,直接就是resp格式的命令,在最开始也有截图。需要注意的是,因为redis支持RDB+AOF混合式,因此在读取aof文件时,会先读取前5个字节,判断是否是”REDIS”,如果是则为混合存储。否则仅仅AOF文件。

12345678910111213141516171819
char sig[5]; /* "REDIS" */if (fread(sig,1,5,fp) != 5 || memcmp(sig,"REDIS",5) != 0) {    /* No RDB preamble, seek back at 0 offset. */    if (fseek(fp,0,SEEK_SET) == -1) goto readerr;} else {    /* RDB preamble. Pass loading the RDB functions. */    rio rdb;    serverLog(LL_NOTICE,"Reading RDB preamble from AOF file...");    if (fseek(fp,0,SEEK_SET) == -1) goto readerr;    rioInitWithFile(&rdb,fp);    if (rdbLoadRio(&rdb,NULL,1) != C_OK) {        serverLog(LL_WARNING,"Error reading the RDB preamble of the AOF file, AOF loading aborted");        goto readerr;    } else {        serverLog(LL_NOTICE,"Reading the remaining AOF tail...");    }}
  • 可能在想啥时候表示RDB读取结束呢?

还记得上一节RDB持久化时,说道RDB表示EOF结束的宏吗?RDB_OPCODE_EOF

1234
else if (type == RDB_OPCODE_EOF) {            /* EOF: End of file, exit the main loop. */            break;        }
AOF一些注意点
AOF对过期键
  • 被惰性或者定期删除后,会追加一条del指令至aof文件,并向客户端返回空;因为AOF重写:会去掉过期键。
  • AOF+RDB混合加载时,针对过期键不会特殊处理,会全部加载
if (server.masterhost == NULL && !loading_aof && expiretime != -1 && expiretime < now) {           decrRefCount(key);           decrRefCount(val);       } else {           /* Add the new object in the hash table */           dbAdd(db,key,val);           /* Set the expire time if needed */           if (expiretime != -1) setExpire(NULL,db,key,expiretime);                      /* Set usage information (for eviction). */           objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock);           /* Decrement the key refcount since dbAdd() will take its            * own reference. */           decrRefCount(key);       }
优先加载AOF
123456789101112
void loadDataFromDisk(void) {    ...   long long start = ustime();    if (server.aof_state == AOF_ON) {        if (loadAppendOnlyFile(server.aof_filename) == C_OK)            serverLog(LL_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);    } else {        rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;    }    ...}
AOF优缺点
  1. 优点:
  • 提供更灵活的策略,来平衡性能和可靠性。
  • 追加模式,容错性强,写到一半宕机或者错误,可以快速恢复
  • 优先使用AOF
  1. 缺点:
  • 对于相同数量的数据集而言,AOF文件通常要大于RDB文件
  • 恢复速度慢于rdb