Redis源码分析10——RDB持久化

1,564 阅读12分钟

Redis相比memcache而言,最重要的一个特点是其支持持久化。而在redis中持久化分为两种RDB和AOF。

RDB

原理

Rdb思想是把当前进程数据生成快照保存到硬盘的过程,保存数据库的键值对。触发RDB持久化过程分为手动触发和自动触发

配置文件

1591346905153

自动触发

1591344434712

save参数引起的自动触发

Redis的rdb受配置文件中save 配置的限制,当满足条件时,会触发rdb持久化。当然前提是我们打开rdb的持久化

  1. 数据结构
1234
struct saveparam {    time_t seconds;    int changes;};

可能会想到,既然给了标准,那总得有个对象记录每次变化和上次save时间吧?没错,这两个变量记录在redisServer结构中(如果忘记了,请看前文)。

123456789101112131415161718192021222324
struct redisServer server; /* Server global state *//* RDB persistence RDB持久化相关的对象*///表示最后一次save之后数据变化    long long dirty;                /* Changes to DB from the last save */    long long dirty_before_bgsave;  /* Used to restore dirty on failed BGSAVE */    pid_t rdb_child_pid;            /* PID of RDB saving child */    struct saveparam *saveparams;   /* Save points array for RDB */    int saveparamslen;              /* Number of saving points */    char *rdb_filename;             /* Name of RDB file */    int rdb_compression;            /* Use compression in RDB? */    int rdb_checksum;               /* Use RDB checksum? *///表示上一次save时间    time_t lastsave;                /* Unix time of last successful save */    time_t lastbgsave_try;          /* Unix time of last attempted bgsave */    time_t rdb_save_time_last;      /* Time used by last RDB save run. */    time_t rdb_save_time_start;     /* Current RDB save start time. */    int rdb_bgsave_scheduled;       /* BGSAVE when possible if true. */    int rdb_child_type;             /* Type of save by active child. */    int lastbgsave_status;          /* C_OK or C_ERR */    int stop_writes_on_bgsave_err;  /* Don't allow writes if can't BGSAVE */    int rdb_pipe_write_result_to_parent; /* RDB pipes used to return the state */    int rdb_pipe_read_result_from_child; /* of each slave in diskless SYNC. */    /* Pipe and data structures for child -> parent info sharing. */
  1. 计算方式(见serverCron函数)
123456789101112131415161718192021
for (j = 0; j < server.saveparamslen; j++) {          struct saveparam *sp = server.saveparams+j;          /* Save if we reached the given amount of changes,           * the given amount of seconds, and if the latest bgsave was           * successful or if, in case of an error, at least           * CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */          if (server.dirty >= sp->changes &&              server.unixtime-server.lastsave > sp->seconds &&              (server.unixtime-server.lastbgsave_try >               CONFIG_BGSAVE_RETRY_DELAY ||               server.lastbgsave_status == C_OK))          {              serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",                  sp->changes, (int)sp->seconds);              rdbSaveInfo rsi, *rsiptr;              rsiptr = rdbPopulateSaveInfo(&rsi);              rdbSaveBackground(server.rdb_filename,rsiptr);              break;          }      }
主从复制触发

见后文的主从复制

debug reload触发
12345678910111213141516171819
else if (!strcasecmp(c->argv[1]->ptr,"reload")) {    //看看这里        rdbSaveInfo rsi, *rsiptr;        rsiptr = rdbPopulateSaveInfo(&rsi);        if (rdbSave(server.rdb_filename,rsiptr) != C_OK) {            addReply(c,shared.err);            return;        }        emptyDb(-1,EMPTYDB_NO_FLAGS,NULL);        protectClient(c);        int ret = rdbLoad(server.rdb_filename,NULL);        unprotectClient(c);        if (ret != C_OK) {            addReplyError(c,"Error trying to load the RDB dump");            return;        }        serverLog(LL_WARNING,"DB reloaded by DEBUG RELOAD");        addReply(c,shared.ok);    }
shutdown+save参数触发
1234567891011121314151617181920212223242526272829303132333435363738
shutdownCommand--->prepareForShutdownint prepareForShutdown(int flags) {    int save = flags & SHUTDOWN_SAVE;    int nosave = flags & SHUTDOWN_NOSAVE;    serverLog(LL_WARNING,"User requested shutdown...");    ...    if (server.rdb_child_pid != -1) {        serverLog(LL_WARNING,"There is a child saving an .rdb. Killing it!");        kill(server.rdb_child_pid,SIGUSR1);        rdbRemoveTempFile(server.rdb_child_pid);    }    ...    /* Create a new RDB file before exiting. */    if ((server.saveparamslen > 0 && !nosave) || save) {        serverLog(LL_NOTICE,"Saving the final RDB snapshot before exiting.");        /* Snapshotting. Perform a SYNC SAVE and exit */        rdbSaveInfo rsi, *rsiptr;        rsiptr = rdbPopulateSaveInfo(&rsi);        if (rdbSave(server.rdb_filename,rsiptr) != C_OK) {            /* Ooops.. error saving! The best we can do is to continue             * operating. Note that if there was a background saving process,             * in the next cron() Redis will be notified that the background             * saving aborted, handling special stuff like slaves pending for             * synchronization... */            serverLog(LL_WARNING,"Error trying to save the DB, can't exit.");            return C_ERR;        }    }   ...     return C_OK;}
手动触发
save命令

需要注意的是save是阻塞操作,直到整个save完成,redis才会对外服务。

1234567891011121314
void saveCommand(client *c) {    if (server.rdb_child_pid != -1) {        addReplyError(c,"Background save already in progress");        return;    }    rdbSaveInfo rsi, *rsiptr;    rsiptr = rdbPopulateSaveInfo(&rsi);    if (rdbSave(server.rdb_filename,rsiptr) == C_OK) {        addReply(c,shared.ok);    } else {        addReply(c,shared.err);    }}
bgsave命令

bgsave和save不同,他是创建一个子进程来进行持久化。

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
/* BGSAVE [SCHEDULE] */void bgsaveCommand(client *c) {    int schedule = 0;    /* The SCHEDULE option changes the behavior of BGSAVE when an AOF rewrite     * is in progress. Instead of returning an error a BGSAVE gets scheduled. */    if (c->argc > 1) {        if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"schedule")) {            schedule = 1;        } else {            addReply(c,shared.syntaxerr);            return;        }    }    rdbSaveInfo rsi, *rsiptr;    rsiptr = rdbPopulateSaveInfo(&rsi);    if (server.rdb_child_pid != -1) {        addReplyError(c,"Background save already in progress");    } else if (server.aof_child_pid != -1) {        if (schedule) {            server.rdb_bgsave_scheduled = 1;            addReplyStatus(c,"Background saving scheduled");        } else {            addReplyError(c,                "An AOF log rewriting in progress: can't BGSAVE right now. "                "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "                "possible.");        }    } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {        addReplyStatus(c,"Background saving started");    } else {        addReply(c,shared.err);    }}int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {    pid_t childpid;    start = ustime();    //看看这里    if ((childpid = fork()) == 0) {        int retval;        /* Child */       ...    } else {        /* Parent */      ...    }    return C_OK; /* unreached */}
bgsave命令执行时限制

命令:BGSAVE [SCHEDULE]

  1. ==当已经在进行bgsave时,不允许在进行==
  2. ==当正在进行aof rewrite时,如果schedule设置为1,这进行延迟执行;否则直接回复错误==。
1234567891011121314151617
if (server.rdb_child_pid != -1) {        addReplyError(c,"Background save already in progress");    } else if (server.aof_child_pid != -1) {        if (schedule) {            server.rdb_bgsave_scheduled = 1;            addReplyStatus(c,"Background saving scheduled");        } else {            addReplyError(c,                "An AOF log rewriting in progress: can't BGSAVE right now. "                "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "                "possible.");        }    } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {        addReplyStatus(c,"Background saving started");    } else {        addReply(c,shared.err);    }

关于延迟,请看serverCron函数

12345678910
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 &&        server.rdb_bgsave_scheduled &&        (server.unixtime-server.lastbgsave_try > CONFIG_BGSAVE_RETRY_DELAY ||         server.lastbgsave_status == C_OK))    {        rdbSaveInfo rsi, *rsiptr;        rsiptr = rdbPopulateSaveInfo(&rsi);        if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK)            server.rdb_bgsave_scheduled = 0;    }
bgsave执行流程

1591348825147

分析
  1. fork

Linux中fork采用写时复制思想,当数据发生变化时,才会拷贝一份完整内存。因此这里我们回顾一下之前说rdb持久化时,尽量不要进行rehash。

1234567891011121314151617181920212223242526
else {        /* Parent */        server.stat_fork_time = ustime()-start;        server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */        latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);        if (childpid == -1) {            closeChildInfoPipe();            server.lastbgsave_status = C_ERR;            serverLog(LL_WARNING,"Can't save in background: fork: %s",                strerror(errno));            return C_ERR;        }        serverLog(LL_NOTICE,"Background saving started by pid %d",childpid);        server.rdb_save_time_start = time(NULL);        server.rdb_child_pid = childpid;        server.rdb_child_type = RDB_CHILD_TYPE_DISK;        updateDictResizePolicy();        return C_OK;    }//看看这里,他禁止了rehash操作void updateDictResizePolicy(void) {    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1)        dictEnableResize();    else        dictDisableResize();}

当然,不能一直都不能rehash,冲突链表太长,会影响查询性能。其打破条件为:used.length/all.length>=5

123456789101112
static unsigned int dict_force_resize_ratio = 5;static int _dictExpandIfNeeded(dict *d){    ....    if (d->ht[0].used >= d->ht[0].size &&        (dict_can_resize ||         d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))    {        return dictExpand(d, d->ht[0].used*2);    }    return DICT_OK;}
  1. 父子进程通信——管道

fork完后,生成一个子进程,而进程间是不共享内存空间的,加之这里是属于父子进程的关系,因此我们使用管道来进行通信。

1234567891011
void openChildInfoPipe(void) {    if (pipe(server.child_info_pipe) == -1) {        /* On error our two file descriptors should be still set to -1,         * but we call anyway cloesChildInfoPipe() since can't hurt. */        closeChildInfoPipe();    } else if (anetNonBlock(NULL,server.child_info_pipe[0]) != ANET_OK) {        closeChildInfoPipe();    } else {        memset(&server.child_info_data,0,sizeof(server.child_info_data));    }}

子进程向父进程传输的是子进程状态数据,其数据结构在redisServer数据结构中

1234567
/* Pipe and data structures for child -> parent info sharing. */    int child_info_pipe[2];         /* Pipe used to write the child_info_data. */    struct {        int process_type;           /* AOF or RDB child? */        size_t cow_size;            /* Copy on write size. */        unsigned long long magic;   /* Magic value to make sure data is valid. */    } child_info_data;

具体流程如下:

12345678910111213141516171819202122232425262728293031
  if ((childpid = fork()) == 0) {        int retval;        /* Child */        closeListeningSockets(0);        redisSetProcTitle("redis-rdb-bgsave");        retval = rdbSave(filename,rsi);        if (retval == C_OK) {            size_t private_dirty = zmalloc_get_private_dirty(-1);            if (private_dirty) {                serverLog(LL_NOTICE,                    "RDB: %zu MB of memory used by copy-on-write",                    private_dirty/(1024*1024));            }            server.child_info_data.cow_size = private_dirty;            sendChildInfo(CHILD_INFO_TYPE_RDB);        }        exitFromChild((retval == C_OK) ? 0 : 1);    }     //子进程信息    void sendChildInfo(int ptype) {    if (server.child_info_pipe[1] == -1) return;    server.child_info_data.magic = CHILD_INFO_MAGIC;    server.child_info_data.process_type = ptype;    ssize_t wlen = sizeof(server.child_info_data);    if (write(server.child_info_pipe[1],&server.child_info_data,wlen) != wlen) {        /* Nothing to do on error, this will be detected by the other side. */    }}
  1. 由于文件描述符会继承,为避免额外增加其生命周期,我们对其进行close
123456789101112131415
 /* Child */closeListeningSockets(0);void closeListeningSockets(int unlink_unix_socket) {    int j;    for (j = 0; j < server.ipfd_count; j++) close(server.ipfd[j]);    if (server.sofd != -1) close(server.sofd);    if (server.cluster_enabled)        for (j = 0; j < server.cfd_count; j++) close(server.cfd[j]);    if (unlink_unix_socket && server.unixsocket) {        serverLog(LL_NOTICE,"Removing the unix socket file.");        unlink(server.unixsocket); /* don't care if this fails */    }}
  1. 主进程在serverCron中使用wait3回收子进,防止出现僵尸进程==(ps -elf指令时,显示Z)==.

wait3相比waitpid和wait函数而言,他多了一个资源汇总的功能。这些资源信息包括用户CPU时间总量、系统时间总量、页面出错次数、接收到信号的次数等。但是我看你也没用哇?

1
pid_t wait3 ( int *status, int option, struct rusage *ru );
  • 参数 options 提供一些额外的选项来控制 wait3():

  WNOHANG;如果没有任何已经结束了的子进程,则马上返回,不等待;

  WUNTRACED;如果子进程进入暂停执行的情况,则马上返回,但结束状态不予理会;

  • wait3() 的返回值,有三种:
    • 正常返回时,waitpid() 返回收集到的子进程的PID;
    • 如果设置了 WNOHANG,而调用 waitpid() 时,没有发现已退出的子进程可收集,则返回0;
    • 如果调用出错,则返回 -1,这时erron 会被设置为相应的值以指示错误所在。(当 pid 所指示的子进程不错在,或此进程存在,但不是调用进程的子进程, wait3() 就会返回出错,这时 erron 被设置为 ECHILD)
1234567891011121314151617181920212223242526272829
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {           int exitcode = WEXITSTATUS(statloc);           int bysignal = 0;           if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);           if (pid == -1) {               serverLog(LL_WARNING,"wait3() returned an error: %s. "                   "rdb_child_pid = %d, aof_child_pid = %d",                   strerror(errno),                   (int) server.rdb_child_pid,                   (int) server.aof_child_pid);           } else if (pid == server.rdb_child_pid) {               backgroundSaveDoneHandler(exitcode,bysignal);               if (!bysignal && exitcode == 0) receiveChildInfo();           } else if (pid == server.aof_child_pid) {               backgroundRewriteDoneHandler(exitcode,bysignal);               if (!bysignal && exitcode == 0) receiveChildInfo();           } else {               if (!ldbRemoveChild(pid)) {                   serverLog(LL_WARNING,                       "Warning, detected child with unmatched pid: %ld",                       (long)pid);               }           }           updateDictResizePolicy();           closeChildInfoPipe();       }   }
RDB文件结构

1591358253036

rsi可选项

主从复制下,节点状态信息(能避免全同步就避免)

某些操作的宏定义
  1. 数据库操作

为了区分我们接下来的数据是哪种类型的,因此引入了1个字节的宏。

123456789
#define RDB_OPCODE_MODULE_AUX 247   /* Module auxiliary data. */#define RDB_OPCODE_IDLE       248   /* LRU idle time. LRU信息*/#define RDB_OPCODE_FREQ       249   /* LFU frequency. LFU信息*/#define RDB_OPCODE_AUX        250   /* RDB aux field.扩展字段 */#define RDB_OPCODE_RESIZEDB   251   /* Hash table resize hint.提示可能需要进行哈希表的resize操作 */#define RDB_OPCODE_EXPIRETIME_MS 252    /* Expire time in milliseconds.表示过期键,单位毫秒 */#define RDB_OPCODE_EXPIRETIME 253       /* Old expire time in seconds.表示过期键 */#define RDB_OPCODE_SELECTDB   254   /* DB number of the following keys.表示接下来是要选的数据库的索引id */#define RDB_OPCODE_EOF        255   /* End of the RDB file.表示结尾 */
  1. 值类型
1234
#define RDB_LOAD_NONE   0#define RDB_LOAD_ENC    (1<<0)   ---->表示编码过#define RDB_LOAD_PLAIN  (1<<1)   ----->表示未处理#define RDB_LOAD_SDS    (1<<2)    ---->表示使用的是字符串

为啥这样区分呢?

因为我们知道整型要比字符串更省内存,因此在保存对象时,都是优先尝试是否可以INT编码

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
/* Like rdbSaveRawString() gets a Redis object instead. */ssize_t rdbSaveStringObject(rio *rdb, robj *obj) {    /* Avoid to decode the object, then encode it again, if the     * object is already integer encoded. */    if (obj->encoding == OBJ_ENCODING_INT) {        return rdbSaveLongLongAsStringObject(rdb,(long)obj->ptr);    } else {        serverAssertWithInfo(NULL,obj,sdsEncodedObject(obj));        return rdbSaveRawString(rdb,obj->ptr,sdslen(obj->ptr));    }}/* Save a long long value as either an encoded string or a string. */ssize_t rdbSaveLongLongAsStringObject(rio *rdb, long long value) {    unsigned char buf[32];    ssize_t n, nwritten = 0;    int enclen = rdbEncodeInteger(value,buf);    if (enclen > 0) {        return rdbWriteRaw(rdb,buf,enclen);    } else {        /* Encode as string */        enclen = ll2string((char*)buf,32,value);        serverAssert(enclen < 32);        if ((n = rdbSaveLen(rdb,enclen)) == -1) return -1;        nwritten += n;        if ((n = rdbWriteRaw(rdb,buf,enclen)) == -1) return -1;        nwritten += n;    }    return nwritten;}/* Save a string object as [len][data] on disk. If the object is a string * representation of an integer value we try to save it in a special form */ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len) {    int enclen;    ssize_t n, nwritten = 0;    /* Try integer encoding */    if (len <= 11) {        unsigned char buf[5];        if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {            if (rdbWriteRaw(rdb,buf,enclen) == -1) return -1;            return enclen;        }    }    /* Try LZF compression - under 20 bytes it's unable to compress even     * aaaaaaaaaaaaaaaaaa so skip it */    if (server.rdb_compression && len > 20) {        n = rdbSaveLzfStringObject(rdb,s,len);        if (n == -1) return -1;        if (n > 0) return n;        /* Return value of 0 means data can't be compressed, save the old way */    }    /* Store verbatim */    if ((n = rdbSaveLen(rdb,len)) == -1) return -1;    nwritten += n;    if (len > 0) {        if (rdbWriteRaw(rdb,s,len) == -1) return -1;        nwritten += len;    }    return nwritten;}
  1. 整型类型
1234
#define RDB_ENC_INT8 0        /* 8 bit signed integer */#define RDB_ENC_INT16 1       /* 16 bit signed integer */#define RDB_ENC_INT32 2       /* 32 bit signed integer */#define RDB_ENC_LZF 3         /* string compressed with FASTLZ */
  1. 对象类型
12345678910111213141516171819202122
/* Map object types to RDB object types. Macros starting with OBJ_ are for * memory storage and may change. Instead RDB types must be fixed because * we store them on disk. */#define RDB_TYPE_STRING 0#define RDB_TYPE_LIST   1#define RDB_TYPE_SET    2#define RDB_TYPE_ZSET   3#define RDB_TYPE_HASH   4#define RDB_TYPE_ZSET_2 5 /* ZSET version 2 with doubles stored in binary. */#define RDB_TYPE_MODULE 6#define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without                               the generating module being loaded. *//* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW *//* Object types for encoded objects. */#define RDB_TYPE_HASH_ZIPMAP    9#define RDB_TYPE_LIST_ZIPLIST  10#define RDB_TYPE_SET_INTSET    11#define RDB_TYPE_ZSET_ZIPLIST  12#define RDB_TYPE_HASH_ZIPLIST  13#define RDB_TYPE_LIST_QUICKLIST 14#define RDB_TYPE_STREAM_LISTPACKS 15
对过期键的处理
  • 创建rdb时候会过滤掉过期的键
  • 读取时:
    • 主:只会载入未过期的;
    • 从:全部载入,因为后期主从会同步
1234567891011121314151617181920212223
核心函数rdbLoadRio: /* Read key */        if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr;        /* Read value */        if ((val = rdbLoadObject(type,rdb)) == NULL) goto eoferr;        //看到没,主会过滤掉。而从的话。直接载入        if (server.masterhost == NULL && !loading_aof && expiretime != -1 && expiretime < now) {            decrRefCount(key);            decrRefCount(val);        } else {            /* Add the new object in the hash table */            dbAdd(db,key,val);            /* Set the expire time if needed */            if (expiretime != -1) setExpire(NULL,db,key,expiretime);                        /* Set usage information (for eviction). */            objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock);            /* Decrement the key refcount since dbAdd() will take its             * own reference. */            decrRefCount(key);        }
RDB优缺点
优点
  1. 文件紧凑,一个二进制文件,适用于备份,全量复制等场景。
  2. Redis启动时恢复速度快。
  3. 适合做冷备,比如定期将rdb文件复制到远程文件系统中。
缺点
  1. 无法做到实时持久化/秒级持久化。
  2. 可能会丢失数据。
  3. 兼容性的问题。