Redis相比memcache而言,最重要的一个特点是其支持持久化。而在redis中持久化分为两种RDB和AOF。
RDB
原理
Rdb思想是把当前进程数据生成快照保存到硬盘的过程,保存数据库的键值对。触发RDB持久化过程分为手动触发和自动触发。
配置文件

自动触发

save参数引起的自动触发
Redis的rdb受配置文件中save 配置的限制,当满足条件时,会触发rdb持久化。当然前提是我们打开rdb的持久化。
- 数据结构
1234
| struct saveparam { time_t seconds; int changes;};
|
可能会想到,既然给了标准,那总得有个对象记录每次变化和上次save时间吧?没错,这两个变量记录在redisServer结构中(如果忘记了,请看前文)。
123456789101112131415161718192021222324
| struct redisServer server; /* Server global state *//* RDB persistence RDB持久化相关的对象*///表示最后一次save之后数据变化 long long dirty; /* Changes to DB from the last save */ long long dirty_before_bgsave; /* Used to restore dirty on failed BGSAVE */ pid_t rdb_child_pid; /* PID of RDB saving child */ struct saveparam *saveparams; /* Save points array for RDB */ int saveparamslen; /* Number of saving points */ char *rdb_filename; /* Name of RDB file */ int rdb_compression; /* Use compression in RDB? */ int rdb_checksum; /* Use RDB checksum? *///表示上一次save时间 time_t lastsave; /* Unix time of last successful save */ time_t lastbgsave_try; /* Unix time of last attempted bgsave */ time_t rdb_save_time_last; /* Time used by last RDB save run. */ time_t rdb_save_time_start; /* Current RDB save start time. */ int rdb_bgsave_scheduled; /* BGSAVE when possible if true. */ int rdb_child_type; /* Type of save by active child. */ int lastbgsave_status; /* C_OK or C_ERR */ int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */ int rdb_pipe_write_result_to_parent; /* RDB pipes used to return the state */ int rdb_pipe_read_result_from_child; /* of each slave in diskless SYNC. */ /* Pipe and data structures for child -> parent info sharing. */
|
- 计算方式(见serverCron函数)
123456789101112131415161718192021
| for (j = 0; j < server.saveparamslen; j++) { struct saveparam *sp = server.saveparams+j; /* Save if we reached the given amount of changes, * the given amount of seconds, and if the latest bgsave was * successful or if, in case of an error, at least * CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */ if (server.dirty >= sp->changes && server.unixtime-server.lastsave > sp->seconds && (server.unixtime-server.lastbgsave_try > CONFIG_BGSAVE_RETRY_DELAY || server.lastbgsave_status == C_OK)) { serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...", sp->changes, (int)sp->seconds); rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); rdbSaveBackground(server.rdb_filename,rsiptr); break; } }
|
主从复制触发
见后文的主从复制
debug reload触发
12345678910111213141516171819
| else if (!strcasecmp(c->argv[1]->ptr,"reload")) { //看看这里 rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (rdbSave(server.rdb_filename,rsiptr) != C_OK) { addReply(c,shared.err); return; } emptyDb(-1,EMPTYDB_NO_FLAGS,NULL); protectClient(c); int ret = rdbLoad(server.rdb_filename,NULL); unprotectClient(c); if (ret != C_OK) { addReplyError(c,"Error trying to load the RDB dump"); return; } serverLog(LL_WARNING,"DB reloaded by DEBUG RELOAD"); addReply(c,shared.ok); }
|
shutdown+save参数触发
1234567891011121314151617181920212223242526272829303132333435363738
| shutdownCommand--->prepareForShutdownint prepareForShutdown(int flags) { int save = flags & SHUTDOWN_SAVE; int nosave = flags & SHUTDOWN_NOSAVE; serverLog(LL_WARNING,"User requested shutdown..."); ... if (server.rdb_child_pid != -1) { serverLog(LL_WARNING,"There is a child saving an .rdb. Killing it!"); kill(server.rdb_child_pid,SIGUSR1); rdbRemoveTempFile(server.rdb_child_pid); } ... /* Create a new RDB file before exiting. */ if ((server.saveparamslen > 0 && !nosave) || save) { serverLog(LL_NOTICE,"Saving the final RDB snapshot before exiting."); /* Snapshotting. Perform a SYNC SAVE and exit */ rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (rdbSave(server.rdb_filename,rsiptr) != C_OK) { /* Ooops.. error saving! The best we can do is to continue * operating. Note that if there was a background saving process, * in the next cron() Redis will be notified that the background * saving aborted, handling special stuff like slaves pending for * synchronization... */ serverLog(LL_WARNING,"Error trying to save the DB, can't exit."); return C_ERR; } } ... return C_OK;}
|
手动触发
save命令
需要注意的是save是阻塞操作,直到整个save完成,redis才会对外服务。
1234567891011121314
| void saveCommand(client *c) { if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); return; } rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (rdbSave(server.rdb_filename,rsiptr) == C_OK) { addReply(c,shared.ok); } else { addReply(c,shared.err); }}
|
bgsave命令
bgsave和save不同,他是创建一个子进程来进行持久化。
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
| /* BGSAVE [SCHEDULE] */void bgsaveCommand(client *c) { int schedule = 0; /* The SCHEDULE option changes the behavior of BGSAVE when an AOF rewrite * is in progress. Instead of returning an error a BGSAVE gets scheduled. */ if (c->argc > 1) { if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"schedule")) { schedule = 1; } else { addReply(c,shared.syntaxerr); return; } } rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); } else if (server.aof_child_pid != -1) { if (schedule) { server.rdb_bgsave_scheduled = 1; addReplyStatus(c,"Background saving scheduled"); } else { addReplyError(c, "An AOF log rewriting in progress: can't BGSAVE right now. " "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever " "possible."); } } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) { addReplyStatus(c,"Background saving started"); } else { addReply(c,shared.err); }}int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) { pid_t childpid; start = ustime(); //看看这里 if ((childpid = fork()) == 0) { int retval; /* Child */ ... } else { /* Parent */ ... } return C_OK; /* unreached */}
|
bgsave命令执行时限制
命令:BGSAVE [SCHEDULE]
- ==当已经在进行bgsave时,不允许在进行==
- ==当正在进行aof rewrite时,如果schedule设置为1,这进行延迟执行;否则直接回复错误==。
1234567891011121314151617
| if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); } else if (server.aof_child_pid != -1) { if (schedule) { server.rdb_bgsave_scheduled = 1; addReplyStatus(c,"Background saving scheduled"); } else { addReplyError(c, "An AOF log rewriting in progress: can't BGSAVE right now. " "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever " "possible."); } } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) { addReplyStatus(c,"Background saving started"); } else { addReply(c,shared.err); }
|
关于延迟,请看serverCron函数
12345678910
| if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 && server.rdb_bgsave_scheduled && (server.unixtime-server.lastbgsave_try > CONFIG_BGSAVE_RETRY_DELAY || server.lastbgsave_status == C_OK)) { rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) server.rdb_bgsave_scheduled = 0; }
|
bgsave执行流程

分析
- fork
Linux中fork采用写时复制思想,当数据发生变化时,才会拷贝一份完整内存。因此这里我们回顾一下之前说rdb持久化时,尽量不要进行rehash。
1234567891011121314151617181920212223242526
| else { /* Parent */ server.stat_fork_time = ustime()-start; server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000); if (childpid == -1) { closeChildInfoPipe(); server.lastbgsave_status = C_ERR; serverLog(LL_WARNING,"Can't save in background: fork: %s", strerror(errno)); return C_ERR; } serverLog(LL_NOTICE,"Background saving started by pid %d",childpid); server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_DISK; updateDictResizePolicy(); return C_OK; }//看看这里,他禁止了rehash操作void updateDictResizePolicy(void) { if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) dictEnableResize(); else dictDisableResize();}
|
当然,不能一直都不能rehash,冲突链表太长,会影响查询性能。其打破条件为:used.length/all.length>=5
123456789101112
| static unsigned int dict_force_resize_ratio = 5;static int _dictExpandIfNeeded(dict *d){ .... if (d->ht[0].used >= d->ht[0].size && (dict_can_resize || d->ht[0].used/d->ht[0].size > dict_force_resize_ratio)) { return dictExpand(d, d->ht[0].used*2); } return DICT_OK;}
|
- 父子进程通信——管道
fork完后,生成一个子进程,而进程间是不共享内存空间的,加之这里是属于父子进程的关系,因此我们使用管道来进行通信。
1234567891011
| void openChildInfoPipe(void) { if (pipe(server.child_info_pipe) == -1) { /* On error our two file descriptors should be still set to -1, * but we call anyway cloesChildInfoPipe() since can't hurt. */ closeChildInfoPipe(); } else if (anetNonBlock(NULL,server.child_info_pipe[0]) != ANET_OK) { closeChildInfoPipe(); } else { memset(&server.child_info_data,0,sizeof(server.child_info_data)); }}
|
子进程向父进程传输的是子进程状态数据,其数据结构在redisServer数据结构中
1234567
| /* Pipe and data structures for child -> parent info sharing. */ int child_info_pipe[2]; /* Pipe used to write the child_info_data. */ struct { int process_type; /* AOF or RDB child? */ size_t cow_size; /* Copy on write size. */ unsigned long long magic; /* Magic value to make sure data is valid. */ } child_info_data;
|
具体流程如下:
12345678910111213141516171819202122232425262728293031
| if ((childpid = fork()) == 0) { int retval; /* Child */ closeListeningSockets(0); redisSetProcTitle("redis-rdb-bgsave"); retval = rdbSave(filename,rsi); if (retval == C_OK) { size_t private_dirty = zmalloc_get_private_dirty(-1); if (private_dirty) { serverLog(LL_NOTICE, "RDB: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } server.child_info_data.cow_size = private_dirty; sendChildInfo(CHILD_INFO_TYPE_RDB); } exitFromChild((retval == C_OK) ? 0 : 1); } //子进程信息 void sendChildInfo(int ptype) { if (server.child_info_pipe[1] == -1) return; server.child_info_data.magic = CHILD_INFO_MAGIC; server.child_info_data.process_type = ptype; ssize_t wlen = sizeof(server.child_info_data); if (write(server.child_info_pipe[1],&server.child_info_data,wlen) != wlen) { /* Nothing to do on error, this will be detected by the other side. */ }}
|
- 由于文件描述符会继承,为避免额外增加其生命周期,我们对其进行close
123456789101112131415
| /* Child */closeListeningSockets(0);void closeListeningSockets(int unlink_unix_socket) { int j; for (j = 0; j < server.ipfd_count; j++) close(server.ipfd[j]); if (server.sofd != -1) close(server.sofd); if (server.cluster_enabled) for (j = 0; j < server.cfd_count; j++) close(server.cfd[j]); if (unlink_unix_socket && server.unixsocket) { serverLog(LL_NOTICE,"Removing the unix socket file."); unlink(server.unixsocket); /* don't care if this fails */ }}
|
- 主进程在serverCron中使用wait3回收子进,防止出现僵尸进程==(ps -elf指令时,显示Z)==.
wait3相比waitpid和wait函数而言,他多了一个资源汇总的功能。这些资源信息包括用户CPU时间总量、系统时间总量、页面出错次数、接收到信号的次数等。但是我看你也没用哇?
1
| pid_t wait3 ( int *status, int option, struct rusage *ru );
|
- 参数 options 提供一些额外的选项来控制 wait3():
WNOHANG;如果没有任何已经结束了的子进程,则马上返回,不等待;
WUNTRACED;如果子进程进入暂停执行的情况,则马上返回,但结束状态不予理会;
- wait3() 的返回值,有三种:
- 正常返回时,waitpid() 返回收集到的子进程的PID;
- 如果设置了 WNOHANG,而调用 waitpid() 时,没有发现已退出的子进程可收集,则返回0;
- 如果调用出错,则返回 -1,这时erron 会被设置为相应的值以指示错误所在。(当 pid 所指示的子进程不错在,或此进程存在,但不是调用进程的子进程, wait3() 就会返回出错,这时 erron 被设置为 ECHILD)
1234567891011121314151617181920212223242526272829
| if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { int exitcode = WEXITSTATUS(statloc); int bysignal = 0; if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); if (pid == -1) { serverLog(LL_WARNING,"wait3() returned an error: %s. " "rdb_child_pid = %d, aof_child_pid = %d", strerror(errno), (int) server.rdb_child_pid, (int) server.aof_child_pid); } else if (pid == server.rdb_child_pid) { backgroundSaveDoneHandler(exitcode,bysignal); if (!bysignal && exitcode == 0) receiveChildInfo(); } else if (pid == server.aof_child_pid) { backgroundRewriteDoneHandler(exitcode,bysignal); if (!bysignal && exitcode == 0) receiveChildInfo(); } else { if (!ldbRemoveChild(pid)) { serverLog(LL_WARNING, "Warning, detected child with unmatched pid: %ld", (long)pid); } } updateDictResizePolicy(); closeChildInfoPipe(); } }
|
RDB文件结构

rsi可选项
主从复制下,节点状态信息(能避免全同步就避免)
某些操作的宏定义
- 数据库操作
为了区分我们接下来的数据是哪种类型的,因此引入了1个字节的宏。
- 值类型
为啥这样区分呢?
因为我们知道整型要比字符串更省内存,因此在保存对象时,都是优先尝试是否可以INT编码
:
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
| /* Like rdbSaveRawString() gets a Redis object instead. */ssize_t rdbSaveStringObject(rio *rdb, robj *obj) { /* Avoid to decode the object, then encode it again, if the * object is already integer encoded. */ if (obj->encoding == OBJ_ENCODING_INT) { return rdbSaveLongLongAsStringObject(rdb,(long)obj->ptr); } else { serverAssertWithInfo(NULL,obj,sdsEncodedObject(obj)); return rdbSaveRawString(rdb,obj->ptr,sdslen(obj->ptr)); }}/* Save a long long value as either an encoded string or a string. */ssize_t rdbSaveLongLongAsStringObject(rio *rdb, long long value) { unsigned char buf[32]; ssize_t n, nwritten = 0; int enclen = rdbEncodeInteger(value,buf); if (enclen > 0) { return rdbWriteRaw(rdb,buf,enclen); } else { /* Encode as string */ enclen = ll2string((char*)buf,32,value); serverAssert(enclen < 32); if ((n = rdbSaveLen(rdb,enclen)) == -1) return -1; nwritten += n; if ((n = rdbWriteRaw(rdb,buf,enclen)) == -1) return -1; nwritten += n; } return nwritten;}/* Save a string object as [len][data] on disk. If the object is a string * representation of an integer value we try to save it in a special form */ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len) { int enclen; ssize_t n, nwritten = 0; /* Try integer encoding */ if (len <= 11) { unsigned char buf[5]; if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) { if (rdbWriteRaw(rdb,buf,enclen) == -1) return -1; return enclen; } } /* Try LZF compression - under 20 bytes it's unable to compress even * aaaaaaaaaaaaaaaaaa so skip it */ if (server.rdb_compression && len > 20) { n = rdbSaveLzfStringObject(rdb,s,len); if (n == -1) return -1; if (n > 0) return n; /* Return value of 0 means data can't be compressed, save the old way */ } /* Store verbatim */ if ((n = rdbSaveLen(rdb,len)) == -1) return -1; nwritten += n; if (len > 0) { if (rdbWriteRaw(rdb,s,len) == -1) return -1; nwritten += len; } return nwritten;}
|
- 整型类型
- 对象类型
12345678910111213141516171819202122
| /* Map object types to RDB object types. Macros starting with OBJ_ are for * memory storage and may change. Instead RDB types must be fixed because * we store them on disk. */
|
对过期键的处理
- 创建rdb时候会过滤掉过期的键
- 读取时:
- 主:只会载入未过期的;
- 从:全部载入,因为后期主从会同步
1234567891011121314151617181920212223
| 核心函数rdbLoadRio: /* Read key */ if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr; /* Read value */ if ((val = rdbLoadObject(type,rdb)) == NULL) goto eoferr; //看到没,主会过滤掉。而从的话。直接载入 if (server.masterhost == NULL && !loading_aof && expiretime != -1 && expiretime < now) { decrRefCount(key); decrRefCount(val); } else { /* Add the new object in the hash table */ dbAdd(db,key,val); /* Set the expire time if needed */ if (expiretime != -1) setExpire(NULL,db,key,expiretime); /* Set usage information (for eviction). */ objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock); /* Decrement the key refcount since dbAdd() will take its * own reference. */ decrRefCount(key); }
|
RDB优缺点
优点
- 文件紧凑,一个二进制文件,适用于备份,全量复制等场景。
- Redis启动时恢复速度快。
- 适合做冷备,比如定期将rdb文件复制到远程文件系统中。
缺点
- 无法做到实时持久化/秒级持久化。
- 可能会丢失数据。
- 兼容性的问题。
