Put函数实现
Write/Put/Delete接口最终都通过调用Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates)来实现,leveldb中删除同put一样,也是添加记录,类型标识为删除
先看一个结构
struct DBImpl::Writer {
explicit Writer(port::Mutex* mu)
: batch(nullptr), sync(false), done(false), cv(mu) {}
Status status;
WriteBatch* batch; // 需要写入的数据
bool sync; // 设置log是否马上写入磁盘,不马上写入会有丢失风险
bool done; // 标记写入是否完成
port::CondVar cv; // 条件变量
};
Writer为一个写操作的上下文,保存了写的内容,一些写入选项以及写入控制相关的条件变量等,其中batch为写入内容:
WriteBatch单个kv格式:
// 8字节sequence_number +
// 4字节count +
// 1字节kTypeValue(internal key的最后一个组件) +
// key_size(varint32) + key +
// value_size(varint32) + value
Write函数整个过程框架简略为:
MutexLock l(&mutex_); // 这个锁作用于整个write过程
writers_.push_back(&w); // push到队列里,用于多线程合作
// 使用条件变量,构造队列串行执行write操作
while (!w.done && &w != writers_.front()) {
// w.done为其他线程帮处理了直接返回
// w==writers_.front()轮到当前线程
w.cv.Wait();
}
if (w.done) { // 其它的writer线程可能会帮它执行
return w.status;
}
// 当前write的处理
Writer* last_writer = &w;
// .........
// 这里处理的不只是当前线程的writer,也会把其它writers_里面的writer取出来打包到一个batch,取出原则为当前writer是异步的话遇到同步的就停止,并且batch😂不能大于限制size
while (true) {
// 从队列头部开始取数据,并通知已经处理完的writer
Writer* ready = writers_.front();
writers_.pop_front();
if (ready != &w) {
ready->status = status;
ready->done = true;
ready->cv.Signal();
}
if (ready == last_writer) break;// 循环到当前线程处理的最后一个writer,退出
}
// Notify new head of write queue
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
}
可以看到,同一时间,写操作只有一个线程执行,且不同请求的写操作会打包成一个batch一起执行write过程
写操作主要做了什么
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(updates == nullptr); // 当前线程在队列最前端,可能启动compaction
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && updates != nullptr) { // nullptr batch is for compactions
WriteBatch* write_batch = BuildBatchGroup(&last_writer); // 把多个writer打包到了一个WriteBatch里面,会帮其它线程处理writer
WriteBatchInternal::SetSequence(write_batch, last_sequence + 1); // 同一batch处理的kv的sequence相同
last_sequence += WriteBatchInternal::Count(write_batch); // last_sequence:一个kv累加1
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
mutex_.Unlock();
// 先写log
status = log_->AddRecord(WriteBatchInternal::Contents(write_batch));
bool sync_error = false;
if (status.ok() && options.sync) {
status = logfile_->Sync(); // 同步的写入到磁盘
if (!status.ok()) {
sync_error = true;
}
}
// log写成功再写memtable
if (status.ok()) {
status = WriteBatchInternal::InsertInto(write_batch, mem_);
}
mutex_.Lock();
if (sync_error) {
// The state of the log file is indeterminate: the log record we
// just added may or may not show up when the DB is re-opened.
// So we force the DB into a mode where all future writes fail.
RecordBackgroundError(status);
}
}
if (write_batch == tmp_batch_) tmp_batch_->Clear();
versions_->SetLastSequence(last_sequence);
}
MkaeRoomForWrite函数在内存不够用时,启动compaction将memtable写到磁盘并合并文件,其中,手动启动compaction会强制执行
// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
// 输入:force=true,手动compactionRange,不可以delay
Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force; // 不是手动compactionRange,就是一般的写
Status s;
while (true) {
if (!bg_error_.ok()) { // 有错误直接返回
// Yield previous error
s = bg_error_;
break;
} else if (allow_delay && versions_->NumLevelFiles(0) >=
config::kL0_SlowdownWritesTrigger) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
// individual write by 1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
// 快到L0的文件数限制了,不是在到达限制的时候把某个write操作延迟几秒,
// 而是快到限制的时候把每个写操作都延迟1ms来降低延迟方差。
// 此外,这个延迟会让出一些CPU给compaction线程,防止它跟writer共享同一个核
mutex_.Unlock();
env_->SleepForMicroseconds(1000);
allow_delay = false; // Do not delay a single write more than once
mutex_.Lock();
} else if (!force &&
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { // 还有空间,直接退出这个函数
// There is room in current memtable
break;
} else if (imm_ != nullptr) { // imm_落盘,等等
// We have filled up the current memtable, but the previous
// one is still being compacted, so we wait.
Log(options_.info_log, "Current memtable full; waiting...\n");
background_work_finished_signal_.Wait();
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // L0文件数超限制了要等compact
// There are too many level-0 files.
Log(options_.info_log, "Too many L0 files; waiting...\n");
background_work_finished_signal_.Wait();
} else {
// Attempt to switch to a new memtable and trigger compaction of old
assert(versions_->PrevLogNumber() == 0);
uint64_t new_log_number = versions_->NewFileNumber(); // return next_file_number_++
WritableFile* lfile = nullptr;
s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
if (!s.ok()) {
// Avoid chewing through file number space in a tight loop.
versions_->ReuseFileNumber(new_log_number);
break;
}
delete log_;
delete logfile_;
logfile_ = lfile;
logfile_number_ = new_log_number;
log_ = new log::Writer(lfile);
imm_ = mem_;
has_imm_.store(true, std::memory_order_release);
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
force = false; // Do not force another compaction if have room
MaybeScheduleCompaction(); // 启动compaction
}
}
return s;
}
可以看到,如果内存还够用,MakeRoomForWrite什么都不会做;如果L0的文件数快到限制了,那么当前的write如果不是手动启动的那么每个写操作延迟1ms;如果正在执行imm_落盘,或者落盘完成但L0已经到文件数限制了那就先停止写等compaction完成;如果内存不够用且并没有在compact那么启动compaction