数据写入
-
写入
writeChfunc (db *DB) sendToWriteCh(entries []*Entry) (*request, error) { if atomic.LoadInt32(&db.blockWrites) == 1 { return nil, ErrBlockedWrites } var count, size int64 for _, e := range entries { size += e.estimateSizeAndSetThreshold(db.valueThreshold()) count++ } if count >= db.opt.maxBatchCount || size >= db.opt.maxBatchSize { return nil, ErrTxnTooBig } // We can only service one request because we need each txn to be stored in a contiguous section. // Txns should not interleave among other txns or rewrites. req := requestPool.Get().(*request) req.reset() req.Entries = entries req.Wg.Add(1) req.IncrRef() // for db write db.writeCh <- req // Handled in doWrites. y.NumPutsAdd(db.opt.MetricsEnabled, int64(len(entries))) return req, nil } -
写入
reqs函数内部是一个生产者消费者的模式,
pendingCh是一个缓冲为1的channel,作为了生产者和消费者沟通的桥梁。在缓冲中无数据的时候,生产者可以开启goroutine
writeRequests消费reqs。消费者消费完数据之后会从pendingCh取出缓冲的struct{}{},从而让生产者知道消费者可以继续消费了。若消费者还没消费完数据,而reqs数据到达阈值3*kvWriteChCapacity当前goroutine就会阻塞在往pendingCh写数据的地方。总的来说db写数据是单线程的,因为只有一个goroutine
writeReqeusts可以消费往db写入的数据。func (db *DB) doWrites(lc *z.Closer) { defer lc.Done() pendingCh := make(chan struct{}, 1) // 消费reqs的数据 writeRequests := func(reqs []*request) { if err := db.writeRequests(reqs); err != nil { db.opt.Errorf("writeRequests: %v", err) } <-pendingCh } // This variable tracks the number of pending writes. reqLen := new(expvar.Int) y.PendingWritesSet(db.opt.MetricsEnabled, db.opt.Dir, reqLen) // 缓存列表 reqs := make([]*request, 0, 10) for { var r *request select { case r = <-db.writeCh: case <-lc.HasBeenClosed(): goto closedCase } for { reqs = append(reqs, r) reqLen.Set(int64(len(reqs))) // 超过缓存,尝试写入 if len(reqs) >= 3*kvWriteChCapacity { pendingCh <- struct{}{} // blocking. goto writeCase } select { // Either push to pending, or continue to pick from writeCh. case r = <-db.writeCh: // 继续放到缓存列表reqs之中 case pendingCh <- struct{}{}: // writeReqeusts消费完一波数据了,可以开启下一波消费 goto writeCase case <-lc.HasBeenClosed(): goto closedCase } } closedCase: // All the pending request are drained. // Don't close the writeCh, because it has be used in several places. for { select { case r = <-db.writeCh: reqs = append(reqs, r) default: pendingCh <- struct{}{} // Push to pending before doing a write. writeRequests(reqs) return } } writeCase: go writeRequests(reqs) reqs = make([]*request, 0, 10) reqLen.Set(0) } } -
准备写入lsm
- 将请求中的大value写入到vlog文件
- 发送变化给监听者
- 确保有足够的空间可以写入每个entry
// writeRequests is called serially by only one goroutine. func (db *DB) writeRequests(reqs []*request) error { if len(reqs) == 0 { return nil } done := func(err error) { for _, r := range reqs { r.Err = err r.Wg.Done() } } db.opt.Debugf("writeRequests called. Writing to value log") // 1. 将请求中的大value写入到vlog err := db.vlog.write(reqs) if err != nil { done(err) return err } db.opt.Debugf("Sending updates to subscribers") // 2. 存在监听update的功能,发送给监听者 db.pub.sendUpdates(reqs) db.opt.Debugf("Writing to memtable") var count int for _, b := range reqs { if len(b.Entries) == 0 { continue } count += len(b.Entries) var i uint64 var err error // 3. 确保有足够的空间 for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() { i++ if i%100 == 0 { db.opt.Debugf("Making room for writes") } // We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm. // When flushChan is full and you are blocked there, and the flusher is trying to update s.imm, // you will get a deadlock. time.Sleep(10 * time.Millisecond) } if err != nil { done(err) return y.Wrap(err, "writeRequests") } // 把数据写入到lsm if err := db.writeToLSM(b); err != nil { done(err) return y.Wrap(err, "writeRequests") } } done(nil) db.opt.Debugf("%d entries written", count) return nil } -
写入到lsm
func (db *DB) writeToLSM(b *request) error { db.lock.RLock() defer db.lock.RUnlock() for i, entry := range b.Entries { var err error if db.opt.managedTxns || entry.skipVlogAndSetThreshold(db.valueThreshold()) { // Will include deletion / tombstone case. err = db.mt.Put(entry.Key, y.ValueStruct{ Value: entry.Value, // value写入的是实际的值 // Ensure value pointer flag is removed. Otherwise, the value will fail // to be retrieved during iterator prefetch. `bitValuePointer` is only // known to be set in write to LSM when the entry is loaded from a backup // with lower ValueThreshold and its value was stored in the value log. Meta: entry.meta &^ bitValuePointer, // 当前entry的值是指针的flag位设置成0 UserMeta: entry.UserMeta, ExpiresAt: entry.ExpiresAt, }) } else { // Write pointer to Memtable. err = db.mt.Put(entry.Key, y.ValueStruct{ Value: b.Ptrs[i].Encode(), // value写入的是值所在file的offset Meta: entry.meta | bitValuePointer, // 当前entry的值指针flag位设置成1 UserMeta: entry.UserMeta, ExpiresAt: entry.ExpiresAt, }) } if err != nil { return y.Wrapf(err, "while writing to memTable") } } if db.opt.SyncWrites { return db.mt.SyncWAL() } return nil } -
写入memTable
- 写入数据到wal
- 写入数据到skiplist
func (mt *memTable) Put(key []byte, value y.ValueStruct) error { entry := &Entry{ Key: key, Value: value.Value, UserMeta: value.UserMeta, meta: value.Meta, ExpiresAt: value.ExpiresAt, } // 1. 如果配置了wal,则先写入到wal // wal is nil only when badger in running in in-memory mode and we don't need the wal. if mt.wal != nil { // If WAL exceeds opt.ValueLogFileSize, we'll force flush the memTable. See logic in // ensureRoomForWrite. if err := mt.wal.writeEntry(mt.buf, entry, mt.opt); err != nil { return y.Wrapf(err, "cannot write entry to WAL file") } } // 2. 事务结束并不需要写入到skiplist // We insert the finish marker in the WAL but not in the memtable. if entry.meta&bitFinTxn > 0 { return nil } // 3. 写入数据到skiplists // Write to skiplist and update maxVersion encountered. mt.sl.Put(key, value) if ts := y.ParseTs(entry.Key); ts > mt.maxVersion { mt.maxVersion = ts } return nil }
层级管理
-
NumCompactors个worker并发执行compactfunc (s *levelsController) startCompact(lc *z.Closer) { n := s.kv.opt.NumCompactors lc.AddRunning(n - 1) for i := 0; i < n; i++ { go s.runCompactor(i, lc) } } -
compact,层级压缩
- level0 → baseLevel,选择level0中
- level0 → level0
- level_n-1 → level_n
- levelMax →levelMax
func (s *levelsController) runCompactor(id int, lc *z.Closer) { defer lc.Done() randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond) select { case <-randomDelay.C: case <-lc.HasBeenClosed(): randomDelay.Stop() return } moveL0toFront := func(prios []compactionPriority) []compactionPriority { idx := -1 for i, p := range prios { if p.level == 0 { idx = i break } } // If idx == -1, we didn't find L0. // If idx == 0, then we don't need to do anything. L0 is already at the front. if idx > 0 { out := append([]compactionPriority{}, prios[idx]) out = append(out, prios[:idx]...) out = append(out, prios[idx+1:]...) return out } return prios } run := func(p compactionPriority) bool { // 执行压缩 err := s.doCompact(id, p) switch err { case nil: return true case errFillTables: // pass default: s.kv.opt.Warningf("While running doCompact: %v\n", err) } return false } runOnce := func() bool { // 按照优先级排序层级 prios := s.pickCompactLevels() if id == 0 { // Worker ID zero prefers to compact L0 always. prios = moveL0toFront(prios) } // 按照排序好的层级,从高优先级到低优先级依次尝试执行compact for _, p := range prios { if id == 0 && p.level == 0 { // Allow worker zero to run level 0, irrespective of its adjusted score. } else if p.adjusted < 1.0 { break } if run(p) { return true } } return false } tryLmaxToLmaxCompaction := func() { p := compactionPriority{ level: s.lastLevel().level, t: s.levelTargets(), } run(p) } count := 0 ticker := time.NewTicker(50 * time.Millisecond) defer ticker.Stop() for { select { // Can add a done channel or other stuff. case <-ticker.C: // 每50ms执行一次compact count++ // Each ticker is 50ms so 50*200=10seconds. if s.kv.opt.LmaxCompaction && id == 2 && count >= 200 { tryLmaxToLmaxCompaction() count = 0 } else { runOnce() } case <-lc.HasBeenClosed(): return } } } -
排序层级
-
查看现有数据的情况,找到对应的每个层级应该的目标
// 查看当前数据的情况,找到每个层级需要达到的目标 // levelTargets calculates the targets for levels in the LSM tree. The idea comes from Dynamic Level // Sizes ( <https://rocksdb.org/blog/2015/07/23/dynamic-level.html> ) in RocksDB. The sizes of levels // are calculated based on the size of the lowest level, typically L6. So, if L6 size is 1GB, then // L5 target size is 100MB, L4 target size is 10MB and so on. // // L0 files don't automatically go to L1. Instead, they get compacted to Lbase, where Lbase is // chosen based on the first level which is non-empty from top (check L1 through L6). For an empty // DB, that would be L6. So, L0 compactions go to L6, then L5, L4 and so on. // // Lbase is advanced to the upper levels when its target size exceeds BaseLevelSize. For // example, when L6 reaches 1.1GB, then L4 target sizes becomes 11MB, thus exceeding the // BaseLevelSize of 10MB. L3 would then become the new Lbase, with a target size of 1MB < // BaseLevelSize. func (s *levelsController) levelTargets() targets { adjust := func(sz int64) int64 { if sz < s.kv.opt.BaseLevelSize { return s.kv.opt.BaseLevelSize } return sz } t := targets{ targetSz: make([]int64, len(s.levels)), fileSz: make([]int64, len(s.levels)), } // DB size is the size of the last level. dbSize := s.lastLevel().getTotalSize() for i := len(s.levels) - 1; i > 0; i-- { ltarget := adjust(dbSize) t.targetSz[i] = ltarget if t.baseLevel == 0 && ltarget <= s.kv.opt.BaseLevelSize { t.baseLevel = i } dbSize /= int64(s.kv.opt.LevelSizeMultiplier) } tsz := s.kv.opt.BaseTableSize for i := 0; i < len(s.levels); i++ { if i == 0 { // Use MemTableSize for Level 0. Because at Level 0, we stop compactions based on the // number of tables, not the size of the level. So, having a 1:1 size ratio between // memtable size and the size of L0 files is better than churning out 32 files per // memtable (assuming 64MB MemTableSize and 2MB BaseTableSize). t.fileSz[i] = s.kv.opt.MemTableSize } else if i <= t.baseLevel { t.fileSz[i] = tsz } else { tsz *= int64(s.kv.opt.TableSizeMultiplier) t.fileSz[i] = tsz } } // Bring the base level down to the last empty level. for i := t.baseLevel + 1; i < len(s.levels)-1; i++ { if s.levels[i].getTotalSize() > 0 { break } t.baseLevel = i } // If the base level is empty and the next level size is less than the // target size, pick the next level as the base level. b := t.baseLevel lvl := s.levels if b < len(lvl)-1 && lvl[b].getTotalSize() == 0 && lvl[b+1].getTotalSize() < t.targetSz[b+1] { t.baseLevel++ } return t } -
优先级排序
优先级排序有如下三个规则
-
的按照table数量除以预期的table数量作为优先级
-
非的按照实际和预期的大小算优先级。如果大于等于1,则说明当前level的数据量达到了阈值,可以缩减。此值越大,则说明超过的阈值越多,约需要缩减。
-
对于相邻层级的,可以比较和的,如果的比较小,那么可以优先
compact,因为相对超过的空间来说的超出空间较小。所以在需要进行compcat的时候,可以利用来计算优先级。注意 不一定是需要compact的。此规则相较于规则2考虑了更多的场景。
// 按照层级的优先级排序 // pickCompactLevel determines which level to compact. // Based on: <https://github.com/facebook/rocksdb/wiki/Leveled-Compaction> func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { // 找到层级的目标 t := s.levelTargets() addPriority := func(level int, score float64) { pri := compactionPriority{ level: level, score: score, adjusted: score, t: t, } prios = append(prios, pri) } // Add L0 priority based on the number of tables. addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables)) // All other levels use size to calculate priority. for i := 1; i < len(s.levels); i++ { // Don't consider those tables that are already being compacted right now. delSize := s.cstatus.delSize(i) l := s.levels[i] sz := l.getTotalSize() - delSize addPriority(i, float64(sz)/float64(t.targetSz[i])) } y.AssertTrue(len(prios) == len(s.levels)) // The following code is borrowed from PebbleDB and results in healthier LSM tree structure. // If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1 // score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so // on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then // we'll increase the priority of Li-1. // Overall what this means is, if the bottom level is already overflowing, then de-prioritize // compaction of the above level. If the bottom level is not full, then increase the priority of // above level. var prevLevel int for level := t.baseLevel; level < len(s.levels); level++ { if prios[prevLevel].adjusted >= 1 { // Avoid absurdly large scores by placing a floor on the score that we'll // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily const minScore = 0.01 if prios[level].score >= minScore { prios[prevLevel].adjusted /= prios[level].adjusted } else { prios[prevLevel].adjusted /= minScore } } prevLevel = level } // Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score. // We'll still sort them by their adjusted score below. Having both these scores allows us to // make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0 // compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions. out := prios[:0] for _, p := range prios[:len(prios)-1] { if p.score >= 1.0 { out = append(out, p) } } prios = out // Sort by the adjusted score. sort.Slice(prios, func(i, j int) bool { return prios[i].adjusted > prios[j].adjusted }) return prios } -
-
-
按照排序好优先级的层级顺序,依次查看哪个层级可以执行
compact// doCompact picks some table on level l and compacts it away to the next level. func (s *levelsController) doCompact(id int, p compactionPriority) error { l := p.level y.AssertTrue(l < s.kv.opt.MaxLevels) // Sanity check. if p.t.baseLevel == 0 { p.t = s.levelTargets() } _, span := otrace.StartSpan(context.Background(), "Badger.Compaction") defer span.End() cd := compactDef{ compactorId: id, span: span, p: p, t: p.t, thisLevel: s.levels[l], dropPrefixes: p.dropPrefixes, } // 1. 找到可以进行compact的table // While picking tables to be compacted, both levels' tables are expected to // remain unchanged. if l == 0 { // level_0 -> baseLevel cd.nextLevel = s.levels[p.t.baseLevel] if !s.fillTablesL0(&cd) { return errFillTables } } else { cd.nextLevel = cd.thisLevel // We're not compacting the last level so pick the next level. if !cd.thisLevel.isLastLevel() { // level_n-1 -> level_n cd.nextLevel = s.levels[l+1] } // else level_n -> level_n if !s.fillTables(&cd) { return errFillTables } } defer s.cstatus.delete(cd) // Remove the ranges from compaction status. span.Annotatef(nil, "Compaction: %+v", cd) // 2. 执行compact if err := s.runCompactDef(id, l, cd); err != nil { // This compaction couldn't be done successfully. s.kv.opt.Warningf("[Compactor: %d] LOG Compact FAILED with error: %+v: %+v", id, err, cd) return err } s.kv.opt.Debugf("[Compactor: %d] Compaction for level: %d DONE", id, cd.thisLevel.level) return nil }-
选择table
- 排序table: 按照每个table的最大版本从小到大排序
- 校验table:
castatus存储了当前各层级正在压缩的范围。cstatus.overlapsWith使用RLock来查看是否有重叠(相较于Lock成本更低),而在cstatus.compareAndAdd的时候才会使用排它锁Lock来查看是否有重叠,在没有重叠的时候才会添加到cstatus中去。
func (s *levelsController) fillTables(cd *compactDef) bool { cd.lockLevels() defer cd.unlockLevels() tables := make([]*table.Table, len(cd.thisLevel.tables)) copy(tables, cd.thisLevel.tables) if len(tables) == 0 { return false } // We're doing a maxLevel to maxLevel compaction. Pick tables based on the stale data size. if cd.thisLevel.isLastLevel() { return s.fillMaxLevelTables(tables, cd) } // We pick tables, so we compact older tables first. This is similar to // kOldestLargestSeqFirst in RocksDB. s.sortByHeuristic(tables, cd) for _, t := range tables { cd.thisSize = t.Size() cd.thisRange = getKeyRange(t) // csstatus存储了正在进行compact的table,判断是否与已有的重叠,如有就查看下个table // If we're already compacting this range, don't do anything. if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) { continue } cd.top = []*table.Table{t} left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange) cd.bot = make([]*table.Table, right-left) copy(cd.bot, cd.nextLevel.tables[left:right]) if len(cd.bot) == 0 { cd.bot = []*table.Table{} cd.nextRange = cd.thisRange if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { continue } return true } cd.nextRange = getKeyRange(cd.bot...) // 查看下个层级是否有重叠 if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) { continue } if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { continue } return true } return false } -
执行compact
- merge: 对选择的table分块,分为五份,每份的table数量最少为3。这样就可以对数据分片的和top table合并,并发的构建新的table。
- fix: 删除top的table,将bottom的table替换为新合并的table
// addSplits can allow us to run multiple sub-compactions in parallel across the split key ranges. func (s *levelsController) runCompactDef(id, l int, cd compactDef) (err error) { if len(cd.t.fileSz) == 0 { return errors.New("Filesizes cannot be zero. Targets are not set") } timeStart := time.Now() thisLevel := cd.thisLevel nextLevel := cd.nextLevel y.AssertTrue(len(cd.splits) == 0) if thisLevel.level == nextLevel.level { // don't do anything for L0 -> L0 and Lmax -> Lmax. } else { // Level_n-1 -> level_n的时候,对bottom的tables分块合并 s.addSplits(&cd) } if len(cd.splits) == 0 { cd.splits = append(cd.splits, keyRange{}) } // Table should never be moved directly between levels, always be rewritten to allow discarding // invalid versions. // 构建合并好了的table newTables, decr, err := s.compactBuildTables(l, cd) if err != nil { return err } defer func() { // Only assign to err, if it's not already nil. if decErr := decr(); err == nil { err = decErr } }() changeSet := buildChangeSet(&cd, newTables) // We write to the manifest _before_ we delete files (and after we created files) if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { return err } // 把bottom层级的table,替换为更新了的table // See comment earlier in this function about the ordering of these ops, and the order in which // we access levels when reading. if err := nextLevel.replaceTables(cd.bot, newTables); err != nil { return err } // 删除top层级的table if err := thisLevel.deleteTables(cd.top); err != nil { return err } // Note: For level 0, while doCompact is running, it is possible that new tables are added. // However, the tables are added only to the end, so it is ok to just delete the first table. from := append(tablesToString(cd.top), tablesToString(cd.bot)...) to := tablesToString(newTables) if dur := time.Since(timeStart); dur > 2*time.Second { var expensive string if dur > time.Second { expensive = " [E]" } s.kv.opt.Infof("[%d]%s LOG Compact %d->%d (%d, %d -> %d tables with %d splits)."+ " [%s] -> [%s], took %v\n", id, expensive, thisLevel.level, nextLevel.level, len(cd.top), len(cd.bot), len(newTables), len(cd.splits), strings.Join(from, " "), strings.Join(to, " "), dur.Round(time.Millisecond)) } if cd.thisLevel.level != 0 && len(newTables) > 2*s.kv.opt.LevelSizeMultiplier { s.kv.opt.Debugf("This Range (numTables: %d)\nLeft:\n%s\nRight:\n%s\n", len(cd.top), hex.Dump(cd.thisRange.left), hex.Dump(cd.thisRange.right)) s.kv.opt.Debugf("Next Range (numTables: %d)\nLeft:\n%s\nRight:\n%s\n", len(cd.bot), hex.Dump(cd.nextRange.left), hex.Dump(cd.nextRange.right)) } return nil }
-