golang badger 源码解析

472 阅读5分钟

数据写入

  1. 写入writeCh

    func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) {
    	if atomic.LoadInt32(&db.blockWrites) == 1 {
    		return nil, ErrBlockedWrites
    	}
    	var count, size int64
    	for _, e := range entries {
    		size += e.estimateSizeAndSetThreshold(db.valueThreshold())
    		count++
    	}
    	if count >= db.opt.maxBatchCount || size >= db.opt.maxBatchSize {
    		return nil, ErrTxnTooBig
    	}
    
    	// We can only service one request because we need each txn to be stored in a contiguous section.
    	// Txns should not interleave among other txns or rewrites.
    	req := requestPool.Get().(*request)
    	req.reset()
    	req.Entries = entries
    	req.Wg.Add(1)
    	req.IncrRef()     // for db write
    	db.writeCh <- req // Handled in doWrites.
    	y.NumPutsAdd(db.opt.MetricsEnabled, int64(len(entries)))
    
    	return req, nil
    }
    
  2. 写入reqs

    函数内部是一个生产者消费者的模式,pendingCh是一个缓冲为1的channel,作为了生产者和消费者沟通的桥梁。

    在缓冲中无数据的时候,生产者可以开启goroutine writeRequests 消费reqs。消费者消费完数据之后会从pendingCh 取出缓冲的struct{}{},从而让生产者知道消费者可以继续消费了。若消费者还没消费完数据,而reqs数据到达阈值3*kvWriteChCapacity 当前goroutine就会阻塞在往pendingCh 写数据的地方。

    总的来说db写数据是单线程的,因为只有一个goroutine writeReqeusts 可以消费往db写入的数据。

    func (db *DB) doWrites(lc *z.Closer) {
    	defer lc.Done()
    	pendingCh := make(chan struct{}, 1)
    	// 消费reqs的数据
    	writeRequests := func(reqs []*request) {
    		if err := db.writeRequests(reqs); err != nil {
    			db.opt.Errorf("writeRequests: %v", err)
    		}
    		<-pendingCh
    	}
    
    	// This variable tracks the number of pending writes.
    	reqLen := new(expvar.Int)
    	y.PendingWritesSet(db.opt.MetricsEnabled, db.opt.Dir, reqLen)
    	// 缓存列表
    	reqs := make([]*request, 0, 10)
    	for {
    		var r *request
    		select {
    		case r = <-db.writeCh:
    		case <-lc.HasBeenClosed():
    			goto closedCase
    		}
    
    		for {
    			reqs = append(reqs, r)
    			reqLen.Set(int64(len(reqs)))
    			// 超过缓存,尝试写入
    			if len(reqs) >= 3*kvWriteChCapacity {
    				pendingCh <- struct{}{} // blocking.
    				goto writeCase
    			}
    
    			select {
    			// Either push to pending, or continue to pick from writeCh.
    			case r = <-db.writeCh:
    				// 继续放到缓存列表reqs之中
    			case pendingCh <- struct{}{}:
    				// writeReqeusts消费完一波数据了,可以开启下一波消费
    				goto writeCase
    			case <-lc.HasBeenClosed():
    				goto closedCase
    			}
    		}
    
    	closedCase:
    		// All the pending request are drained.
    		// Don't close the writeCh, because it has be used in several places.
    		for {
    			select {
    			case r = <-db.writeCh:
    				reqs = append(reqs, r)
    			default:
    				pendingCh <- struct{}{} // Push to pending before doing a write.
    				writeRequests(reqs)
    				return
    			}
    		}
    
    	writeCase:
    		go writeRequests(reqs)
    		reqs = make([]*request, 0, 10)
    		reqLen.Set(0)
    	}
    }
    
  3. 准备写入lsm

    1. 将请求中的大value写入到vlog文件
    2. 发送变化给监听者
    3. 确保有足够的空间可以写入每个entry
    // writeRequests is called serially by only one goroutine.
    func (db *DB) writeRequests(reqs []*request) error {
    	if len(reqs) == 0 {
    		return nil
    	}
    
    	done := func(err error) {
    		for _, r := range reqs {
    			r.Err = err
    			r.Wg.Done()
    		}
    	}
    	db.opt.Debugf("writeRequests called. Writing to value log")
      // 1. 将请求中的大value写入到vlog
    	err := db.vlog.write(reqs)
    	if err != nil {
    		done(err)
    		return err
    	}
    
    	db.opt.Debugf("Sending updates to subscribers")
      // 2. 存在监听update的功能,发送给监听者
    	db.pub.sendUpdates(reqs)
    	db.opt.Debugf("Writing to memtable")
    	var count int
    	for _, b := range reqs {
    		if len(b.Entries) == 0 {
    			continue
    		}
    		count += len(b.Entries)
    		var i uint64
    		var err error
    		// 3. 确保有足够的空间
    		for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() {
    			i++
    			if i%100 == 0 {
    				db.opt.Debugf("Making room for writes")
    			}
    			// We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm.
    			// When flushChan is full and you are blocked there, and the flusher is trying to update s.imm,
    			// you will get a deadlock.
    			time.Sleep(10 * time.Millisecond)
    		}
    		if err != nil {
    			done(err)
    			return y.Wrap(err, "writeRequests")
    		}
    		// 把数据写入到lsm
    		if err := db.writeToLSM(b); err != nil {
    			done(err)
    			return y.Wrap(err, "writeRequests")
    		}
    	}
    	done(nil)
    	db.opt.Debugf("%d entries written", count)
    	return nil
    }
    
  4. 写入到lsm

    func (db *DB) writeToLSM(b *request) error {
    	db.lock.RLock()
    	defer db.lock.RUnlock()
    	for i, entry := range b.Entries {
    		var err error
    		if db.opt.managedTxns || entry.skipVlogAndSetThreshold(db.valueThreshold()) {
    			// Will include deletion / tombstone case.
    			err = db.mt.Put(entry.Key,
    				y.ValueStruct{
    					Value: entry.Value, // value写入的是实际的值
    					// Ensure value pointer flag is removed. Otherwise, the value will fail
    					// to be retrieved during iterator prefetch. `bitValuePointer` is only
    					// known to be set in write to LSM when the entry is loaded from a backup
    					// with lower ValueThreshold and its value was stored in the value log.
    					Meta:      entry.meta &^ bitValuePointer, // 当前entry的值是指针的flag位设置成0
    					UserMeta:  entry.UserMeta,
    					ExpiresAt: entry.ExpiresAt,
    				})
    		} else {
    			// Write pointer to Memtable.
    			err = db.mt.Put(entry.Key,
    				y.ValueStruct{
    					Value:     b.Ptrs[i].Encode(), // value写入的是值所在file的offset
    					Meta:      entry.meta | bitValuePointer, // 当前entry的值指针flag位设置成1
    					UserMeta:  entry.UserMeta,
    					ExpiresAt: entry.ExpiresAt,
    				})
    		}
    		if err != nil {
    			return y.Wrapf(err, "while writing to memTable")
    		}
    	}
    	if db.opt.SyncWrites {
    		return db.mt.SyncWAL()
    	}
    	return nil
    }
    
  5. 写入memTable

    1. 写入数据到wal
    2. 写入数据到skiplist
    func (mt *memTable) Put(key []byte, value y.ValueStruct) error {
    	entry := &Entry{
    		Key:       key,
    		Value:     value.Value,
    		UserMeta:  value.UserMeta,
    		meta:      value.Meta,
    		ExpiresAt: value.ExpiresAt,
    	}
    	// 1. 如果配置了wal,则先写入到wal
    	// wal is nil only when badger in running in in-memory mode and we don't need the wal.
    	if mt.wal != nil {
    		// If WAL exceeds opt.ValueLogFileSize, we'll force flush the memTable. See logic in
    		// ensureRoomForWrite.
    		if err := mt.wal.writeEntry(mt.buf, entry, mt.opt); err != nil {
    			return y.Wrapf(err, "cannot write entry to WAL file")
    		}
    	}
      // 2. 事务结束并不需要写入到skiplist
    	// We insert the finish marker in the WAL but not in the memtable.
    	if entry.meta&bitFinTxn > 0 {
    		return nil
    	}
    	// 3. 写入数据到skiplists
    	// Write to skiplist and update maxVersion encountered.
    	mt.sl.Put(key, value)
    	if ts := y.ParseTs(entry.Key); ts > mt.maxVersion {
    		mt.maxVersion = ts
    	}
    	return nil
    }
    

层级管理

  1. NumCompactors 个worker并发执行compact

    func (s *levelsController) startCompact(lc *z.Closer) {
    	n := s.kv.opt.NumCompactors
    	lc.AddRunning(n - 1)
    	for i := 0; i < n; i++ {
    		go s.runCompactor(i, lc)
    	}
    }
    
  2. compact,层级压缩

    1. level0 → baseLevel,选择level0中
    2. level0 → level0
    3. level_n-1 → level_n
    4. levelMax →levelMax
    func (s *levelsController) runCompactor(id int, lc *z.Closer) {
    	defer lc.Done()
    
    	randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond)
    	select {
    	case <-randomDelay.C:
    	case <-lc.HasBeenClosed():
    		randomDelay.Stop()
    		return
    	}
    
    	moveL0toFront := func(prios []compactionPriority) []compactionPriority {
    		idx := -1
    		for i, p := range prios {
    			if p.level == 0 {
    				idx = i
    				break
    			}
    		}
    		// If idx == -1, we didn't find L0.
    		// If idx == 0, then we don't need to do anything. L0 is already at the front.
    		if idx > 0 {
    			out := append([]compactionPriority{}, prios[idx])
    			out = append(out, prios[:idx]...)
    			out = append(out, prios[idx+1:]...)
    			return out
    		}
    		return prios
    	}
    
    	run := func(p compactionPriority) bool {
    		// 执行压缩
    		err := s.doCompact(id, p)
    		switch err {
    		case nil:
    			return true
    		case errFillTables:
    			// pass
    		default:
    			s.kv.opt.Warningf("While running doCompact: %v\n", err)
    		}
    		return false
    	}
    	runOnce := func() bool {
    		// 按照优先级排序层级
    		prios := s.pickCompactLevels()
    		if id == 0 {
    			// Worker ID zero prefers to compact L0 always.
    			prios = moveL0toFront(prios)
    		}
    		// 按照排序好的层级,从高优先级到低优先级依次尝试执行compact
    		for _, p := range prios {
    			if id == 0 && p.level == 0 {
    				// Allow worker zero to run level 0, irrespective of its adjusted score.
    			} else if p.adjusted < 1.0 {
    				break
    			}
    			if run(p) {
    				return true
    			}
    		}
    
    		return false
    	}
    
    	tryLmaxToLmaxCompaction := func() {
    		p := compactionPriority{
    			level: s.lastLevel().level,
    			t:     s.levelTargets(),
    		}
    		run(p)
    
    	}
    	count := 0
    	ticker := time.NewTicker(50 * time.Millisecond)
    	defer ticker.Stop()
    	for {
    		select {
    		// Can add a done channel or other stuff.
    		case <-ticker.C:
    			// 每50ms执行一次compact
    			count++
    			// Each ticker is 50ms so 50*200=10seconds.
    			if s.kv.opt.LmaxCompaction && id == 2 && count >= 200 {
    				tryLmaxToLmaxCompaction()
    				count = 0
    			} else {
    				runOnce()
    			}
    		case <-lc.HasBeenClosed():
    			return
    		}
    	}
    }
    
  3. 排序层级

    1. 查看现有数据的情况,找到对应的每个层级应该的目标

      // 查看当前数据的情况,找到每个层级需要达到的目标
      // levelTargets calculates the targets for levels in the LSM tree. The idea comes from Dynamic Level
      // Sizes ( <https://rocksdb.org/blog/2015/07/23/dynamic-level.html> ) in RocksDB. The sizes of levels
      // are calculated based on the size of the lowest level, typically L6. So, if L6 size is 1GB, then
      // L5 target size is 100MB, L4 target size is 10MB and so on.
      //
      // L0 files don't automatically go to L1. Instead, they get compacted to Lbase, where Lbase is
      // chosen based on the first level which is non-empty from top (check L1 through L6). For an empty
      // DB, that would be L6.  So, L0 compactions go to L6, then L5, L4 and so on.
      //
      // Lbase is advanced to the upper levels when its target size exceeds BaseLevelSize. For
      // example, when L6 reaches 1.1GB, then L4 target sizes becomes 11MB, thus exceeding the
      // BaseLevelSize of 10MB. L3 would then become the new Lbase, with a target size of 1MB <
      // BaseLevelSize.
      func (s *levelsController) levelTargets() targets {
      	adjust := func(sz int64) int64 {
      		if sz < s.kv.opt.BaseLevelSize {
      			return s.kv.opt.BaseLevelSize
      		}
      		return sz
      	}
      
      	t := targets{
      		targetSz: make([]int64, len(s.levels)),
      		fileSz:   make([]int64, len(s.levels)),
      	}
      	// DB size is the size of the last level.
      	dbSize := s.lastLevel().getTotalSize()
      	for i := len(s.levels) - 1; i > 0; i-- {
      		ltarget := adjust(dbSize)
      		t.targetSz[i] = ltarget
      		if t.baseLevel == 0 && ltarget <= s.kv.opt.BaseLevelSize {
      			t.baseLevel = i
      		}
      		dbSize /= int64(s.kv.opt.LevelSizeMultiplier)
      	}
      
      	tsz := s.kv.opt.BaseTableSize
      	for i := 0; i < len(s.levels); i++ {
      		if i == 0 {
      			// Use MemTableSize for Level 0. Because at Level 0, we stop compactions based on the
      			// number of tables, not the size of the level. So, having a 1:1 size ratio between
      			// memtable size and the size of L0 files is better than churning out 32 files per
      			// memtable (assuming 64MB MemTableSize and 2MB BaseTableSize).
      			t.fileSz[i] = s.kv.opt.MemTableSize
      		} else if i <= t.baseLevel {
      			t.fileSz[i] = tsz
      		} else {
      			tsz *= int64(s.kv.opt.TableSizeMultiplier)
      			t.fileSz[i] = tsz
      		}
      	}
      
      	// Bring the base level down to the last empty level.
      	for i := t.baseLevel + 1; i < len(s.levels)-1; i++ {
      		if s.levels[i].getTotalSize() > 0 {
      			break
      		}
      		t.baseLevel = i
      	}
      
      	// If the base level is empty and the next level size is less than the
      	// target size, pick the next level as the base level.
      	b := t.baseLevel
      	lvl := s.levels
      	if b < len(lvl)-1 && lvl[b].getTotalSize() == 0 && lvl[b+1].getTotalSize() < t.targetSz[b+1] {
      		t.baseLevel++
      	}
      	return t
      }
      
    2. 优先级排序

      优先级排序有如下三个规则

      1. Level0Level_0的按照table数量除以预期Level0Level_0的table数量作为优先级

        score0=Level0.numTables/NumLevelZeroTablesscore_0 = Level_0.numTables/NumLevelZeroTables

      2. Level0Level_0的按照实际actualSizeactualSize和预期的targetSizetargetSize大小算优先级。如果大于等于1,则说明当前level的数据量达到了阈值,可以缩减。此值越大,则说明超过的阈值越多,约需要缩减。

        scoren=actualSize/targetSizescore_n = actualSize/targetSize

      3. 对于相邻层级的,可以比较Leveln1Level_{n-1}LevelnLevel_nscorescore,如果LevelnLevel_nscorenscore_n比较小,那么可以优先compact Leveln1Level_{n-1},因为相对Leveln1Level_{n-1}超过的空间来说LevelnLevel_n的超出空间较小。所以在Leveln1Level_{n-1}需要进行compcat的时候,可以利用Leveln1/LevelnLevel_{n-1}/Level_{n}来计算优先级。注意LevelnLevel_n 不一定是需要compact的。此规则相较于规则2考虑了更多的场景。

        ajustedn1=ajustedn1/ajustednajusted_{n-1} = ajusted_{n-1}/ajusted_n

      // 按照层级的优先级排序
      // pickCompactLevel determines which level to compact.
      // Based on: <https://github.com/facebook/rocksdb/wiki/Leveled-Compaction>
      func (s *levelsController) pickCompactLevels() (prios []compactionPriority) {
      	// 找到层级的目标
      	t := s.levelTargets()
      	addPriority := func(level int, score float64) {
      		pri := compactionPriority{
      			level:    level,
      			score:    score,
      			adjusted: score,
      			t:        t,
      		}
      		prios = append(prios, pri)
      	}
      
      	// Add L0 priority based on the number of tables.
      	addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables))
      
      	// All other levels use size to calculate priority.
      	for i := 1; i < len(s.levels); i++ {
      		// Don't consider those tables that are already being compacted right now.
      		delSize := s.cstatus.delSize(i)
      
      		l := s.levels[i]
      		sz := l.getTotalSize() - delSize
      		addPriority(i, float64(sz)/float64(t.targetSz[i]))
      	}
      	y.AssertTrue(len(prios) == len(s.levels))
      
      	// The following code is borrowed from PebbleDB and results in healthier LSM tree structure.
      	// If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1
      	// score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so
      	// on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then
      	// we'll increase the priority of Li-1.
      	// Overall what this means is, if the bottom level is already overflowing, then de-prioritize
      	// compaction of the above level. If the bottom level is not full, then increase the priority of
      	// above level.
      	var prevLevel int
      	for level := t.baseLevel; level < len(s.levels); level++ {
      		if prios[prevLevel].adjusted >= 1 {
      			// Avoid absurdly large scores by placing a floor on the score that we'll
      			// adjust a level by. The value of 0.01 was chosen somewhat arbitrarily
      			const minScore = 0.01
      			if prios[level].score >= minScore {
      				prios[prevLevel].adjusted /= prios[level].adjusted
      			} else {
      				prios[prevLevel].adjusted /= minScore
      			}
      		}
      		prevLevel = level
      	}
      
      	// Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score.
      	// We'll still sort them by their adjusted score below. Having both these scores allows us to
      	// make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0
      	// compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions.
      	out := prios[:0]
      	for _, p := range prios[:len(prios)-1] {
      		if p.score >= 1.0 {
      			out = append(out, p)
      		}
      	}
      	prios = out
      
      	// Sort by the adjusted score.
      	sort.Slice(prios, func(i, j int) bool {
      		return prios[i].adjusted > prios[j].adjusted
      	})
      	return prios
      }
      
  4. 按照排序好优先级的层级顺序,依次查看哪个层级可以执行compact

    // doCompact picks some table on level l and compacts it away to the next level.
    func (s *levelsController) doCompact(id int, p compactionPriority) error {
    	l := p.level
    	y.AssertTrue(l < s.kv.opt.MaxLevels) // Sanity check.
    	if p.t.baseLevel == 0 {
    		p.t = s.levelTargets()
    	}
    
    	_, span := otrace.StartSpan(context.Background(), "Badger.Compaction")
    	defer span.End()
    
    	cd := compactDef{
    		compactorId:  id,
    		span:         span,
    		p:            p,
    		t:            p.t,
    		thisLevel:    s.levels[l],
    		dropPrefixes: p.dropPrefixes,
    	}
    	// 1. 找到可以进行compact的table
    	// While picking tables to be compacted, both levels' tables are expected to
    	// remain unchanged.
    	if l == 0 {
    		// level_0 -> baseLevel
    		cd.nextLevel = s.levels[p.t.baseLevel]
    		if !s.fillTablesL0(&cd) {
    			return errFillTables
    		}
    	} else {
    		
    		cd.nextLevel = cd.thisLevel
    		// We're not compacting the last level so pick the next level.
    		if !cd.thisLevel.isLastLevel() {
    			// level_n-1 -> level_n
    			cd.nextLevel = s.levels[l+1]
    		} // else level_n -> level_n
    		if !s.fillTables(&cd) {
    			return errFillTables
    		}
    	}
    	defer s.cstatus.delete(cd) // Remove the ranges from compaction status.
    
    	span.Annotatef(nil, "Compaction: %+v", cd)
    	// 2. 执行compact
    	if err := s.runCompactDef(id, l, cd); err != nil {
    		// This compaction couldn't be done successfully.
    		s.kv.opt.Warningf("[Compactor: %d] LOG Compact FAILED with error: %+v: %+v", id, err, cd)
    		return err
    	}
    
    	s.kv.opt.Debugf("[Compactor: %d] Compaction for level: %d DONE", id, cd.thisLevel.level)
    	return nil
    }
    
    1. 选择table

      1. 排序table: 按照每个table的最大版本从小到大排序
      2. 校验table: castatus存储了当前各层级正在压缩的范围。cstatus.overlapsWith 使用RLock来查看是否有重叠(相较于Lock 成本更低),而在cstatus.compareAndAdd 的时候才会使用排它锁Lock来查看是否有重叠,在没有重叠的时候才会添加到cstatus中去。
      func (s *levelsController) fillTables(cd *compactDef) bool {
      	cd.lockLevels()
      	defer cd.unlockLevels()
      
      	tables := make([]*table.Table, len(cd.thisLevel.tables))
      	copy(tables, cd.thisLevel.tables)
      	if len(tables) == 0 {
      		return false
      	}
      	// We're doing a maxLevel to maxLevel compaction. Pick tables based on the stale data size.
      	if cd.thisLevel.isLastLevel() {
      		return s.fillMaxLevelTables(tables, cd)
      	}
      	// We pick tables, so we compact older tables first. This is similar to
      	// kOldestLargestSeqFirst in RocksDB.
      	s.sortByHeuristic(tables, cd)
      
      	for _, t := range tables {
      		cd.thisSize = t.Size()
      		cd.thisRange = getKeyRange(t)
      		// csstatus存储了正在进行compact的table,判断是否与已有的重叠,如有就查看下个table
      		// If we're already compacting this range, don't do anything.
      		if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) {
      			continue
      		}
      		cd.top = []*table.Table{t}
      		left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange)
      
      		cd.bot = make([]*table.Table, right-left)
      		copy(cd.bot, cd.nextLevel.tables[left:right])
      
      		if len(cd.bot) == 0 {
      			cd.bot = []*table.Table{}
      			cd.nextRange = cd.thisRange
      			if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
      				continue
      			}
      			return true
      		}
      		cd.nextRange = getKeyRange(cd.bot...)
      		// 查看下个层级是否有重叠
      		if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) {
      			continue
      		}
      		if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
      			continue
      		}
      		return true
      	}
      	return false
      }
      
    2. 执行compact

      1. merge: 对LevelnLevel_n选择的table分块,分为五份,每份的table数量最少为3。这样就可以对数据分片的和top table合并,并发的构建新的table。
      2. fix: 删除top的table,将bottom的table替换为新合并的table
      // addSplits can allow us to run multiple sub-compactions in parallel across the split key ranges.
      func (s *levelsController) runCompactDef(id, l int, cd compactDef) (err error) {
      	if len(cd.t.fileSz) == 0 {
      		return errors.New("Filesizes cannot be zero. Targets are not set")
      	}
      	timeStart := time.Now()
      
      	thisLevel := cd.thisLevel
      	nextLevel := cd.nextLevel
      
      	y.AssertTrue(len(cd.splits) == 0)
      	if thisLevel.level == nextLevel.level {
      		// don't do anything for L0 -> L0 and Lmax -> Lmax.
      	} else {
      		// Level_n-1 -> level_n的时候,对bottom的tables分块合并
      		s.addSplits(&cd)
      	}
      	if len(cd.splits) == 0 {
      		cd.splits = append(cd.splits, keyRange{})
      	}
      
      	// Table should never be moved directly between levels, always be rewritten to allow discarding
      	// invalid versions.
        // 构建合并好了的table
      	newTables, decr, err := s.compactBuildTables(l, cd)
      	if err != nil {
      		return err
      	}
      	defer func() {
      		// Only assign to err, if it's not already nil.
      		if decErr := decr(); err == nil {
      			err = decErr
      		}
      	}()
      	changeSet := buildChangeSet(&cd, newTables)
      
      	// We write to the manifest _before_ we delete files (and after we created files)
      	if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil {
      		return err
      	}
      	// 把bottom层级的table,替换为更新了的table
      	// See comment earlier in this function about the ordering of these ops, and the order in which
      	// we access levels when reading.
      	if err := nextLevel.replaceTables(cd.bot, newTables); err != nil {
      		return err
      	}
      	// 删除top层级的table
      	if err := thisLevel.deleteTables(cd.top); err != nil {
      		return err
      	}
      
      	// Note: For level 0, while doCompact is running, it is possible that new tables are added.
      	// However, the tables are added only to the end, so it is ok to just delete the first table.
      
      	from := append(tablesToString(cd.top), tablesToString(cd.bot)...)
      	to := tablesToString(newTables)
      	if dur := time.Since(timeStart); dur > 2*time.Second {
      		var expensive string
      		if dur > time.Second {
      			expensive = " [E]"
      		}
      		s.kv.opt.Infof("[%d]%s LOG Compact %d->%d (%d, %d -> %d tables with %d splits)."+
      			" [%s] -> [%s], took %v\n",
      			id, expensive, thisLevel.level, nextLevel.level, len(cd.top), len(cd.bot),
      			len(newTables), len(cd.splits), strings.Join(from, " "), strings.Join(to, " "),
      			dur.Round(time.Millisecond))
      	}
      
      	if cd.thisLevel.level != 0 && len(newTables) > 2*s.kv.opt.LevelSizeMultiplier {
      		s.kv.opt.Debugf("This Range (numTables: %d)\nLeft:\n%s\nRight:\n%s\n",
      			len(cd.top), hex.Dump(cd.thisRange.left), hex.Dump(cd.thisRange.right))
      		s.kv.opt.Debugf("Next Range (numTables: %d)\nLeft:\n%s\nRight:\n%s\n",
      			len(cd.bot), hex.Dump(cd.nextRange.left), hex.Dump(cd.nextRange.right))
      	}
      	return nil
      }