本文主要介绍了etcd的mvcc的原理以及实现
1. 引言
mvcc 是多版本并发控制,核心思想是维护一个数据的多个历史版本
etcd 的 mvcc 是基于b树和boltdb 引擎实现的
在treeIndex模块,etcd使用b树里面保存了用户 key 和版本号的关系
在boltdb模块中,etcd以版本号为key,保存对应的kv值
2. treeIndex
2.1. treeIndex
treeIndex主要是一个b树,对应节点元素是keyIndex
type treeIndex struct {
sync.RWMutex
tree *btree.BTreeG[*keyIndex]
lg *zap.Logger
}
type generation struct {
ver int64 //表示此key的修改次数
created revision //表示generation结构创建时的版本号
revs []revision //每次修改key时的revision追加到此数组
}
2.2. keyIndex
etcd 就是通过keyIndex保存了用户的 key 与版本号的映射关系
type keyIndex struct {
key []byte // 用户的key名称
modified revision // 最后一次修改key时的etcd版本号
generations []generation // generation保存了一个key若干代版本号信息,每代中包含对key的多次修改的版本号列表
}
2.3. generations
generations 表示一个 key 从创建到删除的过程,每代对应 key 的一个生命周期的开始与结束,一个keyIndex 有多个创建销毁过程,也就是有多个generations
type generation struct {
ver int64
created Revision // when the generation is created (put in first revision).
revs []Revision
}
2.4. Revision
版本号Revision
revision 包含 main 和 sub 两个字段:
- main 是全局递增的版本号,它是个 etcd 逻辑时钟,随着 put/txn/delete 等事务递增。
- sub 是一个事务内的子版本号,从 0 开始随事务内的 put/delete 操作递增
type Revision struct {
// Main is the main revision of a set of changes that happen atomically.
// 全局事务的版本号
Main int64
// Sub is the sub revision of a change in a set of changes that happen
// atomically. Each change has different increasing sub revision in that
// set.
// 一次事务的多次操作
Sub int64
}
2.5. index
treeIndex 模块具体实现的方法如下所示:
// server/storage/mvcc/index.go 25 行
type index interface {
// Get 获取小于等于atRev版本号的最近rev、创建这个rev的版本号,以及后面剩余版本号数
Get(key []byte, atRev int64) (rev, created revision, ver int64, err error)
// 获取范围内的key的小于等于atRev版本号的最近rev版本
Range(key, end []byte, atRev int64) ([][]byte, []revision)
// 获取范围内的key的小于等于atRev版本号的最近rev版本
Revisions(key, end []byte, atRev int64, limit int) ([]revision, int)
// 获取范围内的key的小于等于atRev版本号的最近rev版本数
CountRevisions(key, end []byte, atRev int64) int
// 批量插入key rev
Put(key []byte, rev revision)
// 标记删除
Tombstone(key []byte, rev revision) error
// 从rev起的某个版本号
RangeSince(key, end []byte, rev int64) []revision
// 压缩
Compact(rev int64) map[revision]struct{}
Keep(rev int64) map[revision]struct{}
Equal(b index) bool
Insert(ki *keyIndex)
KeyIndex(ki *keyIndex) *keyIndex
}
3. boltdb
看前面文章
4. mvcc
4.1. Put
当你通过 API 发起一个 Put 请求后,etcd 的 KV Server 收到 Put 请求并提交到 raft 模块处理 。
在 raft 模块中提交后,apply 模块就会通过 MVCC 模块的 Put 接口执行这个写任务
mvcc的Put执行流程如下:
- 查询 keyIndex
- 写入 buffer,后面通过backend异步持久到boltdb
- 更新 treeIndex
func (tw *storeTxnWrite) Put(key, value []byte, lease lease.LeaseID) int64 {
tw.put(key, value, lease)
return tw.beginRev + 1
}
func (tw *storeTxnWrite) put(key, value []byte, leaseID lease.LeaseID) {
rev := tw.beginRev + 1
c := rev
oldLease := lease.NoLease
// if the key exists before, use its previous created and
// get its previous leaseID
// 1. 查询对应的keyIndex
_, created, ver, err := tw.s.kvindex.Get(key, rev)
if err == nil {
c = created.Main
oldLease = tw.s.le.GetLease(lease.LeaseItem{Key: string(key)})
tw.trace.Step("get key's previous created_revision and leaseID")
}
ibytes := NewRevBytes()
idxRev := Revision{Main: rev, Sub: int64(len(tw.changes))}
ibytes = RevToBytes(idxRev, ibytes)
ver = ver + 1
kv := mvccpb.KeyValue{
Key: key,
Value: value,
CreateRevision: c,
ModRevision: rev,
Version: ver,
Lease: int64(leaseID),
}
d, err := kv.Marshal()
if err != nil {
tw.storeTxnCommon.s.lg.Fatal(
"failed to marshal mvccpb.KeyValue",
zap.Error(err),
)
}
tw.trace.Step("marshal mvccpb.KeyValue")
// 2. 写入buffer缓冲区,后面backend模块会异步写入到boltdb
tw.tx.UnsafeSeqPut(schema.Key, ibytes, d)
// 3. 更新treeIndex
tw.s.kvindex.Put(key, idxRev)
tw.changes = append(tw.changes, kv)
tw.trace.Step("store kv pair into bolt db")
// 后面是租约相关的
if oldLease == leaseID {
tw.trace.Step("attach lease to kv pair")
return
}
if oldLease != lease.NoLease {
if tw.s.le == nil {
panic("no lessor to detach lease")
}
err = tw.s.le.Detach(oldLease, []lease.LeaseItem{{Key: string(key)}})
if err != nil {
tw.storeTxnCommon.s.lg.Error(
"failed to detach old lease from a key",
zap.Error(err),
)
}
}
if leaseID != lease.NoLease {
if tw.s.le == nil {
panic("no lessor to attach lease")
}
err = tw.s.le.Attach(leaseID, []lease.LeaseItem{{Key: string(key)}})
if err != nil {
panic("unexpected error from lease Attach")
}
}
tw.trace.Step("attach lease to kv pair")
}
4.2. Get
同理,Get请求也一样
Get的流程如下:
- 查询版本号
- 查询 blotdb
func (tr *storeTxnCommon) Range(ctx context.Context, key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
// 默认查询最新版本的数据
return tr.rangeKeys(ctx, key, end, tr.Rev(), ro)
}
func (tr *storeTxnCommon) rangeKeys(ctx context.Context, key, end []byte, curRev int64, ro RangeOptions) (*RangeResult, error) {
rev := ro.Rev
if rev > curRev {
return &RangeResult{KVs: nil, Count: -1, Rev: curRev}, ErrFutureRev
}
if rev <= 0 {
rev = curRev
}
if rev < tr.s.compactMainRev {
return &RangeResult{KVs: nil, Count: -1, Rev: 0}, ErrCompacted
}
if ro.Count {
total := tr.s.kvindex.CountRevisions(key, end, rev)
tr.trace.Step("count revisions from in-memory index tree")
return &RangeResult{KVs: nil, Count: total, Rev: curRev}, nil
}
// 1. 查询对应的版本号
revpairs, total := tr.s.kvindex.Revisions(key, end, rev, int(ro.Limit))
tr.trace.Step("range keys from in-memory index tree")
if len(revpairs) == 0 {
return &RangeResult{KVs: nil, Count: total, Rev: curRev}, nil
}
limit := int(ro.Limit)
if limit <= 0 || limit > len(revpairs) {
limit = len(revpairs)
}
kvs := make([]mvccpb.KeyValue, limit)
revBytes := NewRevBytes()
for i, revpair := range revpairs[:len(kvs)] {
select {
case <-ctx.Done():
return nil, fmt.Errorf("rangeKeys: context cancelled: %w", ctx.Err())
default:
}
revBytes = RevToBytes(revpair, revBytes)
// 2. 查询boltdb,先从buffer读,在从boltdb读
_, vs := tr.tx.UnsafeRange(schema.Key, revBytes, nil, 0)
if len(vs) != 1 {
tr.s.lg.Fatal(
"range failed to find revision pair",
zap.Int64("revision-main", revpair.Main),
zap.Int64("revision-sub", revpair.Sub),
zap.Int64("revision-current", curRev),
zap.Int64("range-option-rev", ro.Rev),
zap.Int64("range-option-limit", ro.Limit),
zap.Binary("key", key),
zap.Binary("end", end),
zap.Int("len-revpairs", len(revpairs)),
zap.Int("len-values", len(vs)),
)
}
if err := kvs[i].Unmarshal(vs[0]); err != nil {
tr.s.lg.Fatal(
"failed to unmarshal mvccpb.KeyValue",
zap.Error(err),
)
}
}
tr.trace.Step("range keys from bolt db")
return &RangeResult{KVs: kvs, Count: total, Rev: curRev}, nil
}
func (baseReadTx *baseReadTx) UnsafeRange(bucketType Bucket, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
if endKey == nil {
// forbid duplicates for single keys
limit = 1
}
if limit <= 0 {
limit = math.MaxInt64
}
if limit > 1 && !bucketType.IsSafeRangeBucket() {
panic("do not use unsafeRange on non-keys bucket")
}
// 从buffer中读取数据
keys, vals := baseReadTx.buf.Range(bucketType, key, endKey, limit)
if int64(len(keys)) == limit {
return keys, vals
}
// find/cache bucket
bn := bucketType.ID()
baseReadTx.txMu.RLock()
bucket, ok := baseReadTx.buckets[bn]
baseReadTx.txMu.RUnlock()
lockHeld := false
if !ok {
baseReadTx.txMu.Lock()
lockHeld = true
bucket = baseReadTx.tx.Bucket(bucketType.Name())
baseReadTx.buckets[bn] = bucket
}
// ignore missing bucket since may have been created in this batch
if bucket == nil {
if lockHeld {
baseReadTx.txMu.Unlock()
}
return keys, vals
}
if !lockHeld {
baseReadTx.txMu.Lock()
}
c := bucket.Cursor()
baseReadTx.txMu.Unlock()
k2, v2 := unsafeRange(c, key, endKey, limit-int64(len(keys)))
return append(k2, keys...), append(v2, vals...)
}
4.3. Delete
而对于delete而言, etcd 实现的是标记删除模式(软删除),原理与 put类似。
在执行delete,写入一个删除标识,mvcc的keyIndex模块会给对应key添加一个删除标识并在generation追加一个空generation 对象,表示已删除
当你再次查询 对应key 的时候,treeIndex 模块根据 key 查找到 keyindex 对象后,若发现其存在空的 generation 对象,并且查询的版本号大于等于被删除时的版本号,则会返回空
注意:真正的硬删除是通过compact完成的,它会压缩对应key的版本历史
func (tw *storeTxnWrite) DeleteRange(key, end []byte) (int64, int64) {
if n := tw.deleteRange(key, end); n != 0 || len(tw.changes) > 0 {
return n, tw.beginRev + 1
}
return 0, tw.beginRev
}
func (tw *storeTxnWrite) deleteRange(key, end []byte) int64 {
rrev := tw.beginRev
if len(tw.changes) > 0 {
rrev++
}
// 获取对应版本
keys, _ := tw.s.kvindex.Range(key, end, rrev)
if len(keys) == 0 {
return 0
}
// 循环删除
for _, key := range keys {
tw.delete(key)
}
return int64(len(keys))
}
func (tw *storeTxnWrite) delete(key []byte) {
ibytes := NewRevBytes()
idxRev := newBucketKey(tw.beginRev+1, int64(len(tw.changes)), true)
ibytes = BucketKeyToBytes(idxRev, ibytes)
kv := mvccpb.KeyValue{Key: key}
d, err := kv.Marshal()
if err != nil {
tw.storeTxnCommon.s.lg.Fatal(
"failed to marshal mvccpb.KeyValue",
zap.Error(err),
)
}
tw.tx.UnsafeSeqPut(schema.Key, ibytes, d)
err = tw.s.kvindex.Tombstone(key, idxRev.Revision)
if err != nil {
tw.storeTxnCommon.s.lg.Fatal(
"failed to tombstone an existing key",
zap.String("key", string(key)),
zap.Error(err),
)
}
tw.changes = append(tw.changes, kv)
item := lease.LeaseItem{Key: string(key)}
leaseID := tw.s.le.GetLease(item)
if leaseID != lease.NoLease {
err = tw.s.le.Detach(leaseID, []lease.LeaseItem{item})
if err != nil {
tw.storeTxnCommon.s.lg.Error(
"failed to detach old lease from a key",
zap.Error(err),
)
}
}
}
4.4. Compact
etcd 主要通过压缩回收历史版本,不支持单独指定key
etcd支持两种压缩:周期性压缩和版本号压缩
- 周期性压缩,希望 etcd 只保留最近一段时间写入的历史版本时,你就可以选择配置 etcd 的压缩模 式为 periodic,保留时间为你自定义的 1h 等
- 版本号压缩,压缩某个版本号之前的
compact的流程如下:
- MVCC 模块的 Compact 接口首先会检查 Compact 请求的版本号 rev 是否已被压缩过,若是则返回 ErrCompacted 错误给 client,其次会检查 rev 是否大于当前 etcd server 的最大版本号,若是则返回 ErrFutureRev 给 client
- 通过 boltdb 的 API 在 meta bucket 中更新当前已调度的压缩版本号 (scheduledCompactedRev) 号,然后将压缩任务追加到 FIFO Scheduled 中,异步调度执行
func (s *store) Compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
s.mu.Lock()
// 检查上一个压缩是否完成了
prevCompactionCompleted := s.checkPrevCompactionCompleted()
// 更新压缩版本号
ch, prevCompactRev, err := s.updateCompactRev(rev)
trace.Step("check and update compact revision")
if err != nil {
s.mu.Unlock()
return ch, err
}
s.mu.Unlock()
// 将压缩任务加到FIFO Scheduled
return s.compact(trace, rev, prevCompactRev, prevCompactionCompleted), nil
}
// 更新压缩版本号
func (s *store) updateCompactRev(rev int64) (<-chan struct{}, int64, error) {
s.revMu.Lock()
if rev <= s.compactMainRev {
ch := make(chan struct{})
f := schedule.NewJob("kvstore_updateCompactRev_compactBarrier", func(ctx context.Context) { s.compactBarrier(ctx, ch) })
s.fifoSched.Schedule(f)
s.revMu.Unlock()
return ch, 0, ErrCompacted
}
if rev > s.currentRev {
s.revMu.Unlock()
return nil, 0, ErrFutureRev
}
compactMainRev := s.compactMainRev
s.compactMainRev = rev
SetScheduledCompact(s.b.BatchTx(), rev)
// ensure that desired compaction is persisted
// gofail: var compactBeforeCommitScheduledCompact struct{}
s.b.ForceCommit()
// gofail: var compactAfterCommitScheduledCompact struct{}
s.revMu.Unlock()
return nil, compactMainRev, nil
}
// 压缩任务追加到 FIFO Scheduled 中,异步调度执行
func (s *store) compact(trace *traceutil.Trace, rev, prevCompactRev int64, prevCompactionCompleted bool) <-chan struct{} {
ch := make(chan struct{})
j := schedule.NewJob("kvstore_compact", func(ctx context.Context) {
if ctx.Err() != nil {
s.compactBarrier(ctx, ch)
return
}
hash, err := s.scheduleCompaction(rev, prevCompactRev)
if err != nil {
s.lg.Warn("Failed compaction", zap.Error(err))
s.compactBarrier(context.TODO(), ch)
return
}
// Only store the hash value if the previous hash is completed, i.e. this compaction
// hashes every revision from last compaction. For more details, see #15919.
if prevCompactionCompleted {
s.hashes.Store(hash)
} else {
s.lg.Info("previous compaction was interrupted, skip storing compaction hash value")
}
close(ch)
})
s.fifoSched.Schedule(j)
trace.Step("schedule compaction")
return ch
}
真正执行版本压缩的地方,流程如下:
- 压缩 treeIndex 模块中的各 key 的历史版本、已删除的版本
- 删除 boltdb 中废弃的历史版本数据
func (s *store) scheduleCompaction(compactMainRev, prevCompactRev int64) (KeyValueHash, error) {
totalStart := time.Now()
// 压缩 treeIndex 模块中的各 key 的历史版本、已删除的版本
// 处理b树,会先克隆一棵b树,在b树上处理,从而减少压缩的影响
keep := s.kvindex.Compact(compactMainRev)
indexCompactionPauseMs.Observe(float64(time.Since(totalStart) / time.Millisecond))
totalStart = time.Now()
defer func() { dbCompactionTotalMs.Observe(float64(time.Since(totalStart) / time.Millisecond)) }()
keyCompactions := 0
defer func() { dbCompactionKeysCounter.Add(float64(keyCompactions)) }()
defer func() { dbCompactionLast.Set(float64(time.Now().Unix())) }()
end := make([]byte, 8)
binary.BigEndian.PutUint64(end, uint64(compactMainRev+1))
batchNum := s.cfg.CompactionBatchLimit
batchTicker := time.NewTicker(s.cfg.CompactionSleepInterval)
defer batchTicker.Stop()
h := newKVHasher(prevCompactRev, compactMainRev, keep)
last := make([]byte, 8+1+8)
// 在boltdb中是定时批量处理删除key
for {
var rev Revision
start := time.Now()
tx := s.b.BatchTx()
tx.LockOutsideApply()
keys, values := tx.UnsafeRange(schema.Key, last, end, int64(batchNum))
for i := range keys {
rev = BytesToRev(keys[i])
if _, ok := keep[rev]; !ok {
tx.UnsafeDelete(schema.Key, keys[i])
keyCompactions++
}
h.WriteKeyValue(keys[i], values[i])
}
if len(keys) < batchNum {
// gofail: var compactBeforeSetFinishedCompact struct{}
UnsafeSetFinishedCompact(tx, compactMainRev)
tx.Unlock()
// gofail: var compactAfterSetFinishedCompact struct{}
hash := h.Hash()
size, sizeInUse := s.b.Size(), s.b.SizeInUse()
s.lg.Info(
"finished scheduled compaction",
zap.Int64("compact-revision", compactMainRev),
zap.Duration("took", time.Since(totalStart)),
zap.Uint32("hash", hash.Hash),
zap.Int64("current-db-size-bytes", size),
zap.String("current-db-size", humanize.Bytes(uint64(size))),
zap.Int64("current-db-size-in-use-bytes", sizeInUse),
zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse))),
)
return hash, nil
}
tx.Unlock()
// update last
last = RevToBytes(Revision{Main: rev.Main, Sub: rev.Sub + 1}, last)
// Immediately commit the compaction deletes instead of letting them accumulate in the write buffer
// gofail: var compactBeforeCommitBatch struct{}
s.b.ForceCommit()
// gofail: var compactAfterCommitBatch struct{}
dbCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
select {
case <-batchTicker.C:
case <-s.stopc:
return KeyValueHash{}, fmt.Errorf("interrupted due to stop signal")
}
}
}
treeIndex的压缩代码
func (ti *treeIndex) Compact(rev int64) map[Revision]struct{} {
available := make(map[Revision]struct{})
ti.lg.Info("compact tree index", zap.Int64("revision", rev))
ti.Lock()
// 克隆一棵树
clone := ti.tree.Clone()
ti.Unlock()
// 遍历每个key
clone.Ascend(func(keyi *keyIndex) bool {
// Lock is needed here to prevent modification to the keyIndex while
// compaction is going on or revision added to empty before deletion
ti.Lock()
// 压缩对应版本
keyi.compact(ti.lg, rev, available)
if keyi.isEmpty() {
_, ok := ti.tree.Delete(keyi)
if !ok {
ti.lg.Panic("failed to delete during compaction")
}
}
ti.Unlock()
return true
})
return available
}