ETCD lease 实现

397 阅读1分钟

使用方式

lease 用于控制key的有效时间,一个lease可以和多个key绑定。在lease过期之后,lease会被撤销,相关联的key也会一并被删除。 客户端的使用方式如下:

  1. 创建lease

    hudahai@hudahai-PC:~/repo/src/etcd$ ./bin/etcdctl lease grant 3600
    lease 32698678be872f04 granted with TTL(3600s)
    
  2. 将key和lease绑定

    hudahai@hudahai-PC:~/repo/src/etcd$ ./bin/etcdctl put name lisi --lease=32698678be872f04
    OK
    
  3. 撤销lease

    hudahai@hudahai-PC:~/repo/src/etcd$ ./bin/etcdctl lease revoke 32698678be872f04
    lease 32698678be872f04 revoked
    

实现逻辑

后台线程

etcd集群每个实例后台goroutine都会有如下定时操作。此定时操作均只有leader才会有效果,细节后面两部分会分别介绍。

  1. 检查过期的lease,并同步到集群
  2. 检查可更新过期时间的lease,并同步到集群
func (le *lessor) runLoop() {
	defer close(le.doneC)

	delayTicker := time.NewTicker(500 * time.Millisecond)
	defer delayTicker.Stop()

	for {
		// 检查过期的lease
		le.revokeExpiredLeases()
		// 同步lease的过期时间
		le.checkpointScheduledLeases()

		select {
		case <-delayTicker.C:
		case <-le.stopC:
			return
		}
	}
}

撤销过期lease

etcd 使用堆和map来维护所有lease的过期时间。通过LeaseQueueLess 方法可以知道,堆顶是过期时间最近的lease。

// LeaseWithTime contains lease object with a time.
// For the lessor's lease heap, time identifies the lease expiration time.
// For the lessor's lease checkpoint heap, the time identifies the next lease checkpoint time.
type LeaseWithTime struct {
	id    LeaseID
	time  time.Time
	index int
}

type LeaseQueue []*LeaseWithTime

func (pq LeaseQueue) Len() int { return len(pq) }

func (pq LeaseQueue) Less(i, j int) bool {
	return pq[i].time.Before(pq[j].time)
}

func (pq LeaseQueue) Swap(i, j int) {
	pq[i], pq[j] = pq[j], pq[i]
	pq[i].index = i
	pq[j].index = j
}

func (pq *LeaseQueue) Push(x interface{}) {
	n := len(*pq)
	item := x.(*LeaseWithTime)
	item.index = n
	*pq = append(*pq, item)
}

func (pq *LeaseQueue) Pop() interface{} {
	old := *pq
	n := len(old)
	item := old[n-1]
	item.index = -1 // for safety
	*pq = old[0 : n-1]
	return item
}

// LeaseExpiredNotifier is a queue used to notify lessor to revoke expired lease.
// Only save one item for a lease, `Register` will update time of the corresponding lease.
type LeaseExpiredNotifier struct {
	m     map[LeaseID]*LeaseWithTime
	queue LeaseQueue
}

1. 检查过期的lease

通过下面查询过期的lease的函数,我们可以知道

  • 只有在leader节点的leassor才会同步过期的lease给集群。
  • 过期的lease并不一定会被server处理。因为select中有一个default语句,在expiredC缓冲满了的情况下是发送不出去的。
// revokeExpiredLeases finds all leases past their expiry and sends them to expired channel for
// to be revoked.
func (le *lessor) revokeExpiredLeases() {
	var ls []*Lease

	// rate limit
	revokeLimit := leaseRevokeRate / 2

	le.mu.RLock()
	// 节点是leader才会同步过期结果给集群
	if le.isPrimary() {
		ls = le.findExpiredLeases(revokeLimit)
	}
	le.mu.RUnlock()

	if len(ls) != 0 {
		select {
		case <-le.stopC:
			return
		case le.expiredC <- ls:
		default:
			// the receiver of expiredC is probably busy handling
			// other stuff
			// let's try this next time after 500ms
		}
	}
}

上述操作只是查询过期的lease,并不会撤销lease。所以lessor中的lease并不能被删除,而是按照如下方式添加过期时间expiredLeaseRetryInterval 。一般场景下此lease在下次过期时间到达之前集群就会达成一致撤销此lease。如果当前时间段过期的lease实在太多,可能会在下一次过期时间到达的时候再次尝试通知集群此lease过期。

// expireExists returns "l" which is not nil if expiry items exist.
// It pops only when expiry item exists.
// "next" is true, to indicate that it may exist in next attempt.
func (le *lessor) expireExists() (l *Lease, next bool) {
	if le.leaseExpiredNotifier.Len() == 0 {
		return nil, false
	}

	item := le.leaseExpiredNotifier.Peek()
	l = le.leaseMap[item.id]
	if l == nil {
		// lease has expired or been revoked
		// no need to revoke (nothing is expiry)
		le.leaseExpiredNotifier.Unregister() // O(log N)
		return nil, true
	}
	now := time.Now()
	if now.Before(item.time) /* item.time: expiration time */ {
		// Candidate expirations are caught up, reinsert this item
		// and no need to revoke (nothing is expiry)
		return nil, false
	}

	// recheck if revoke is complete after retry interval
	item.time = now.Add(le.expiredLeaseRetryInterval)
	le.leaseExpiredNotifier.RegisterOrUpdate(item)
	return l, false
}

2. 同步过期的lease到集群

func (s *EtcdServer) revokeExpiredLeases(leases []*lease.Lease) {
	s.GoAttach(func() {
		lg := s.Logger()
		// Increases throughput of expired leases deletion process through parallelization
		c := make(chan struct{}, maxPendingRevokes)
		for _, curLease := range leases {
			select {
			case c <- struct{}{}:
			case <-s.stopping:
				return
			}

			f := func(lid int64) {
				s.GoAttach(func() {
					ctx := s.authStore.WithRoot(s.ctx)
					_, lerr := s.LeaseRevoke(ctx, &pb.LeaseRevokeRequest{ID: lid})
					if lerr == nil {
						leaseExpired.Inc()
					} else {
						lg.Warn(
							"failed to revoke lease",
							zap.String("lease-id", fmt.Sprintf("%016x", lid)),
							zap.Error(lerr),
						)
					}

					<-c
				})
			}

			f(int64(curLease.ID))
		}
	})
}

3. 集群达成一致的时候,会撤销lease。

func (a *applierV3backend) LeaseRevoke(lc *pb.LeaseRevokeRequest) (*pb.LeaseRevokeResponse, error) {
	err := a.lessor.Revoke(lease.LeaseID(lc.ID))
	return &pb.LeaseRevokeResponse{Header: a.newHeader()}, err
}

同步lease过期时间

通过Grant代码可以知道,只有leader节点才会维护一个过期lease的小顶堆。因为新增的lease只有在当前节点是leader的时候,才会添加到LeaseExpiredNotifier

func (le *lessor) Grant(id LeaseID, ttl int64) (*Lease, error) {
	if id == NoLease {
		return nil, ErrLeaseNotFound
	}

	if ttl > MaxLeaseTTL {
		return nil, ErrLeaseTTLTooLarge
	}

	// TODO: when lessor is under high load, it should give out lease
	// with longer TTL to reduce renew load.
	l := &Lease{
		ID:      id,
		ttl:     ttl,
		itemSet: make(map[LeaseItem]struct{}),
		revokec: make(chan struct{}),
	}

	if l.ttl < le.minLeaseTTL {
		l.ttl = le.minLeaseTTL
	}

	le.mu.Lock()
	defer le.mu.Unlock()

	if _, ok := le.leaseMap[id]; ok {
		return nil, ErrLeaseExists
	}

	if le.isPrimary() {
		l.refresh(0)
	} else {
		l.forever()
	}

	le.leaseMap[id] = l
	l.persistTo(le.b)

	leaseTotalTTLs.Observe(float64(l.ttl))
	leaseGranted.Inc()
	// leader 才会维护一个过期的lease堆
	if le.isPrimary() {
		item := &LeaseWithTime{id: l.ID, time: l.expiry}
		le.leaseExpiredNotifier.RegisterOrUpdate(item)
		le.scheduleCheckpointIfNeeded(l)
	}

	return l, nil
}

这种情况下如果leader挂了,一个新的follower选举成了leader。新的leader会按照如下方式构建 leaseExpiredNotifier ,会将所有lease刷新过期时间,并按照过期的lease速率调整lease的过期时间。

func (le *lessor) Promote(extend time.Duration) {
	le.mu.Lock()
	defer le.mu.Unlock()

	le.demotec = make(chan struct{})

	// refresh the expiries of all leases.
	for _, l := range le.leaseMap {
		l.refresh(extend)
		item := &LeaseWithTime{id: l.ID, time: l.expiry}
		le.leaseExpiredNotifier.RegisterOrUpdate(item)
		le.scheduleCheckpointIfNeeded(l)
	}

	if len(le.leaseMap) < leaseRevokeRate {
		// no possibility of lease pile-up
		return
	}

	// adjust expiries in case of overlap
	leases := le.unsafeLeases()
	sort.Sort(leasesByExpiry(leases))

	baseWindow := leases[0].Remaining()
	nextWindow := baseWindow + time.Second
	expires := 0
	// have fewer expires than the total revoke rate so piled up leases
	// don't consume the entire revoke limit
	targetExpiresPerSecond := (3 * leaseRevokeRate) / 4
	for _, l := range leases {
		remaining := l.Remaining()
		if remaining > nextWindow {
			baseWindow = remaining
			nextWindow = baseWindow + time.Second
			expires = 1
			continue
		}
		expires++
		if expires <= targetExpiresPerSecond {
			continue
		}
		rateDelay := float64(time.Second) * (float64(expires) / float64(targetExpiresPerSecond))
		// If leases are extended by n seconds, leases n seconds ahead of the
		// base window should be extended by only one second.
		rateDelay -= float64(remaining - baseWindow)
		delay := time.Duration(rateDelay)
		nextWindow = baseWindow + delay
		l.refresh(delay + extend)
		item := &LeaseWithTime{id: l.ID, time: l.expiry}
		le.leaseExpiredNotifier.RegisterOrUpdate(item)
		le.scheduleCheckpointIfNeeded(l)
	}
}

这种调整是有问题的。对于过期时间较短的lease还好说,因为原本过期时间是1s的话,这个时候会新续上1s。而如果时间较长比如10min,leader在lease还有1min过期的时候挂了,此lease会再过10min才会过期。

为了解决这个问题,leader的lessor后台goroutine会对较长时间过期的lease的最新过期时间最新同步到集群。这部分数据存储在leaseCheckpointHeap ,也是一个小顶堆,存的是需要更新ttl的lease列表,调用方法如下。

func (le *lessor) scheduleCheckpointIfNeeded(lease *Lease) {
	if le.cp == nil {
		return
	}

	if lease.getRemainingTTL() > int64(le.checkpointInterval.Seconds()) {
		if le.lg != nil {
			le.lg.Debug("Scheduling lease checkpoint",
				zap.Int64("leaseID", int64(lease.ID)),
				zap.Duration("intervalSeconds", le.checkpointInterval),
			)
		}
		heap.Push(&le.leaseCheckpointHeap, &LeaseWithTime{
			id:   lease.ID,
			time: time.Now().Add(le.checkpointInterval),
		})
	}
}

lessor 找到需要同步最新过期时间的lease,通过le.cp同步到集群(和lease过期同步集群,然后在revoke一样。这个是同步集群,待集群达成一致然后再Checkpoint)。

// checkpointScheduledLeases finds all scheduled lease checkpoints that are due and
// submits them to the checkpointer to persist them to the consensus log.
func (le *lessor) checkpointScheduledLeases() {
	// rate limit
	for i := 0; i < leaseCheckpointRate/2; i++ {
		var cps []*pb.LeaseCheckpoint

		le.mu.Lock()
		if le.isPrimary() {
			cps = le.findDueScheduledCheckpoints(maxLeaseCheckpointBatchSize)
		}
		le.mu.Unlock()

		if len(cps) != 0 {
			le.cp(context.Background(), &pb.LeaseCheckpointRequest{Checkpoints: cps})
		}
		if len(cps) < maxLeaseCheckpointBatchSize {
			return
		}
	}
}