kubernetes apiserver源码: 指数回退

196 阅读3分钟

当请求发生错误时,可能服务处于不健康状态,或者是工作负载过高或者直接挂了。这个时候要进行请求重试,重试的策略使用指数回退算法的好处是:

  • 避免固定间隔策略导致所有客户端一起重试,导致请求尖峰
  • 递增增加重试间隔,减少服务挂掉情况下的无意义尝试

Backoff

// Backoff holds parameters applied to a Backoff function.
type Backoff struct {
	// The initial duration.
	Duration time.Duration
	// Duration is multiplied by factor each iteration, if factor is not zero
	// and the limits imposed by Steps and Cap have not been reached.
	// Should not be negative.
	// The jitter does not contribute to the updates to the duration parameter.
	Factor float64
	// The sleep at each iteration is the duration plus an additional
	// amount chosen uniformly at random from the interval between
	// zero and `jitter*duration`.
	Jitter float64
	// The remaining number of iterations in which the duration
	// parameter may change (but progress can be stopped earlier by
	// hitting the cap). If not positive, the duration is not
	// changed. Used for exponential backoff in combination with
	// Factor and Cap.
	Steps int
	// A limit on revised values of the duration parameter. If a
	// multiplication by the factor parameter would make the duration
	// exceed the cap then the duration is set to the cap and the
	// steps parameter is set to zero.
	Cap time.Duration
}
  • 设置一个初始间隔Duration
  • 每次重试,使用Duration * Factor来增加重试之间的间隔时间
  • 每次重试会递减Step,当Step=0, 重试间隔不再增加
  • 另外当重试间隔超过最大间隔Cap, 其设置为Cap
  • 为了防止同时启动的客户端,同时发起重试,引入随机抖动, 重试间隔=Duration * (1 + Jitter)

那这个4点通过Step方法实现:

// Step (1) returns an amount of time to sleep determined by the
// original Duration and Jitter and (2) mutates the provided Backoff
// to update its Steps and Duration.
func (b *Backoff) Step() time.Duration {
        // 每次重试会递减Step,当`Step=0`, 重试间隔不再增加
	if b.Steps < 1 {
		if b.Jitter > 0 {
			return Jitter(b.Duration, b.Jitter)
		}
		return b.Duration
	}
	b.Steps--

	duration := b.Duration

	// calculate the next step
	if b.Factor != 0 {
                // 每次重试,使用Duration * Factor来增加重试之间的间隔时间
		b.Duration = time.Duration(float64(b.Duration) * b.Factor)
                // 另外当重试间隔超过最大间隔`Cap`, 其设置为Cap 
		if b.Cap > 0 && b.Duration > b.Cap {
			b.Duration = b.Cap
			b.Steps = 0
		}
	}

	if b.Jitter > 0 {
		duration = Jitter(duration, b.Jitter)
	}
	return duration
}

// 为了防止同时启动的客户端,同时发起重试,引入随机抖动
// Jitter returns a time.Duration between duration and duration + maxFactor *
// duration.
//
// This allows clients to avoid converging on periodic behavior. If maxFactor
// is 0.0, a suggested default value will be chosen.
func Jitter(duration time.Duration, maxFactor float64) time.Duration {
	if maxFactor <= 0.0 {
		maxFactor = 1.0
	}
	wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
	return wait
}

二进制回退

type exponentialBackoffManagerImpl struct {
	backoff              *Backoff
	backoffTimer         clock.Timer    // 重试计时器
	lastBackoffStart     time.Time      // 最后一次执行回退的时间戳
	initialBackoff       time.Duration  // 初始化间隔
	backoffResetDuration time.Duration  // 乐观重置阈值
	clock                clock.Clock
}

// NewExponentialBackoffManager returns a manager for managing exponential backoff. Each backoff is jittered and
// backoff will not exceed the given max. If the backoff is not called within resetDuration, the backoff is reset.
// This backoff manager is used to reduce load during upstream unhealthiness.
func NewExponentialBackoffManager(initBackoff, maxBackoff, resetDuration time.Duration, backoffFactor, jitter float64, c clock.Clock) BackoffManager {
	return &exponentialBackoffManagerImpl{
		backoff: &Backoff{
			Duration: initBackoff,
			Factor:   backoffFactor,
			Jitter:   jitter,

			// the current impl of wait.Backoff returns Backoff.Duration once steps are used up, which is not
			// what we ideally need here, we set it to max int and assume we will never use up the steps
			Steps: math.MaxInt32,
			Cap:   maxBackoff,
		},
		backoffTimer:         nil,
		initialBackoff:       initBackoff,
		lastBackoffStart:     c.Now(),
		backoffResetDuration: resetDuration,
		clock:                c,
	}
}

乐观重置阈值是什么?

lastBackoffStart记录着上次重试开始的时间戳,每次请求失败后重试都会更新这个时间戳。这意味着请求成功不会更新,当Now() - lastBackoffStart > backoffResetDuration, 即认为上次重试已经隔了很长时间了,那我们认为此次重试很有可能成功,因此重置重试间隔为初始值(较小的时间间隔)。


func (b *exponentialBackoffManagerImpl) getNextBackoff() time.Duration {
        // 重置重试间隔为初始值(较小的时间间隔)
	if b.clock.Now().Sub(b.lastBackoffStart) > b.backoffResetDuration {
		b.backoff.Steps = math.MaxInt32
		b.backoff.Duration = b.initialBackoff
	}
        // 每次请求失败后重试都会更新这个时间戳
	b.lastBackoffStart = b.clock.Now()
        // Step (1) returns an amount of time to sleep determined by the
        // original Duration and Jitter and (2) mutates the provided Backoff
        // to update its Steps and Duration.
	return b.backoff.Step()
}

// Backoff implements BackoffManager.Backoff, it returns a timer so caller can block on the timer for exponential backoff.
// The returned timer must be drained before calling Backoff() the second time
func (b *exponentialBackoffManagerImpl) Backoff() clock.Timer {
	if b.backoffTimer == nil {
		b.backoffTimer = b.clock.NewTimer(b.getNextBackoff())
	} else {
		b.backoffTimer.Reset(b.getNextBackoff())
	}
	return b.backoffTimer
}

clock.Timer

go SDK API, 用于定时任务

贴几个简单的demo在这里辅助下理解:

  • 定时炸弹
package main

import (
	"fmt"
	"time"
)
func main() {
	timer := time.NewTimer(3 * time.Second)  //启动定时器,生产一个Timer对象
	select {
	case <-timer.C:
		fmt.Println("3秒爆炸")
	}
	timer.Stop() // 不再使用了,结束它
}
  • 固定周期执行
package main

import (
	"fmt"
	"time"
)
func main() {
	timer := time.NewTimer(3 * time.Second)
	for {
		timer.Reset(4 * time.Second) // 这样来复用 timer 和修改执行时间
		select {
		case <-timer.C:
			fmt.Println("每隔4秒执行任务")
		}
	}
}

重试定时器

那么有了上面的例子,就比较好理解exponentialBackoffManagerImpl.backoffTimer。 它的作用是客户端从获取其channel取消阻塞,就开始发送请求

// BackoffUntil loops until stop channel is closed, run f every duration given by BackoffManager.
//
// If sliding is true, the period is computed after f runs. If it is false then
// period includes the runtime for f.
func BackoffUntil(f func(), backoff BackoffManager, sliding bool, stopCh <-chan struct{}) {
	var t clock.Timer
	for {
		select {
		case <-stopCh:
			return
		default:
		}

		if !sliding {
			t = backoff.Backoff()
		}

		func() {
			defer runtime.HandleCrash()
			f() // 发送请求
		}()

		if sliding {
			t = backoff.Backoff()
		}

		// NOTE: b/c there is no priority selection in golang
		// it is possible for this to race, meaning we could
		// trigger t.C and stopCh, and t.C select falls through.
		// In order to mitigate we re-check stopCh at the beginning
		// of every loop to prevent extra executions of f().
		select {
		case <-stopCh:
			return
		case <-t.C(): // 从获取其channel取消阻塞,就开始发送请求
		}
	}
}