flagger是如何的canary发布

1,191 阅读5分钟

此文档主要讲解了如下内容: flagger如何实现了canary发布; 同时可以了解client-go的架构以及crd数据如何进行的流传,对后续crd的开发有很大的帮助;

整体架构

整体架构采用了client-go的模式, 同时做了增加使用两个定时器 & sync.Map

Client-go可以参考github.com/JaneLiuL/ku….

定时器的功能是将任务

数据流向图:

flagger-canary数据流向.png

源码解析

main.go

  • 初始化Controller
    • 资源的add/update/delete的回调事件
  • 启动
func main() {
  ...
  // 初始化
  c := controller.NewController(
		kubeClient,
		flaggerClient,
		infos,
		controlLoopInterval,
		logger,
		notifierClient,
		canaryFactory,
		routerFactory,
		observerFactory,
		meshProvider,
		version.VERSION,
		fromEnv("EVENT_WEBHOOK_URL", eventWebhook),
	)

	// leader election context
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	// prevents new requests when leadership is lost
	cfg.Wrap(transport.ContextCanceller(ctx, fmt.Errorf("the leader is shutting down")))

	// cancel leader election context on shutdown signals
	go func() {
		<-stopCh
		cancel()
	}()

	// wrap controller run
	runController := func() {
    // 启动
		if err := c.Run(threadiness, stopCh); err != nil {
			logger.Fatalf("Error running controller: %v", err)
		}
	}
  ...
}

controller.go

NewController 方法
// fluxcd/flagger/pkg/controller/controller.go # NewController
func NewController(
	kubeClient kubernetes.Interface,
	flaggerClient clientset.Interface,
	flaggerInformers Informers,
	flaggerWindow time.Duration,
	logger *zap.SugaredLogger,
	notifier notifier.Interface,
	canaryFactory *canary.Factory,
	routerFactory *router.Factory,
	observerFactory *observers.Factory,
	meshProvider string,
	version string,
	eventWebhook string,
) *Controller {
...
// 添加canary资源的回调方法
  flaggerInformers.CanaryInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    // apply 新增时, 把资源放入队列, 在后面的启动方法中会从此队列中获取数据进行逻辑处理-------- 第一处
		AddFunc: ctrl.enqueue,
    // 以有资源进行更新时
		UpdateFunc: func(old, new interface{}) {
			...
		},
    // 删除资源时
		DeleteFunc: func(old interface{}) {
			r, ok := checkCustomResourceType(old, logger)
			if ok {
				ctrl.logger.Infof("Deleting %s.%s from cache", r.Name, r.Namespace)
        // 此处就是在删除资源时会把此资源从存储在sync.Map中移除, 如果没有进行移除 此资源一直存在Map中, 是占用内存的哟
				ctrl.canaries.Delete(fmt.Sprintf("%s.%s", r.Name, r.Namespace))
			}
		},
	})

}

Run 方法
// // fluxcd/flagger/pkg/controller/controller.go # Run
// Run starts the K8s workers and the canary scheduler
func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
	defer utilruntime.HandleCrash()
	defer c.workqueue.ShutDown()

	c.logger.Info("Starting operator")

	for i := 0; i < threadiness; i++ {
		go wait.Until(func() {
      // 此方法很重要, 会是在创建资源的时候存储到的队列中获取资源进行处理, 在第一处已经说明了队列中的数据何时写入的
			for c.processNextWorkItem() {
			}
		}, time.Second, stopCh)
	}

	c.logger.Info("Started operator workers")

  // 此处为flagger自己实现client-go添加的自己的逻辑, 是使用定时器进行处理存储在sync.map中的canary资源
	tickChan := time.NewTicker(c.flaggerWindow).C
	for {
		select {
		case <-tickChan:
      // 很重要, 这里创建了canaryJob, 定时处理canary资源的定时器------第2处
			c.scheduleCanaries()
		case <-stopCh:
			c.logger.Info("Shutting down operator workers")
			return nil
		}
	}
}
processNextWorkItem
// fluxcd/flagger/pkg/controller/controller.go # processNextWorkItem
func (c *Controller) processNextWorkItem() bool {
  // 从队列获取消息
	obj, shutdown := c.workqueue.Get()

	...
	
    // 此方法很重要, 是将从队列中取出来的对象存储到了sync.map中
		if err := c.syncHandler(key); err != nil {
			return fmt.Errorf("error syncing '%s': %w", key, err)
		}
		// Finally, if no error occurs we Forget this item so it does not
		// get queued again until another change happens.
		c.workqueue.Forget(obj)
		return nil
	}(obj)

...
}
syncHandler
  • canary cr状态的初始化
  • canary存储到sync.Map
// fluxcd/flagger/pkg/controller/controller.go # syncHandler
func (c *Controller) syncHandler(key string) error {
...
	// set status condition for new canaries
	if cd.Status.Conditions == nil {
		// 初始化canary cr中的状态
		if err := c.setPhaseInitializing(cd); err != nil {
			c.logger.Errorf("%s unable to set initializing status: %v", key, err)
			return fmt.Errorf("%s initializing error: %w", key, err)
		}
	}

	// 将canary信息进行存储
	c.canaries.Store(fmt.Sprintf("%s.%s", cd.Name, cd.Namespace), cd)

	// If opt in for revertOnDeletion add finalizer if not present
	if cd.Spec.RevertOnDeletion && !hasFinalizer(cd) {
		if err := c.addFinalizer(cd); err != nil {
			return fmt.Errorf("unable to add finalizer to canary %s.%s: %w", cd.Name, cd.Namespace, err)
		}

	}
	c.logger.Infof("Synced %s", key)
...
	return nil
}

scheduler.go

fluxcd/flagger/pkg/controller/scheduler.go

scheduleCanaries

便利sync.Map, 并创建定时任务

// fluxcd/flagger/pkg/controller/scheduler.go # scheduleCanaries
func (c *Controller) scheduleCanaries() {
	current := make(map[string]string)
	stats := make(map[string]int)
	c.logger.Infof("----95 canaries----")

  // 遍历sync.Map
	c.canaries.Range(func(key interface{}, value interface{}) bool {
		cn := value.(*flaggerv1.Canary)

		// format: <name>.<namespace>
		name := key.(string)

		current[name] = fmt.Sprintf("%s.%s", cn.Spec.TargetRef.Name, cn.Namespace)

		job, exists := c.jobs[name]
		// 创建canaryJob, 
		if (exists && job.GetCanaryAnalysisInterval() != cn.GetAnalysisInterval()) || !exists {
			if exists {
				job.Stop()
			}

			newJob := CanaryJob{
				Name:             cn.Name,
				Namespace:        cn.Namespace,
        // 此方法很重要, 此方法是canaryJob的一个回调方法 是核心业务逻辑所在, // fluxcd/flagger/pkg/controller/scheduler.go # advanceCanary
				function:         c.advanceCanary,
				done:             make(chan bool),
				ticker:           time.NewTicker(cn.GetAnalysisInterval()),
				analysisInterval: cn.GetAnalysisInterval(),
			}

			c.jobs[name] = newJob
      // start中的逻辑是一个定时器, 
			newJob.Start()
		}

		...
}
advanceCanary方法
  • 根据canay不同的状态进行不同的逻辑处理
  • 根据canary资源中定义的provider以及target.Kind找到具体的实现, 如果想增加自己的实现 可以直接实现对应的接口
  • 创建额外的service ;service一共会创建三个, 如果MAIN svc(用户主动创建与targetRef.name所关联的service)没有被创建,则canary会主动创建main svc, 同时会创建*-primary, *-canary两个svc
  • 创建*-canary ingress,如果在canary中没有定义ingressRef, 则此次canary发布将会失败,因为canary强依赖ingress;
  • 此方法中含有具体A/B canary的实现逻辑

流程图正在路上。。。。。

Provider实现以下接口

type Interface interface {
	Reconcile(canary *flaggerv1.Canary) error
	SetRoutes(canary *flaggerv1.Canary, primaryWeight int, canaryWeight int, mirrored bool) error
	GetRoutes(canary *flaggerv1.Canary) (primaryWeight int, canaryWeight int, mirrored bool, err error)
	Finalize(canary *flaggerv1.Canary) error
}

Controller实现以下接口

type Controller interface {
	IsPrimaryReady(canary *flaggerv1.Canary) error
	IsCanaryReady(canary *flaggerv1.Canary) (bool, error)
	GetMetadata(canary *flaggerv1.Canary) (string, string, map[string]int32, error)
	SyncStatus(canary *flaggerv1.Canary, status flaggerv1.CanaryStatus) error
	SetStatusFailedChecks(canary *flaggerv1.Canary, val int) error
	SetStatusWeight(canary *flaggerv1.Canary, val int) error
	SetStatusIterations(canary *flaggerv1.Canary, val int) error
	SetStatusPhase(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error
	Initialize(canary *flaggerv1.Canary) error
	Promote(canary *flaggerv1.Canary) error
	HasTargetChanged(canary *flaggerv1.Canary) (bool, error)
	HaveDependenciesChanged(canary *flaggerv1.Canary) (bool, error)
	ScaleToZero(canary *flaggerv1.Canary) error
	ScaleFromZero(canary *flaggerv1.Canary) error
	Finalize(canary *flaggerv1.Canary) error
}

canary 流程梳理

根据flagger官方给的例子, 进行部署,在结合代码逻辑梳理了下图, 可以更好的理解canary发布; 在最后会把官方的yaml一同带上, 方便查看

canary流程图.jpg

栗子

此栗子中包含三个yaml文件, 在进行验证时请依次进行apply; 进行验证之前请确认您的k8s集群已经安装了flagger

podinfo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: podinfo
  namespace: test
  labels:
    app: podinfo
spec:
  replicas: 2
  minReadySeconds: 5
  revisionHistoryLimit: 5
  progressDeadlineSeconds: 60
  strategy:
    rollingUpdate:
      maxUnavailable: 1
    type: RollingUpdate
  selector:
    matchLabels:
      app: podinfo
  template:
    metadata:
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9797"
      labels:
        app: podinfo
    spec:
      containers:
      - name: podinfod
        image: stefanprodan/podinfo:3.1.1
        imagePullPolicy: IfNotPresent
        ports:
        - name: http
          containerPort: 9898
          protocol: TCP
        - name: http-metrics
          containerPort: 9797
          protocol: TCP
        - name: grpc
          containerPort: 9999
          protocol: TCP
        command:
        - ./podinfo
        - --port=9898
        - --port-metrics=9797
        - --grpc-port=9999
        - --grpc-service-name=podinfo
        - --level=info
        - --random-delay=false
        - --random-error=false
        env:
        - name: PODINFO_UI_COLOR
          value: "#34577c"
        livenessProbe:
          exec:
            command:
            - podcli
            - check
            - http
            - localhost:9898/healthz
          initialDelaySeconds: 5
          timeoutSeconds: 5
        readinessProbe:
          exec:
            command:
            - podcli
            - check
            - http
            - localhost:9898/readyz
          initialDelaySeconds: 5
          timeoutSeconds: 5
        resources:
          limits:
            cpu: 2000m
            memory: 512Mi
          requests:
            cpu: 100m
            memory: 64Mi

ingress-podinfo.yaml
apiVersion: networking.k8s.io/v1beta1
kind: Ingress
metadata:
  name: podinfo
  namespace: test
  labels:
    app: podinfo
  annotations:
    kubernetes.io/ingress.class: "nginx"
spec:
  rules:
    - host: podinfo.test.jd.com
      http:
        paths:
          - backend:
              serviceName: podinfo
              servicePort: 80

canary.yaml

此文件在官方的基础上进行了裁剪, 为了方便进行主流程的验证

apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
  name: podinfo
  namespace: test
spec:
  provider: nginx
  # deployment reference
  targetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: podinfo
  # ingress reference
  ingressRef:
    apiVersion: networking.k8s.io/v1beta1
    kind: Ingress
    name: podinfo
  # HPA reference (optional)
  # autoscalerRef:
  #  apiVersion: autoscaling/v2beta2
  #  kind: HorizontalPodAutoscaler
  #  name: podinfo
  # the maximum time in seconds for the canary deployment
  # to make progress before it is rollback (default 600s)
  progressDeadlineSeconds: 60
  service:
    # ClusterIP port number
    port: 80
    # container port number or name
    targetPort: 9898
  analysis:
    # schedule interval (default 60s)
    interval: 10s
    # max number of failed metric checks before rollback
    threshold: 10
    # max traffic percentage routed to canary
    # percentage (0-100)
    maxWeight: 100
    # canary increment step
    # percentage (0-100)
    stepWeight: 20
    # NGINX Prometheus checks
    #metrics:
    #- name: request-success-rate
    #  # minimum req success rate (non 5xx responses)
      # percentage (0-100)
      #   thresholdRange:
      #  min: 99
      #interval: 1m
    # testing (optional)
    # webhooks:
    #   - name: acceptance-test
    #     type: pre-rollout
    #     url: http://flagger-loadtester.test/
    #     timeout: 30s
    #     metadata:
    #       type: bash
    #       cmd: "curl -sd 'test' http://podinfo-canary/token | grep token"
    #   - name: load-test
    #     url: http://flagger-loadtester.test/
    #     timeout: 5s
    #     metadata:
    #       cmd: "hey -z 1m -q 10 -c 2 http://podinfo.test.jd.com/"

主要逻辑

  • 如何监控到用户的deployment有新的版本变更

在newJob的定时任务中会调用此方法fluxcd/flagger/pkg/controller/scheduler.go, 此方法会判断用户的deployment是否有更新, 如果有更新会将canary.status 的状态由终止状态变更新CanaryPhaseProgressing

  • 有两组重要的接口

1, canary所支持targetRef类型, 都需要实现此接口fluxcd/flagger/pkg/canary/controller.go

2, canary所支持provider类型, 需要实现此接口fluxcd/flagger/pkg/router/router.go

  • apply canary.yaml后状态是如何初始化的

fluxcd/flagger/pkg/controller/controller.go # syncHandler 方法中进行的初始化, 同时会将canary放入到sync.Map中, 以便定时任务可以从sync.map中进行获取

  • Client-go在此项目中是如何体现的

此项目中的也是使用到了client-go, 但是此项目的使用与tektoncd类似, 直接实现了接口, 进行了重新的实现, 这样地方好处是更加的灵活, 可控性更强

  • canary status crd中的两个字断LastApplidSpec & LastPromotedSpec

这两个值保存的都是deployment.spec.template的hash值

其中LastApplidSpec在进行同步的时候进行赋值, 说明当前使用的是deployment.spec.template, 在判断deployment是否有更新时使用fluxcd/flagger/pkg/canary/status.go # syncCanaryStatus;

LastPromotedSpec在最终canary状态变为最终状态(inited / success)时, 会把LastApplidSpec的状态复制给此值

  • 流量比例如何做到的

首先是所使用的provider 支持

比如:ingress支持根据设置annotation的方式进行进行对流量的设置

通过 fluxcd/flagger/pkg/router/ingress.go # SetRoutes 可以设置ingress中的annotation注解的值