此文档主要讲解了如下内容: flagger如何实现了canary发布; 同时可以了解client-go的架构以及crd数据如何进行的流传,对后续crd的开发有很大的帮助;
整体架构
整体架构采用了client-go的模式, 同时做了增加使用两个定时器 & sync.Map
Client-go可以参考github.com/JaneLiuL/ku….
定时器的功能是将任务
数据流向图:
源码解析
main.go
- 初始化Controller
- 资源的add/update/delete的回调事件
- 启动
func main() {
...
// 初始化
c := controller.NewController(
kubeClient,
flaggerClient,
infos,
controlLoopInterval,
logger,
notifierClient,
canaryFactory,
routerFactory,
observerFactory,
meshProvider,
version.VERSION,
fromEnv("EVENT_WEBHOOK_URL", eventWebhook),
)
// leader election context
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// prevents new requests when leadership is lost
cfg.Wrap(transport.ContextCanceller(ctx, fmt.Errorf("the leader is shutting down")))
// cancel leader election context on shutdown signals
go func() {
<-stopCh
cancel()
}()
// wrap controller run
runController := func() {
// 启动
if err := c.Run(threadiness, stopCh); err != nil {
logger.Fatalf("Error running controller: %v", err)
}
}
...
}
controller.go
NewController 方法
// fluxcd/flagger/pkg/controller/controller.go # NewController
func NewController(
kubeClient kubernetes.Interface,
flaggerClient clientset.Interface,
flaggerInformers Informers,
flaggerWindow time.Duration,
logger *zap.SugaredLogger,
notifier notifier.Interface,
canaryFactory *canary.Factory,
routerFactory *router.Factory,
observerFactory *observers.Factory,
meshProvider string,
version string,
eventWebhook string,
) *Controller {
...
// 添加canary资源的回调方法
flaggerInformers.CanaryInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
// apply 新增时, 把资源放入队列, 在后面的启动方法中会从此队列中获取数据进行逻辑处理-------- 第一处
AddFunc: ctrl.enqueue,
// 以有资源进行更新时
UpdateFunc: func(old, new interface{}) {
...
},
// 删除资源时
DeleteFunc: func(old interface{}) {
r, ok := checkCustomResourceType(old, logger)
if ok {
ctrl.logger.Infof("Deleting %s.%s from cache", r.Name, r.Namespace)
// 此处就是在删除资源时会把此资源从存储在sync.Map中移除, 如果没有进行移除 此资源一直存在Map中, 是占用内存的哟
ctrl.canaries.Delete(fmt.Sprintf("%s.%s", r.Name, r.Namespace))
}
},
})
}
Run 方法
// // fluxcd/flagger/pkg/controller/controller.go # Run
// Run starts the K8s workers and the canary scheduler
func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
defer utilruntime.HandleCrash()
defer c.workqueue.ShutDown()
c.logger.Info("Starting operator")
for i := 0; i < threadiness; i++ {
go wait.Until(func() {
// 此方法很重要, 会是在创建资源的时候存储到的队列中获取资源进行处理, 在第一处已经说明了队列中的数据何时写入的
for c.processNextWorkItem() {
}
}, time.Second, stopCh)
}
c.logger.Info("Started operator workers")
// 此处为flagger自己实现client-go添加的自己的逻辑, 是使用定时器进行处理存储在sync.map中的canary资源
tickChan := time.NewTicker(c.flaggerWindow).C
for {
select {
case <-tickChan:
// 很重要, 这里创建了canaryJob, 定时处理canary资源的定时器------第2处
c.scheduleCanaries()
case <-stopCh:
c.logger.Info("Shutting down operator workers")
return nil
}
}
}
processNextWorkItem
// fluxcd/flagger/pkg/controller/controller.go # processNextWorkItem
func (c *Controller) processNextWorkItem() bool {
// 从队列获取消息
obj, shutdown := c.workqueue.Get()
...
// 此方法很重要, 是将从队列中取出来的对象存储到了sync.map中
if err := c.syncHandler(key); err != nil {
return fmt.Errorf("error syncing '%s': %w", key, err)
}
// Finally, if no error occurs we Forget this item so it does not
// get queued again until another change happens.
c.workqueue.Forget(obj)
return nil
}(obj)
...
}
syncHandler
- canary cr状态的初始化
- canary存储到sync.Map
// fluxcd/flagger/pkg/controller/controller.go # syncHandler
func (c *Controller) syncHandler(key string) error {
...
// set status condition for new canaries
if cd.Status.Conditions == nil {
// 初始化canary cr中的状态
if err := c.setPhaseInitializing(cd); err != nil {
c.logger.Errorf("%s unable to set initializing status: %v", key, err)
return fmt.Errorf("%s initializing error: %w", key, err)
}
}
// 将canary信息进行存储
c.canaries.Store(fmt.Sprintf("%s.%s", cd.Name, cd.Namespace), cd)
// If opt in for revertOnDeletion add finalizer if not present
if cd.Spec.RevertOnDeletion && !hasFinalizer(cd) {
if err := c.addFinalizer(cd); err != nil {
return fmt.Errorf("unable to add finalizer to canary %s.%s: %w", cd.Name, cd.Namespace, err)
}
}
c.logger.Infof("Synced %s", key)
...
return nil
}
scheduler.go
fluxcd/flagger/pkg/controller/scheduler.go
scheduleCanaries
便利sync.Map, 并创建定时任务
// fluxcd/flagger/pkg/controller/scheduler.go # scheduleCanaries
func (c *Controller) scheduleCanaries() {
current := make(map[string]string)
stats := make(map[string]int)
c.logger.Infof("----95 canaries----")
// 遍历sync.Map
c.canaries.Range(func(key interface{}, value interface{}) bool {
cn := value.(*flaggerv1.Canary)
// format: <name>.<namespace>
name := key.(string)
current[name] = fmt.Sprintf("%s.%s", cn.Spec.TargetRef.Name, cn.Namespace)
job, exists := c.jobs[name]
// 创建canaryJob,
if (exists && job.GetCanaryAnalysisInterval() != cn.GetAnalysisInterval()) || !exists {
if exists {
job.Stop()
}
newJob := CanaryJob{
Name: cn.Name,
Namespace: cn.Namespace,
// 此方法很重要, 此方法是canaryJob的一个回调方法 是核心业务逻辑所在, // fluxcd/flagger/pkg/controller/scheduler.go # advanceCanary
function: c.advanceCanary,
done: make(chan bool),
ticker: time.NewTicker(cn.GetAnalysisInterval()),
analysisInterval: cn.GetAnalysisInterval(),
}
c.jobs[name] = newJob
// start中的逻辑是一个定时器,
newJob.Start()
}
...
}
advanceCanary方法
- 根据canay不同的状态进行不同的逻辑处理
- 根据canary资源中定义的provider以及target.Kind找到具体的实现, 如果想增加自己的实现 可以直接实现对应的接口
- 创建额外的service ;service一共会创建三个, 如果MAIN svc(用户主动创建与targetRef.name所关联的service)没有被创建,则canary会主动创建main svc, 同时会创建*-primary, *-canary两个svc
- 创建*-canary ingress,如果在canary中没有定义ingressRef, 则此次canary发布将会失败,因为canary强依赖ingress;
- 此方法中含有具体A/B canary的实现逻辑
流程图正在路上。。。。。
Provider实现以下接口
type Interface interface {
Reconcile(canary *flaggerv1.Canary) error
SetRoutes(canary *flaggerv1.Canary, primaryWeight int, canaryWeight int, mirrored bool) error
GetRoutes(canary *flaggerv1.Canary) (primaryWeight int, canaryWeight int, mirrored bool, err error)
Finalize(canary *flaggerv1.Canary) error
}
Controller实现以下接口
type Controller interface {
IsPrimaryReady(canary *flaggerv1.Canary) error
IsCanaryReady(canary *flaggerv1.Canary) (bool, error)
GetMetadata(canary *flaggerv1.Canary) (string, string, map[string]int32, error)
SyncStatus(canary *flaggerv1.Canary, status flaggerv1.CanaryStatus) error
SetStatusFailedChecks(canary *flaggerv1.Canary, val int) error
SetStatusWeight(canary *flaggerv1.Canary, val int) error
SetStatusIterations(canary *flaggerv1.Canary, val int) error
SetStatusPhase(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error
Initialize(canary *flaggerv1.Canary) error
Promote(canary *flaggerv1.Canary) error
HasTargetChanged(canary *flaggerv1.Canary) (bool, error)
HaveDependenciesChanged(canary *flaggerv1.Canary) (bool, error)
ScaleToZero(canary *flaggerv1.Canary) error
ScaleFromZero(canary *flaggerv1.Canary) error
Finalize(canary *flaggerv1.Canary) error
}
canary 流程梳理
根据flagger官方给的例子, 进行部署,在结合代码逻辑梳理了下图, 可以更好的理解canary发布; 在最后会把官方的yaml一同带上, 方便查看
栗子
此栗子中包含三个yaml文件, 在进行验证时请依次进行apply; 进行验证之前请确认您的k8s集群已经安装了flagger
podinfo.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: podinfo
namespace: test
labels:
app: podinfo
spec:
replicas: 2
minReadySeconds: 5
revisionHistoryLimit: 5
progressDeadlineSeconds: 60
strategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
selector:
matchLabels:
app: podinfo
template:
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9797"
labels:
app: podinfo
spec:
containers:
- name: podinfod
image: stefanprodan/podinfo:3.1.1
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 9898
protocol: TCP
- name: http-metrics
containerPort: 9797
protocol: TCP
- name: grpc
containerPort: 9999
protocol: TCP
command:
- ./podinfo
- --port=9898
- --port-metrics=9797
- --grpc-port=9999
- --grpc-service-name=podinfo
- --level=info
- --random-delay=false
- --random-error=false
env:
- name: PODINFO_UI_COLOR
value: "#34577c"
livenessProbe:
exec:
command:
- podcli
- check
- http
- localhost:9898/healthz
initialDelaySeconds: 5
timeoutSeconds: 5
readinessProbe:
exec:
command:
- podcli
- check
- http
- localhost:9898/readyz
initialDelaySeconds: 5
timeoutSeconds: 5
resources:
limits:
cpu: 2000m
memory: 512Mi
requests:
cpu: 100m
memory: 64Mi
ingress-podinfo.yaml
apiVersion: networking.k8s.io/v1beta1
kind: Ingress
metadata:
name: podinfo
namespace: test
labels:
app: podinfo
annotations:
kubernetes.io/ingress.class: "nginx"
spec:
rules:
- host: podinfo.test.jd.com
http:
paths:
- backend:
serviceName: podinfo
servicePort: 80
canary.yaml
此文件在官方的基础上进行了裁剪, 为了方便进行主流程的验证
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
provider: nginx
# deployment reference
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
# ingress reference
ingressRef:
apiVersion: networking.k8s.io/v1beta1
kind: Ingress
name: podinfo
# HPA reference (optional)
# autoscalerRef:
# apiVersion: autoscaling/v2beta2
# kind: HorizontalPodAutoscaler
# name: podinfo
# the maximum time in seconds for the canary deployment
# to make progress before it is rollback (default 600s)
progressDeadlineSeconds: 60
service:
# ClusterIP port number
port: 80
# container port number or name
targetPort: 9898
analysis:
# schedule interval (default 60s)
interval: 10s
# max number of failed metric checks before rollback
threshold: 10
# max traffic percentage routed to canary
# percentage (0-100)
maxWeight: 100
# canary increment step
# percentage (0-100)
stepWeight: 20
# NGINX Prometheus checks
#metrics:
#- name: request-success-rate
# # minimum req success rate (non 5xx responses)
# percentage (0-100)
# thresholdRange:
# min: 99
#interval: 1m
# testing (optional)
# webhooks:
# - name: acceptance-test
# type: pre-rollout
# url: http://flagger-loadtester.test/
# timeout: 30s
# metadata:
# type: bash
# cmd: "curl -sd 'test' http://podinfo-canary/token | grep token"
# - name: load-test
# url: http://flagger-loadtester.test/
# timeout: 5s
# metadata:
# cmd: "hey -z 1m -q 10 -c 2 http://podinfo.test.jd.com/"
主要逻辑
- 如何监控到用户的deployment有新的版本变更
在newJob的定时任务中会调用此方法fluxcd/flagger/pkg/controller/scheduler.go, 此方法会判断用户的deployment是否有更新, 如果有更新会将canary.status 的状态由终止状态变更新CanaryPhaseProgressing
- 有两组重要的接口
1, canary所支持targetRef类型, 都需要实现此接口fluxcd/flagger/pkg/canary/controller.go
2, canary所支持provider类型, 需要实现此接口fluxcd/flagger/pkg/router/router.go
- apply canary.yaml后状态是如何初始化的
fluxcd/flagger/pkg/controller/controller.go # syncHandler 方法中进行的初始化, 同时会将canary放入到sync.Map中, 以便定时任务可以从sync.map中进行获取
- Client-go在此项目中是如何体现的
此项目中的也是使用到了client-go, 但是此项目的使用与tektoncd类似, 直接实现了接口, 进行了重新的实现, 这样地方好处是更加的灵活, 可控性更强
- canary status crd中的两个字断LastApplidSpec & LastPromotedSpec
这两个值保存的都是deployment.spec.template的hash值
其中LastApplidSpec在进行同步的时候进行赋值, 说明当前使用的是deployment.spec.template, 在判断deployment是否有更新时使用fluxcd/flagger/pkg/canary/status.go # syncCanaryStatus;
LastPromotedSpec在最终canary状态变为最终状态(inited / success)时, 会把LastApplidSpec的状态复制给此值
- 流量比例如何做到的
首先是所使用的provider 支持
比如:ingress支持根据设置annotation的方式进行进行对流量的设置
通过 fluxcd/flagger/pkg/router/ingress.go # SetRoutes 可以设置ingress中的annotation注解的值