7-开发一个完整的operator程序

130 阅读7分钟

本文是胡涛大佬所出版的《Kubernetes Operator 进阶开发》,强烈推荐各位阅读原书,本文仅仅留作个人心得,如有侵权立马删除。


1 项目设计

作者为了介绍 Kubernetes Operator 的开发,于是自定义了一个 Application 的类型,并且自定义控制器去控制 Deployment 以及 Service

首先利用 kubebuilder 来初始化项目:

 kubebuilder init --domain=sunstrider.cn --repo=gitee.com/langzijiangnan/sample-operator --owner sunstrider 

然后就可以开始添加新的 API,来创建 Application 类型以及对应的控制器:

kubebuilder create api --group apps --version v1 --kind Application

然后我们将注意力放在 ./api/v1/application_types.go 中,我们可以见到对应的定义:

// ApplicationSpec defines the desired state of Application
type ApplicationSpec struct {
	// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
	// Important: Run "make" to regenerate code after modifying this file

	// Foo is an example field of Application. Edit application_types.go to remove/update
	Foo string `json:"foo,omitempty"`
}

// ApplicationStatus defines the observed state of Application
type ApplicationStatus struct {
	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
	// Important: Run "make" to regenerate code after modifying this file
}

2 自定义 Application

Kubernetes 中,控制器就是不断的调整 specstatus,从而让二者保持一致。接下来,我们来定义自己的 API,从而改造 Application

// ApplicationSpec defines the desired state of Application
type ApplicationSpec struct {
	Deployment DeploymentTemplate `json:"deployment,omitempty"`
	Service    ServiceTemplate    `json:"service,omitempty"`
}

type DeploymentTemplate struct {
	appsv1.DeploymentSpec `json:",inline"`
}

type ServiceTemplate struct {
	corev1.ServiceSpec `json:",inline"`
}

// ApplicationStatus defines the observed state of Application
type ApplicationStatus struct {
	Workflow appsv1.DeploymentStatus `json:"workflow"`
	Network  corev1.ServiceStatus    `json:"network"`
}

3 实现 Application Controller

Application 资源定义好之后,我们可以开始编写控制器核心代码。在 Kubernetes 中最重要的就是将 specstatus 的状态调整成一致。

3.1 Controller 的骨架

我们可以在 internal/controllers/application_controller.go 中看见 Reconcile 的骨架代码:

func (r *ApplicationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
	_ = log.FromContext(ctx)

	// TODO(user): your logic here

	return ctrl.Result{}, nil
}

下面是作者所编写的关于 reconcile 的代码:

// 两个全局变量:
var CounterReconcileApplication int64 = 0
const GenericRequeueDuration = 1 * time.Minute

func (r *ApplicationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
	<-time.NewTicker(100 * time.Millisecond).C
	log := log.FromContext(ctx)

	CounterReconcileApplication += 1
	log.Info("Reconciling Application", "CounterReconcilApplication", CounterReconcileApplication)
	app := &v1.Application{}
	if err := r.Get(ctx, req.NamespacedName, app); err != nil {
		if errors.IsNotFound(err) {
			log.Info("Application resource not found. Ignoring since object must be deleted")
			return ctrl.Result{}, nil
		}
		log.Error(err, "Failed to get Application")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	var result ctrl.Result
	var err error

	result, err = r.reconcileDeployment(ctx, app)
	if err != nil {
		log.Error(err, "Failed to reconcile Deployment")
		return result, err
	}

	result, err = r.reconcileService(ctx, app)
	if err != nil {
		log.Error(err, "Failed to reconcile Service")
		return result, err
	}

	log.Info("Reconcile Application successfully")
	return ctrl.Result{}, nil
}

其中的逻辑非常容易易懂,也就是查询是否有 application 这个资源,如果有就开始让 specstatus 保持一致。

3.2 reconcileDeployment

下面则是 reconcileDeployment 的逻辑,因为在这次项目中,唯一需要我们完成的工作就是在 application 被创建之后,我们需要在系统创建根据 application 的内容创建对应的 deploymentsvc


func (r *ApplicationReconciler) reconcileDeployment(ctx context.Context, app *v1.Application) (ctrl.Result, error) {
	log := log.FromContext(ctx)

	// 此处是在查询 deployment 的状态
	var dp = &appsv1.Deployment{}
	err := r.Get(ctx, types.NamespacedName{
		Namespace: app.Namespace,
		Name:      app.Name,
	}, dp)

	// 如果可以查询到对应的deployment的状态,那么就尝试更新
	if err == nil {
		log.Info("Deployment already exists")

		// 如果是已经同步完成的情况,那么就不用接着更新了,表示此时已经更新完成了
		if reflect.DeepEqual(dp.Status, app.Status.Workflow) {
			log.Info("Deployment is in sync")
			return ctrl.Result{}, nil
		}

		// 否则就更新
		app.Status.Workflow = dp.Status
		if err := r.Status().Update(ctx, app); err != nil {
			log.Error(err, "Failed to update Application status")
			return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
		}
		log.Info("Updated Application status")
		return ctrl.Result{}, nil
	}

	// 如果系统没有创建这个deployment,那么就创建一个新的deployment
	if !errors.IsNotFound(err) {
		log.Error(err, "Failed to get Deployment")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	newDp := &appsv1.Deployment{}
	newDp.SetName(app.Name)
	newDp.SetNamespace(app.Namespace)
	newDp.SetLabels(app.Labels)
	newDp.Spec = app.Spec.Deployment.DeploymentSpec
	newDp.Spec.Template.SetLabels(app.Labels)

	if err := ctrl.SetControllerReference(app, newDp, r.Scheme); err != nil {
		log.Error(err, "Failed to set controller reference")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	if err := r.Create(ctx, newDp); err != nil {
		log.Error(err, "Failed to create Deployment")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	log.Info("Created Deployment")
	return ctrl.Result{}, nil
}

3.3 reconcileService

其中主要内容如 reconcileDeployment,因此没有什么必须要添加思考。

func (r *ApplicationReconciler) reconcileService(ctx context.Context, app *v1.Application) (ctrl.Result, error) {
	log := log.FromContext(ctx)
	var svc = &corev1.Service{}
	err := r.Get(ctx, types.NamespacedName{
		Namespace: app.Namespace,
		Name:      app.Name,
	}, svc)

	if err == nil {
		log.Info("Service already exists")
		if reflect.DeepEqual(svc.Status, app.Status.Network) {
			log.Info("Service is in sync")
			return ctrl.Result{}, nil
		}

		app.Status.Network = svc.Status
		if err := r.Status().Update(ctx, app); err != nil {
			log.Error(err, "Failed to update Application status")
			return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
		}
		log.Info("Updated Application status")
		return ctrl.Result{}, nil
	}

	if !errors.IsNotFound(err) {
		log.Error(err, "Failed to get Service")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	newSvc := &corev1.Service{}
	newSvc.SetName(app.Name)
	newSvc.SetNamespace(app.Namespace)
	newSvc.SetLabels(app.Labels)
	newSvc.Spec = app.Spec.Service.ServiceSpec
	newSvc.Spec.Selector = app.Labels

	if err := ctrl.SetControllerReference(app, newSvc, r.Scheme); err != nil {
		log.Error(err, "Failed to set controller reference")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	if err := r.Create(ctx, newSvc); err != nil {
		log.Error(err, "Failed to create Service")
		return ctrl.Result{RequeueAfter: GenericRequeueDuration}, err
	}

	log.Info("Created Service")

	return ctrl.Result{}, nil
}

3.4 设置过滤事件

接下来我们需要进一步设置过滤事件,需要让 operator 在指定的事件的时候开始对集群中 Application 进行 reconcile,这部分代码在 setupwithManager 中:

// SetupWithManager sets up the controller with the Manager.
func (r *ApplicationReconciler) SetupWithManager(mgr ctrl.Manager) error {
	setupLog := ctrl.Log.WithName("setup")
	return ctrl.NewControllerManagedBy(mgr).
		For(&v1.Application{}, builder.WithPredicates(
			predicate.Funcs{
				// 如果系统中create 了一个 application,那么肯定就要 reconcile 的处理
				CreateFunc: func(e event.CreateEvent) bool {
					return true
				},
				// 如果系统 delete 了一个 application,但是我们因为托管给 Kubernetes进行处理的,因此deployment 和 svc 会自动删除
				DeleteFunc: func(deleteEvent event.DeleteEvent) bool {
					setupLog.Info("Delete Application", "Application", deleteEvent.Object)
					return false
				},
				// 因为麻烦,所以没有进行仔细的比较,直接进行更新
				UpdateFunc: func(updateEvent event.UpdateEvent) bool {
					return true
				},
			})).
		Owns(&appsv1.Deployment{}, builder.WithPredicates(
			predicate.Funcs{
				// 系统中如果创建了一个deployment,这个是不要 application reconcile 进行管理的
				CreateFunc: func(e event.CreateEvent) bool {
					return false
				},
				// 系统中如果删除了一个 deployment,那么我们需要思考是否是用户的误删
				DeleteFunc: func(deleteEvent event.DeleteEvent) bool {
					setupLog.Info("Delete Deployment", "Deployment", deleteEvent.Object)
					return true
				},
				// 系统中如果更新了一个 deployment,那么我们需要思考是否是用户的误操作
				UpdateFunc: func(updateEvent event.UpdateEvent) bool {
					return true
				},
				// 通用事件不进行处理
				GenericFunc: nil,
			})).
		Owns(&corev1.Service{}, builder.WithPredicates(
			predicate.Funcs{
				CreateFunc: func(e event.CreateEvent) bool {
					return false
				},
				DeleteFunc: func(deleteEvent event.DeleteEvent) bool {
					setupLog.Info("Delete Service", "Service", deleteEvent.Object)
					return true
				},
				UpdateFunc: func(updateEvent event.UpdateEvent) bool {
					return true
				},
			})).
		Complete(r)
}

4 设置 RBAC 权限

如何正确的设置 RBAC 权限,我们可以看在 Reconcile() 有几行注释:

//+kubebuilder:rbac:groups=apps.sunstrider.cn,resources=applications,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=apps.sunstrider.cn,resources=applications/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=apps.sunstrider.cn,resources=applications/finalizers,verbs=update

也就是表明了,Reconcile 可以对 application 资源进行一系列的操作,但是我们也可以知道,reconcile 没有权限对 deployment 以及 service 进行操作。

因为我们需要添加对处在 apps 下的 deployment 以及处在 corev1 下的 service 的权限:

//+kubebuilder:rbac:groups=apps.sunstrider.cn,resources=applications,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=apps.sunstrider.cn,resources=applications/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=apps.sunstrider.cn,resources=applications/finalizers,verbs=update
//+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=apps,resources=deployments/status,verbs=get
//+kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=services/status,verbs=get

然后就可以在 config/rbac/role.yaml 中看见下面的内容:

rules:
- apiGroups:
  - apps
  resources:
  - deployments
  verbs:
  - create
  - delete
  - get
  - list
  - patch
  - update
  - watch

这也就表示了所执行的 roleKubernetesdeployment 有对应的权限。

6 正式部署

下面的过程展示了应该如何正确部署整个项目:

# 首先构建镜像,然后推送给kind 
make docker-build IMG=operator:v0.2
kind load docker-image operator:v0.2 --name dev

# 部署 CRD
make install 

# 部署 operator
make deploy IMG=operator:v0.2

然后就可以检查是否成功了:

kubectl get pod -A 

NAMESPACE            NAME                                             READY   STATUS    RESTARTS   AGE
chapter-08-system    chapter-08-controller-manager-6674bb57df-7cmnf   2/2     Running   0          2m46s
default              nginx-sample-85996f8dbd-grwcz                    1/1     Running   0          48s
kube-system          coredns-787d4945fb-hq9df                         1/1     Running   0          16m
kube-system          coredns-787d4945fb-l6kd5                         1/1     Running   0          16m
kube-system          etcd-dev-control-plane                           1/1     Running   0          17m
kube-system          kindnet-6t2rv                                    1/1     Running   0          16m
kube-system          kube-apiserver-dev-control-plane                 1/1     Running   0          17m
kube-system          kube-controller-manager-dev-control-plane        1/1     Running   0          17m
kube-system          kube-proxy-xcw8l                                 1/1     Running   0          16m
kube-system          kube-scheduler-dev-control-plane                 1/1     Running   0          17m
local-path-storage   local-path-provisioner-57c674d644-hb876          1/1     Running   0          16m

可以发现此时 chapter-08-system 这个已经开始正常运行了。

我们再编写一个 sample.yaml

apiVersion: apps.sunstrider.cn/v1
kind: Application
metadata:
  name: nginx-sample
  namespace: default
  labels:
    app: nginx
spec:
  deployment:
    replicas: 1
    selector:
      matchLabels:
        app: nginx
    template:
      spec:
        containers:
          - name: nginx
            image: nginx:1.14.2
            ports:
              - containerPort: 80
  service:
    type: NodePort
    ports:
      - port: 80
        targetPort: 80
        nodePort: 30080

说起来,这里我一直有一个 bug,当我在 spec 字段上加上 deployment.template.metadata.labels 之后就会报错: Error from server (BadRequest): error when creating "config/samples/apps_v 1_application. Yaml": Application in version "v 1" cannot be handled as a Application: strict decoding error: unknown field "spec. Deployment. Template. Metadata. Labels",但是通过 kubectl describe crd <crd-name> 得到的文档表示这个字段是没有问题的,但是我将其删除之后,一切运行正常