使用client-go restclient操作crd concurrent map read and map write问题

337 阅读3分钟

背景: 在云原生operator开发中,会有大量的crd操作的需求。最近发现operator pod随机触发过重启,看日志,有concurrent map read and map write 报错,这个fatal error是无法recover的,最后扫描了下代码定位了一下。如果是复现的代码


package main

import (
   "context"
   "fmt"
   "github.com/fsp1yjl/crd-demo/crd"   //这里的目录下放crd的结构定义
   metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
   "k8s.io/apimachinery/pkg/runtime"
   "k8s.io/apimachinery/pkg/runtime/schema"
   "k8s.io/apimachinery/pkg/runtime/serializer"
   "k8s.io/client-go/kubernetes/scheme"
   "k8s.io/client-go/rest"
   "k8s.io/client-go/tools/clientcmd"
   "sync"
   "time"
)

var (
   GroupVersion = schema.GroupVersion{Group: "crd-demo.io", Version: "v1"}

   // SchemeBuilder initializes a scheme builder
   SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
   // AddToScheme is a global function that registers this API group & version to a scheme
   AddToScheme = SchemeBuilder.AddToScheme
)

func addKnownTypes(scheme *runtime.Scheme) error {

   scheme.AddKnownTypes(GroupVersion,
      &crd.HardDisk{},
      &crd.HardDiskList{},
   )
   metav1.AddToGroupVersion(scheme, GroupVersion)
   return nil
}


//这里模拟第一种触发 map fatal error 的场景
//如果多个goroutine并发调用GetK8sClient 函数, 这里会触发scheme.Scheme  concurrent map read and map write 系统错误
func GetK8sClient() (*rest.RESTClient, error) {
   
   err := AddToScheme(scheme.Scheme)
   if err != nil {
      fmt.Println("ericprint init [AddToScheme] error:", err)
   }

   //config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
   kubeconfig := "/Users/dddd/work/test/crd-demo/config"
   config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
   if err != nil {
      config, err = rest.InClusterConfig()
      if err != nil {
         return nil, err
      }
   }

   config.APIPath = "/apis/"
   config.NegotiatedSerializer = serializer.NewCodecFactory(scheme.Scheme)

   config.GroupVersion = &GroupVersion
   config.ContentConfig.GroupVersion = &schema.GroupVersion{Group: GroupVersion.Group, Version: GroupVersion.Version}

   client, err := rest.RESTClientFor(config)
   if err != nil {
      return nil, err
   }
   return client, nil
}


// 这是第一种优化方案, 
//如果多个goroutine并发调用GetK8sClientWithOnce函数, 因为加入了sync.Once,即使是多次调用,也不会触发并行执行 AddToScheme(scheme.Scheme)
// 但是这里还是有风险,因为scheme.Scheme是一个非线程安全的全局变量,假设有其他服务同时在写这个变量,依旧有 fatal error的可能

var doOnce sync.Once
func GetK8sClientWithOnce() (*rest.RESTClient, error) {
   doOnce.Do(func() {
      err := AddToScheme(scheme.Scheme)
      if err != nil {
         fmt.Println("ericprint init [AddToScheme] error:", err)
      }

   })

   //config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
   kubeconfig := "/Users/eric.feng/work/test/crd-demo/config"
   config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
   if err != nil {
      config, err = rest.InClusterConfig()
      if err != nil {
         return nil, err
      }
   }

   config.APIPath = "/apis/"
   config.NegotiatedSerializer = serializer.NewCodecFactory(scheme.Scheme)

   config.GroupVersion = &GroupVersion
   config.ContentConfig.GroupVersion = &schema.GroupVersion{Group: GroupVersion.Group, Version: GroupVersion.Version}

   client, err := rest.RESTClientFor(config)
   if err != nil {
      return nil, err
   }
   return client, nil
}


//第二种优化方案
// 从第一种优化方案的分析可以看出,主要的问题出在 scheme.Scheme 对象中的map属性是非线程安全的
// 看scheme 包的源码,可以看到 var Scheme = runtime.NewScheme()
// 所以,可以在构造client 时,new 一个新的scheme,再去注册,就可以避免并发读写的风险

func GetK8sClientWithNewScheme() (*rest.RESTClient, error) {

   // todo : 可以借助sync.Once 保证多次调用GetK8sClientWithNewScheme 只执行一次AddToScheme
   err := AddToScheme(runtime.NewScheme())  // 这里每次都new一个新的scheme 对象
   if err != nil {
      fmt.Println("ericprint init [AddToScheme] error:", err)
   }

   //config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
   kubeconfig := "/Users/eric.feng/work/test/crd-demo/config"
   config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
   if err != nil {
      config, err = rest.InClusterConfig()
      if err != nil {
         return nil, err
      }
   }

   config.APIPath = "/apis/"
   config.NegotiatedSerializer = serializer.NewCodecFactory(scheme.Scheme)  

   config.GroupVersion = &GroupVersion
   config.ContentConfig.GroupVersion = &schema.GroupVersion{Group: GroupVersion.Group, Version: GroupVersion.Version}

   client, err := rest.RESTClientFor(config)
   if err != nil {
      return nil, err
   }
   return client, nil
}



// 测试使用有fatal 风险的client去读取cr
func ClientGetHarddiskWithOneScheme(name string) (*crd.HardDisk, error) {
   k8sClient, err := GetK8sClient()
   if err != nil {
      fmt.Println("ericprint  [GetK8sClient] error:", err)

      return nil, err
   }

   retObj := &crd.HardDisk{}
   err = k8sClient.Get().Resource("harddisks").
      Name(name).
      Do(context.TODO()).Into(retObj)
   if err != nil {
      //if apierrors.IsNotFound(err) {
      // // 如果cr不存在,则返回空对象
      // return &HardDisk{}, nil
      //
      //}
      fmt.Println("ericprint  [Get Resource] error:", err)

      return nil, err
   }
   fmt.Println("client get disk res:", retObj)
   return retObj, nil
}

// 测试使用修复了fatal 风险的client 读取cr
func ClientGetHarddiskWithNewScheme(name string) (*crd.HardDisk, error) {
   k8sClient, err := GetK8sClientWithNewScheme()
   if err != nil {
      fmt.Println("ericprint  [GetK8sClient] error:", err)

      return nil, err
   }

   retObj := &crd.HardDisk{}
   err = k8sClient.Get().Resource("harddisks").
      Name(name).
      Do(context.TODO()).Into(retObj)
   if err != nil {
      //if apierrors.IsNotFound(err) {
      // // 如果cr不存在,则返回空对象
      // return &HardDisk{}, nil
      //
      //}
      fmt.Println("ericprint  [Get Resource] error:", err)

      return nil, err
   }
   fmt.Println("client get disk res:", retObj)
   return retObj, nil
}


func mapConcurrentWriteTestSyncOnceScheme(concurrent int) {
   for i := 0; i < concurrent; i++ {
      go GetK8sClientWithOnce()
   }
}
func mapConcurrentWriteTestOneScheme(concurrent int) {
   for i := 0; i < concurrent; i++ {
      go GetK8sClient()
   }
}

func mapConcurrentWriteTestNewScheme(concurrent int) {
   for i := 0; i < concurrent; i++ {
      go GetK8sClientWithNewScheme()
   }
}

func ConcurrentGetWithOneScheme(concurrent int) {
   for i := 0; i < concurrent; i++ {
      go ClientGetHarddiskWithOneScheme("disk1")
   }
}

func ConcurrentGetWithNewScheme(concurrent int) {
   for i := 0; i < concurrent; i++ {
      go ClientGetHarddiskWithNewScheme("disk1")
   }
}


func main() {

   //mapConcurrentWriteTestNewScheme(10)   不会fatal 
   //mapConcurrentWriteTestOneScheme(10)   不会fatal


   // mapConcurrentWriteTestOneScheme(10)   能复现fatal 
   //mapConcurrentWriteTestNewScheme(10)    不会fatal 
   
   select{}
}

总结:

  1. k8s.io/apimachinery/pkg/runtime 包负责schemeBuilder对象创建,初始化,并提供crd 注册到新的scheme中的方法, 这个scheme是cr 对象管理时序列化/反序列化的关键
  2. config.NegotiatedSerializer 用于指定client的crd 反序列化方法信息,即关联scheme
  3. 对于一个runtime.Scheme的对象scheme1,执行scheme1.AddKnownTypes时需注意有线程安全问题,需要保证不会出现并发执行的情况

后续再写一个使用controller-runtime 包的client 去读写crd的demo 参考: buraksekili.github.io/articles/co… zhuanlan.zhihu.com/p/525386905 blog.csdn.net/cbmljs/arti… blog.csdn.net/zhonglinzha… github.com/kubernetes-… www.zhaohuabing.com/post/2023-0…