背景: 在云原生operator开发中,会有大量的crd操作的需求。最近发现operator pod随机触发过重启,看日志,有concurrent map read and map write 报错,这个fatal error是无法recover的,最后扫描了下代码定位了一下。如果是复现的代码
package main
import (
"context"
"fmt"
"github.com/fsp1yjl/crd-demo/crd" //这里的目录下放crd的结构定义
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"sync"
"time"
)
var (
GroupVersion = schema.GroupVersion{Group: "crd-demo.io", Version: "v1"}
// SchemeBuilder initializes a scheme builder
SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
// AddToScheme is a global function that registers this API group & version to a scheme
AddToScheme = SchemeBuilder.AddToScheme
)
func addKnownTypes(scheme *runtime.Scheme) error {
scheme.AddKnownTypes(GroupVersion,
&crd.HardDisk{},
&crd.HardDiskList{},
)
metav1.AddToGroupVersion(scheme, GroupVersion)
return nil
}
//这里模拟第一种触发 map fatal error 的场景
//如果多个goroutine并发调用GetK8sClient 函数, 这里会触发scheme.Scheme concurrent map read and map write 系统错误
func GetK8sClient() (*rest.RESTClient, error) {
err := AddToScheme(scheme.Scheme)
if err != nil {
fmt.Println("ericprint init [AddToScheme] error:", err)
}
//config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
kubeconfig := "/Users/dddd/work/test/crd-demo/config"
config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
if err != nil {
config, err = rest.InClusterConfig()
if err != nil {
return nil, err
}
}
config.APIPath = "/apis/"
config.NegotiatedSerializer = serializer.NewCodecFactory(scheme.Scheme)
config.GroupVersion = &GroupVersion
config.ContentConfig.GroupVersion = &schema.GroupVersion{Group: GroupVersion.Group, Version: GroupVersion.Version}
client, err := rest.RESTClientFor(config)
if err != nil {
return nil, err
}
return client, nil
}
// 这是第一种优化方案,
//如果多个goroutine并发调用GetK8sClientWithOnce函数, 因为加入了sync.Once,即使是多次调用,也不会触发并行执行 AddToScheme(scheme.Scheme)
// 但是这里还是有风险,因为scheme.Scheme是一个非线程安全的全局变量,假设有其他服务同时在写这个变量,依旧有 fatal error的可能
var doOnce sync.Once
func GetK8sClientWithOnce() (*rest.RESTClient, error) {
doOnce.Do(func() {
err := AddToScheme(scheme.Scheme)
if err != nil {
fmt.Println("ericprint init [AddToScheme] error:", err)
}
})
//config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
kubeconfig := "/Users/eric.feng/work/test/crd-demo/config"
config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
if err != nil {
config, err = rest.InClusterConfig()
if err != nil {
return nil, err
}
}
config.APIPath = "/apis/"
config.NegotiatedSerializer = serializer.NewCodecFactory(scheme.Scheme)
config.GroupVersion = &GroupVersion
config.ContentConfig.GroupVersion = &schema.GroupVersion{Group: GroupVersion.Group, Version: GroupVersion.Version}
client, err := rest.RESTClientFor(config)
if err != nil {
return nil, err
}
return client, nil
}
//第二种优化方案
// 从第一种优化方案的分析可以看出,主要的问题出在 scheme.Scheme 对象中的map属性是非线程安全的
// 看scheme 包的源码,可以看到 var Scheme = runtime.NewScheme()
// 所以,可以在构造client 时,new 一个新的scheme,再去注册,就可以避免并发读写的风险
func GetK8sClientWithNewScheme() (*rest.RESTClient, error) {
// todo : 可以借助sync.Once 保证多次调用GetK8sClientWithNewScheme 只执行一次AddToScheme
err := AddToScheme(runtime.NewScheme()) // 这里每次都new一个新的scheme 对象
if err != nil {
fmt.Println("ericprint init [AddToScheme] error:", err)
}
//config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
kubeconfig := "/Users/eric.feng/work/test/crd-demo/config"
config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
if err != nil {
config, err = rest.InClusterConfig()
if err != nil {
return nil, err
}
}
config.APIPath = "/apis/"
config.NegotiatedSerializer = serializer.NewCodecFactory(scheme.Scheme)
config.GroupVersion = &GroupVersion
config.ContentConfig.GroupVersion = &schema.GroupVersion{Group: GroupVersion.Group, Version: GroupVersion.Version}
client, err := rest.RESTClientFor(config)
if err != nil {
return nil, err
}
return client, nil
}
// 测试使用有fatal 风险的client去读取cr
func ClientGetHarddiskWithOneScheme(name string) (*crd.HardDisk, error) {
k8sClient, err := GetK8sClient()
if err != nil {
fmt.Println("ericprint [GetK8sClient] error:", err)
return nil, err
}
retObj := &crd.HardDisk{}
err = k8sClient.Get().Resource("harddisks").
Name(name).
Do(context.TODO()).Into(retObj)
if err != nil {
//if apierrors.IsNotFound(err) {
// // 如果cr不存在,则返回空对象
// return &HardDisk{}, nil
//
//}
fmt.Println("ericprint [Get Resource] error:", err)
return nil, err
}
fmt.Println("client get disk res:", retObj)
return retObj, nil
}
// 测试使用修复了fatal 风险的client 读取cr
func ClientGetHarddiskWithNewScheme(name string) (*crd.HardDisk, error) {
k8sClient, err := GetK8sClientWithNewScheme()
if err != nil {
fmt.Println("ericprint [GetK8sClient] error:", err)
return nil, err
}
retObj := &crd.HardDisk{}
err = k8sClient.Get().Resource("harddisks").
Name(name).
Do(context.TODO()).Into(retObj)
if err != nil {
//if apierrors.IsNotFound(err) {
// // 如果cr不存在,则返回空对象
// return &HardDisk{}, nil
//
//}
fmt.Println("ericprint [Get Resource] error:", err)
return nil, err
}
fmt.Println("client get disk res:", retObj)
return retObj, nil
}
func mapConcurrentWriteTestSyncOnceScheme(concurrent int) {
for i := 0; i < concurrent; i++ {
go GetK8sClientWithOnce()
}
}
func mapConcurrentWriteTestOneScheme(concurrent int) {
for i := 0; i < concurrent; i++ {
go GetK8sClient()
}
}
func mapConcurrentWriteTestNewScheme(concurrent int) {
for i := 0; i < concurrent; i++ {
go GetK8sClientWithNewScheme()
}
}
func ConcurrentGetWithOneScheme(concurrent int) {
for i := 0; i < concurrent; i++ {
go ClientGetHarddiskWithOneScheme("disk1")
}
}
func ConcurrentGetWithNewScheme(concurrent int) {
for i := 0; i < concurrent; i++ {
go ClientGetHarddiskWithNewScheme("disk1")
}
}
func main() {
//mapConcurrentWriteTestNewScheme(10) 不会fatal
//mapConcurrentWriteTestOneScheme(10) 不会fatal
// mapConcurrentWriteTestOneScheme(10) 能复现fatal
//mapConcurrentWriteTestNewScheme(10) 不会fatal
select{}
}
总结:
- k8s.io/apimachinery/pkg/runtime 包负责schemeBuilder对象创建,初始化,并提供crd 注册到新的scheme中的方法, 这个scheme是cr 对象管理时序列化/反序列化的关键
- config.NegotiatedSerializer 用于指定client的crd 反序列化方法信息,即关联scheme
- 对于一个runtime.Scheme的对象scheme1,执行scheme1.AddKnownTypes时需注意有线程安全问题,需要保证不会出现并发执行的情况
后续再写一个使用controller-runtime 包的client 去读写crd的demo 参考: buraksekili.github.io/articles/co… zhuanlan.zhihu.com/p/525386905 blog.csdn.net/cbmljs/arti… blog.csdn.net/zhonglinzha… github.com/kubernetes-… www.zhaohuabing.com/post/2023-0…