etcd 5分钟掌握源码级BUG分析

1,267 阅读5分钟

引起错误的方式

可以提前说,这是一个由于官方代码逻辑错误引起的token校验失败 etcd集群带账号密码

  1. etcd集群主备切换
  2. 节点重启

错误打印来源

报错内容

{"level":"warn","ts":"2022-08-23T14:06:11.060+0800","logger":"etcd-client","caller":"v3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00021f340/localhost:2379","attempt":0,"error":"rpc error: code = Unauthenticated desc = etcdserver: invalid auth token"}
{"level":"warn","ts":"2022-08-23T14:06:11.153+0800","logger":"etcd-client","caller":"v3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00021f340/localhost:2379","attempt":1,"error":"rpc error: code = Unauthenticated desc = etcdserver: invalid auth token"}
{"level":"warn","ts":"2022-08-23T14:06:11.248+0800","logger":"etcd-client","caller":"v3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00021f340/localhost:2379","attempt":2,"error":"rpc error: code = Unauthenticated desc = etcdserver: invalid auth token"}
{"level":"warn","ts":"2022-08-23T14:06:11.343+0800","logger":"etcd-client","caller":"v3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"etcd-endpoints://0xc00021f340/localhost:2379","attempt":3,"error":"rpc error: code = Unauthenticated desc = etcdserver: invalid auth token"}

错误来源定义

ErrGRPCInvalidAuthToken = status.New(codes.Unauthenticated, "etcdserver: invalid auth token").Err()

错误使用位置

go.etcd.io/etcd/client/v3/retry_interceptor.go

//unaryClientInterceptor返回一个新的重试一元客户端侦听器。拦截器的默认配置是完全不重试。在创建拦截器或调用时(通过grpc.CallOptions),可以通过选项(例如WithMax)更改此行为
func (c *Client) unaryClientInterceptor(optFuncs ...retryOption) grpc.UnaryClientInterceptor {
  ///...
  //line:58 
  lastErr = invoker(ctx, method, req, reply, cc, grpcOpts...)
  if lastErr == nil {
    return nil
  }
  c.GetLogger().Warn(
    "retrying of unary invoker failed",
    zap.String("target", cc.Target()),
    zap.Uint("attempt", attempt),
    zap.Error(lastErr),
  )
}

错误定位

go.etcd.io/etcd/client/v3/retry_interceptor.go

func (c *Client) unaryClientInterceptor(optFuncs ...retryOption) grpc.UnaryClientInterceptor {
  //...
  if callOpts.retryAuth && rpctypes.Error(lastErr) == rpctypes.ErrInvalidAuthToken {
    //在刷新身份验证令牌之前清除它。调用c.Auth。如果服务器未应用pr 12165修补程序(https://github.com/etcd-io/etcd/pull/12165)和rpctypes,则使用无效令牌进行身份验证将始终无法通过服务器端的身份验证检查。ErrInvalidAuthToken将递归调用c.getToken,直到系统耗尽资源。
    c.authTokenBundle.UpdateAuthToken("")

    gterr := c.getToken(ctx)
    if gterr != nil {
      c.GetLogger().Warn(
        "retrying of unary invoker failed to fetch new auth token",
        zap.String("target", cc.Target()),
        zap.Error(gterr),
      )
      return gterr // lastErr must be invalid auth token
    }
    continue
  }
  //...
}

怀疑是服务端异常导致无法校验客户端token,所以在本地装了个代密码的etcd服务端,但是该问题依然复现。

对刷新token的地方gterr := c.getToken(ctx)进行了断点调试,发现在出现token失效时,服务端正常返回了新的token

那么判断可能是客户端的在刷新token时候依然失效,这边尝试去查找跟生成token相关的逻辑。

go.etcd.io/etcd/client/v3/client.go

// dial configures and dials any grpc balancer target.
func (c *Client) dial(creds grpccredentials.TransportCredentials, dopts ...grpc.DialOption) (*grpc.ClientConn, error) {
	opts, err := c.dialSetupOpts(creds, dopts...)
	if err != nil {
		return nil, fmt.Errorf("failed to configure dialer: %v", err)
	}

	if c.Username != "" && c.Password != "" {
		c.authTokenBundle = credentials.NewBundle(credentials.Config{})
		opts = append(opts, grpc.WithPerRPCCredentials(c.authTokenBundle.PerRPCCredentials()))
	}

	opts = append(opts, c.cfg.DialOptions...)
	//...
	conn, err := grpc.DialContext(dctx, target, opts...)
	//...
}

找到了在建立连接时,如果我们使用了账号密码,那么client端将会添加一个中间件,它会将client端保存的tokenMetadata的形式注入到gRpc请求的头部。这个中间件会一直在连接的生命周期中生效,也就是说我们每次请求都会携带这个token。并且在过期时重新从服务端获取新的token

这里看到大部分的中间件是从外部传入,而获取toekn的中间件代码使用了一个全局变量来管理,由此猜测此代码片段有逻辑上的错误,

c.authTokenBundle = credentials.NewBundle(credentials.Config{})
opts = append(opts, grpc.WithPerRPCCredentials(c.authTokenBundle.PerRPCCredentials()))

尝试查看dial()函数的调用方,令人感到兴奋地是这里找到了两个调用方。 image-20220823144928804.png

  1. clientV3.New()

    go.etcd.io/etcd/client/v3/client.go

    
    func newClient(cfg *Config) (*Client, error) {
         //...
         conn, err := client.dialWithBalancer()
         if err != nil {
                 client.cancel()
                 client.resolver.Close()
                 return nil, err
         }
         client.conn = conn
    
         client.Cluster = NewCluster(client)
         client.KV = NewKV(client)
         client.Lease = NewLease(client)
         client.Watcher = NewWatcher(client)
         client.Auth = NewAuth(client)
         client.Maintenance = NewMaintenance(client)
    
         //...
         return client, nil
    }
    
    func (c *Client) dialWithBalancer(dopts ...grpc.DialOption) (*grpc.ClientConn, error) {
    	creds := c.credentialsForEndpoint(c.Endpoints()[0])
    	opts := append(dopts, grpc.WithResolvers(c.resolver))
    	return c.dial(creds, opts...)
    }
    
  2. client.Status()

    go.etcd.io/etcd/client/v3/maintenance.go

    func (m *maintenance) Status(ctx context.Context, endpoint string) (*StatusResponse, error) {
         remote, cancel, err := m.dial(endpoint)
         if err != nil {
                 return nil, toErr(ctx, err)
         }
         defer cancel()
         resp, err := remote.Status(ctx, &pb.StatusRequest{}, m.callOpts...)
         if err != nil {
                 return nil, toErr(ctx, err)
         }
         return (*StatusResponse)(resp), nil
     }
    

我们的代码在初始化的时候,主动检测了节点健康状况

    client, err := clientv3.New(clientv3.Config{
            Endpoints:   e.conf.Endpoints,
            DialTimeout: time.Duration(e.conf.Timeout) * time.Second,
            Username:    e.conf.Username,
            Password:    e.conf.Password,
    })
    if err != nil {
            fmt.Println(err.Error())
            log.Fatal(err)
    }
    ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
    defer cancel()
    for _, v := range e.conf.Endpoints {
            if _, err = client.Status(ctx, v); err != nil {
                    log.Fatalf("Must etcd check node %s err : %s", v, err.Error())
            }
    }

再次进行断点调试,确认以下函数被执行了两次,并且authTokenBundle的地址被替换了

c.authTokenBundle = credentials.NewBundle(credentials.Config{})
opts = append(opts, grpc.WithPerRPCCredentials(c.authTokenBundle.PerRPCCredentials()))

判断client端的内部变量authTokenBundle被替换导致的前后管理token的函数栈出现了变动。

image-20220823160418727.png

但是这里产生了一个疑问❓,如果函数栈被变更了,那么后面都应该使用新的,也就是Status()所创建的更新token的方式,理论上不影响使用。但事实上它产生校验的问题,关注到代码c.authTokenBundle.PerRPCCredentials(),认为它可能是将authTokenBundle的内部变量脱离的其本身管理,导致UpdateAuthTokenGetRequestMetadata并非在同一个实例的管理下调用。

结论

正如前面所说的,通过对UpdateAuthTokenGetRequestMetadata实例的地址打印,发现发送请求时使用了旧的GetRequestMetadata函数,像服务端传递了旧的token,导致校验失败。

go.etcd.io/etcd/client/v3/credentials/credentials.go

func (rc *perRPCCredential) GetRequestMetadata(ctx context.Context, s ...string) (map[string]string, error) {
    rc.authTokenMu.RLock()
    authToken := rc.authToken
    rc.authTokenMu.RUnlock()
    if authToken == "" {
            return nil, nil
    }
    fmt.Printf("[GetRequestMetadata]地址:%p,%s\n", rc, authToken)
    return map[string]string{rpctypes.TokenFieldNameGRPC: authToken}, nil
}

func (b *bundle) UpdateAuthToken(token string) {
    if b.rc == nil {
            return
    }
    b.rc.UpdateAuthToken(token)
}

func (rc *perRPCCredential) UpdateAuthToken(token string) {
    fmt.Printf("[UpdateAuthToken]地址:%p,%s\n", rc, token)
    rc.authTokenMu.Lock()
    rc.authToken = token
    rc.authTokenMu.Unlock()
}

现在真实的引用方式如下: image-20220823160950304.png 这是一个非常糟糕的代码片段

if c.Username != "" && c.Password != "" {
  c.authTokenBundle = credentials.NewBundle(credentials.Config{})
  opts = append(opts, grpc.WithPerRPCCredentials(c.authTokenBundle.PerRPCCredentials()))
}

它会重复将两个新credentials.Bundle实例赋值到client的内部变量authTokenBundle中,之后从authTokenBundle取出了用来管理token的函数,并转成中间件gprc.DialOption类型。

go.etcd.io/etcd/client/v3/client.go

func newClient(cfg *Config) (*Client, error) {
    //...
    conn, err := client.dialWithBalancer()
    if err != nil {
            client.cancel()
            client.resolver.Close()
            return nil, err
    }
    client.conn = conn

    client.Cluster = NewCluster(client)
    client.KV = NewKV(client)
    client.Lease = NewLease(client)
    client.Watcher = NewWatcher(client)
    client.Auth = NewAuth(client)
    client.Maintenance = NewMaintenance(client)
    //...
    return client, nil
}

这里在我们newClient之后,opts的中间件已经被挂载到client端了,后续并不会再被修改。于是产生了如图的调用关系: image-20220823160950304.png

这个逻辑同样会引起服务端的压力,在token校验失败是,会发起大量的重试请求来刷新token

修复方式

1.修改内部变量赋值过程

if c.Username != "" && c.Password != "" && c.authTokenBundle != nil {
    c.authTokenBundle = credentials.NewBundle(credentials.Config{})
    opts = append(opts, grpc.WithPerRPCCredentials(c.authTokenBundle.PerRPCCredentials()))
}
  1. 3.6版本将修复该BUG etcd-io/etcd tag 3.6.1-alpha.0

  2. 更细说明 Plans for v3.5.5 release #14138