kube-ovn nb|sb db transact timeout 作用机制
argOvnTimeout = pflag.Int("ovn-timeout", 60, "The seconds to wait ovn command timeout") # libovsdb client timout
argOvsDbConTimeout = pflag.Int("ovsdb-con-timeout", 3, "The seconds to wait ovsdb connect timeout")
argOvsDbConnectMaxRetry = pflag.Int("ovsdb-con-maxretry", 60, "The maximum number of retries for connecting to ovsdb") # kubeovn 自己重试的
argOvsDbInactivityTimeout = pflag.Int("ovsdb-inactivity-timeout", 10, "The seconds to wait ovsdb inactivity check timeout")
func (c *ovsDbClient) Transact(method string, operations []ovsdb.Operation) error {
if len(operations) == 0 {
klog.V(6).Info("operations should not be empty")
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), c.Timeout) # 这里的 timout 是 60s
defer cancel()
start := time.Now()
results, err := c.Client.Transact(ctx, operations...)
elapsed := float64((time.Since(start)) / time.Millisecond)
var dbType string
switch c.Schema().Name {
case "OVN_Northbound":
dbType = "ovn-nb"
case "OVN_Southbound":
dbType = "ovn-sb"
}
code := "0"
defer func() {
ovsClientRequestLatency.WithLabelValues(dbType, method, code).Observe(elapsed)
}()
if err != nil {
code = "1"
klog.Errorf("error occurred in transact with %s operations: %+v in %vms", dbType, operations, elapsed)
return err
}
if elapsed > 500 {
klog.Warningf("%s operations took too long: %+v in %vms", dbType, operations, elapsed)
}
errors, err := ovsdb.CheckOperationResults(results, operations)
if err != nil {
klog.Errorf("error occurred in transact with operations %+v with operation errors %+v: %v", operations, errors, err)
return err
}
return nil
}
func (o *ovsdbClient) Transact(ctx context.Context, operation ...ovsdb.Operation) ([]ovsdb.OperationResult, error) {
logger := o.logFromContext(ctx)
o.logger.Info("Transact rpc try RLock 1")
o.rpcMutex.RLock()
o.logger.Info("Transact rpc get RLock 1")
if o.rpcClient == nil || !o.connected {
o.logger.Info("Transact rpc if RUnlock 1")
o.rpcMutex.RUnlock()
if o.options.reconnect {
logger.V(5).Info("blocking transaction until reconnected", "operations",
fmt.Sprintf("%+v", operation))
ticker := time.NewTicker(50 * time.Millisecond)
defer ticker.Stop()
ReconnectWaitLoop:
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("%w: while awaiting reconnection", ctx.Err())
case <-ticker.C:
o.logger.Info("Transact rpc ReconnectWaitLoop try RLock 2")
o.rpcMutex.RLock()
o.logger.Info("Transact rpc ReconnectWaitLoop get RLock 2")
if o.rpcClient != nil && o.connected {
break ReconnectWaitLoop
}
o.logger.Info("Transact rpc ReconnectWaitLoop rpc RUnlock 2")
o.rpcMutex.RUnlock()
}
}
} else {
return nil, ErrNotConnected
}
}
defer func() {
o.logger.Info("Transact rpc defer RUnlock 1")
o.rpcMutex.RUnlock()
}()
return o.transact(ctx, o.primaryDBName, false, operation...) // todo:// ctx timeout ???
}
所以如果 transact 锁住,要么就是 rcpMutex,要么就是 o.transact 中的锁 modelMutex
1. 简单分析排除 modelMutex 存在死锁问题
modelMutex 主要用于:
- 将 db model 读到缓存中
- 将 db schema 读到缓存中,o.transact 就是用这个,只是一个 rlock
- 将 newMonitorRequest 结构体构造到缓存中
由于 modelMutex 使用比 rcpMutex 简单的多,所以不再考虑该 modelMutex 存在死锁问题
2. 继续分析 rpcMutex 存在死锁问题
argOvsDbConTimeout = pflag.Int("ovsdb-con-timeout", 3, "The seconds to wait ovsdb connect timeout")
argOvsDbInactivityTimeout = pflag.Int("ovsdb-inactivity-timeout", 10, "The seconds to wait ovsdb inactivity check timeout")
结合上面两个 timeout 分析死锁分析,因为如果死锁出现,说明 timeout 就没生效:一般来说,timeout 会触发函数退出,函数退出前会触发解锁
没有利用到 timeout 会导致 函数没退出,锁没释放
2.1 先分析 o.timeout 上下文中 rpcMutex lock 是否存在死锁(锁没释放)的风险
o.timeout 使用:
- func (o *ovsdbClient) connect: o.tryEndpoint 和 o.monitor
- func (o *ovsdbClient) tryEndpoint:setTCPUserTimeout
如上仅在需要(重新)建立连接的时候,需要设置 OvsDbConTimeout,设计 rpc 7层和 tcp 4层。
这里的函数感觉都用到了 o.timeout
2.2 继续分析 o.options.inactivityTimeout 上下文中 rpcMutex lock 是否存在死锁(锁没释放)的风险
没有利用到 timeout 会导致 函数没退出,锁没释放
感觉 o.options.inactivityTimeout 感觉没怎么起作用
这里就大胆猜一下会发生阻塞: RPC echo 等
目前不知道 rpc 是否有 timeout,但这里就猜下没有
3. RPC send echo 健康检查机制是否可以去掉 rpc client rlock
出发点:当 rpc client lock 的时候,rlock 也是阻塞的,担心可能会影响
实际上, rpc client lock 的时间点是:
- func (o *ovsdbClient) connect(
- func (o *ovsdbClient) UpdateEndpoints(
- func (o *ovsdbClient) MonitorCancel(
- func (o *ovsdbClient) watchForLeaderChange(
- func (o *ovsdbClient) handleDisconnectNotification(
- func (o *ovsdbClient) Disconnect(
- func (o *ovsdbClient) Close(
可以看到这些方法中 lock 都是必要的,而且,如果这些方法没有完成,比如没有建立连接,那去做 RPC send echo 的意义在哪里呢?
但是仔细分析的话,sendEcho 只在 connect( 方法中的 go o.handleInactivityProbes() 中调用一次
argOvsDbConTimeout = pflag.Int("ovsdb-con-timeout", 3, "The seconds to wait ovsdb connect timeout")
argOvsDbInactivityTimeout = pflag.Int("ovsdb-inactivity-timeout", 10, "The seconds to wait ovsdb inactivity check timeout")
ovsdb-con-timeout 是 3s,而 ovsdb-inactivity-timeout 是 10s,也就是说真正调用 sendEcho 的时候,是在 10s 后, 10s 的时间: connect, UpdateEndpoints, 早就干完了, 如果发生watchForLeaderChange 那么就会触发重连,这种情况下,sendEcho 检测失败也是合理的。
小结: sendEcho 可以不加锁,同时补充 rpc client lock 时间点关键 log: 比如建立连接,重连,发生错误等