10. apiserver(七、停机流程)
10.1 信号流转
Run方法中,使用停机信号进行优雅停机:
10.2 前置知识
先介绍一下lifecycleSignal:
type lifecycleSignal interface {
// Signal signals the event, indicating that the event has occurred.
// Signal is idempotent, once signaled the event stays signaled and
// it immediately unblocks any goroutine waiting for this event.
Signal()
// Signaled returns a channel that is closed when the underlying event
// has been signaled. Successive calls to Signaled return the same value.
Signaled() <-chan struct{}
// Name returns the name of the signal, useful for logging.
Name() string
}
- Signal():触发信号,唤醒监听协程;
- Signaled():返回信号;
生命信号总好包含8种:
type lifecycleSignals struct {
// ShutdownInitiated event is signaled when an apiserver shutdown has been initiated.
// It is signaled when the `stopCh` provided by the main goroutine
// receives a KILL signal and is closed as a consequence.
ShutdownInitiated lifecycleSignal
// AfterShutdownDelayDuration event is signaled as soon as ShutdownDelayDuration
// has elapsed since the ShutdownInitiated event.
// ShutdownDelayDuration allows the apiserver to delay shutdown for some time.
AfterShutdownDelayDuration lifecycleSignal
// PreShutdownHooksStopped event is signaled when all registered
// preshutdown hook(s) have finished running.
PreShutdownHooksStopped lifecycleSignal
// NotAcceptingNewRequest event is signaled when the server is no
// longer accepting any new request, from this point on any new
// request will receive an error.
NotAcceptingNewRequest lifecycleSignal
// InFlightRequestsDrained event is signaled when the existing requests
// in flight have completed. This is used as signal to shut down the audit backends
InFlightRequestsDrained lifecycleSignal
// HTTPServerStoppedListening termination event is signaled when the
// HTTP Server has stopped listening to the underlying socket.
HTTPServerStoppedListening lifecycleSignal
// HasBeenReady is signaled when the readyz endpoint succeeds for the first time.
HasBeenReady lifecycleSignal
// MuxAndDiscoveryComplete is signaled when all known HTTP paths have been installed.
// It exists primarily to avoid returning a 404 response when a resource actually exists but we haven't installed the path to a handler.
// The actual logic is implemented by an APIServer using the generic server library.
MuxAndDiscoveryComplete lifecycleSignal
}
最后两种为启动信号,其他为关机信号。
Run方法的总体流程如下(这里省略了各个子协程以及if块的逻辑,便于更清晰地看到Run方法的逻辑):
func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
delayedStopCh := s.lifecycleSignals.AfterShutdownDelayDuration
shutdownInitiatedCh := s.lifecycleSignals.ShutdownInitiated
defer s.Destroy()
// 启动debug socket
if s.UnprotectedDebugSocket != nil {...}
go func() {...}()
go func() {...}()
// 设置shutdownTimeout
shutdownTimeout := s.ShutdownTimeout
if s.ShutdownSendRetryAfter {...}
notAcceptingNewRequestCh := s.lifecycleSignals.NotAcceptingNewRequest
drainedCh := s.lifecycleSignals.InFlightRequestsDrained
stopHttpServerCh := make(chan struct{})
go func() {...}()
// 监听drainedCh,启动AuditBackend
if s.AuditBackend != nil {...}
stoppedCh, listenerStoppedCh, err := s.NonBlockingRun(stopHttpServerCh, shutdownTimeout)
httpServerStoppedListeningCh := s.lifecycleSignals.HTTPServerStoppedListening
go func() {...}()
preShutdownHooksHasStoppedCh := s.lifecycleSignals.PreShutdownHooksStopped
go func() {...}()
nonLongRunningRequestDrainedCh := make(chan struct{})
go func() {...}()
activeWatchesDrainedCh := make(chan struct{})
go func() {...}()
go func() {...}()
// 【重要】前面都是启动了各种协程,而这里开始有处理逻辑
klog.V(1).Info("[graceful-termination] waiting for shutdown to be initiated")
<-stopCh
func() {...}()
<-drainedCh.Signaled()
if s.AuditBackend != nil {...}
<-listenerStoppedCh
<-stoppedCh
klog.V(1).Info("[graceful-termination] apiserver is exiting")
return nil
}
1)首先看一下最开始的stopCh信号:
func NewAPIServerCommand() *cobra.Command {
s := options.NewServerRunOptions()
cmd := &cobra.Command{
...
RunE: func(cmd *cobra.Command, args []string) error {
...
return Run(completedOptions, genericapiserver.SetupSignalHandler())
},
...
}
...
}
执行Run方法的时候使用genericapiserver.SetupSignalHandler()传入stopCh信号:
func SetupSignalHandler() <-chan struct{} {
return SetupSignalContext().Done()
}
使用Done()方法返回cancelCtx的done字段存储的通道,即信号流转图里面的stopCh。
func SetupSignalContext() context.Context {
close(onlyOneSignalHandler) // panics when called twice
shutdownHandler = make(chan os.Signal, 2)
ctx, cancel := context.WithCancel(context.Background())
// 当触发关机信号(windows: SIGINT, linux: SIGINT、SIGTERM)时,会写入shutdownHandler
signal.Notify(shutdownHandler, shutdownSignals...)
go func() {
<-shutdownHandler
cancel() // 触发close(stopCh)
<-shutdownHandler
os.Exit(1) // second signal. Exit directly.
}()
return ctx
}
小结 当程序接收到停机信号之后,触发close(stopCh)。
2)接收stopCh,触发delayedStopCh和preShutdownHooksHasStoppedCh
func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
delayedStopCh := s.lifecycleSignals.AfterShutdownDelayDuration
shutdownInitiatedCh := s.lifecycleSignals.ShutdownInitiated
...
go func() {
defer delayedStopCh.Signal()
defer klog.V(1).InfoS("[graceful-termination] shutdown event", "name", delayedStopCh.Name())
<-stopCh
// As soon as shutdown is initiated, /readyz should start returning failure.
// This gives the load balancer a window defined by ShutdownDelayDuration to detect that /readyz is red
// and stop sending traffic to this server.
shutdownInitiatedCh.Signal()
klog.V(1).InfoS("[graceful-termination] shutdown event", "name", shutdownInitiatedCh.Name())
time.Sleep(s.ShutdownDelayDuration)
}()
...
klog.V(1).Info("[graceful-termination] waiting for shutdown to be initiated")
<-stopCh
// run shutdown hooks directly. This includes deregistering from
// the kubernetes endpoint in case of kube-apiserver.
func() {
defer func() {
preShutdownHooksHasStoppedCh.Signal()
klog.V(1).InfoS("[graceful-termination] pre-shutdown hooks completed", "name", preShutdownHooksHasStoppedCh.Name())
}()
err = s.RunPreShutdownHooks()
}()
}
当接收到stopCh之后,立即触发:1)shutdownInitiatedCh(ShutdownInitiated),再等待ShutdownDelayDuration之后触发delayedStopCh(AfterShutdownDelayDuration);2)执行shutdownHooks之后触发preShutdownHooksHasStoppedCh(PreShutdownHooksStopped),需要注意的是其中一个preShutdownHook是取消租约,以便让其他apiserver实例接手流量。
3)接收到delayedStopCh和preShutdownHooksHasStoppedCh,触发notAcceptingNewRequestCh
go func() {
defer klog.V(1).InfoS("[graceful-termination] shutdown event", "name", notAcceptingNewRequestCh.Name())
defer notAcceptingNewRequestCh.Signal()
// wait for the delayed stopCh before closing the handler chain
<-delayedStopCh.Signaled()
// Additionally wait for preshutdown hooks to also be finished, as some of them need
// to send API calls to clean up after themselves (e.g. lease reconcilers removing
// itself from the active servers).
<-preShutdownHooksHasStoppedCh.Signaled()
}()
4)接收到notAcceptingNewRequestCh,触发nonLongRunningRequestDrainedCh和activeWatchesDrainedCh:
func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
...
// 1. 根据ShutdownSendRetryAfter决定是否由notAcceptingNewRequestCh触发stopHttpServerCh
notAcceptingNewRequestCh := s.lifecycleSignals.NotAcceptingNewRequest
drainedCh := s.lifecycleSignals.InFlightRequestsDrained
stopHttpServerCh := make(chan struct{})
go func() {
defer close(stopHttpServerCh)
timeToStopHttpServerCh := notAcceptingNewRequestCh.Signaled()
if s.ShutdownSendRetryAfter {
timeToStopHttpServerCh = drainedCh.Signaled()
}
<-timeToStopHttpServerCh
}()
...
// 2. 执行NonLongRunningRequestWaitGroup::Wait之后触发nonLongRunningRequestDrainedCh
// wait for all in-flight non-long running requests to finish
nonLongRunningRequestDrainedCh := make(chan struct{})
go func() {
defer close(nonLongRunningRequestDrainedCh)
defer klog.V(1).Info("[graceful-termination] in-flight non long-running request(s) have drained")
// wait for the delayed stopCh before closing the handler chain (it rejects everything after Wait has been called).
<-notAcceptingNewRequestCh.Signaled()
s.NonLongRunningRequestWaitGroup.Wait()
}()
// 3. 执行WatchRequestWaitGroup::Wait之后触发activeWatchesDrainedCh
// wait for all in-flight watches to finish
activeWatchesDrainedCh := make(chan struct{})
go func() {
defer close(activeWatchesDrainedCh)
<-notAcceptingNewRequestCh.Signaled()
if s.ShutdownWatchTerminationGracePeriod <= time.Duration(0) {
klog.V(1).InfoS("[graceful-termination] not going to wait for active watch request(s) to drain")
return
}
// Wait for all active watches to finish
grace := s.ShutdownWatchTerminationGracePeriod
activeBefore, activeAfter, err := s.WatchRequestWaitGroup.Wait(func(count int) (utilwaitgroup.RateLimiter, context.Context, context.CancelFunc) {
qps := float64(count) / grace.Seconds()
// TODO: we don't want the QPS (max requests drained per second) to
// get below a certain floor value, since we want the server to
// drain the active watch requests as soon as possible.
// For now, it's hard coded to 200, and it is subject to change
// based on the result from the scale testing.
if qps < 200 {
qps = 200
}
ctx, cancel := context.WithTimeout(context.Background(), grace)
// We don't expect more than one token to be consumed
// in a single Wait call, so setting burst to 1.
return rate.NewLimiter(rate.Limit(qps), 1), ctx, cancel
})
klog.V(1).InfoS("[graceful-termination] active watch request(s) have drained",
"duration", grace, "activeWatchesBefore", activeBefore, "activeWatchesAfter", activeAfter, "error", err)
}()
...
}
5)nonLongRunningRequestDrainedCh和activeWatchesDrainedCh共同触发drainedCh,再根据配置触发stopHttpServerCh:
func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
...
notAcceptingNewRequestCh := s.lifecycleSignals.NotAcceptingNewRequest
drainedCh := s.lifecycleSignals.InFlightRequestsDrained
stopHttpServerCh := make(chan struct{})
go func() {
defer close(stopHttpServerCh)
timeToStopHttpServerCh := notAcceptingNewRequestCh.Signaled()
if s.ShutdownSendRetryAfter {
timeToStopHttpServerCh = drainedCh.Signaled()
}
<-timeToStopHttpServerCh
}()
// Start the audit backend before any request comes in. This means we must call Backend.Run
// before http server start serving. Otherwise the Backend.ProcessEvents call might block.
// AuditBackend.Run will stop as soon as all in-flight requests are drained.
if s.AuditBackend != nil {
if err := s.AuditBackend.Run(drainedCh.Signaled()); err != nil {
return fmt.Errorf("failed to run the audit backend: %v", err)
}
}
...
go func() {
defer klog.V(1).InfoS("[graceful-termination] shutdown event", "name", drainedCh.Name())
defer drainedCh.Signal()
<-nonLongRunningRequestDrainedCh
<-activeWatchesDrainedCh
}()
...
<-drainedCh.Signaled()
if s.AuditBackend != nil {
s.AuditBackend.Shutdown()
klog.V(1).InfoS("[graceful-termination] audit backend shutdown completed")
}
// wait for stoppedCh that is closed when the graceful termination (server.Shutdown) is finished.
<-listenerStoppedCh
<-stoppedCh
klog.V(1).Info("[graceful-termination] apiserver is exiting")
return nil
}
drained同时会触发AuditBackend::Shutdown方法。
6)接收到stopHttpServerCh之后,server会触发
func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
...
stoppedCh, listenerStoppedCh, err := s.NonBlockingRun(stopHttpServerCh, shutdownTimeout)
...
}
这里不再具体阐述(secure_serving.go#RunServer)。
7)接收到listenerStoppedCh之后会触发HttpServerStoppedListening信号,并在AuditBackend::Shutdown执行完之后由listenerStoppedCh、stoppedCh共同触发server的销毁。
func (s preparedGenericAPIServer) Run(stopCh <-chan struct{}) error {
...
// Clean up resources on shutdown.
defer s.Destroy()
...
// Wait for all requests in flight to drain, bounded by the RequestTimeout variable.
<-drainedCh.Signaled()
if s.AuditBackend != nil {
s.AuditBackend.Shutdown()
klog.V(1).InfoS("[graceful-termination] audit backend shutdown completed")
}
// wait for stoppedCh that is closed when the graceful termination (server.Shutdown) is finished.
<-listenerStoppedCh
<-stoppedCh
klog.V(1).Info("[graceful-termination] apiserver is exiting")
return nil
}