代码解读
关键数据结构
PodLifecycleEvent
// PodLifecycleEvent is an event that reflects the change of the pod state.
type PodLifecycleEvent struct {
// The pod ID.
ID types.UID
// The type of the event.
Type PodLifeCycleEventType
// The accompanied data which varies based on the event type.
// - ContainerStarted/ContainerStopped: the container name (string).
// - All other event types: unused.
Data interface{}
}
代表pod 状态改变这一事件
只包含三个属性
- ID types.UID - 代表pod的UID
- Type PodLifeCycleEventType - 事件类型。这里需要配合PodRecord结构体一起理解,
the new state of container表示podRecord.current中的state,the old state of container表示podRecord.old中的state。其中ContainerDied和ContainerRemoved的区别就比较好理解了,当第一次看到container exit的时候是ContainerDied,再次看到container exit的时候就是ContainerRemoved了。
// ContainerStarted - event type when the new state of container is running.
ContainerStarted PodLifeCycleEventType = "ContainerStarted"
// ContainerDied - event type when the new state of container is exited.
ContainerDied PodLifeCycleEventType = "ContainerDied"
// ContainerRemoved - event type when the old state of container is exited.
ContainerRemoved PodLifeCycleEventType = "ContainerRemoved"
// PodSync is used to trigger syncing of a pod when the observed change of
// the state of the pod cannot be captured by any single event above.
PodSync PodLifeCycleEventType = "PodSync"
// ContainerChanged - event type when the new state of container is unknown.
ContainerChanged PodLifeCycleEventType = "ContainerChanged"
// ConditionMet - event type triggered when any number of watch conditions are met.
ConditionMet PodLifeCycleEventType = "ConditionMet"
- Data interface{} - 目前只有事件 ContainerStarted/ContainerStopped使用,值为containerName
podRecord和podRecords
type podRecord struct {
old *kubecontainer.Pod
current *kubecontainer.Pod
}
type podRecords map[types.UID]*podRecord
podRecord存储了pod的状态,包含了上一次和当前两个状态,通过对比old和current可以知道container的change是什么。
podRecords是一个map,存储pod的uid到podRecord的映射。其中需要注意的两个函数是 setCurrent 和 update。
func (pr podRecords) setCurrent(pods []*kubecontainer.Pod) {
for i := range pr {
pr[i].current = nil
}
for _, pod := range pods {
if r, ok := pr[pod.ID]; ok {
r.current = pod
} else {
pr[pod.ID] = &podRecord{current: pod}
}
}
}
setCurrent 会首先把map中的所有pods的current都设置成nil,然后把传参存到map中,如果map中已经存在对应的podRecord,那么只设置current属性,如果不存在,那么就创建一个新的podRecord。这里值得注意的是,如果传参pods中不存在某个pod,而podRecords map中存在,那么这个pod的podRecord的current就会是nil,而old则是上一次拿到的pod的state。
func (pr podRecords) update(id types.UID) {
r, ok := pr[id]
if !ok {
return
}
pr.updateInternal(id, r)
}
func (pr podRecords) updateInternal(id types.UID, r *podRecord) {
if r.current == nil {
// Pod no longer exists; delete the entry.
delete(pr, id)
return
}
r.old = r.current
r.current = nil
}
update 函数会调用 updateInternal 函数,其将pod 的current state转成old state。
cache
cache缓存pod的最新状态,并且实现subscribe机制,当pod有change的时候,通知subscriber。
type cache struct {
// Map that stores the pod statuses.
pods map[types.UID]*data
// timestamp 表示cache中所有的pods的state都更新时间都再timestamp之后
timestamp *time.Time
// Map that stores the subscriber records.
subscribers map[types.UID][]*subRecord
}
type data struct {
// Status of the pod.
status *PodStatus
// Error got when trying to inspect the pod.
err error
// Time when the data was last modified.
modified time.Time
}
type subRecord struct {
// 表示subscriber希望获取到time之后的pod state
time time.Time
// 通过ch 传给subscriber
ch chan *data
}
这里简单介绍一下subscribe机制是怎么实现的。首先,subscriber通过调用函数 GetNewerThan 创建一个
func (c *cache) GetNewerThan(id types.UID, minTime time.Time) (*PodStatus, error) {
ch := c.subscribe(id, minTime)
d := <-ch
return d.status, d.err
}
func (c *cache) subscribe(id types.UID, timestamp time.Time) chan *data {
ch := make(chan *data, 1)
c.lock.Lock()
defer c.lock.Unlock()
d := c.getIfNewerThan(id, timestamp)
if d != nil {
// If the cache entry is ready, send the data and return immediately.
ch <- d
return ch
}
// Add the subscription record.
c.subscribers[id] = append(c.subscribers[id], &subRecord{time: timestamp, ch: ch})
return ch
}
subscribe 函数尝试从cache中获取timestamp之后的pod的statue,如果拿到了直接放入 ch 中,如果没拿到,就会往 subscribers 中存一个 subRecord 。然后就是从 ch 中拿pod status,由于 ch 是一个容量为1的channel,所以在没有数据的时候,会直接阻塞住。直到调用 notify 函数,往 ch 中放入pod status,这样函数 GetNewerThan 就会返回最新的pod status,进而通知到subscriber。
func (c *cache) notify(id types.UID, timestamp time.Time) {
list, ok := c.subscribers[id]
if !ok {
// No one to notify.
return
}
newList := []*subRecord{}
for i, r := range list {
// 如果期望时间没到,就重新放入subscribers中,等待下次处理
if timestamp.Before(r.time) {
// Doesn't meet the time requirement; keep the record.
newList = append(newList, list[i])
continue
}
// 如果时间已经到了,则往ch中放入pod statue
r.ch <- c.get(id)
close(r.ch)
}
if len(newList) == 0 {
delete(c.subscribers, id)
} else {
c.subscribers[id] = newList
}
}
最后 notify会在两种情况下被调用:
- 在GenericPLEG relist之后,遍历每一个pod,此时如果该pod的status改变了,会更新cache中该pod的status,此时notify会被调用
- 在GenericPLEG每次relist之后,会把cache的global timestamp,也就是cache.timestamp更新成relist前的时间戳,此时会遍历subscribers,调用notify函数,对那些expected time已经到了subscriber,放入最新的pod status。
GenericPLEG
type GenericPLEG struct {
// The container runtime.
runtime kubecontainer.Runtime
// The channel from which the subscriber listens events.
eventChannel chan *PodLifecycleEvent
// The internal cache for pod/container information.
podRecords podRecords
// Time of the last relisting.
relistTime atomic.Value
// Cache for storing the runtime states required for syncing pods.
cache kubecontainer.Cache
// retried during the next relisting.
podsToReinspect map[types.UID]*kubecontainer.Pod
// Indicates relisting related parameters
relistDuration *RelistDuration
// Time of the last relisting.
relistTime atomic.Value
...
}
GenericPLEG 周期地从container runtime获取pods列表,跟 podRecords 中存储的之前状态进行对比,生成pod lifecycle event,并放入 eventChannel 中。
GenericPLEG最关键的是 Relist() 函数,
// Relist queries the container runtime for list of pods/containers, compare
// with the internal pods/containers, and generates events accordingly.
func (g *GenericPLEG) Relist() {
...
// GenericPLEG每次relist的时间,也是更新cache的global timestamp
timestamp := g.clock.Now()
// 从container runtime中取全部的pods
podList, err := g.runtime.GetPods(ctx, true)
if err != nil {
g.logger.Error(err, "GenericPLEG: Unable to retrieve pods")
return
}
// 立马更新此次relist的time
g.updateRelistTime(timestamp)
pods := kubecontainer.Pods(podList)
// 更新podRecords中所有的pods的current值,如果某个pod,在podRecords中,但是不在pods中,那么此pod的podRecord.current就是nil
g.podRecords.setCurrent(pods)
// 此次没有拿到status的pod,需要再次拿status
needsReinspection := make(map[types.UID]*kubecontainer.Pod)
for pid := range g.podRecords {
// Compare the old and the current pods, and generate events.
oldPod := g.podRecords.getOld(pid)
pod := g.podRecords.getCurrent(pid)
// Get all containers in the old and the new pod.
allContainers := getContainersFromPods(oldPod, pod)
var events []*PodLifecycleEvent
for _, container := range allContainers {
// 生成 pod lifecycle event,并暂时放入events中
containerEvents := computeEvents(g.logger, oldPod, pod, &container.ID)
events = append(events, containerEvents...)
}
_, reinspect := g.podsToReinspect[pid]
if len(events) == 0 && len(watchConditions) == 0 && !reinspect {
// Nothing else needed for this pod.
continue
}
// updateCache() will inspect the pod and update the cache.
status, updated, err := g.updateCache(ctx, pod, pid)
if err != nil {
// make sure we try to reinspect the pod during the next relisting
needsReinspection[pid] = pod
continue
} else if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
if !updated {
continue
}
}
// Update the internal storage
g.podRecords.update(pid)
}
// Update the cache global timestamp.
g.cache.UpdateTime(timestamp)
// make sure we retain the list of pods that need reinspecting the next time relist is called
g.podsToReinspect = needsReinspection
}
由上面的code可知, Relist 函数主要做两件事:
- 用container runtime获取pods的status,然后对比前后变化,生成pod lifecycle event
- 更新pod cache
EventedPLEG
type EventedPLEG struct {
// The container runtime.
runtime kubecontainer.Runtime
// The runtime service.
runtimeService internalapi.RuntimeService
// The channel from which the subscriber listens events.
eventChannel chan *PodLifecycleEvent
// Cache for storing the runtime states required for syncing pods.
cache kubecontainer.Cache
// GenericPLEG is used to force relist when required.
genericPleg podLifecycleEventGeneratorHandler
// The maximum number of retries when getting container events from the runtime.
eventedPlegMaxStreamRetries int
// Indicates relisting related parameters
relistDuration *RelistDuration
...
}
EventedPLEG与GenericPLEG最大的区别就是,EventedPLEG是基于event的,而GenericPLEG是周期性的主动list pods,当然EventedPLEG还有有周期性主动list pods的。
从数据结构上看,EventedPLEG删掉了podRecords,因为其已经不再需要对比pod前后两次的状态了。
EventedPLEG会一直运行函数 watchEventsChannel
func (e *EventedPLEG) watchEventsChannel() {
containerEventsResponseCh := make(chan *runtimeapi.ContainerEventResponse, cap(e.eventChannel))
defer close(containerEventsResponseCh)
// Get the container events from the runtime.
go func() {
numAttempts := 0
for {
if numAttempts >= e.eventedPlegMaxStreamRetries {
// 失败次数太多,就stop
if isEventedPLEGInUse() {
e.Stop()
e.genericPleg.Stop()
e.Update(e.relistDuration)
e.genericPleg.Start()
break
}
}
// 从container runtime中获取container的events
err := e.runtimeService.GetContainerEvents(context.Background(), containerEventsResponseCh, func(runtimeapi.RuntimeService_GetContainerEventsClient) {
metrics.EventedPLEGConn.Inc()
})
if err != nil {
numAttempts++
// 如果失败了,就
e.Relist()
}
}
}()
if isEventedPLEGInUse() {
// 处理获取到的container events
e.processCRIEvents(containerEventsResponseCh)
}
}
watchEventsChannel 函数会进一步调用 processCRIEvents 函数处理拿到的events
func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeapi.ContainerEventResponse) {
for event := range containerEventsResponseCh {
...
podID := types.UID(event.PodSandboxStatus.Metadata.Uid)
shouldSendPLEGEvent := false
status, err := e.runtime.GeneratePodStatus(event)
...
// 如果是delete event,就从cache中删除pod
if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT {
for _, sandbox := range status.SandboxStatuses {
if sandbox.Id == event.ContainerId {
e.cache.Delete(podID)
}
}
shouldSendPLEGEvent = true
} else {
// 如果event的时间在cache中pod的时间晚,就发送event
if e.cache.Set(podID, status, err, time.Unix(0, event.GetCreatedAt())) {
shouldSendPLEGEvent = true
}
}
if shouldSendPLEGEvent {
e.processCRIEvent(event)
}
}
}
processCRIEvents 函数遍历所有的event,只有当下面两种情况之一出现时,才发出event:
- 是container delete event
- event是新的,不是过去某个时间的event
processCRIEvents 调用 processCRIEvent 函数,将container event封装成 pod lifecycle event,并写到eventChannel中。
func (e *EventedPLEG) processCRIEvent(event *runtimeapi.ContainerEventResponse) {
switch event.ContainerEventType {
case runtimeapi.ContainerEventType_CONTAINER_STOPPED_EVENT:
// container stopped -> pod ContainerDied
e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
case runtimeapi.ContainerEventType_CONTAINER_CREATED_EVENT:
// ignore
case runtimeapi.ContainerEventType_CONTAINER_STARTED_EVENT:
// container started -> pod ContainerStarted
e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerStarted, Data: event.ContainerId})
case runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT:
// container deleted -> pod ContainerDied and ContainerRemoved
e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerRemoved, Data: event.ContainerId})
}
}
主要作用分析
PLEG有GenericPLEG和EventedPLEG两个版本,由feature gate EventedPLEG 决定到底使用的是哪个。
在 NewMainKubelet 函数中,创建出需要的pleg
func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,...) (*Kubelet, error){
...
klet := &Kubelet{
hostname: hostname,
hostnameOverridden: hostnameOverridden,
nodeName: nodeName,
...
}
...
if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
...
klet.pleg = pleg.NewGenericPLEG(logger, klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
eventedRelistDuration := &pleg.RelistDuration{
RelistPeriod: genericPlegRelistPeriod,
RelistThreshold: genericPlegRelistThreshold,
}
// 如果EventedPLEG开启,就创建EventedPLEG
klet.eventedPleg, err = pleg.NewEventedPLEG(logger, klet.containerRuntime, klet.runtimeService, eventChannel,
klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{})
} else {
genericRelistDuration := &pleg.RelistDuration{
RelistPeriod: genericPlegRelistPeriod,
RelistThreshold: genericPlegRelistThreshold,
}
// 否则创建GenericPLEG
klet.pleg = pleg.NewGenericPLEG(logger, klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
}
...
}
在 syncLoop 中,从pleg的event channel中取event,并处理
func (kl *Kubelet) syncLoop(ctx context.Context, updates <-chan kubetypes.PodUpdate, handler SyncHandler) {
...
plegCh := kl.pleg.Watch()
...
for {
...
if !kl.syncLoopIteration(ctx, updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
break
}
...
}
}
syncLoopIteration 函数是一个select,其中一个case会从plegCh中取event处理。
func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
select {
...
case e := <-plegCh:
if isSyncPodWorthy(e) {
// PLEG event for a pod; sync it.
if pod, ok := kl.podManager.GetPodByUID(e.ID); ok {
klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e)
handler.HandlePodSyncs([]*v1.Pod{pod})
}
}
if e.Type == pleg.ContainerDied {
if containerID, ok := e.Data.(string); ok {
kl.cleanUpContainersInPod(e.ID, containerID)
}
}
...
}
}
func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool {
// ContainerRemoved doesn't affect pod state
return event.Type != pleg.ContainerRemoved
}