PLEG - Pod Lifecycle Event GeneratorPodLifecycleEventGenerat

代码解读

关键数据结构

PodLifecycleEvent

// PodLifecycleEvent is an event that reflects the change of the pod state.
type PodLifecycleEvent struct {
    // The pod ID.
    ID types.UID
    // The type of the event.
    Type PodLifeCycleEventType
    // The accompanied data which varies based on the event type.
    //   - ContainerStarted/ContainerStopped: the container name (string).
    //   - All other event types: unused.
    Data interface{}
}

代表pod 状态改变这一事件

只包含三个属性

ID types.UID - 代表pod的UID
Type PodLifeCycleEventType - 事件类型。这里需要配合PodRecord结构体一起理解， the new state of container 表示podRecord.current中的state， the old state of container 表示podRecord.old中的state。其中 ContainerDied 和 ContainerRemoved 的区别就比较好理解了，当第一次看到container exit的时候是 ContainerDied，再次看到container exit的时候就是 ContainerRemoved 了。

// ContainerStarted - event type when the new state of container is running.
ContainerStarted PodLifeCycleEventType = "ContainerStarted"
// ContainerDied - event type when the new state of container is exited.
ContainerDied PodLifeCycleEventType = "ContainerDied"
// ContainerRemoved - event type when the old state of container is exited.
ContainerRemoved PodLifeCycleEventType = "ContainerRemoved"
// PodSync is used to trigger syncing of a pod when the observed change of
// the state of the pod cannot be captured by any single event above.
PodSync PodLifeCycleEventType = "PodSync"
// ContainerChanged - event type when the new state of container is unknown.
ContainerChanged PodLifeCycleEventType = "ContainerChanged"
// ConditionMet - event type triggered when any number of watch conditions are met.
ConditionMet PodLifeCycleEventType = "ConditionMet"

Data interface{} - 目前只有事件 ContainerStarted/ContainerStopped使用，值为containerName

podRecord和podRecords

type podRecord struct {
    old     *kubecontainer.Pod
    current *kubecontainer.Pod
}
type podRecords map[types.UID]*podRecord

podRecord存储了pod的状态，包含了上一次和当前两个状态，通过对比old和current可以知道container的change是什么。

podRecords是一个map，存储pod的uid到podRecord的映射。其中需要注意的两个函数是 setCurrent 和 update。

func (pr podRecords) setCurrent(pods []*kubecontainer.Pod) {
    for i := range pr {
        pr[i].current = nil
    }
    for _, pod := range pods {
        if r, ok := pr[pod.ID]; ok {
            r.current = pod
        } else {
            pr[pod.ID] = &podRecord{current: pod}
        }
    }
}

setCurrent 会首先把map中的所有pods的current都设置成nil，然后把传参存到map中，如果map中已经存在对应的podRecord，那么只设置current属性，如果不存在，那么就创建一个新的podRecord。这里值得注意的是，如果传参pods中不存在某个pod，而podRecords map中存在，那么这个pod的podRecord的current就会是nil，而old则是上一次拿到的pod的state。

func (pr podRecords) update(id types.UID) {
    r, ok := pr[id]
    if !ok {
        return
    }
    pr.updateInternal(id, r)
}

func (pr podRecords) updateInternal(id types.UID, r *podRecord) {
    if r.current == nil {
        // Pod no longer exists; delete the entry.
        delete(pr, id)
        return
    }
    r.old = r.current
    r.current = nil
}

update 函数会调用 updateInternal 函数，其将pod 的current state转成old state。

cache

cache缓存pod的最新状态，并且实现subscribe机制，当pod有change的时候，通知subscriber。

type cache struct {
    // Map that stores the pod statuses.
    pods map[types.UID]*data
    // timestamp 表示cache中所有的pods的state都更新时间都再timestamp之后
    timestamp *time.Time
    // Map that stores the subscriber records.
    subscribers map[types.UID][]*subRecord
}
type data struct {
    // Status of the pod.
    status *PodStatus
    // Error got when trying to inspect the pod.
    err error
    // Time when the data was last modified.
    modified time.Time
}

type subRecord struct {
  // 表示subscriber希望获取到time之后的pod state
    time time.Time
    // 通过ch 传给subscriber
    ch   chan *data
}

这里简单介绍一下subscribe机制是怎么实现的。首先，subscriber通过调用函数 GetNewerThan 创建一个

func (c *cache) GetNewerThan(id types.UID, minTime time.Time) (*PodStatus, error) {
    ch := c.subscribe(id, minTime)
    d := <-ch
    return d.status, d.err
}
func (c *cache) subscribe(id types.UID, timestamp time.Time) chan *data {
    ch := make(chan *data, 1)
    c.lock.Lock()
    defer c.lock.Unlock()
    d := c.getIfNewerThan(id, timestamp)
    if d != nil {
        // If the cache entry is ready, send the data and return immediately.
        ch <- d
        return ch
    }
    // Add the subscription record.
    c.subscribers[id] = append(c.subscribers[id], &subRecord{time: timestamp, ch: ch})
    return ch
}

subscribe 函数尝试从cache中获取timestamp之后的pod的statue，如果拿到了直接放入 ch 中，如果没拿到，就会往 subscribers 中存一个 subRecord 。然后就是从 ch 中拿pod status，由于 ch 是一个容量为1的channel，所以在没有数据的时候，会直接阻塞住。直到调用 notify 函数，往 ch 中放入pod status，这样函数 GetNewerThan 就会返回最新的pod status，进而通知到subscriber。

func (c *cache) notify(id types.UID, timestamp time.Time) {
    list, ok := c.subscribers[id]
    if !ok {
        // No one to notify.
        return
    }
    newList := []*subRecord{}
    for i, r := range list {
      // 如果期望时间没到，就重新放入subscribers中，等待下次处理
        if timestamp.Before(r.time) {
            // Doesn't meet the time requirement; keep the record.
            newList = append(newList, list[i])
            continue
        }
        // 如果时间已经到了，则往ch中放入pod statue
        r.ch <- c.get(id)
        close(r.ch)
    }
    if len(newList) == 0 {
        delete(c.subscribers, id)
    } else {
        c.subscribers[id] = newList
    }
}

最后 notify会在两种情况下被调用：

在GenericPLEG relist之后，遍历每一个pod，此时如果该pod的status改变了，会更新cache中该pod的status，此时notify会被调用
在GenericPLEG每次relist之后，会把cache的global timestamp，也就是cache.timestamp更新成relist前的时间戳，此时会遍历subscribers，调用notify函数，对那些expected time已经到了subscriber，放入最新的pod status。

GenericPLEG

type GenericPLEG struct {
    // The container runtime.
    runtime kubecontainer.Runtime
    // The channel from which the subscriber listens events.
    eventChannel chan *PodLifecycleEvent
    // The internal cache for pod/container information.
    podRecords podRecords
    // Time of the last relisting.
    relistTime atomic.Value
    // Cache for storing the runtime states required for syncing pods.
    cache kubecontainer.Cache
    // retried during the next relisting.
    podsToReinspect map[types.UID]*kubecontainer.Pod
    // Indicates relisting related parameters
    relistDuration *RelistDuration
    // Time of the last relisting.
    relistTime atomic.Value
    ...
}

GenericPLEG 周期地从container runtime获取pods列表，跟 podRecords 中存储的之前状态进行对比，生成pod lifecycle event，并放入 eventChannel 中。

GenericPLEG最关键的是 Relist() 函数，

// Relist queries the container runtime for list of pods/containers, compare
// with the internal pods/containers, and generates events accordingly.
func (g *GenericPLEG) Relist() {
  ...
    // GenericPLEG每次relist的时间，也是更新cache的global timestamp
    timestamp := g.clock.Now()

    // 从container runtime中取全部的pods
    podList, err := g.runtime.GetPods(ctx, true)
    if err != nil {
        g.logger.Error(err, "GenericPLEG: Unable to retrieve pods")
        return
    }
    // 立马更新此次relist的time
    g.updateRelistTime(timestamp)

    pods := kubecontainer.Pods(podList)
    // 更新podRecords中所有的pods的current值，如果某个pod，在podRecords中，但是不在pods中，那么此pod的podRecord.current就是nil
    g.podRecords.setCurrent(pods)

  // 此次没有拿到status的pod，需要再次拿status
    needsReinspection := make(map[types.UID]*kubecontainer.Pod)

    for pid := range g.podRecords {
        // Compare the old and the current pods, and generate events.
        oldPod := g.podRecords.getOld(pid)
        pod := g.podRecords.getCurrent(pid)
        // Get all containers in the old and the new pod.
        allContainers := getContainersFromPods(oldPod, pod)
        var events []*PodLifecycleEvent
        for _, container := range allContainers {
          // 生成 pod lifecycle event，并暂时放入events中
            containerEvents := computeEvents(g.logger, oldPod, pod, &container.ID)
            events = append(events, containerEvents...)
        }

        _, reinspect := g.podsToReinspect[pid]

        if len(events) == 0 && len(watchConditions) == 0 && !reinspect {
            // Nothing else needed for this pod.
            continue
        }

        // updateCache() will inspect the pod and update the cache. 
        status, updated, err := g.updateCache(ctx, pod, pid)
        if err != nil {
            // make sure we try to reinspect the pod during the next relisting
            needsReinspection[pid] = pod

            continue
        } else if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
            if !updated {
                continue
            }
        }

        // Update the internal storage
        g.podRecords.update(pid)
    }

    // Update the cache global timestamp.
    g.cache.UpdateTime(timestamp)

    // make sure we retain the list of pods that need reinspecting the next time relist is called
    g.podsToReinspect = needsReinspection
}

由上面的code可知， Relist 函数主要做两件事：

用container runtime获取pods的status，然后对比前后变化，生成pod lifecycle event
更新pod cache

EventedPLEG

type EventedPLEG struct {
    // The container runtime.
    runtime kubecontainer.Runtime
    // The runtime service.
    runtimeService internalapi.RuntimeService
    // The channel from which the subscriber listens events.
    eventChannel chan *PodLifecycleEvent
    // Cache for storing the runtime states required for syncing pods.
    cache kubecontainer.Cache
    // GenericPLEG is used to force relist when required.
    genericPleg podLifecycleEventGeneratorHandler
    // The maximum number of retries when getting container events from the runtime.
    eventedPlegMaxStreamRetries int
    // Indicates relisting related parameters
    relistDuration *RelistDuration
    ...
}

EventedPLEG与GenericPLEG最大的区别就是，EventedPLEG是基于event的，而GenericPLEG是周期性的主动list pods，当然EventedPLEG还有有周期性主动list pods的。

从数据结构上看，EventedPLEG删掉了podRecords，因为其已经不再需要对比pod前后两次的状态了。

EventedPLEG会一直运行函数 watchEventsChannel

func (e *EventedPLEG) watchEventsChannel() {
    containerEventsResponseCh := make(chan *runtimeapi.ContainerEventResponse, cap(e.eventChannel))
    defer close(containerEventsResponseCh)

    // Get the container events from the runtime.
    go func() {
        numAttempts := 0
        for {
            if numAttempts >= e.eventedPlegMaxStreamRetries {
            // 失败次数太多，就stop
                if isEventedPLEGInUse() {
                    e.Stop()
                    e.genericPleg.Stop() 
                    e.Update(e.relistDuration) 
                    e.genericPleg.Start()
                    break
                }
            }
            // 从container runtime中获取container的events
            err := e.runtimeService.GetContainerEvents(context.Background(), containerEventsResponseCh, func(runtimeapi.RuntimeService_GetContainerEventsClient) {
                metrics.EventedPLEGConn.Inc()
            })
            if err != nil {
                numAttempts++
                // 如果失败了，就
                e.Relist() 
            }
        }
    }()

    if isEventedPLEGInUse() {
      // 处理获取到的container events
        e.processCRIEvents(containerEventsResponseCh)
    }
}

watchEventsChannel 函数会进一步调用 processCRIEvents 函数处理拿到的events

func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeapi.ContainerEventResponse) {
    for event := range containerEventsResponseCh {
        ...

        podID := types.UID(event.PodSandboxStatus.Metadata.Uid)
        shouldSendPLEGEvent := false

        status, err := e.runtime.GeneratePodStatus(event)
        ...

        // 如果是delete event，就从cache中删除pod
        if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT {
            for _, sandbox := range status.SandboxStatuses {
                if sandbox.Id == event.ContainerId {
                    e.cache.Delete(podID)
                }
            }
            shouldSendPLEGEvent = true
        } else {
          // 如果event的时间在cache中pod的时间晚，就发送event
            if e.cache.Set(podID, status, err, time.Unix(0, event.GetCreatedAt())) {
                shouldSendPLEGEvent = true
            }
        }

        if shouldSendPLEGEvent {
            e.processCRIEvent(event)
        }
    }
}

processCRIEvents 函数遍历所有的event，只有当下面两种情况之一出现时，才发出event：

是container delete event
event是新的，不是过去某个时间的event

processCRIEvents 调用 processCRIEvent 函数，将container event封装成 pod lifecycle event，并写到eventChannel中。

func (e *EventedPLEG) processCRIEvent(event *runtimeapi.ContainerEventResponse) {
    switch event.ContainerEventType {
    case runtimeapi.ContainerEventType_CONTAINER_STOPPED_EVENT:
      // container stopped -> pod ContainerDied
        e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId}) 
    case runtimeapi.ContainerEventType_CONTAINER_CREATED_EVENT:
      // ignore
    case runtimeapi.ContainerEventType_CONTAINER_STARTED_EVENT:
      // container started -> pod ContainerStarted
        e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerStarted, Data: event.ContainerId})
    case runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT:
      // container deleted -> pod ContainerDied and ContainerRemoved
        e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
        e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerRemoved, Data: event.ContainerId})
    }
}

主要作用分析

PLEG有GenericPLEG和EventedPLEG两个版本，由feature gate EventedPLEG 决定到底使用的是哪个。

在 NewMainKubelet 函数中，创建出需要的pleg

func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,...) (*Kubelet, error){
    ...
    klet := &Kubelet{
        hostname:                       hostname,
        hostnameOverridden:             hostnameOverridden,
        nodeName:                       nodeName,
        ...
    }
    ...
    if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
        ...
        klet.pleg = pleg.NewGenericPLEG(logger, klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
        eventedRelistDuration := &pleg.RelistDuration{
            RelistPeriod:    genericPlegRelistPeriod,
            RelistThreshold: genericPlegRelistThreshold,
        }
        // 如果EventedPLEG开启，就创建EventedPLEG
        klet.eventedPleg, err = pleg.NewEventedPLEG(logger, klet.containerRuntime, klet.runtimeService, eventChannel,
            klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{})
    } else {
        genericRelistDuration := &pleg.RelistDuration{
            RelistPeriod:    genericPlegRelistPeriod,
            RelistThreshold: genericPlegRelistThreshold,
        }
        // 否则创建GenericPLEG
        klet.pleg = pleg.NewGenericPLEG(logger, klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
    }
    ...
}

在 syncLoop 中，从pleg的event channel中取event，并处理

func (kl *Kubelet) syncLoop(ctx context.Context, updates <-chan kubetypes.PodUpdate, handler SyncHandler) {
    ...
    plegCh := kl.pleg.Watch()
    ...
    for {
        ...
        if !kl.syncLoopIteration(ctx, updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
            break
        }
        ...
    }
}

syncLoopIteration 函数是一个select，其中一个case会从plegCh中取event处理。

func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
    syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
    select {
    ...
    case e := <-plegCh:
        if isSyncPodWorthy(e) {
            // PLEG event for a pod; sync it.
            if pod, ok := kl.podManager.GetPodByUID(e.ID); ok {
                klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e)
                handler.HandlePodSyncs([]*v1.Pod{pod})
            } 
        }

        if e.Type == pleg.ContainerDied {
            if containerID, ok := e.Data.(string); ok {
                kl.cleanUpContainersInPod(e.ID, containerID)
            }
        }
    ...
    }
}
func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool {
    // ContainerRemoved doesn't affect pod state
    return event.Type != pleg.ContainerRemoved
}