kubelet DevicePlugin: 以nvidia-gpu为例

322 阅读2分钟

概述

DevicePlugin机制是k8s为了解决 cou/memory 以外的资源申请类型所设计的通用的设备注册和分配机制,为了保持通用,它的接口设计得比较简单,对于设备内部的复杂性就留给impl解决了。

details

nvidia-device-plugin

nvidia-device-plugin 是官方提供的device-plugin 样例,用户当然可以选择自己实现自己的device-plugin。考虑到nvidia的封闭性,nvidia-device-plugin 代码阅读起来并不友好,仅适合学习使用。

NvidiaDevicePlugin

NvidiaDevicePlugin 是管理GPU设备的主类,核心的调用都在它的method中。

type NvidiaDevicePlugin struct {
	rm                   rm.ResourceManager
	config               *spec.Config
	deviceListEnvvar     string
	deviceListStrategies spec.DeviceListStrategies
	socket               string

	cdiHandler          cdi.Interface
	cdiEnabled          bool
	cdiAnnotationPrefix string

	server *grpc.Server
	health chan *rm.Device
	stop   chan interface{}
}

Register

Register 是通过 k8s-DeviceManager 提供的注册机制,将设备的名称和device-plugin注册到k8s上,其中的 Endpoint 表明了 device-plugin server socket所在的位置,提供给k8s调用。

// Register registers the device plugin for the given resourceName with Kubelet.
func (plugin *NvidiaDevicePlugin) Register() error {
	conn, err := plugin.dial(pluginapi.KubeletSocket, 5*time.Second)
	if err != nil {
		return err
	}
	defer conn.Close()

	client := pluginapi.NewRegistrationClient(conn)
	reqt := &pluginapi.RegisterRequest{
		Version:      pluginapi.Version,
		Endpoint:     path.Base(plugin.socket),
		ResourceName: string(plugin.rm.Resource()),
		Options: &pluginapi.DevicePluginOptions{
			GetPreferredAllocationAvailable: true,
		},
	}

	_, err = client.Register(context.Background(), reqt)
	if err != nil {
		return err
	}
	return nil
}

ResourceManager

ResourceManager 是 nvidia 对自己的设备管理的接口抽象,与k8s本身的协议设计并无太大关系。

type ResourceManager interface {
	Resource() spec.ResourceName
	Devices() Devices
	GetDevicePaths([]string) []string
	GetPreferredAllocation(available, required []string, size int) ([]string, error)
	CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error
}

health: chan-event模式

device-plugin 协议考虑到了 device 自身的复杂性,因此可以标识设备的是否健康。

func (plugin *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
	if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
		return err
	}

	for {
		select {
		case <-plugin.stop:
			return nil
		case d := <-plugin.health:
			// FIXME: there is no way to recover from the Unhealthy state.
			d.Health = pluginapi.Unhealthy
			klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID)
			if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
				return nil
			}
		}
	}
}

kubelet(deivce-manager)

kubelet 本身拥有众多 manager,device-manager 还是其中相对简单好读的manager,它的核心思想就是通过 endpoint 接口 管理各个device的client,作为"后端存储"。

Register

func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
	go m.addEndpoint(r)
}

func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
	new, err := newEndpointImpl(filepath.Join(m.socketdir, r.Endpoint), r.ResourceName, m.callback)
	if err != nil {
		klog.ErrorS(err, "Failed to dial device plugin with request", "request", r)
		return
	}
	m.registerEndpoint(r.ResourceName, r.Options, new)
	go func() {
		m.runEndpoint(r.ResourceName, new)
	}()
}

DevicePluginClient

type DevicePluginClient interface {
	GetDevicePluginOptions(ctx context.Context, in *Empty, opts ...grpc.CallOption) (*DevicePluginOptions, error)
	ListAndWatch(ctx context.Context, in *Empty, opts ...grpc.CallOption) (DevicePlugin_ListAndWatchClient, error)
	GetPreferredAllocation(ctx context.Context, in *PreferredAllocationRequest, opts ...grpc.CallOption) (*PreferredAllocationResponse, error)
	Allocate(ctx context.Context, in *AllocateRequest, opts ...grpc.CallOption) (*AllocateResponse, error)
	PreStartContainer(ctx context.Context, in *PreStartContainerRequest, opts ...grpc.CallOption) (*PreStartContainerResponse, error)
}

endpoint

// endpoint maps to a single registered device plugin. It is responsible
// for managing gRPC communications with the device plugin and caching
// device states reported by the device plugin.
type endpoint interface {
	run()
	stop()
	getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error)
	allocate(devs []string) (*pluginapi.AllocateResponse, error)
	preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error)
	callback(resourceName string, devices []pluginapi.Device)
	isStopped() bool
	stopGracePeriodExpired() bool
}