概述
DevicePlugin机制是k8s为了解决 cou/memory 以外的资源申请类型所设计的通用的设备注册和分配机制,为了保持通用,它的接口设计得比较简单,对于设备内部的复杂性就留给impl解决了。
details
nvidia-device-plugin
nvidia-device-plugin 是官方提供的device-plugin 样例,用户当然可以选择自己实现自己的device-plugin。考虑到nvidia的封闭性,nvidia-device-plugin 代码阅读起来并不友好,仅适合学习使用。
NvidiaDevicePlugin
NvidiaDevicePlugin 是管理GPU设备的主类,核心的调用都在它的method中。
type NvidiaDevicePlugin struct {
rm rm.ResourceManager
config *spec.Config
deviceListEnvvar string
deviceListStrategies spec.DeviceListStrategies
socket string
cdiHandler cdi.Interface
cdiEnabled bool
cdiAnnotationPrefix string
server *grpc.Server
health chan *rm.Device
stop chan interface{}
}
Register
Register 是通过 k8s-DeviceManager 提供的注册机制,将设备的名称和device-plugin注册到k8s上,其中的 Endpoint 表明了 device-plugin server socket所在的位置,提供给k8s调用。
// Register registers the device plugin for the given resourceName with Kubelet.
func (plugin *NvidiaDevicePlugin) Register() error {
conn, err := plugin.dial(pluginapi.KubeletSocket, 5*time.Second)
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: path.Base(plugin.socket),
ResourceName: string(plugin.rm.Resource()),
Options: &pluginapi.DevicePluginOptions{
GetPreferredAllocationAvailable: true,
},
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return err
}
return nil
}
ResourceManager
ResourceManager 是 nvidia 对自己的设备管理的接口抽象,与k8s本身的协议设计并无太大关系。
type ResourceManager interface {
Resource() spec.ResourceName
Devices() Devices
GetDevicePaths([]string) []string
GetPreferredAllocation(available, required []string, size int) ([]string, error)
CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error
}
health: chan-event模式
device-plugin 协议考虑到了 device 自身的复杂性,因此可以标识设备的是否健康。
func (plugin *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
return err
}
for {
select {
case <-plugin.stop:
return nil
case d := <-plugin.health:
// FIXME: there is no way to recover from the Unhealthy state.
d.Health = pluginapi.Unhealthy
klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID)
if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
return nil
}
}
}
}
kubelet(deivce-manager)
kubelet 本身拥有众多 manager,device-manager 还是其中相对简单好读的manager,它的核心思想就是通过 endpoint 接口 管理各个device的client,作为"后端存储"。
Register
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
go m.addEndpoint(r)
}
func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
new, err := newEndpointImpl(filepath.Join(m.socketdir, r.Endpoint), r.ResourceName, m.callback)
if err != nil {
klog.ErrorS(err, "Failed to dial device plugin with request", "request", r)
return
}
m.registerEndpoint(r.ResourceName, r.Options, new)
go func() {
m.runEndpoint(r.ResourceName, new)
}()
}
DevicePluginClient
type DevicePluginClient interface {
GetDevicePluginOptions(ctx context.Context, in *Empty, opts ...grpc.CallOption) (*DevicePluginOptions, error)
ListAndWatch(ctx context.Context, in *Empty, opts ...grpc.CallOption) (DevicePlugin_ListAndWatchClient, error)
GetPreferredAllocation(ctx context.Context, in *PreferredAllocationRequest, opts ...grpc.CallOption) (*PreferredAllocationResponse, error)
Allocate(ctx context.Context, in *AllocateRequest, opts ...grpc.CallOption) (*AllocateResponse, error)
PreStartContainer(ctx context.Context, in *PreStartContainerRequest, opts ...grpc.CallOption) (*PreStartContainerResponse, error)
}
endpoint
// endpoint maps to a single registered device plugin. It is responsible
// for managing gRPC communications with the device plugin and caching
// device states reported by the device plugin.
type endpoint interface {
run()
stop()
getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error)
allocate(devs []string) (*pluginapi.AllocateResponse, error)
preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error)
callback(resourceName string, devices []pluginapi.Device)
isStopped() bool
stopGracePeriodExpired() bool
}