kubernetes kube-scheduler 源码:默认生效的plugins

172 阅读2分钟

调度扩展点中调用plugin进行调度决策,那默认的这些生效的调度插件有哪些? 作用都是什么?

调度Pod的目的是什么? 是为了降本增效,即降低资源使用成本,提高现有资源的利用率。为了实现这个目标,衍生出两类算法binpack和负载均衡,更多参考: kubernetes调度--资源利用率算法

默认生效的插件列表

  • pkg/scheduler/scheduler.go

// createFromProvider creates a scheduler from the name of a registered algorithm provider.
func (c *Configurator) createFromProvider(providerName string) (*Scheduler, error) {
	klog.V(2).InfoS("Creating scheduler from algorithm provider", "algorithmProvider", providerName)
	r := algorithmprovider.NewRegistry()
	defaultPlugins, exist := r[providerName] // 默认用的是负载均衡配置
	if !exist {
		return nil, fmt.Errorf("algorithm provider %q is not registered", providerName)
	}
        // 合并默认激活插件和用户配置的插件
	for i := range c.profiles {
		prof := &c.profiles[i]
		plugins := &schedulerapi.Plugins{}
		plugins.Append(defaultPlugins)
		plugins.Apply(prof.Plugins)
		prof.Plugins = plugins
	}
	return c.create()
}
// NewRegistry returns an algorithm provider registry instance.
func NewRegistry() Registry {
        // 负载均衡
	defaultConfig := getDefaultConfig()
	applyFeatureGates(defaultConfig)
        
        // bin-pack
	caConfig := getClusterAutoscalerConfig()
	applyFeatureGates(caConfig)

	return Registry{
		schedulerapi.SchedulerDefaultProviderName: defaultConfig,  // 负载均衡配置
		ClusterAutoscalerProvider:                 caConfig,       // bin-pack配置
	}
}

默认的激活插件可以在getDefaultConfig()中看到:

默认的激活插件列表及其作用,参考: Scheduling plugins

func getDefaultConfig() *schedulerapi.Plugins {
	plugins := &schedulerapi.Plugins{
		QueueSort: schedulerapi.PluginSet{
			Enabled: []schedulerapi.Plugin{
				{Name: queuesort.Name},
			},
		},
		PreFilter: schedulerapi.PluginSet{
			Enabled: []schedulerapi.Plugin{
				{Name: noderesources.FitName},
				{Name: nodeports.Name},
				{Name: podtopologyspread.Name},
				{Name: interpodaffinity.Name},
				{Name: volumebinding.Name},
				{Name: nodeaffinity.Name},
			},
		},

                // ...
            
	}
	if utilfeature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority) {
		plugins.Score.Enabled = append(plugins.Score.Enabled, schedulerapi.Plugin{Name: volumebinding.Name, Weight: 1})
	}
	return plugins
}

bin-pack算法即是其中的ClusterAutoscalerProvider:

func getClusterAutoscalerConfig() *schedulerapi.Plugins {
	caConfig := getDefaultConfig()
	// Replace least with most requested.
	for i := range caConfig.Score.Enabled {
                // Score扩展点中的LeastAllocated模式换成MostAllocated模式,即节点使用资源越多的
                // 分数越大。这个插件的使用场景即是自动扩缩容,只使用必要数量的资源。
		if caConfig.Score.Enabled[i].Name == noderesources.LeastAllocatedName {
			caConfig.Score.Enabled[i].Name = noderesources.MostAllocatedName
		}
	}
	return caConfig
}

默认插件解析

QueueSort

  • 对比Pod的优先级
  • 对比Pod的创建时间

// Less is the function used by the activeQ heap algorithm to sort pods.
// It sorts pods based on their priority. When priorities are equal, it uses
// PodQueueInfo.timestamp.
func (pl *PrioritySort) Less(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
	p1 := corev1helpers.PodPriority(pInfo1.Pod)
	p2 := corev1helpers.PodPriority(pInfo2.Pod)
	return (p1 > p2) || (p1 == p2 && pInfo1.Timestamp.Before(pInfo2.Timestamp))
}

Prefilter

NodeResourcesFit

NodeResourcesFit检查节点是否有足够的资源。

检查的方式:

  • InitContainer因为是顺序运行的,所以只需要满足资源需求量最大的那个容器即可。
  • Containers字段中的容器是同时运行,所以需要计算总和。

NodeResourcesFit作为Prefilter插件,计算出来Pod的资源用量放到cycleState中。cycleState是共享数据,以map形式存储。

// CycleState provides a mechanism for plugins to store and retrieve arbitrary data.
// StateData stored by one plugin can be read, altered, or deleted by another plugin.
// CycleState does not provide any data protection, as all plugins are assumed to be
// trusted.
type CycleState struct {
	mx      sync.RWMutex
	storage map[StateKey]StateData // StateKey="PreFilterNodeResourcesFit"
	// if recordPluginMetrics is true, PluginExecutionDuration will be recorded for this cycle.
	recordPluginMetrics bool
}

// PreFilter invoked at the prefilter extension point.
func (f *Fit) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) *framework.Status {
	cycleState.Write(preFilterStateKey, computePodResourceRequest(pod))
	return nil
}


// computePodResourceRequest returns a framework.Resource that covers the largest
// width in each resource dimension. Because init-containers run sequentially, we collect
// the max in each dimension iteratively. In contrast, we sum the resource vectors for
// regular containers since they run simultaneously.
//
// If Pod Overhead is specified and the feature gate is set, the resources defined for Overhead
// are added to the calculated Resource request sum
//
// Example:
//
// Pod:
//   InitContainers
//     IC1:
//       CPU: 2
//       Memory: 1G
//     IC2:
//       CPU: 2
//       Memory: 3G
//   Containers
//     C1:
//       CPU: 2
//       Memory: 1G
//     C2:
//       CPU: 1
//       Memory: 1G
//
// Result: CPU: 3, Memory: 3G
func computePodResourceRequest(pod *v1.Pod) *preFilterState {
	result := &preFilterState{}
	for _, container := range pod.Spec.Containers {
		result.Add(container.Resources.Requests)
	}

	// take max_resource(sum_pod, any_init_container)
	for _, container := range pod.Spec.InitContainers {
		result.SetMaxResource(container.Resources.Requests)
	}

	// If Overhead is being utilized, add to the total requests for the pod
	if pod.Spec.Overhead != nil && utilfeature.DefaultFeatureGate.Enabled(features.PodOverhead) {
		result.Add(pod.Spec.Overhead)
	}

	return result
}