kubernetes kube-scheduler 源码：预选和优选可用节点筛选调度器主循环会调用ScheduleAl

预选和优选

调度器主循环会调用ScheduleAlgorithm.Schedule选择一个可用节点，ScheduleAlgorithm的默认实现genericScheduler会去调用评分插件来选择最高分且最合适的节点。

// ScheduleAlgorithm is an interface implemented by things that know how to schedule pods
// onto machines.
type ScheduleAlgorithm interface {
	Schedule(context.Context, framework.Framework, *framework.CycleState, *v1.Pod) (scheduleResult ScheduleResult, err error)
	// Extenders returns a slice of extender config. This is exposed for
	// testing.
	Extenders() []framework.Extender
}

// 实现ScheduleAlgorithm
type genericScheduler struct {
	cache                    internalcache.Cache
	extenders                []framework.Extender
	nodeInfoSnapshot         *internalcache.Snapshot
	percentageOfNodesToScore int32
	nextStartNodeIndex       int
}

ScheduleAlgorithm.Schedule方法包含预选和优选(Filter & Score)逻辑筛选合适的节点列表，最后从节点列表中使用 水库抽样算法 选择一个最佳节点。

// Schedule tries to schedule the given pod to one of the nodes in the node list.
// If it succeeds, it will return the name of the node.
// If it fails, it will return a FitError error with reasons.
func (g *genericScheduler) Schedule(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
	trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name})
	defer trace.LogIfLong(100 * time.Millisecond)

	if err := g.snapshot(); err != nil {
		return result, err
	}
	trace.Step("Snapshotting scheduler cache and node infos done")

	if g.nodeInfoSnapshot.NumNodes() == 0 {
		return result, ErrNoNodesAvailable
	}
        // Filters the nodes to find the ones that fit the pod based on the framework
	feasibleNodes, diagnosis, err := g.findNodesThatFitPod(ctx, fwk, state, pod)
	if err != nil {
		return result, err
	}
	trace.Step("Computing predicates done")
        // 可用节点为0, 返回错误
	if len(feasibleNodes) == 0 {
		return result, &framework.FitError{
			Pod:         pod,
			NumAllNodes: g.nodeInfoSnapshot.NumNodes(),
			Diagnosis:   diagnosis,
		}
	}
        // 可用节点为1, 不用考虑，选择此节点
	// When only one node after predicate, just use it.
	if len(feasibleNodes) == 1 {
		return ScheduleResult{
			SuggestedHost:  feasibleNodes[0].Name,
			EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap),
			FeasibleNodes:  1,
		}, nil
	}
        // 可用节点有多个, 进行优选环节
        // prioritizeNodes prioritizes the nodes by running the score plugins,
        // which return a score for each node from the call to RunScorePlugins().
	priorityList, err := g.prioritizeNodes(ctx, fwk, state, pod, feasibleNodes)
	if err != nil {
		return result, err
	}
        // 水库算法选择其中一个
	host, err := g.selectHost(priorityList)
	trace.Step("Prioritizing done")

	return ScheduleResult{
		SuggestedHost:  host,
		EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap),
		FeasibleNodes:  len(feasibleNodes),
	}, err
}

feasibleNodes, diagnosis, err := g.findNodesThatFitPod(ctx, fwk, state, pod)

这个调用是进行预选

// Filters the nodes to find the ones that fit the pod based on the framework
// filter plugins and filter extenders.
func (g *genericScheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) {
	diagnosis := framework.Diagnosis{
		NodeToStatusMap:      make(framework.NodeToStatusMap),
		UnschedulablePlugins: sets.NewString(),
	}

        // 运行Prefilter,Filter etc.
	// Run "prefilter" plugins.
	s := fwk.RunPreFilterPlugins(ctx, state, pod)
	allNodes, err := g.nodeInfoSnapshot.NodeInfos().List()
        
        //  ... 

        //  Run "filter" plugins.
	feasibleNodes, err := g.findNodesThatPassFilters(ctx, fwk, state, pod, diagnosis, allNodes)
	if err != nil {
		return nil, diagnosis, err
	}

        // ...
	return feasibleNodes, diagnosis, nil
}

priorityList, err := g.prioritizeNodes(ctx, fwk, state, pod, feasibleNodes)

这一步是进行优选，对每个节点进行打分

// prioritizeNodes prioritizes the nodes by running the score plugins,
// which return a score for each node from the call to RunScorePlugins().
// The scores from each plugin are added together to make the score for that node, then
// any extenders are run as well.
// All scores are finally combined (added) to get the total weighted scores of all nodes
func (g *genericScheduler) prioritizeNodes(
	ctx context.Context,
	fwk framework.Framework,
	state *framework.CycleState,
	pod *v1.Pod,
	nodes []*v1.Node,
) (framework.NodeScoreList, error) {
	
	// Run PreScore plugins.
	preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes)
	if !preScoreStatus.IsSuccess() {
		return nil, preScoreStatus.AsError()
	}

	// Run the Score plugins.
        // scoresMap map[string]NodeScoreList存储着每个插件对每个节点的打分，key=pluginName
	scoresMap, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes)
	if !scoreStatus.IsSuccess() {
		return nil, scoreStatus.AsError()
	}

	// ...

	// Summarize all scores.
        // 前面是按照插件维度展开，这次以节点维度展开，即计算每个节点获得的每个插件打分的总和
	result := make(framework.NodeScoreList, 0, len(nodes))

	for i := range nodes {
		result = append(result, framework.NodeScore{Name: nodes[i].Name, Score: 0})
		for j := range scoresMap {
			result[i].Score += scoresMap[j][i].Score
		}
	}

        // ...


	return result, nil
}

scoresMap, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes)

其中scoresMap是PluginToNodeScores类型，以插件维度展开对每个节点的评分。

// PluginToNodeScores declares a map from plugin name to its NodeScoreList.
type PluginToNodeScores map[string]NodeScoreList


// NodeScoreList declares a list of nodes and their scores.
type NodeScoreList []NodeScore

// NodeScore is a struct with node name and score.
type NodeScore struct {
	Name  string  // 节点名
	Score int64   // 分数
}

评分都是最高且节点有多个中如何选择?

使用水库抽样算法，肯定是选分数最高的，那么如果分数一般高的，则1/cntOfMaxScore替换掉当前最高分节点。

// selectHost takes a prioritized list of nodes and then picks one
// in a reservoir sampling manner from the nodes that had the highest score.
func (g *genericScheduler) selectHost(nodeScoreList framework.NodeScoreList) (string, error) {
	if len(nodeScoreList) == 0 {
		return "", fmt.Errorf("empty priorityList")
	}
	maxScore := nodeScoreList[0].Score
	selected := nodeScoreList[0].Name
	cntOfMaxScore := 1
	for _, ns := range nodeScoreList[1:] {
                // 选择分数最高的
		if ns.Score > maxScore {
			maxScore = ns.Score
			selected = ns.Name
			cntOfMaxScore = 1
		} else if ns.Score == maxScore {
                        // 如果分数一样高，则 1/cntOfMaxScore几率替换掉当前节点
                        // cntOfMaxScore 代表最高分且分数相等的节点个数
			cntOfMaxScore++
			if rand.Intn(cntOfMaxScore) == 0 {
				// Replace the candidate with probability of 1/cntOfMaxScore
				selected = ns.Name
			}
		}
	}
	return selected, nil
}