• 【博客500】k8s调度器如何为pod计算最终得分


    k8s调度器如何为pod计算最终得分

    调度器汇聚node最终得分源码

    // k8s.io/kubernetes/pkg/scheduler/core/generic_scheduler.go
    // 打分阶段
    func (g *genericScheduler) prioritizeNodes(
    	ctx context.Context,
    	prof *profile.Profile,
    	state *framework.CycleState,
    	pod *v1.Pod,
    	nodes []*v1.Node,
    ) (framework.NodeScoreList, error) {
    	// 如果既没有扩展插件, 也没有内嵌score插件,每个node score=1
    	if len(g.extenders) == 0 && !prof.HasScorePlugins() {
    		result := make(framework.NodeScoreList, 0, len(nodes))
    		for i := range nodes {
    			result = append(result, framework.NodeScore{
    				Name:  nodes[i].Name,
    				Score: 1,
    			})
    		}
    		return result, nil
    	}
    	// 每个插件针对每个node来算一个评分
    	scoresMap, scoreStatus := prof.RunScorePlugins(ctx, state, pod, nodes)
    	// 对评分求和
    	result := make(framework.NodeScoreList, 0, len(nodes))
    	for i := range nodes {
    		result = append(result, framework.NodeScore{Name: nodes[i].Name, Score: 0})
    		for j := range scoresMap {
    			result[i].Score += scoresMap[j][i].Score
    		}
    	}
    	if len(g.extenders) != 0 && nodes != nil {
    		combinedScores := make(map[string]int64, len(nodes))
    		for i := range g.extenders {
    			go func(extIndex int) {
    				prioritizedList, weight, err := g.extenders[extIndex].Prioritize(pod, nodes)
    				for i := range *prioritizedList {
    					host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
    				    // 每个extender 插件再算一个分,并基于权重累计
    					combinedScores[host] += score * weight
    				}
    			}(i)
    		}
    	    // result 内嵌评分(最大100),combinedScores extenders 评分(最大10),最大值不一致, 所以把combinedScores 扩大10倍 加入到 result 上
    		for i := range result {
    			// MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore,
    			// therefore we need to scale the score returned by extenders to the score range used by the scheduler.
    			result[i].Score += combinedScores[result[i].Name] * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority)
    		}
    	}
    	return result, nil
    }
    
    // k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1/framework.go
    // 运行各个打分插件来为node打分
    func (f *framework) RunScorePlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*v1.Node) (ps PluginToNodeScores, status *Status) {
    	pluginToNodeScores := make(PluginToNodeScores, len(f.scorePlugins))
    	for _, pl := range f.scorePlugins {
    		pluginToNodeScores[pl.Name()] = make(NodeScoreList, len(nodes))
    	}
    	// Run Score method for each node in parallel.
    	workqueue.ParallelizeUntil(ctx, 16, len(nodes), func(index int) {
    		for _, pl := range f.scorePlugins {
    			nodeName := nodes[index].Name
    			s, status := f.runScorePlugin(ctx, pl, state, pod, nodeName)
    			pluginToNodeScores[pl.Name()][index] = NodeScore{
    				Name:  nodeName,
    				Score: int64(s),
    			}
    		}
    	})
    	// Run NormalizeScore method for each ScorePlugin in parallel.
    	workqueue.ParallelizeUntil(ctx, 16, len(f.scorePlugins), func(index int) {
    		pl := f.scorePlugins[index]
    		nodeScoreList := pluginToNodeScores[pl.Name()]
    		status := f.runScoreExtension(ctx, pl, state, pod, nodeScoreList)
    	})
    
    	// Apply score defaultWeights for each ScorePlugin in parallel.
    	workqueue.ParallelizeUntil(ctx, 16, len(f.scorePlugins), func(index int) {
    		pl := f.scorePlugins[index]
    		weight := f.pluginNameToWeightMap[pl.Name()]
    		nodeScoreList := pluginToNodeScores[pl.Name()]
    		for i, nodeScore := range nodeScoreList {
    			// return error if score plugin returns invalid score.
    			if nodeScore.Score > int64(MaxNodeScore) || nodeScore.Score < int64(MinNodeScore) {
    				return
    			}
    			nodeScoreList[i].Score = nodeScore.Score * int64(weight)
    		}
    	})
    	return pluginToNodeScores, nil
    }
    
    每个node 计算出一个score之后,哪个node 的score 最高,pod 就调度到哪个node 上。
    // k8s.io/kubernetes/pkg/scheduler/core/generic_scheduler.go
    func (g *genericScheduler) Schedule(ctx context.Context, prof *profile.Profile, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
    	...
    	priorityList, err := g.prioritizeNodes(ctx, prof, state, pod, filteredNodes)
    	host, err := g.selectHost(priorityList)
    	return ScheduleResult{
    		SuggestedHost:  host,
    		EvaluatedNodes: len(filteredNodes) + len(filteredNodesStatuses),
    		FeasibleNodes:  len(filteredNodes),
    	}, err
    }
    
    // 选出得分最高的node,如果有多个相同最高分,则随机选一个
    func (g *genericScheduler) selectHost(nodeScoreList framework.NodeScoreList) (string, error) {
    	if len(nodeScoreList) == 0 {
    		return "", fmt.Errorf("empty priorityList")
    	}
    	maxScore := nodeScoreList[0].Score
    	selected := nodeScoreList[0].Name
    	cntOfMaxScore := 1
    	for _, ns := range nodeScoreList[1:] {
    		if ns.Score > maxScore {
    			maxScore = ns.Score
    			selected = ns.Name
    			cntOfMaxScore = 1
    		} else if ns.Score == maxScore {
    		    // 这部分随机替换,其实就是实现了如果有多个相同最高分,则随机选一个
    		    // 这个算法其实就类似于随机抽样算法
    			cntOfMaxScore++
    			if rand.Intn(cntOfMaxScore) == 0 {
    				// Replace the candidate with probability of 1/cntOfMaxScore
    				selected = ns.Name
    			}
    		}
    	}
    	return selected, nil
    }
    

    各个score 插件的默认权重

    每个ScorePlugin 计算的score 限定在[1,100],可以看到大部分插件默认为1,个别插件nodepreferavoidpods 权重为10000(基本就是一票否决了)。

    // k8s.io/kubernetes/pkg/scheduler/algorithmprovider/registry.go
    func getDefaultConfig() *schedulerapi.Plugins {
    	return &schedulerapi.Plugins{
    	    ...
    		Score: &schedulerapi.PluginSet{
    			Enabled: []schedulerapi.Plugin{
    				{Name: noderesources.BalancedAllocationName, Weight: 1},
    				{Name: imagelocality.Name, Weight: 1},
    				{Name: interpodaffinity.Name, Weight: 1},
    				{Name: noderesources.LeastAllocatedName, Weight: 1},
    				{Name: nodeaffinity.Name, Weight: 1},
    				{Name: nodepreferavoidpods.Name, Weight: 10000},
    				{Name: defaultpodtopologyspread.Name, Weight: 1},
    				{Name: tainttoleration.Name, Weight: 1},
    			},
    		},
    	}
    }
    

    如何查看调度器在调度一个pod的时候,为每个node打的分

    scheduler 在score结束会记录日志:

    // k8s.io/kubernetes/pkg/scheduler/core/generic_scheduler.go
    func (g *genericScheduler) prioritizeNodes(
    	ctx context.Context,
    	prof *profile.Profile,
    	state *framework.CycleState,
    	pod *v1.Pod,
    	nodes []*v1.Node,
    ) (framework.NodeScoreList, error) {
        ...
        if klog.V(10) {
    		for i := range result {
    			klog.Infof("Host %s => Score %d", result[i].Name, result[i].Score)
    		}
    	}
    }
    

    修改调度器的启动参数: --v=10

        spec:
          containers:
          - args:
            - /bin/kube-scheduler
            - --address=0.0.0.0
            - --leader-elect=false
            - --config=/etc/kubernetes/scheduler-config.yaml
            - --scheduler-name=scheduler-plugins-scheduler
            - --v=10
    

    调高日志打印等级即可看到:

    scheduler for nginx-deployment-77dfb65897-fh45h in node 10.231.2.20
    scheduler for nginx-deployment-77dfb65897-fh45h in node 10.231.2.46
    scheduler for nginx-deployment-77dfb65897-fh45h in node 10.231.2.21
    scheduler for nginx-deployment-77dfb65897-6vn85 in node 10.231.2.21
    scheduler for nginx-deployment-77dfb65897-6vn85 in node 10.231.2.46
    scheduler for nginx-deployment-77dfb65897-6vn85 in node 10.231.2.20
    I0920 09:03:16.856661       1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.21: NodeResourcesBalancedAllocation, map of allocatable resources map[cpu:4000 memory:8095989760], map of requested resources map[cpu:500 memory:943718400] ,score 99,
    I0920 09:03:16.856697       1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.21: NodeResourcesLeastAllocated, map of allocatable resources map[cpu:4000 memory:8095989760], map of requested resources map[cpu:500 memory:943718400] ,score 87,
    I0920 09:03:16.856693       1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.46: NodeResourcesBalancedAllocation, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:550 memory:1153433600] ,score 99,
    I0920 09:03:16.856714       1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.46: NodeResourcesLeastAllocated, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:550 memory:1153433600] ,score 85,
    I0920 09:03:16.856714       1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.20: NodeResourcesBalancedAllocation, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:600 memory:1153433600] ,score 99,
    I0920 09:03:16.856726       1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.20: NodeResourcesLeastAllocated, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:600 memory:1153433600] ,score 85,
    I0920 09:03:16.856801       1 generic_scheduler.go:504] Plugin NodeResourcesLeastAllocated scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 87} {10.231.2.46 85} {10.231.2.20 85}]
    I0920 09:03:16.856813       1 generic_scheduler.go:504] Plugin NodeAffinity scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 0} {10.231.2.46 0} {10.231.2.20 0}]
    I0920 09:03:16.856820       1 generic_scheduler.go:504] Plugin NodePreferAvoidPods scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 1000000} {10.231.2.46 1000000} {10.231.2.20 1000000}]
    I0920 09:03:16.856827       1 generic_scheduler.go:504] Plugin PodTopologySpread scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 0} {10.231.2.46 0} {10.231.2.20 0}]
    I0920 09:03:16.856835       1 generic_scheduler.go:504] Plugin TaintToleration scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 100} {10.231.2.46 100} {10.231.2.20 100}]
    I0920 09:03:16.856846       1 generic_scheduler.go:504] Plugin NodeResourcesBalancedAllocation scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 99} {10.231.2.46 99} {10.231.2.20 99}]
    I0920 09:03:16.856852       1 generic_scheduler.go:504] Plugin ImageLocality scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 2} {10.231.2.46 0} {10.231.2.20 2}]
    I0920 09:03:16.856862       1 generic_scheduler.go:504] Plugin InterPodAffinity scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 0} {10.231.2.46 0} {10.231.2.20 0}]
    I0920 09:03:16.856871       1 generic_scheduler.go:560] Host 10.231.2.21 => Score 1000288
    I0920 09:03:16.856879       1 generic_scheduler.go:560] Host 10.231.2.46 => Score 1000284
    I0920 09:03:16.856883       1 generic_scheduler.go:560] Host 10.231.2.20 => Score 1000286
    I0920 09:03:16.856938       1 scheduling_queue.go:833] About to try and schedule pod default/nginx-deployment-77dfb65897-vqkd5
    I0920 09:03:16.856949       1 scheduler.go:459] Attempting to schedule pod: default/nginx-deployment-77dfb65897-vqkd5
    

    如何设置插件的权重

    apiVersion: kubescheduler.config.k8s.io/v1
    kind: KubeSchedulerConfiguration
    profiles:
      - plugins:
          score:
            disabled:
            - name: PodTopologySpread
            enabled:
            - name: MyCustomPluginA
              weight: 2
            - name: MyCustomPluginB
              weight: 1
    

    调度器每个打分插件的打分范围

    每个插件的打分范围:

    // MaxNodeScore is the maximum score a Score plugin is expected to return.
    MaxNodeScore int64 = 100
    
    // MinNodeScore is the minimum score a Score plugin is expected to return.
    MinNodeScore int64 = 0
    

    最终得分的范围:

    // MaxTotalScore is the maximum total score.
    MaxTotalScore int64 = math.MaxInt64
    

    NormalizeScore阶段的作用:

    执行所有插件的 normalize scoring;每个插件对所有节点 score 进行 reduce,最终将分数限制在[MinNodeScore, MaxNodeScore]有效范围

    自己开发打分插件要注意的

    1、记得插件打分最终得分限制在:0-100
    2、如果插件打分不在0-100范围内,要实现自己的NormalizeScore阶段的插件来归一化得分

    总结

    node的最终得分:每个插件的打分 * 插件的权重 之和

    plugin-1-score * plugin-1-weight + plugin-2-score * plugin-2-weight + …

  • 相关阅读:
    删数字问题 贪心算法 1472
    ECharts多个数据视图进行自适应大小的解决方案
    【网络编程】第一章 网络基础(协议+OSI+TCPIP+网络传输的流程+IP地址+MAC地址)
    计算机毕业设计ssm+vue基本微信小程序的个人健康管理系统
    信息系统项目管理师---第十一章项目风险管理
    FPGA零基础学习:数字电路中的时序逻辑
    WinForm应用实战开发指南 - 教你如何实现表头的全选操作?
    1542_AURIX_TC275_CPU子系统_内核
    .Net Core中无处不在的Async/Await是如何提升性能的?
    关于DevExpress的设置记录
  • 原文地址:https://blog.csdn.net/qq_43684922/article/details/126961067