// k8s.io/kubernetes/pkg/scheduler/core/generic_scheduler.go
// 打分阶段
func (g *genericScheduler) prioritizeNodes(
ctx context.Context,
prof *profile.Profile,
state *framework.CycleState,
pod *v1.Pod,
nodes []*v1.Node,
) (framework.NodeScoreList, error) {
// 如果既没有扩展插件, 也没有内嵌score插件,每个node score=1
if len(g.extenders) == 0 && !prof.HasScorePlugins() {
result := make(framework.NodeScoreList, 0, len(nodes))
for i := range nodes {
result = append(result, framework.NodeScore{
Name: nodes[i].Name,
Score: 1,
})
}
return result, nil
}
// 每个插件针对每个node来算一个评分
scoresMap, scoreStatus := prof.RunScorePlugins(ctx, state, pod, nodes)
// 对评分求和
result := make(framework.NodeScoreList, 0, len(nodes))
for i := range nodes {
result = append(result, framework.NodeScore{Name: nodes[i].Name, Score: 0})
for j := range scoresMap {
result[i].Score += scoresMap[j][i].Score
}
}
if len(g.extenders) != 0 && nodes != nil {
combinedScores := make(map[string]int64, len(nodes))
for i := range g.extenders {
go func(extIndex int) {
prioritizedList, weight, err := g.extenders[extIndex].Prioritize(pod, nodes)
for i := range *prioritizedList {
host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
// 每个extender 插件再算一个分,并基于权重累计
combinedScores[host] += score * weight
}
}(i)
}
// result 内嵌评分(最大100),combinedScores extenders 评分(最大10),最大值不一致, 所以把combinedScores 扩大10倍 加入到 result 上
for i := range result {
// MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore,
// therefore we need to scale the score returned by extenders to the score range used by the scheduler.
result[i].Score += combinedScores[result[i].Name] * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority)
}
}
return result, nil
}
// k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1/framework.go
// 运行各个打分插件来为node打分
func (f *framework) RunScorePlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*v1.Node) (ps PluginToNodeScores, status *Status) {
pluginToNodeScores := make(PluginToNodeScores, len(f.scorePlugins))
for _, pl := range f.scorePlugins {
pluginToNodeScores[pl.Name()] = make(NodeScoreList, len(nodes))
}
// Run Score method for each node in parallel.
workqueue.ParallelizeUntil(ctx, 16, len(nodes), func(index int) {
for _, pl := range f.scorePlugins {
nodeName := nodes[index].Name
s, status := f.runScorePlugin(ctx, pl, state, pod, nodeName)
pluginToNodeScores[pl.Name()][index] = NodeScore{
Name: nodeName,
Score: int64(s),
}
}
})
// Run NormalizeScore method for each ScorePlugin in parallel.
workqueue.ParallelizeUntil(ctx, 16, len(f.scorePlugins), func(index int) {
pl := f.scorePlugins[index]
nodeScoreList := pluginToNodeScores[pl.Name()]
status := f.runScoreExtension(ctx, pl, state, pod, nodeScoreList)
})
// Apply score defaultWeights for each ScorePlugin in parallel.
workqueue.ParallelizeUntil(ctx, 16, len(f.scorePlugins), func(index int) {
pl := f.scorePlugins[index]
weight := f.pluginNameToWeightMap[pl.Name()]
nodeScoreList := pluginToNodeScores[pl.Name()]
for i, nodeScore := range nodeScoreList {
// return error if score plugin returns invalid score.
if nodeScore.Score > int64(MaxNodeScore) || nodeScore.Score < int64(MinNodeScore) {
return
}
nodeScoreList[i].Score = nodeScore.Score * int64(weight)
}
})
return pluginToNodeScores, nil
}
每个node 计算出一个score之后,哪个node 的score 最高,pod 就调度到哪个node 上。
// k8s.io/kubernetes/pkg/scheduler/core/generic_scheduler.go
func (g *genericScheduler) Schedule(ctx context.Context, prof *profile.Profile, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
...
priorityList, err := g.prioritizeNodes(ctx, prof, state, pod, filteredNodes)
host, err := g.selectHost(priorityList)
return ScheduleResult{
SuggestedHost: host,
EvaluatedNodes: len(filteredNodes) + len(filteredNodesStatuses),
FeasibleNodes: len(filteredNodes),
}, err
}
// 选出得分最高的node,如果有多个相同最高分,则随机选一个
func (g *genericScheduler) selectHost(nodeScoreList framework.NodeScoreList) (string, error) {
if len(nodeScoreList) == 0 {
return "", fmt.Errorf("empty priorityList")
}
maxScore := nodeScoreList[0].Score
selected := nodeScoreList[0].Name
cntOfMaxScore := 1
for _, ns := range nodeScoreList[1:] {
if ns.Score > maxScore {
maxScore = ns.Score
selected = ns.Name
cntOfMaxScore = 1
} else if ns.Score == maxScore {
// 这部分随机替换,其实就是实现了如果有多个相同最高分,则随机选一个
// 这个算法其实就类似于随机抽样算法
cntOfMaxScore++
if rand.Intn(cntOfMaxScore) == 0 {
// Replace the candidate with probability of 1/cntOfMaxScore
selected = ns.Name
}
}
}
return selected, nil
}
每个ScorePlugin 计算的score 限定在[1,100],可以看到大部分插件默认为1,个别插件nodepreferavoidpods 权重为10000(基本就是一票否决了)。
// k8s.io/kubernetes/pkg/scheduler/algorithmprovider/registry.go
func getDefaultConfig() *schedulerapi.Plugins {
return &schedulerapi.Plugins{
...
Score: &schedulerapi.PluginSet{
Enabled: []schedulerapi.Plugin{
{Name: noderesources.BalancedAllocationName, Weight: 1},
{Name: imagelocality.Name, Weight: 1},
{Name: interpodaffinity.Name, Weight: 1},
{Name: noderesources.LeastAllocatedName, Weight: 1},
{Name: nodeaffinity.Name, Weight: 1},
{Name: nodepreferavoidpods.Name, Weight: 10000},
{Name: defaultpodtopologyspread.Name, Weight: 1},
{Name: tainttoleration.Name, Weight: 1},
},
},
}
}
scheduler 在score结束会记录日志:
// k8s.io/kubernetes/pkg/scheduler/core/generic_scheduler.go
func (g *genericScheduler) prioritizeNodes(
ctx context.Context,
prof *profile.Profile,
state *framework.CycleState,
pod *v1.Pod,
nodes []*v1.Node,
) (framework.NodeScoreList, error) {
...
if klog.V(10) {
for i := range result {
klog.Infof("Host %s => Score %d", result[i].Name, result[i].Score)
}
}
}
修改调度器的启动参数: --v=10
spec:
containers:
- args:
- /bin/kube-scheduler
- --address=0.0.0.0
- --leader-elect=false
- --config=/etc/kubernetes/scheduler-config.yaml
- --scheduler-name=scheduler-plugins-scheduler
- --v=10
调高日志打印等级即可看到:
scheduler for nginx-deployment-77dfb65897-fh45h in node 10.231.2.20
scheduler for nginx-deployment-77dfb65897-fh45h in node 10.231.2.46
scheduler for nginx-deployment-77dfb65897-fh45h in node 10.231.2.21
scheduler for nginx-deployment-77dfb65897-6vn85 in node 10.231.2.21
scheduler for nginx-deployment-77dfb65897-6vn85 in node 10.231.2.46
scheduler for nginx-deployment-77dfb65897-6vn85 in node 10.231.2.20
I0920 09:03:16.856661 1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.21: NodeResourcesBalancedAllocation, map of allocatable resources map[cpu:4000 memory:8095989760], map of requested resources map[cpu:500 memory:943718400] ,score 99,
I0920 09:03:16.856697 1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.21: NodeResourcesLeastAllocated, map of allocatable resources map[cpu:4000 memory:8095989760], map of requested resources map[cpu:500 memory:943718400] ,score 87,
I0920 09:03:16.856693 1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.46: NodeResourcesBalancedAllocation, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:550 memory:1153433600] ,score 99,
I0920 09:03:16.856714 1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.46: NodeResourcesLeastAllocated, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:550 memory:1153433600] ,score 85,
I0920 09:03:16.856714 1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.20: NodeResourcesBalancedAllocation, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:600 memory:1153433600] ,score 99,
I0920 09:03:16.856726 1 resource_allocation.go:78] nginx-deployment-77dfb65897-6vn85 -> 10.231.2.20: NodeResourcesLeastAllocated, map of allocatable resources map[cpu:4000 memory:8095997952], map of requested resources map[cpu:600 memory:1153433600] ,score 85,
I0920 09:03:16.856801 1 generic_scheduler.go:504] Plugin NodeResourcesLeastAllocated scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 87} {10.231.2.46 85} {10.231.2.20 85}]
I0920 09:03:16.856813 1 generic_scheduler.go:504] Plugin NodeAffinity scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 0} {10.231.2.46 0} {10.231.2.20 0}]
I0920 09:03:16.856820 1 generic_scheduler.go:504] Plugin NodePreferAvoidPods scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 1000000} {10.231.2.46 1000000} {10.231.2.20 1000000}]
I0920 09:03:16.856827 1 generic_scheduler.go:504] Plugin PodTopologySpread scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 0} {10.231.2.46 0} {10.231.2.20 0}]
I0920 09:03:16.856835 1 generic_scheduler.go:504] Plugin TaintToleration scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 100} {10.231.2.46 100} {10.231.2.20 100}]
I0920 09:03:16.856846 1 generic_scheduler.go:504] Plugin NodeResourcesBalancedAllocation scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 99} {10.231.2.46 99} {10.231.2.20 99}]
I0920 09:03:16.856852 1 generic_scheduler.go:504] Plugin ImageLocality scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 2} {10.231.2.46 0} {10.231.2.20 2}]
I0920 09:03:16.856862 1 generic_scheduler.go:504] Plugin InterPodAffinity scores on default/nginx-deployment-77dfb65897-6vn85 => [{10.231.2.21 0} {10.231.2.46 0} {10.231.2.20 0}]
I0920 09:03:16.856871 1 generic_scheduler.go:560] Host 10.231.2.21 => Score 1000288
I0920 09:03:16.856879 1 generic_scheduler.go:560] Host 10.231.2.46 => Score 1000284
I0920 09:03:16.856883 1 generic_scheduler.go:560] Host 10.231.2.20 => Score 1000286
I0920 09:03:16.856938 1 scheduling_queue.go:833] About to try and schedule pod default/nginx-deployment-77dfb65897-vqkd5
I0920 09:03:16.856949 1 scheduler.go:459] Attempting to schedule pod: default/nginx-deployment-77dfb65897-vqkd5
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
profiles:
- plugins:
score:
disabled:
- name: PodTopologySpread
enabled:
- name: MyCustomPluginA
weight: 2
- name: MyCustomPluginB
weight: 1
每个插件的打分范围:
// MaxNodeScore is the maximum score a Score plugin is expected to return.
MaxNodeScore int64 = 100
// MinNodeScore is the minimum score a Score plugin is expected to return.
MinNodeScore int64 = 0
最终得分的范围:
// MaxTotalScore is the maximum total score.
MaxTotalScore int64 = math.MaxInt64
执行所有插件的 normalize scoring;每个插件对所有节点 score 进行 reduce,最终将分数限制在[MinNodeScore, MaxNodeScore]有效范围
1、记得插件打分最终得分限制在:0-100
2、如果插件打分不在0-100范围内,要实现自己的NormalizeScore阶段的插件来归一化得分
node的最终得分:每个插件的打分 * 插件的权重 之和
plugin-1-score * plugin-1-weight + plugin-2-score * plugin-2-weight + …