• Linux进场调度-中断/系统调用调度与进程的调度类型


    中断调度/系统调用调度时机

    本文主要以linux-3.10为源码分析学习。

    中断调度

    中断通用的入口,为前文中断概述中讲述的common_interrupt。

    common_interrupt:
    	XCPT_FRAME
    	ASM_CLAC
    	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
    	interrupt do_IRQ     # 调用中断处理函数进行处理
    	/* 0(%rsp): old_rsp-ARGOFFSET */
    ret_from_intr:                   # 中断处理返回
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	decl PER_CPU_VAR(irq_count)
    
    	/* Restore saved previous stack */
    	popq %rsi
    	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
    	leaq ARGOFFSET-RBP(%rsi), %rsp
    	CFI_DEF_CFA_REGISTER	rsp
    	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
    
    exit_intr:
    	GET_THREAD_INFO(%rcx)
    	testl $3,CS-ARGOFFSET(%rsp)
    	je retint_kernel    # 是否返回内核状态
    	/* Interrupt came from user space */
    	/*
    	 * Has a correct top of stack, but a partial stack frame
    	 * %rcx: thread info. Interrupts off.
    	 */
    retint_with_reschedule:    # 检查是否需要进行调度
    	movl $_TIF_WORK_MASK,%edi
    retint_check:
    	LOCKDEP_SYS_EXIT_IRQ
    	movl TI_flags(%rcx),%edx
    	andl %edi,%edx
    	CFI_REMEMBER_STATE
    	jnz  retint_careful
    
    retint_swapgs:		/* return to user-space */
    	/*
    	 * The iretq could re-enable interrupts:
    	 */
    	DISABLE_INTERRUPTS(CLBR_ANY)
    	TRACE_IRQS_IRETQ
    	SWAPGS
    	jmp restore_args
    
    retint_restore_args:	/* return to kernel space */
    	DISABLE_INTERRUPTS(CLBR_ANY)
    	/*
    	 * The iretq could re-enable interrupts:
    	 */
    	TRACE_IRQS_IRETQ
    restore_args:
    	RESTORE_ARGS 1,8,1
    
    irq_return:
    	INTERRUPT_RETURN
    	_ASM_EXTABLE(irq_return, bad_iret)
    
    #ifdef CONFIG_PARAVIRT
    ENTRY(native_iret)
    	iretq
    	_ASM_EXTABLE(native_iret, bad_iret)
    #endif
    
    	.section .fixup,"ax"
    bad_iret:
    	/*
    	 * The iret traps when the %cs or %ss being restored is bogus.
    	 * We've lost the original trap vector and error code.
    	 * #GPF is the most likely one to get for an invalid selector.
    	 * So pretend we completed the iret and took the #GPF in user mode.
    	 *
    	 * We are now running with the kernel GS after exception recovery.
    	 * But error_entry expects us to have user GS to match the user %cs,
    	 * so swap back.
    	 */
    	pushq $0
    
    	SWAPGS
    	jmp general_protection
    
    	.previous
    
    	/* edi: workmask, edx: work */
    retint_careful:
    	CFI_RESTORE_STATE
    	bt    $TIF_NEED_RESCHED,%edx   # 检查调度标致位
    	jnc   retint_signal            # 处理之前先处理信号
    	TRACE_IRQS_ON
    	ENABLE_INTERRUPTS(CLBR_NONE)
    	pushq_cfi %rdi
    	SCHEDULE_USER                  # 调用主动调取
    	popq_cfi %rdi
    	GET_THREAD_INFO(%rcx)
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	jmp retint_check
    
    retint_signal:
    	testl $_TIF_DO_NOTIFY_MASK,%edx
    	jz    retint_swapgs
    	TRACE_IRQS_ON
    	ENABLE_INTERRUPTS(CLBR_NONE)
    	SAVE_REST
    	movq $-1,ORIG_RAX(%rsp)
    	xorl %esi,%esi		# oldset
    	movq %rsp,%rdi		# &pt_regs
    	call do_notify_resume
    	RESTORE_REST
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	GET_THREAD_INFO(%rcx)
    	jmp retint_with_reschedule   # 检查是否需要再调度
    
    #ifdef CONFIG_PREEMPT
    	/* Returning to kernel space. Check if we need preemption */
    	/* rcx:	 threadinfo. interrupts off. */
    ENTRY(retint_kernel)
    	cmpl $0,TI_preempt_count(%rcx)   # 检查是否加锁
    	jnz  retint_restore_args
    	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)  # 判断调度标志位是否需要调度
    	jnc  retint_restore_args
    	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
    	jnc  retint_restore_args
    	call preempt_schedule_irq          # 主动调度
    	jmp exit_intr
    #endif
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127

    其中preempt_schedule_irq函数就是调用主动调度的函数,如下。

    asmlinkage void __sched preempt_schedule_irq(void)
    {
    	struct thread_info *ti = current_thread_info();
    	enum ctx_state prev_state;
    
    	/* Catch callers which need to be fixed */
    	BUG_ON(ti->preempt_count || !irqs_disabled());
    
    	prev_state = exception_enter();
    
    	do {
    		add_preempt_count(PREEMPT_ACTIVE);
    		local_irq_enable();
    		__schedule();   // 主动调度
    		local_irq_disable();
    		sub_preempt_count(PREEMPT_ACTIVE);
    
    		/*
    		 * Check again in case we missed a preemption opportunity
    		 * between schedule and now.
    		 */
    		barrier();
    	} while (need_resched());
    
    	exception_exit(prev_state);
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    系统调用

    在Linux系统中,系统调用的方式主要就是常见的三种形式。

    1. 通过int 80陷入内核进行系统调用。
    2. 通过Intel32位sysenter/sysexit快速系统调用。
    3. 通过64位模式下的syscall/sysret快速系统调用。

    其中方式1为Linux传统的系统调用的方式该方式需要陷入通过中断陷入内核,执行完成之后再返回用户空间,主要的问题就是通过软中断效率较低,故方式2、3就是为了避开频繁的中断来提高系统调用的效率,其主要原理是通过将运行的可执行文件执行一个vdso的映射的__kernel_vsyscall函数,从而能够提升效率快速进行系统调用。

    syscall初始化(64位)

    在进行trap_init->cpu_init->syscall_init,初始化快速调用的处理函数。

    void syscall_init(void)
    {
    	/*
    	 * LSTAR and STAR live in a bit strange symbiosis.
    	 * They both write to the same internal register. STAR allows to
    	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
    	 */
    	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
    	wrmsrl(MSR_LSTAR, system_call);  //设置处理的回调函数
    	wrmsrl(MSR_CSTAR, ignore_sysret);
    
    #ifdef CONFIG_IA32_EMULATION
    	syscall32_cpu_init();
    #endif
    
    	/* Flags to clear on syscall */
    	wrmsrl(MSR_SYSCALL_MASK,
    	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
    	       X86_EFLAGS_IOPL|X86_EFLAGS_AC);
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20

    MSR_LSTAR代表的就是快速回调处理的函数即system_call函数。详细的可咨询查阅syscall的资料。

    int 80初始化

    在调用trap_init->set_system_trap_gate(SYSCALL_VECTOR, &system_call);

    此时就是设置的80为符合的中断处理函数。

    # define SYSCALL_VECTOR			0x80
    
    • 1
    system_call流程
    ENTRY(system_call)
    	CFI_STARTPROC	simple
    	CFI_SIGNAL_FRAME
    	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
    	CFI_REGISTER	rip,rcx
    	/*CFI_REGISTER	rflags,r11*/
    	SWAPGS_UNSAFE_STACK
    	/*
    	 * A hypervisor implementation might want to use a label
    	 * after the swapgs, so that it can do the swapgs
    	 * for the guest and jump here on syscall.
    	 */
    GLOBAL(system_call_after_swapgs)
    
    	movq	%rsp,PER_CPU_VAR(old_rsp)
    	movq	PER_CPU_VAR(kernel_stack),%rsp
    	/*
    	 * No need to follow this irqs off/on section - it's straight
    	 * and short:
    	 */
    	ENABLE_INTERRUPTS(CLBR_NONE)
    	SAVE_ARGS 8,0
    	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
    	movq  %rcx,RIP-ARGOFFSET(%rsp)
    	CFI_REL_OFFSET rip,RIP-ARGOFFSET
    	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
    	jnz tracesys
    system_call_fastpath:
    #if __SYSCALL_MASK == ~0
    	cmpq $__NR_syscall_max,%rax
    #else
    	andl $__SYSCALL_MASK,%eax
    	cmpl $__NR_syscall_max,%eax
    #endif
    	ja badsys
    	movq %r10,%rcx
    	call *sys_call_table(,%rax,8)  # XXX:	 rip relative   // 跳入系统调用的回调函数执行
    	movq %rax,RAX-ARGOFFSET(%rsp)
    /*
     * Syscall return path ending with SYSRET (fast path)
     * Has incomplete stack frame and undefined top of stack.
     */
    ret_from_sys_call:      # 返回检查系统调用
    	movl $_TIF_ALLWORK_MASK,%edi
    	/* edi:	flagmask */
    sysret_check:
    	LOCKDEP_SYS_EXIT
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
    	andl %edi,%edx
    	jnz  sysret_careful
    	CFI_REMEMBER_STATE
    	/*
    	 * sysretq will re-enable interrupts:
    	 */
    	TRACE_IRQS_ON
    	movq RIP-ARGOFFSET(%rsp),%rcx
    	CFI_REGISTER	rip,rcx
    	RESTORE_ARGS 1,-ARG_SKIP,0
    	/*CFI_REGISTER	rflags,r11*/
    	movq	PER_CPU_VAR(old_rsp), %rsp
    	USERGS_SYSRET64
    
    	CFI_RESTORE_STATE
    	/* Handle reschedules */
    	/* edx:	work, edi: workmask */
    sysret_careful:
    	bt $TIF_NEED_RESCHED,%edx    # 检查是否需要调度,
    	jnc sysret_signal            # 检查是否需要处理信号,优先处理信号
    	TRACE_IRQS_ON
    	ENABLE_INTERRUPTS(CLBR_NONE)
    	pushq_cfi %rdi
    	SCHEDULE_USER                # 进行调度
    	popq_cfi %rdi
    	jmp sysret_check
    	
    	/* Handle a signal */
    sysret_signal:
    	TRACE_IRQS_ON
    	ENABLE_INTERRUPTS(CLBR_NONE)
    #ifdef CONFIG_AUDITSYSCALL
    	bt $TIF_SYSCALL_AUDIT,%edx
    	jc sysret_audit
    #endif
    	/*
    	 * We have a signal, or exit tracing or single-step.
    	 * These all wind up with the iret return path anyway,
    	 * so just join that path right now.
    	 */
    	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
    	jmp int_check_syscall_exit_work
    
    badsys:
    	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
    	jmp ret_from_sys_call
    
    #ifdef CONFIG_AUDITSYSCALL
    	/*
    	 * Fast path for syscall audit without full syscall trace.
    	 * We just call __audit_syscall_entry() directly, and then
    	 * jump back to the normal fast path.
    	 */
    auditsys:
    	movq %r10,%r9			/* 6th arg: 4th syscall arg */
    	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
    	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
    	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
    	movq %rax,%rsi			/* 2nd arg: syscall number */
    	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
    	call __audit_syscall_entry
    	LOAD_ARGS 0		/* reload call-clobbered registers */
    	jmp system_call_fastpath
    
    	/*
    	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
    	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
    	 * masked off.
    	 */
    sysret_audit:
    	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
    	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
    	setbe %al		/* 1 if so, 0 if not */
    	movzbl %al,%edi		/* zero-extend that into %edi */
    	call __audit_syscall_exit
    	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
    	jmp sysret_check
    #endif	/* CONFIG_AUDITSYSCALL */
    
    	/* Do syscall tracing */
    tracesys:
    #ifdef CONFIG_AUDITSYSCALL
    	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
    	jz auditsys
    #endif
    	SAVE_REST
    	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
    	FIXUP_TOP_OF_STACK %rdi
    	movq %rsp,%rdi
    	call syscall_trace_enter
    	/*
    	 * Reload arg registers from stack in case ptrace changed them.
    	 * We don't reload %rax because syscall_trace_enter() returned
    	 * the value it wants us to use in the table lookup.
    	 */
    	LOAD_ARGS ARGOFFSET, 1
    	RESTORE_REST
    #if __SYSCALL_MASK == ~0
    	cmpq $__NR_syscall_max,%rax
    #else
    	andl $__SYSCALL_MASK,%eax
    	cmpl $__NR_syscall_max,%eax
    #endif
    	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
    	movq %r10,%rcx	/* fixup for C */
    	call *sys_call_table(,%rax,8)
    	movq %rax,RAX-ARGOFFSET(%rsp)
    	/* Use IRET because user could have changed frame */
    
    /*
     * Syscall return path ending with IRET.
     * Has correct top of stack, but partial stack frame.
     */
    GLOBAL(int_ret_from_sys_call)
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	movl $_TIF_ALLWORK_MASK,%edi
    	/* edi:	mask to check */
    GLOBAL(int_with_check)
    	LOCKDEP_SYS_EXIT_IRQ
    	GET_THREAD_INFO(%rcx)
    	movl TI_flags(%rcx),%edx
    	andl %edi,%edx
    	jnz   int_careful
    	andl    $~TS_COMPAT,TI_status(%rcx)
    	jmp   retint_swapgs
    
    	/* Either reschedule or signal or syscall exit tracking needed. */
    	/* First do a reschedule test. */
    	/* edx:	work, edi: workmask */
    int_careful:
    	bt $TIF_NEED_RESCHED,%edx   # 检查是否需要调度 
    	jnc  int_very_careful
    	TRACE_IRQS_ON
    	ENABLE_INTERRUPTS(CLBR_NONE)
    	pushq_cfi %rdi
    	SCHEDULE_USER         # 主动调度
    	popq_cfi %rdi
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	jmp int_with_check
    
    	/* handle signals and tracing -- both require a full stack frame */
    int_very_careful:
    	TRACE_IRQS_ON
    	ENABLE_INTERRUPTS(CLBR_NONE)
    int_check_syscall_exit_work:
    	SAVE_REST
    	/* Check for syscall exit trace */
    	testl $_TIF_WORK_SYSCALL_EXIT,%edx
    	jz int_signal
    	pushq_cfi %rdi
    	leaq 8(%rsp),%rdi	# &ptregs -> arg1
    	call syscall_trace_leave
    	popq_cfi %rdi
    	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
    	jmp int_restore_rest
    
    int_signal:
    	testl $_TIF_DO_NOTIFY_MASK,%edx
    	jz 1f
    	movq %rsp,%rdi		# &ptregs -> arg1
    	xorl %esi,%esi		# oldset -> arg2
    	call do_notify_resume
    1:	movl $_TIF_WORK_MASK,%edi
    int_restore_rest:
    	RESTORE_REST
    	DISABLE_INTERRUPTS(CLBR_NONE)
    	TRACE_IRQS_OFF
    	jmp int_with_check
    	CFI_ENDPROC
    END(system_call)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222

    其中SCHEDULE_USER的定义如下

    #ifdef CONFIG_CONTEXT_TRACKING
    # define SCHEDULE_USER call schedule_user
    #else
    # define SCHEDULE_USER call schedule
    #endif
    
    • 1
    • 2
    • 3
    • 4
    • 5

    调用schedule相关函数。至此中断调度和系统调用的时机检查基本如上流程。

    进程的调度类型

    在进程进行schedule的时候会选择下一个需要调度的进程。

    static void __sched __schedule(void)
    {
    	struct task_struct *prev, *next;
    	unsigned long *switch_count;
    	struct rq *rq;
    	int cpu;
    
    need_resched:
    	preempt_disable();
    	cpu = smp_processor_id();   // 获取当前的cpu
    	rq = cpu_rq(cpu);           // 获取当前的调度队列
    	rcu_note_context_switch(cpu);
    	prev = rq->curr;            // 获取当前进程
    	...
    
    	pre_schedule(rq, prev);
    
    	if (unlikely(!rq->nr_running))
    		idle_balance(cpu, rq);
    
    	put_prev_task(rq, prev);  // 放置当前进程到队列中
    	next = pick_next_task(rq);    // 选择下一个待执行进程
    	clear_tsk_need_resched(prev);
    	rq->skip_clock_update = 0;
    	
      ...
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27

    调度的主要流程就是选择下一个可执行的进程pick_next_task。

    static inline struct task_struct *
    pick_next_task(struct rq *rq)
    {
    	const struct sched_class *class;
    	struct task_struct *p;
    
    	/*
    	 * Optimization: we know that if all tasks are in
    	 * the fair class we can call that function directly:
    	 */
    	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {  // 如果当前的都是cfs进程则直接进行调度
    		p = fair_sched_class.pick_next_task(rq);
    		if (likely(p))
    			return p;
    	}
    
    	for_each_class(class) {
    		p = class->pick_next_task(rq);  // 便利每一个class分别选用不用的策略来进行调度
    		if (p)
    			return p;
    	}
    
    	BUG(); /* the idle class will always have a runnable task */
    }
    
    ...
      
    #define sched_class_highest (&stop_sched_class)
    #define for_each_class(class) \
       for (class = sched_class_highest; class; class = class->next)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30

    此时就是从stop_sched_class开始一路往最低优先级便利选择。

    stop_sched_class->rt_sched_class->fair_sched_class->idle_sched_class
    
    • 1

    优先级如上所示,先是stop_sched_class,再是实时进程,然后是cfs公平调度算法,最后就是空闲算法。每个class都调度pick_next_task函数来选择下一个运行的进程。

    stop_sched_class
    static struct task_struct *pick_next_task_stop(struct rq *rq)
    {
    	struct task_struct *stop = rq->stop;
    
    	if (stop && stop->on_rq) {
    		stop->se.exec_start = rq->clock_task;  // 检查是否有 有则选择stop的进程
    		return stop;
    	}
    
    	return NULL;  // 否则返回空
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    该类用在内核任务,迁移任务时采用。

    rt_sched_class
    static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
    						   struct rt_rq *rt_rq)
    {
    	struct rt_prio_array *array = &rt_rq->active;
    	struct sched_rt_entity *next = NULL;
    	struct list_head *queue;
    	int idx;
    
    	idx = sched_find_first_bit(array->bitmap);  // 比较进程的优先级 根据优先级选择进程
    	BUG_ON(idx >= MAX_RT_PRIO);
    
    	queue = array->queue + idx;
    	next = list_entry(queue->next, struct sched_rt_entity, run_list);
    
    	return next;
    }
    
    static struct task_struct *_pick_next_task_rt(struct rq *rq)
    {
    	struct sched_rt_entity *rt_se;
    	struct task_struct *p;
    	struct rt_rq *rt_rq;
    
    	rt_rq = &rq->rt;
    
    	if (!rt_rq->rt_nr_running)
    		return NULL;
    
    	if (rt_rq_throttled(rt_rq))
    		return NULL;
    
    	do {
    		rt_se = pick_next_rt_entity(rq, rt_rq);  // 获取下一个高优先级进程
    		BUG_ON(!rt_se);
    		rt_rq = group_rt_rq(rt_se);  // 选择下一个优先组队列
    	} while (rt_rq);
    
    	p = rt_task_of(rt_se);
    	p->se.exec_start = rq->clock_task;  // 设置
    
    	return p;
    }
    
    static struct task_struct *pick_next_task_rt(struct rq *rq)
    {
    	struct task_struct *p = _pick_next_task_rt(rq);  // 选择下一个进程
    
    	/* The running task is never eligible for pushing */
    	if (p)
    		dequeue_pushable_task(rq, p);
    
    #ifdef CONFIG_SMP
    	/*
    	 * We detect this state here so that we can avoid taking the RQ
    	 * lock again later if there is no need to push
    	 */
    	rq->post_schedule = has_pushable_tasks(rq);
    #endif
    
    	return p;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61

    主要是通过优先级来进行选择下一个待运行的进程,该调度策略为实时调度策略,在进程响应要求较高的时候优势比较明显。

    fair_sched_class

    该策略为公平调度策略,是当前应用较为广泛的一种调度策略,将进程分为不同的调度组,根据组的运行的整体时间来进行调度。

    static struct task_struct *pick_next_task_fair(struct rq *rq)
    {
    	struct task_struct *p;
    	struct cfs_rq *cfs_rq = &rq->cfs;
    	struct sched_entity *se;
    
    	if (!cfs_rq->nr_running)
    		return NULL;
    
    	do {
    		se = pick_next_entity(cfs_rq);  // 选择下一个调度实体
    		set_next_entity(cfs_rq, se);
    		cfs_rq = group_cfs_rq(se);
    	} while (cfs_rq);
    
    	p = task_of(se);
    	if (hrtick_enabled(rq))
    		hrtick_start_fair(rq, p);    // 选择调度实体中可调度的进程
    
    	return p;
    }
    
    /*
     * Pick the next process, keeping these things in mind, in this order:
     * 1) keep things fair between processes/task groups
     * 2) pick the "next" process, since someone really wants that to run
     * 3) pick the "last" process, for cache locality
     * 4) do not run the "skip" process, if something else is available
     */
    static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
    {
    	struct sched_entity *se = __pick_first_entity(cfs_rq);  // 获取下一个运行的节点
    	struct sched_entity *left = se;
    
    	/*
    	 * Avoid running the skip buddy, if running something else can
    	 * be done without getting too unfair.
    	 */
    	if (cfs_rq->skip == se) {   // 是否skip如果跳过则选择下一个
    		struct sched_entity *second = __pick_next_entity(se);  // 
    		if (second && wakeup_preempt_entity(second, left) < 1)
    			se = second;
    	}
    
    	/*
    	 * Prefer last buddy, try to return the CPU to a preempted task.
    	 */
    	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
    		se = cfs_rq->last;  
    
    	/*
    	 * Someone really wants this to run. If it's not unfair, run it.
    	 */
    	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
    		se = cfs_rq->next;
    
    	clear_buddies(cfs_rq, se);
    
    	return se;
    }
    
    
    static struct sched_entity *__pick_next_entity(struct sched_entity *se)
    {
    	struct rb_node *next = rb_next(&se->run_node);  // 获取树中最小的节点运行
    
    	if (!next)
    		return NULL;
    
    	return rb_entry(next, struct sched_entity, run_node); // 获取节点
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71

    因为cfs基于红黑树来进行组织的故在获取调度进程的时候可直接选择,下一个待调度组织的最少执行节点进程,其中有关进程的运行和计算的内容会跟随schedule_tick来给进程进行标记(有机会后续分析)。

    idle_sched_class
    static struct task_struct *pick_next_task_idle(struct rq *rq)
    {
    	schedstat_inc(rq, sched_goidle);
    #ifdef CONFIG_SMP
    	/* Trigger the post schedule to do an idle_enter for CFS */
    	rq->post_schedule = 1;
    #endif
    	return rq->idle;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    这个空闲的进程是在start_kernel之后,给每个cpu都复制了一个进程。

    kernel_init->kernel_init_freeable->smp_init->idle_threads_init->idle_init->fork_idle->idle_init->

    struct task_struct * __cpuinit fork_idle(int cpu)
    {
    	struct task_struct *task;
    	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); // 拷贝0号进场当做idle进场
    	if (!IS_ERR(task)) {
    		init_idle_pids(task->pids);
    		init_idle(task, cpu);  // 设置task相关信息
    	}
    
    	return task;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    此时init_struct_pid就是初始化的进程,即拷贝的进程运行如下逻辑。

    start_kernel->rest_init->cpu_startup_entry流程,

    // idle.c
    void cpu_startup_entry(enum cpuhp_state state)
    {
    	/*
    	 * This #ifdef needs to die, but it's too late in the cycle to
    	 * make this generic (arm and sh have never invoked the canary
    	 * init for the non boot cpus!). Will be fixed in 3.11
    	 */
    #ifdef CONFIG_X86
    	/*
    	 * If we're the non-boot CPU, nothing set the stack canary up
    	 * for us. The boot CPU already has it initialized but no harm
    	 * in doing it again. This is a good place for updating it, as
    	 * we wont ever return from this function (so the invalid
    	 * canaries already on the stack wont ever trigger).
    	 */
    	boot_init_stack_canary();
    #endif
    	current_set_polling();
    	arch_cpu_idle_prepare();
    	cpu_idle_loop();
    }
    
    /*
     * Generic idle loop implementation
     */
    static void cpu_idle_loop(void)
    {
    	while (1) {
    		tick_nohz_idle_enter();
    
    		while (!need_resched()) {  // 检查是否需要调度
    			check_pgt_cache();
    			rmb();
    
    			if (cpu_is_offline(smp_processor_id()))
    				arch_cpu_idle_dead();
    
    			local_irq_disable();
    			arch_cpu_idle_enter();
    
    			/*
    			 * In poll mode we reenable interrupts and spin.
    			 *
    			 * Also if we detected in the wakeup from idle
    			 * path that the tick broadcast device expired
    			 * for us, we don't want to go deep idle as we
    			 * know that the IPI is going to arrive right
    			 * away
    			 */
    			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
    				cpu_idle_poll();
    			} else {
    				current_clr_polling();
    				if (!need_resched()) {
    					stop_critical_timings();
    					rcu_idle_enter();
    					arch_cpu_idle();
    					WARN_ON_ONCE(irqs_disabled());
    					rcu_idle_exit();
    					start_critical_timings();
    				} else {
    					local_irq_enable();
    				}
    				current_set_polling();
    			}
    			arch_cpu_idle_exit();
    		}
    		tick_nohz_idle_exit();
    		schedule_preempt_disabled();  // 检查是否需要调度其他进程
    	}
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72

    此时如上就是每个cpu都会被调度到的idle的进程主要就是检查是否有需要调度的进程然后进行调度。

    总结

    本文主要就是了解了一下中断调度和系统调用的场景下的调度时机,有关中断和系统调用的逻辑流程本文没有深入学习(可查阅相关资料),在了解了调度时机之后在了解了有关进程调度的类型,在Linux3.10中有四种调度的类型,本文也并没有深入去分析每个调度类的具体的逻辑详细,只是简单的概述了rt的优先级和cfs公平调度的红黑树的选择下一个进场的内容,后续有机会再深入学习。由于本人才疏学浅,如有错误请批评指正。

    https://zhuanlan.zhihu.com/p/79236207

    https://www.cnblogs.com/hellokitty2/p/16703115.html

  • 相关阅读:
    即时通讯或者推送消息的守护进程?开启几个进程?
    嵌入式开发十八:USART串口通信实验
    what‘s the meaning of csu in glibc
    甲醇燃料电池(DMFC) 系统
    AI大模型之路 第二篇: Word2Vec介绍
    Go数据库操作插件-xorm
    ETL可视化工具 DataX -- 安装部署 ( 二)
    如何保护您的数据免受.360勒索病毒的感染
    SpringBoot Security 单点登出清除所有业务系统的 token
    ASP.NET预约洗车小程序源码(前台+后台)
  • 原文地址:https://blog.csdn.net/qq_33339479/article/details/127969811