1. 运行队列:runqueue
Linux内核使用运行队列来管理每个CPU上运行的任务,要运行的任务抽象成“调度实体”,调度实体可以是进程、线程或任务组。
对于cfs
、rt
、dl
的任务,调度实体分别由struct sched_entity
、struct sched_rt_entity
、struct sched_dl_entity
表示。这些结构体直接由struct task_struct
包含。
1/// kernel/sched/sched.h
2DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
3
4#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
5#define this_rq() this_cpu_ptr(&runqueues)
6#define task_rq(p) cpu_rq(task_cpu(p))
7#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
8#define raw_rq() raw_cpu_ptr(&runqueues)
sched_init
会对runqueues
的成员进行初始化,其中包括了cfs
、rt
、dl
运行队列的初始化。
1/// kernel/sched/core.c
2 for_each_possible_cpu(i) {
3 struct rq *rq;
4
5 rq = cpu_rq(i);
6 raw_spin_lock_init(&rq->__lock);
7 rq->nr_running = 0;
8 rq->calc_load_active = 0;
9 rq->calc_load_update = jiffies + LOAD_FREQ;
10 /// 初始化cfs/rt/dl
11 init_cfs_rq(&rq->cfs);
12 init_rt_rq(&rq->rt);
13 init_dl_rq(&rq->dl);
14#ifdef CONFIG_FAIR_GROUP_SCHED
15 /// ... ...
16#endif /* CONFIG_FAIR_GROUP_SCHED */
17
18 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
19#ifdef CONFIG_RT_GROUP_SCHED
20 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
21#endif
22#ifdef CONFIG_SMP
23 /// sd和rd的初始化见build_sched_domains
24 rq->sd = NULL;
25 rq->rd = NULL;
26 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
27 rq->balance_callback = &balance_push_callback;
28 rq->active_balance = 0;
29 rq->next_balance = jiffies;
30 rq->push_cpu = 0;
31 rq->cpu = i;
32 rq->online = 0;
33 rq->idle_stamp = 0;
34 rq->avg_idle = 2*sysctl_sched_migration_cost;
35 rq->wake_stamp = jiffies;
36 rq->wake_avg_idle = rq->avg_idle;
37 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
38
39 INIT_LIST_HEAD(&rq->cfs_tasks);
40
41 rq_attach_root(rq, &def_root_domain);
42 /// ... ...
43#endif /* CONFIG_SMP */
44 hrtick_rq_init(rq);
45 atomic_set(&rq->nr_iowait, 0);
46
47#ifdef CONFIG_SCHED_CORE
48 /// ... ...
49#endif
50 zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
51 }
init_idle
会在sched_init
和fork_idle
中调用。
1/// kernel/sched/core.c
2void __init init_idle(struct task_struct *idle, int cpu)
3{
4 /// ... ...
5 rq->idle = idle;
6 rcu_assign_pointer(rq->curr, idle);
7 /// ... ...
8}
1/// kernel/sched/core.c
2void sched_set_stop_task(int cpu, struct task_struct *stop)
3{
4 /// ... ...
5 cpu_rq(cpu)->stop = stop;
6 /// ... ...
7}
从init_idle
中可以看出,运行队列的当前进程(rq->curr
)为0号进程。
在rest_init
的最后,0号进程调用schedule_preempt_disabled
主动让出cpu后,调度器会从1号和2号进程中选择一个进程运行。
在__schedule
中选择好下一个进程后,会使用RCU_INIT_POINTER(rq->curr, next)
更新rq->curr
。
2. 调度
调度(schedule)的主要流程就是挑选下一个要运行的进程,然后切换进程上下文。
2.1. 选择下一个进程:pick_next_task
看一下schedule_preempt_disabled
的调用路径:schedule_preempt_disabled -> schedule -> __schedule -> pick_next_task
。
pick_next_task
负责选择下一个要运行的进程。这里只分析流程,为了方便,以下配置在分析过程中视为关闭。
1CONFIG_SCHED_CORE
2CONFIG_CGROUP_SCHED
3CONFIG_FAIR_GROUP_SCHED
在未定义CONFIG_SCHED_CORE
的情况下pick_next_task
直接调用__pick_next_task
。
1/// kernel/sched/core.c
2/*
3 * Pick up the highest-prio task:
4 */
5static inline struct task_struct *
6__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7{
8 const struct sched_class *class;
9 struct task_struct *p;
10
11 /*
12 * Optimization: we know that if all tasks are in the fair class we can
13 * call that function directly, but only if the @prev task wasn't of a
14 * higher scheduling class, because otherwise those lose the
15 * opportunity to pull in more work from other CPUs.
16 */
17 if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
18 rq->nr_running == rq->cfs.h_nr_running)) {
19
20 p = pick_next_task_fair(rq, prev, rf);
21 if (unlikely(p == RETRY_TASK))
22 goto restart;
23
24 /* Assume the next prioritized class is idle_sched_class */
25 if (!p) {
26 put_prev_task(rq, prev);
27 p = pick_next_task_idle(rq);
28 }
29
30 return p;
31 }
32
33restart:
34 put_prev_task_balance(rq, prev, rf);
35
36 for_each_class(class) {
37 p = class->pick_next_task(rq);
38 if (p)
39 return p;
40 }
41
42 BUG(); /* The idle class should always have a runnable task. */
43}
这里以fair
为例分析,其主要流程就是通过pick_next_entity
找到下一个要运行struct sched_entity
,然后通过struct sched_entity
找到struct task_struct
并返回。
对于rt
和dl
来说,其流程也是类似的,而对于idle
和stop
来说,则是直接返回rq->idle
或rq->stop
。
2.2. 进程上下文切换:context_switch
switch_to
是一个体系结构相关的宏,最终交由cpu_switch_to
来将当前进程的上下文保存到prev->thread.cpu_context
,然后将next->thread.cpu_context
的内容恢复到寄存器中。
ARM64寄存器的详细模式见《DEN0024A_v8_architecture_PG.pdf》第9章。
1/// arch/arm64/kernel/entry.S
2/*
3 * Register switch for AArch64. The callee-saved registers need to be saved
4 * and restored. On entry:
5 * x0 = previous task_struct (must be preserved across the switch)
6 * x1 = next task_struct
7 * Previous and next are guaranteed not to be the same.
8 *
9 */
10SYM_FUNC_START(cpu_switch_to)
11 mov x10, #THREAD_CPU_CONTEXT /// offsetof(struct task_struct, thread.cpu_context)
12 add x8, x0, x10 /// x8 = &prev->thread.cpu_context
13 mov x9, sp
14 // store callee-saved registers
15 stp x19, x20, [x8], #16
16 stp x21, x22, [x8], #16
17 stp x23, x24, [x8], #16
18 stp x25, x26, [x8], #16
19 stp x27, x28, [x8], #16
20 stp x29, x9, [x8], #16 /// x9 = sp
21 str lr, [x8]
22 add x8, x1, x10 /// x8 = &next->thread.cpu_context
23 // restore callee-saved registers
24 ldp x19, x20, [x8], #16
25 ldp x21, x22, [x8], #16
26 ldp x23, x24, [x8], #16
27 ldp x25, x26, [x8], #16
28 ldp x27, x28, [x8], #16
29 ldp x29, x9, [x8], #16
30 ldr lr, [x8]
31 mov sp, x9 /// sp = x9
32 msr sp_el0, x1
33 ptrauth_keys_install_kernel x1, x8, x9, x10
34 scs_save x0
35 scs_load_current
36 ret
37SYM_FUNC_END(cpu_switch_to)
38NOKPROBE(cpu_switch_to)
参考资料
Documentation/translations/zh_CN/scheduler/sched-arch.rst