1. 运行队列:runqueue

Linux内核使用运行队列来管理每个CPU上运行的任务,要运行的任务抽象成“调度实体”,调度实体可以是进程、线程或任务组。

对于cfsrtdl的任务,调度实体分别由struct sched_entitystruct sched_rt_entitystruct sched_dl_entity表示。这些结构体直接由struct task_struct包含。

1/// kernel/sched/sched.h
2DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
3
4#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
5#define this_rq()		this_cpu_ptr(&runqueues)
6#define task_rq(p)		cpu_rq(task_cpu(p))
7#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
8#define raw_rq()		raw_cpu_ptr(&runqueues)

sched_init会对runqueues的成员进行初始化,其中包括了cfsrtdl运行队列的初始化。

 1/// kernel/sched/core.c
 2    for_each_possible_cpu(i) {
 3        struct rq *rq;
 4
 5        rq = cpu_rq(i);
 6        raw_spin_lock_init(&rq->__lock);
 7        rq->nr_running = 0;
 8        rq->calc_load_active = 0;
 9        rq->calc_load_update = jiffies + LOAD_FREQ;
10        /// 初始化cfs/rt/dl
11        init_cfs_rq(&rq->cfs);
12        init_rt_rq(&rq->rt);
13        init_dl_rq(&rq->dl);
14#ifdef CONFIG_FAIR_GROUP_SCHED
15        /// ... ...
16#endif /* CONFIG_FAIR_GROUP_SCHED */
17
18        rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
19#ifdef CONFIG_RT_GROUP_SCHED
20        init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
21#endif
22#ifdef CONFIG_SMP
23        /// sd和rd的初始化见build_sched_domains
24        rq->sd = NULL;
25        rq->rd = NULL;
26        rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
27        rq->balance_callback = &balance_push_callback;
28        rq->active_balance = 0;
29        rq->next_balance = jiffies;
30        rq->push_cpu = 0;
31        rq->cpu = i;
32        rq->online = 0;
33        rq->idle_stamp = 0;
34        rq->avg_idle = 2*sysctl_sched_migration_cost;
35        rq->wake_stamp = jiffies;
36        rq->wake_avg_idle = rq->avg_idle;
37        rq->max_idle_balance_cost = sysctl_sched_migration_cost;
38
39        INIT_LIST_HEAD(&rq->cfs_tasks);
40
41        rq_attach_root(rq, &def_root_domain);
42        /// ... ...
43#endif /* CONFIG_SMP */
44        hrtick_rq_init(rq);
45        atomic_set(&rq->nr_iowait, 0);
46
47#ifdef CONFIG_SCHED_CORE
48        /// ... ...
49#endif
50        zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
51    }

init_idle会在sched_initfork_idle中调用。

1/// kernel/sched/core.c
2void __init init_idle(struct task_struct *idle, int cpu)
3{
4    /// ... ...
5    rq->idle = idle;
6    rcu_assign_pointer(rq->curr, idle);
7    /// ... ...
8}
1/// kernel/sched/core.c
2void sched_set_stop_task(int cpu, struct task_struct *stop)
3{
4    /// ... ...
5    cpu_rq(cpu)->stop = stop;
6    /// ... ...
7}

init_idle中可以看出,运行队列的当前进程(rq->curr)为0号进程。

rest_init的最后,0号进程调用schedule_preempt_disabled主动让出cpu后,调度器会从1号和2号进程中选择一个进程运行。

__schedule中选择好下一个进程后,会使用RCU_INIT_POINTER(rq->curr, next)更新rq->curr

2. 调度

调度(schedule)的主要流程就是挑选下一个要运行的进程,然后切换进程上下文。

2.1. 选择下一个进程:pick_next_task

看一下schedule_preempt_disabled的调用路径:schedule_preempt_disabled -> schedule -> __schedule -> pick_next_task

pick_next_task负责选择下一个要运行的进程。这里只分析流程,为了方便,以下配置在分析过程中视为关闭。

1CONFIG_SCHED_CORE
2CONFIG_CGROUP_SCHED
3CONFIG_FAIR_GROUP_SCHED

在未定义CONFIG_SCHED_CORE的情况下pick_next_task直接调用__pick_next_task

 1/// kernel/sched/core.c
 2/*
 3 * Pick up the highest-prio task:
 4 */
 5static inline struct task_struct *
 6__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 7{
 8    const struct sched_class *class;
 9    struct task_struct *p;
10
11    /*
12     * Optimization: we know that if all tasks are in the fair class we can
13     * call that function directly, but only if the @prev task wasn't of a
14     * higher scheduling class, because otherwise those lose the
15     * opportunity to pull in more work from other CPUs.
16     */
17    if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
18           rq->nr_running == rq->cfs.h_nr_running)) {
19
20        p = pick_next_task_fair(rq, prev, rf);
21        if (unlikely(p == RETRY_TASK))
22            goto restart;
23
24        /* Assume the next prioritized class is idle_sched_class */
25        if (!p) {
26            put_prev_task(rq, prev);
27            p = pick_next_task_idle(rq);
28        }
29
30        return p;
31    }
32
33restart:
34    put_prev_task_balance(rq, prev, rf);
35
36    for_each_class(class) {
37        p = class->pick_next_task(rq);
38        if (p)
39            return p;
40    }
41
42    BUG(); /* The idle class should always have a runnable task. */
43}

这里以fair为例分析,其主要流程就是通过pick_next_entity找到下一个要运行struct sched_entity,然后通过struct sched_entity找到struct task_struct并返回。

对于rtdl来说,其流程也是类似的,而对于idlestop来说,则是直接返回rq->idlerq->stop

2.2. 进程上下文切换:context_switch

switch_to是一个体系结构相关的宏,最终交由cpu_switch_to来将当前进程的上下文保存到prev->thread.cpu_context,然后将next->thread.cpu_context的内容恢复到寄存器中。

ARM64寄存器的详细模式见《DEN0024A_v8_architecture_PG.pdf》第9章。

 1/// arch/arm64/kernel/entry.S
 2/*
 3 * Register switch for AArch64. The callee-saved registers need to be saved
 4 * and restored. On entry:
 5 *   x0 = previous task_struct (must be preserved across the switch)
 6 *   x1 = next task_struct
 7 * Previous and next are guaranteed not to be the same.
 8 *
 9 */
10SYM_FUNC_START(cpu_switch_to)
11    mov	x10, #THREAD_CPU_CONTEXT    /// offsetof(struct task_struct, thread.cpu_context)
12    add	x8, x0, x10                 /// x8 = &prev->thread.cpu_context
13    mov	x9, sp
14    // store callee-saved registers
15    stp	x19, x20, [x8], #16
16    stp	x21, x22, [x8], #16
17    stp	x23, x24, [x8], #16
18    stp	x25, x26, [x8], #16
19    stp	x27, x28, [x8], #16
20    stp	x29, x9, [x8], #16          /// x9 = sp
21    str	lr, [x8]
22    add	x8, x1, x10                 /// x8 = &next->thread.cpu_context
23    // restore callee-saved registers
24    ldp	x19, x20, [x8], #16
25    ldp	x21, x22, [x8], #16
26    ldp	x23, x24, [x8], #16
27    ldp	x25, x26, [x8], #16
28    ldp	x27, x28, [x8], #16
29    ldp	x29, x9, [x8], #16
30    ldr	lr, [x8]
31    mov	sp, x9                      /// sp = x9
32    msr	sp_el0, x1
33    ptrauth_keys_install_kernel x1, x8, x9, x10
34    scs_save x0
35    scs_load_current
36    ret
37SYM_FUNC_END(cpu_switch_to)
38NOKPROBE(cpu_switch_to)

参考资料

Documentation/translations/zh_CN/scheduler/sched-arch.rst