聊聊 Linux Scheduler

Linux Scheduler介绍

Table of Contents

Linux Scheduler介绍
结合代码谈谈调度器
参考

Linux 的 CPU 调度器并不是从设计之初就是像今天这样复杂的，在很长的一段时间里（v0.01 ~ v2.4），Linux 的进程调度都由几十行的简单函数负责，我们先了解一下不同版本调度器的历史：

Init调度器 · v0.01 ~ v2.3

由几十行代码实现，功能非常简陋；
同时最多处理 64 个任务；

𝑂(𝑛) 调度器 · v2.4 ~ v2.5

调度时需要遍历全部任务；
当待执行的任务较多时，同一个任务两次执行的间隔很长，会有比较严重的饥饿问题；

𝑂(1) 调度器 · v2.6.0 ~ v2.6.22

通过引入运行队列和优先数组实现 𝑂(1) 的时间复杂度;
使用本地运行队列替代全局运行队列增强在对称多处理器的扩展性；
引入工作窃取保证多个运行队列中任务的平衡；

CFS 完全公平调度器 · v2.6.23 ~ 至今

引入红黑树和运行时间保证调度的公平性；
引入调度类实现不同任务类型的不同调度策略；

这里会详细介绍从最初的调度器到今天复杂的完全公平调度器（Completely Fair Scheduler，CFS）的演变过程

结合代码谈谈调度器

Init Scheduler

// sched.h
#define NR_TASKS  64
#define TASK_RUNNING		0
#define TASK_INTERRUPTIBLE	1
#define TASK_UNINTERRUPTIBLE	2
#define TASK_ZOMBIE		3
#define TASK_STOPPED		4
extern struct task_struct *task[NR_TASKS];
struct task_struct {
/* these are hardcoded - don't touch */
	long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
	long counter;
	long priority;
	long signal;
	fn_ptr sig_restorer;
	fn_ptr sig_fn[32];
/* various fields */
	int exit_code;
	unsigned long end_code,end_data,brk,start_stack;
	long pid,father,pgrp,session,leader;
	unsigned short uid,euid,suid;
	unsigned short gid,egid,sgid;
	long alarm;
	long utime,stime,cutime,cstime,start_time;
	unsigned short used_math;
/* file system info */
	int tty;		/* -1 if no tty, so it must be signed */
	unsigned short umask;
	struct m_inode * pwd;
	struct m_inode * root;
	unsigned long close_on_exec;
	struct file * filp[NR_OPEN];
/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
	struct desc_struct ldt[3];
/* tss for this task */
	struct tss_struct tss;
};
// sched.c

void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;
......

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	switch_to(next);
}

counter 是时间片，决定该进程还能运行多久
找到当前时间片最大的“可运行进程”的编号 next，即含有最大counter的 TASK
如果找到了有效的时间片，就退出循环
如果没有任何任务有时间片，就“重新计算”每个任务的时间片
动态调整时间片：之前执行得多（counter 大）就“折半衰减”；同时加上“静态优先级”值（priority）
切换上下文到我们挑选的下一个进程
所有的调度进程都存储在上限仅为 64 的数组中，调度器能够处理的进程上限也只有 64 个
Linux 操作系统的计时器会每隔 10ms 触发一次 do_timer 将当前正在运行进程的 counter 减一，当前进程的计数器归零时就会重新触发调度

O(n) Scheduler

𝑂(𝑛) 调度器是 Linux 在 v2.4 ~ v2.6 版本使用的调度器，由于该调取器在最坏的情况下会遍历所有的任务，所以它调度任务的时间复杂度就是 𝑂(𝑛) Linux 调度算法将 CPU 时间分割成了不同的时期（Epoch），也就是每个任务能够使用的时间切片。

struct task_struct {
	/*
	 * offsets of these are hardcoded elsewhere - touch with care
	 */
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
	unsigned long flags;	/* per process flags, defined below */
	int sigpending;
	mm_segment_t addr_limit;	/* thread address space:
					 	0-0xBFFFFFFF for user-thead
						0-0xFFFFFFFF for kernel-thread
					 */
	struct exec_domain *exec_domain;
	volatile long need_resched;
	unsigned long ptrace;

	int lock_depth;		/* Lock depth */

/*
 * offset 32 begins here on 32-bit platforms. We keep
 * all fields in a single cacheline that are needed for
 * the goodness() loop in schedule().
 */
	long counter;
	long nice;
	unsigned long policy;
	struct mm_struct *mm;
	int has_cpu, processor;
	unsigned long cpus_allowed;
	/*
	 * (only the 'next' pointer fits into the cacheline, but
	 * that's just fine.)
	 */
	struct list_head run_list;
	unsigned long sleep_time;

	struct task_struct *next_task, *prev_task;
	struct mm_struct *active_mm;

/* task state */
	struct linux_binfmt *binfmt;
	int exit_code, exit_signal;
	int pdeath_signal;  /*  The signal sent when the parent dies  */
	/* ??? */
	unsigned long personality;
	int dumpable:1;
	int did_exec:1;
	pid_t pid;
	pid_t pgrp;
	pid_t tty_old_pgrp;
	pid_t session;
	pid_t tgid;
	/* boolean value for session group leader */
	int leader;
	/*
	 * pointers to (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with
	 * p->p_pptr->pid)
	 */
	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
	struct list_head thread_group;

	/* PID hash table linkage. */
	struct task_struct *pidhash_next;
	struct task_struct **pidhash_pprev;

	wait_queue_head_t wait_chldexit;	/* for wait4() */
	struct semaphore *vfork_sem;		/* for vfork() */
	unsigned long rt_priority;
	unsigned long it_real_value, it_prof_value, it_virt_value;
	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
	struct timer_list real_timer;
	struct tms times;
	unsigned long start_time;
	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
	int swappable:1;
/* process credentials */
	uid_t uid,euid,suid,fsuid;
	gid_t gid,egid,sgid,fsgid;
	int ngroups;
	gid_t	groups[NGROUPS];
	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
	int keep_capabilities:1;
	struct user_struct *user;
/* limits */
	struct rlimit rlim[RLIM_NLIMITS];
	unsigned short used_math;
	char comm[16];
/* file system info */
	int link_count;
	struct tty_struct *tty; /* NULL if no tty */
	unsigned int locks; /* How many file locks are being held */
/* ipc stuff */
	struct sem_undo *semundo;
	struct sem_queue *semsleeping;
/* CPU-specific state of this task */
	struct thread_struct thread;
/* filesystem information */
	struct fs_struct *fs;
/* open file information */
	struct files_struct *files;
/* signal handlers */
	spinlock_t sigmask_lock;	/* Protects signal and blocked */
	struct signal_struct *sig;

	sigset_t blocked;
	struct sigpending pending;

	unsigned long sas_ss_sp;
	size_t sas_ss_size;
	int (*notifier)(void *priv);
	void *notifier_data;
	sigset_t *notifier_mask;

/* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty */
	spinlock_t alloc_lock;
};

Linux 的调度器（如 schedule() 函数）依赖于每个进程的 task_struct 结构来进行调度决策。这个结构记录了进程的所有核心状态信息，比如状态、优先级、时间片等

与调度器相关的核心字段

字段	说明	作用
`volatile long state`	当前进程状态（如运行、就绪、睡眠）	决定是否能被调度（必须是 `TASK_RUNNING`）
`long counter`	剩余时间片（动态）	调度器用来挑选谁运行
`long nice`	用户设置的优先级，越大越“nice”，优先级越低	影响时间片和优先级计算
`unsigned long policy`	调度策略（如 `SCHED_FIFO`、`SCHED_RR`、`SCHED_OTHER`）	不同策略对应不同调度行为
`unsigned long sleep_time`	睡眠时长	某些调度策略考虑它，防止饥饿
`unsigned long priority`	静态优先级	动态调整时间片时使用
`volatile long need_resched`	设置为 1 表示需要被调度出去	在中断或系统调用中可触发调度

进程身份

字段	说明
`pid`, `tgid`	进程 ID、线程组 ID
`p_pptr`	父进程
`p_cptr`, `p_ysptr`, `p_osptr`	子进程和兄弟链表
`thread_group`	所在线程组的链接

信号相关字段

字段	说明
`sigpending`, `blocked`, `pending`	正在处理/阻塞的信号
`sig`	信号处理结构体
`wait_chldexit`	`wait()` 系列调用的等待队列

线内核线程状态

字段	说明
`struct thread_struct thread`	CPU 寄存器等低层信息（上下文切换时使用）
`addr_limit`	用户态 vs 内核态地址边界（防止访问越界）

task_struct 是 Linux 中每个进程的“身份+状态+资源”合集，调度器 schedule() 通过它来判断谁应该获得 CPU，并基于时间片和优先级动态调整策略

与上一个版本的调度器相比，𝑂(𝑛) 调度器的实现复杂了很多，该调度器会在 schedule 函数中遍历运行队列中的所有任务并调用 goodness 函数分别计算它们的权重获得下一个运行的进程


/*
 * Scheduling policies
 */
#define SCHED_OTHER		0
#define SCHED_FIFO		1
#define SCHED_RR		2
/*
 * This is an additional bit set when we want to
 * yield the CPU for one re-schedule..
 */
#define SCHED_YIELD		0x10

static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
	int weight;

	/*
	 * select the current process after every other
	 * runnable process, but before the idle thread.
	 * Also, dont trigger a counter recalculation.
	 */
	weight = -1;
	/*进程可以设置调度策略为SCHED_YIELD即“礼让”策略，这时候它的权值为-1,权值相当低*/
	if (p->policy & SCHED_YIELD)
		goto out;

	/*
	 * Non-RT process - normal case first.
	 */

	/* 对于调度策略为SCHED_OTHER的进程，没有实时性要求，它的权值仅仅取决于
	 * 时间片的剩余和它的nice值，nice（谦让）是一个负向的权值，也就是说
	 * 数值上nice越小，则优先级越高，总的权值 = 时间片剩余 + (20 - nice)
	 * */
	if (p->policy == SCHED_OTHER) {
		/*
		 * Give the process a first-approximation goodness value
		 * according to the number of clock-ticks it has left.
		 *
		 * Don't do any other calculations if the time slice is
		 * over..
		 */
		weight = p->counter;
		if (!weight)
			goto out;

#ifdef CONFIG_SMP
		/* Give a largish advantage to the same processor...   */
		/* (this is equivalent to penalizing other processors) */
		if (p->processor == this_cpu)
			weight += PROC_CHANGE_PENALTY;
#endif

		/* .. and a slight advantage to the current MM */
		if (p->mm == this_mm || !p->mm)
			weight += 1;
		weight += 20 - p->nice;
		goto out;
	}

	/*
	 * Realtime process, select the first one on the
	 * runqueue (taking priorities within processes
	 * into account).
	 */
	/*
	 *对于实时进程，也就是SCHED_FIFO或者SCHED_RR调度策略，
	 *具有一个实时优先级，总的权值仅仅取决于该实时优先级，
	 *总的权值 = 1000 + 实时优先级。可见实时进程会获得很
	 *高的加分
	 * */
	weight = 1000 + p->rt_priority;
out:
	return weight;

	/*从上述代码可以看出，对于实时进程来说，nice值是对其调度权值没有影响的
	 * 也就是说，调度进程运行的时候，nice值不会影响该实时进程是否获得CPU使用
	 * 权。但是nice值会影响SCHED_RR策略的时间片配额
	 *nice值对SCHED_FIFO策略完全没有任何影响
	 * */
}

asmlinkage void schedule(void)
{
	struct schedule_data * sched_data;
	struct task_struct *prev, *next, *p;
	struct list_head *tmp;
	int this_cpu, c;

	/*进程在运行时active_mm一定不是0*/
	if (!current->active_mm) BUG();
    ......
    repeat_schedule:
	/*
	 * Default process to select..
	 */
	/*按照策略选择等待调度的进程中的最佳候选进程*/
	next = idle_task(this_cpu);
	c = -1000;
	if (prev->state == TASK_RUNNING)
		goto still_running;

still_running_back:
	/*遍历所有处于就绪状态的进程*/
	list_for_each(tmp, &runqueue_head) {
		p = list_entry(tmp, struct task_struct, run_list);
		if (can_schedule(p, this_cpu)) {
			/*遍历到的每个进程都计算其权值*/
			int weight = goodness(p, this_cpu, prev->active_mm);
			if (weight > c)
				/*比较取得最大权值的那个进程*/
				c = weight, next = p;
		}
	}

	/* Do we need to re-calculate counters? */
	/*如果全部就绪进程的权值都已经降到0了，那就开始重新计算每个进程的权值
	 * */
	if (!c)
		goto recalculate;
	/*
	 * from this point on nothing can prevent us from
	 * switching to the next task, save this fact in
	 * sched_data.
	 */
	sched_data->curr = next;
    ......

在每个时期开始时，上述代码都会为所有的任务计算时间切片，因为需要执行 n 次，所以调度器被称作 𝑂(𝑛) 调度器。在默认情况下，每个任务在一个周期都会分配到 200ms 左右的时间切片，然而这种调度和分配方式是 𝑂(𝑛) 调度器的最大问题：

每轮调度完成之后就会陷入没有任务需要调度的情况，需要提升交互性能的场景会受到严重影响，例如：在桌面拖动鼠标会感觉到明显的卡顿；
每次查找权重最高的任务都需要遍历数组中的全部任务；
调度器分配的平均时间片大小为 210ms，当程序中包含 100 个进程时，同一个进程被运行两次的间隔是 21s，这严重影响了操作系统的可用性；

正是因为 𝑂(𝑛) 调度器存在了上述的问题，所以 Linux 内核在两个版本后使用新的 𝑂(1) 调度器替换该实现

O(1) Scheduler

在 𝑂(𝑛) 调度器的基础上进行了如下的改进:

调度器支持了 𝑂(1) 时间复杂度的调度
调度器支持了对称多处理（SMP Symmetric multiprocessing）的扩展性
调度器优化了对称多处理的亲和性

数据结构

struct runqueue {
	...
	prio_array_t *active, *expired, arrays[2];
	...
}
struct prio_array {
	unsigned int nr_active;
	unsigned long bitmap[BITMAP_SIZE];
	struct list_head queue[MAX_PRIO];
};

nr_active;
- 表示当前就绪队列（runqueue）中有多少个处于运行状态的进程。
- 减少调度器遍历所有队列的负担。
- 优化点：可以快速判断是否有任务需要调度
bitmap[BITMAP_SIZE];
- 位图用于表示哪些优先级上有进程。
- BITMAP_SIZE = MAX_PRIO / (8 * sizeof(unsigned long))，能覆盖所有优先级。
- 每个 bit 对应一个优先级（0 ~ MAX_PRIO-1），如果该优先级队列非空，则该 bit 置 1

bitmap = 0b00010010
          ↑   ↑
         prio6 prio1 有任务

这意味着 queue[1] 和 queue[6] 队列中有正在运行的任务

list_head queue[MAX_PRIO];
- 一个数组，每个元素是一个链表头。
- 每个链表存放具有相同静态优先级的就绪任务。
- MAX_PRIO 通常为 140，优先级分为两个段：
  - 0~99：实时优先级（real-time）
  - 100~139：普通优先级（normal）

prio_array
├── nr_active = 4
├── bitmap = [ 0b00001010... ]
└── queue[0] -> empty
    queue[1] -> T3 -> T4
    queue[2] -> empty
    queue[3] -> T1
    queue[4] -> empty
    ...

计算优先级

static int effective_prio(task_t *p)
{
	int bonus, prio;

	if (rt_task(p))
		return p->prio;

	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

	prio = p->static_prio - bonus;
	if (prio < MAX_RT_PRIO)
		prio = MAX_RT_PRIO;
	if (prio > MAX_PRIO-1)
		prio = MAX_PRIO-1;
	return prio;
}

如果任务是实时任务（SCHED_FIFO, SCHED_RR），直接返回 p->prio，它是实时调度类独立维护的优先级，不参与 bonus 计算
CURRENT_BONUS 是什么？
- 它基于任务的 sleep_avg（平均睡眠时间）计算得出，用来衡量一个任务是否**“交互型”（睡得多、跑得少）还是“CPU密集型”**（一直在跑）
取值范围通常是 [0, MAX_BONUS]，这里将其中心对齐到 0，即范围变成 [-5, +5]
- +5 是最“交互型”的进程（经常 sleep，响应用户）
- -5 是最“CPU 密集”的任务（很少 sleep）
static_prio 与 bonus 合成调度优先级
- p->static_prio 是由 nice 值计算得出的基础优先级（通常在 100~139 范围内）
- bonus 越大（偏交互型），prio 就越小（更高优先级）
- static_prio 由 nice 值决定，默认 nice 0 → prio 120

总结如下：

调度策略	策略值	范围	优先级字段	特点
实时任务	`SCHED_FIFO`, `SCHED_RR`	0 ~ 99 (`MAX_RT_PRIO - 1`)	`p->prio`	固定优先级，调度器不会干涉
普通任务	`SCHED_NORMAL` 或 `SCHED_OTHER`	100 ~ 139 (`MAX_PRIO - 1`)	`p->prio`, `p->static_prio`, `p->normal_prio`	会根据交互性动态调整优先级

实时任务：优先级由 rt_priority 决定，越小越高
普通任务：优先级 = static_prio - bonus，结合交互性表现动态调整

调度器如何选出最高优先级任务

过程：

1.先通过 bitmap 找出最靠前的 set bit（即最高优先级）,调用 sched_find_first_bit

1.直接访问对应的 queue[i], 在该队列中挑出队头任务调度执行(list_head)；

1.若该任务运行完，或时间片耗尽、主动让出 CPU，再次查找。

➡️ 整个过程是 O(1) 时间复杂度！

调度器是 O(n) 的，遍历整个 task 列表, O(1) 调度器中，使用 prio_array + bitmap 来快速选出最高优先级任务

再来看下关键代码：（need_resched 标志被设置后，表示当前任务需要让出 CPU，由调度器选择新的任务运行）

need_resched:
	preempt_disable();
	prev = current;
	rq = this_rq();

	release_kernel_lock(prev);
	now = sched_clock();
	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
		run_time = now - prev->timestamp;
	else
		run_time = NS_MAX_SLEEP_AVG;

	/*
	 * Tasks with interactive credits get charged less run_time
	 * at high sleep_avg to delay them losing their interactive
	 * status
	 */
	if (HIGH_CREDIT(prev))
		run_time /= (CURRENT_BONUS(prev) ? : 1);

	spin_lock_irq(&rq->lock);

	/*
	 * if entering off of a kernel preemption go straight
	 * to picking the next task.
	 */
	switch_count = &prev->nivcsw;
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
		switch_count = &prev->nvcsw;
		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
				unlikely(signal_pending(prev))))
			prev->state = TASK_RUNNING;
		else
			deactivate_task(prev, rq);
	}

	cpu = smp_processor_id();
	if (unlikely(!rq->nr_running)) {
		idle_balance(cpu, rq);
		if (!rq->nr_running) {
			next = rq->idle;
			rq->expired_timestamp = 0;
			wake_sleeping_dependent(cpu, rq);
			goto switch_tasks;
		}
	}

	array = rq->active;
	if (unlikely(!array->nr_active)) {
		/*
		 * Switch the active and expired arrays.
		 */
		rq->active = rq->expired;
		rq->expired = array;
		array = rq->active;
		rq->expired_timestamp = 0;
		rq->best_expired_prio = MAX_PRIO;
	}

	idx = sched_find_first_bit(array->bitmap);
	queue = array->queue + idx;
	next = list_entry(queue->next, task_t, run_list);

	if (dependent_sleeper(cpu, rq, next)) {
		next = rq->idle;
		goto switch_tasks;
	}

	if (!rt_task(next) && next->activated > 0) {
		unsigned long long delta = now - next->timestamp;

		if (next->activated == 1)
			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

		array = next->array;
		dequeue_task(next, array);
		recalc_task_prio(next, next->timestamp + delta);
		enqueue_task(next, array);
	}
	next->activated = 0;

准备阶段
- preempt_disable()：关闭抢占，防止调度过程中被抢占
- rq = this_rq()：获取当前 CPU 的 runqueue（运行队列）结构
记录运行时间
- sched_clock()：获取当前时间戳
- 计算当前任务运行了多久：now - prev->timestamp
- 如果时间太长了就截断为 NS_MAX_SLEEP_AVG，防止任务因运行时间过长被 "过分惩罚"
交互任务调度优惠
- 交互型任务（如 shell）会得到调度"优惠"
- 如果任务的 bonus 越高（说明其等待时间长，倾向于交互），它被"扣的分"就越少（除法放缓权重消耗）
- 在linux2.6中，一个普通进程的优先级和平均睡眠时间的关系为：平均睡眠时间越长，其bonus越大，从而得到更高的优先级
- dynamic priority = max (100, min ( static priority - bonus +5, 139)) 普通进程
判断当前任务是否继续运行
- 根据任务状态判断是否真的切换（如果它还可运行，可能会重新被选中）
- 如果是可中断睡眠状态（TASK_INTERRUPTIBLE）且有信号，将其转为 TASK_RUNNING
- 否则将任务从 runqueue 中移除deactivate_task()
寻找新任务
- 如果当前没有可运行任务，尝试从其他 CPU steal 任务
- 如果 steal 也没成功，就切到 idle task
- 并尝试唤醒任何依赖当前 CPU 的任务
调度队列切换
- 如果 active 队列为空，则与 expired 队列交换
- 两个队列交替使用，避免频繁重建队列
- 重置 expired 的时间戳、最佳优先级
选择新任务
- 从 bitmap 中快速查找最高优先级非空队列 list_head
- 取得该优先级对应队列头部的第一个任务
- 记录被选中的任务 next
新任务激活后的优先级补偿
- 如果 next 是一个非实时任务，且刚被唤醒（activated > 0）：
- 根据等待时间 delta，给予优先级补偿
- 尤其是 activated == 1（首次唤醒）时给予更多补偿
- 将任务从队列中移除 → 重新计算优先级 → 加回队列，等待下轮调度

ps 为什么要做补仓？

如果一个任务被唤醒后长时间未获得运行机会，调度器会根据它的等待时间适度提高它的优先级
避免饿死：尤其是在系统繁忙、队列拥挤时，让新唤醒的任务有"加速通道"
提升响应性：典型用于 I/O 密集型任务，比如 shell 或 nginx，被唤醒后能尽快响应用户输入或连接请求

全局的运行队列是 𝑂(𝑛) 调度器难以在对称多处理器架构上扩展的主要原因。为了保证运行队列的一致性，调度器在调度时需要获取运行队列的全局锁，随着处理器数量的增加，多个处理器在调度时会导致更多的锁竞争，严重影响调度性能。𝑂(1) 调度器通过引入本地运行队列解决这个问题，不同的 CPU 可以通过 this_rq 获取绑定在当前 CPU 上的运行队列，降低了锁的粒度和冲突的可能性

优先级和时间切片调度器中包含两种不同的优先级计算方式，一种是静态任务优先级，另一种是动态任务优先级。在默认情况下，任务的静态任务优先级都是 0，不过我们可以通过系统调用 nice 改变任务的优先级；𝑂(1) 调度器会奖励 I/O 密集型任务并惩罚 CPU 密集型任务，它会通过改变任务的静态优先级来完成优先级的动态调整，因为与用户交互的进程时 I/O 密集型的进程，这些进程由于调度器的动态策略会提高自身的优先级，从而提升用户体验

CFS

通过 CFS 的名字我们就能发现，该调度器的能为不同的进程提供完全公平性。一旦某些进程受到了不公平的待遇，调度器就会运行这些进程，从而维持所有进程运行时间的公平性

数据结构

/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight load;
	unsigned long nr_running;

	s64 fair_clock;
	u64 exec_clock;
	s64 wait_runtime;
	u64 sleeper_bonus;
	unsigned long wait_runtime_overruns, wait_runtime_underruns;

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;
	struct rb_node *rb_load_balance_curr;
#ifdef CONFIG_FAIR_GROUP_SCHED
	/* 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity *curr;
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
#endif
};

红黑树（Red-black tree）是平衡的二叉搜索树，红黑树的增删改查操作的最坏时间复杂度为 𝑂(log𝑛) ，也就是树的高度，树中最左侧的节点 rb_leftmost 运行的时间最短，也是下一个待运行的进程

调度过程

关闭当前 CPU 的抢占功能；
如果当前 CPU 的运行队列中不存在任务，调用 idle_balance 从其他 CPU 的运行队列中取一部分执行；
调用 pick_next_task 选择红黑树中优先级最高的任务；
调用 context_switch 切换运行的上下文，包括寄存器的状态和堆栈；
重新开启当前 CPU 的抢占功能；

CFS 的调度过程与 𝑂(1) 调度器十分类似，当前调度器与前者的区别只是增加了可选的工作窃取机制并改变了底层的数据结构代码如下：

/*
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
static void idle_balance(int this_cpu, struct rq *this_rq)
{
	struct sched_domain *sd;
	int pulled_task = -1;
	unsigned long next_balance = jiffies + HZ;

	for_each_domain(this_cpu, sd) {
		unsigned long interval;

		if (!(sd->flags & SD_LOAD_BALANCE))
			continue;

		if (sd->flags & SD_BALANCE_NEWIDLE)
			/* If we've pulled tasks over stop searching: */
			pulled_task = load_balance_newidle(this_cpu,
								this_rq, sd);

		interval = msecs_to_jiffies(sd->balance_interval);
		if (time_after(next_balance, sd->last_balance + interval))
			next_balance = sd->last_balance + interval;
		if (pulled_task)
			break;
	}
	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
		/*
		 * We are going idle. next_balance may be set based on
		 * a busy processor. So reset next_balance.
		 */
		this_rq->next_balance = next_balance;
	}
}

主要逻辑在load_balance_newidle,不进行展开

大致的逻辑如下：

load_balance_newidle()
│
├─► 判断是否开启省电策略
│
├─► 查找最繁忙 group
│ └─ 没找到 → 返回 0
│
├─► 查找 group 内最繁忙 CPU（busiest）
│ └─ 没找到 → 返回 0
│
├─► 加锁 busiest 和当前 CPU
│
├─► move_tasks() 尝试迁移任务
│ ├─ 成功 → 返回迁移数
│ └─ 失败且任务都 pinned → 去掉这个 CPU 再尝试（redo）
│
└─► 返回迁移数 or -1 表示无法平衡

参考

linux kernel source code
Ebook: linux Kernel Development