【深入Linux内核架构笔记】第二章进程调度(5)【深入Linux内核架构笔记】第二章进程调度(5) SMP系统调度

2.8 调度器增强

2.8.1 SMP多处理器系统调度

多处理器系统的额外考虑
- 负载均衡：CPU负荷共享，不能一个处理器处理3个并发应用程序，另一个只处理空闲进程
- 设置亲和性（affinity）：支持计算密集型绑核到前3个CPU上，交互式进程在第4个CPU运行
- 进程支持从一个CPU迁移到另一个（谨慎使用）

1. 数据结构的扩展

调度类
- load_balance：允许从最忙就绪队列分配多个进程到当前CPU，但移动的负荷不能超过max_load_move
- move_one_task：从最忙就绪队列移出一个进程，迁移到当前CPU的就绪队列

struct sched_class {
......
#ifdef CONFIG_SMP
	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
			struct rq *busiest, unsigned long max_load_move,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *all_pinned, int *this_best_prio);

	int (*move_one_task) (struct rq *this_rq, int this_cpu,
			      struct rq *busiest, struct sched_domain *sd,
			      enum cpu_idle_type idle);
#endif
......
};

就绪队列
- cpu：每个就绪队列是特定于CPU的。rq->cpu表示该就绪队列所属的处理器
- migration_queue：内核为每个就绪队列提供一个迁移线程，通常由调度器自身周期性地发起迁移请求
- active_balance：如果某个就绪队列效果不佳，主动发起均衡
调度域：所有就绪队列组织为调度域。在“普通”的SMP系统上，所有处理器都包含在一个调度域中，进程在调度域的CPU上迁移进程

struct rq {
......
#ifdef CONFIG_SMP
	struct sched_domain *sd;

	/* 用于主动均衡 */
	int active_balance;
	int push_cpu;
	/* 该就绪队列的CPU */
	int cpu;

	struct task_struct *migration_thread;
	struct list_head migration_queue;
#endif
......
};

负载均衡的发起
- 周期性调度器函数scheduler_tick完成周期性调度后，最后会调用trigger_load_balance()
- 引发SCHEDULE_SOFTIRQ软中断，该中断在适当时机会引发run_rebalance_domains
- 最终对当前CPU调用rebalance_domains，实现负载均衡

void scheduler_tick(void) {
	//就绪队列时钟更新，运行调度器的task_tick方法处理进程定时调度
......
#ifdef CONFIG_SMP
	rq->idle_at_tick = idle_cpu(cpu);
    //触发负载均衡
	trigger_load_balance(rq, cpu);
#endif
}

static inline void trigger_load_balance(struct rq *rq, int cpu)
{
......
	if (time_after_eq(jiffies, rq->next_balance))
        //引发软中断
		raise_softirq(SCHED_SOFTIRQ);
}

//在sched_init时注册了软中断的处理
void __init sched_init(void) {
......
#ifdef CONFIG_SMP
	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
#endif
......
}

static void run_rebalance_domains(struct softirq_action *h)
{
......
	//运行rebalance_domains，实现负载均衡
	rebalance_domains(this_cpu, idle);
......
}

/* 实现负载均衡 */
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
......
    //检测上一次均衡操作是否过去了足够的时间，发起load_balance操作
    if (time_after_eq(jiffies, sd->last_balance + interval)) {
        if (load_balance(cpu, rq, sd, idle, &balance)) {
......
        }
        sd->last_balance = jiffies;
    }
......
}

负载均衡过程load_balance()
- 找到最忙的就绪队列（负荷值最大的队列）
- 使用move_tasks将该队列中适当数目的进程迁移到当前队列
- 调用调度器类的load_balance方法

static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *balance)
{
......
    group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
				   &cpus, balance);
	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
......

    //是否迁移的标志
	ld_moved = 0;
	if (busiest->nr_running > 1) {
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
        //将适当数目的进程迁移到当前队列上
		ld_moved = move_tasks(this_rq, this_cpu, busiest,
				      imbalance, sd, idle, &all_pinned);
    }
......
}

进程迁移原则
- 内核需保证进程没有运行或刚结束运行（否则迁移运行的进程还要迁移CPU高速缓存，没有意义）
- 根据CPU亲和性，迁移的进程可以在CPU关联的处理器上运行
- 进程均衡操作失败，则唤醒最忙就绪队列的迁移线程进行主动负载：load_balance设置acive_balance标志，并将发起请求的CPU记录到rq->cpu中

2. 迁移线程

迁移线程：是一个执行migration_thread的内核线程。用于完成发自调度器的迁移请求、用于实现主动负载均衡

//入口1：调度器do_execve执行进程，进程迁移到合适的cpu执行
int do_execve(char * filename,
	char __user *__user *argv,
	char __user *__user *envp,
	struct pt_regs * regs)
{
......
    file = open_exec(filename);
	sched_exec();
......
}
void sched_exec(void)
{
	int new_cpu, this_cpu = get_cpu();
    //找到空闲的CPU进行调度
	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
	put_cpu();
	if (new_cpu != this_cpu)
		sched_migrate_task(current, new_cpu);
}
static void sched_migrate_task(struct task_struct *p, int dest_cpu)
{
......
	/* force the process onto the specified CPU */
	if (migrate_task(p, dest_cpu, &req)) {
		/* Need to wait for migration thread (might exit: take ref). */
		struct task_struct *mt = rq->migration_thread;

		get_task_struct(mt);
		task_rq_unlock(rq, &flags);
        //唤醒迁移线程
		wake_up_process(mt);
		put_task_struct(mt);
		wait_for_completion(&req.done);

		return;
	}
......
}

//入口2：load_balance发现有active_balance标志，主动唤醒迁移线程
static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *balance)
{
......
			if (active_balance)
				wake_up_process(busiest->migration_thread);
......
}

/*
 * migration_thread - this is a highprio system thread that performs
 * thread migration by bumping thread off CPU then 'pushing' onto
 * another runqueue.
 */
static int migration_thread(void *data)
{
	while (!kthread_should_stop()) {
......
		//设置了active_balance，说明要主动调度
		if (rq->active_balance) {
            //调用调度器类的move_one_task完成进程迁移
			active_load_balance(rq, cpu);
			rq->active_balance = 0;
		}

		head = &rq->migration_queue;
		//如果迁移队列为空，没有迁移请求，schedule()等待下一次执行到migration_thread
		if (list_empty(head)) {
			spin_unlock_irq(&rq->lock);
			schedule();
			set_current_state(TASK_INTERRUPTIBLE);
			continue;
		}
		req = list_entry(head->next, struct migration_req, list);
		list_del_init(head->next);
		spin_unlock(&rq->lock);
		//有迁移请求，调度__migrate_task，移出所要求的的进程
		__migrate_task(req->task, cpu, req->dest_cpu);
		local_irq_enable();
......
	}
}

3. 与单处理器系统相比的区别

exec系统调用启动新进程，会挑选当前负荷最小的CPU，将进程迁移到该CPU上执行
CFS调度器的三个内核参数调整：CPU数目越多，调度延迟增加
- sysctl_sched_min_granularity和sysctl_sched_latency乘以因子 $1+log_2(nr\_cpus)$ 。注意它们不能超出200ms。
- sysctl_sched_wakeup_granularity也乘以因子 $1+log_2(nr\_cpus)$

2.8.2 调度域和控制组

调度实体：调度器不直接处理进程，而是处理调度实体。好处是可以实现组调度

进程位于不同的组中，在组间保持公平；然后对组内的进程保持公平（例：对每个用户有相同的CPU时间份额，每个用户再细分自己的CPU份额）
内核还提供控制组（control group），通过cgroups创建任意进程集合，甚至可以分为多个层次（如容器）

进程加入队列时，会遍历sched_entity的parent成员定义的调度层次结构，每个实体都加入就绪队列

struct sched_entity {
#ifdef CONFIG_FAIR_GROUP_SCHED
	struct sched_entity	*parent;
#endif
};

#ifdef CONFIG_FAIR_GROUP_SCHED
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se)  for (; se; se = se->parent)
#endif

static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, wakeup);
		wakeup = 1;
	}
}

2.8.3 内核抢占和低延迟

1. 内核抢占

【背景】早期内核只有在进程主动让出CPU调用schedule()，或中断返回用户空间触发进程的调度。有以下问题：

中断发生在内核中，需要等系统调用或中断返回才能触发调度
优先级反转：低优先级进程触发中断，可能导致高优先级进程无法及时响应

内核抢占：内核可以启用内核抢占特性，从而更快速地用紧急进程替代当前进程

如果高优先级进程有事情需要完成，在启用内核抢占的情况下，用户空间应用程序可以被中断，内核也可以被中断
注意内核不能在任意点抢占，有些临界区需要加锁保护，只允许一个CPU访问
为了跟踪能否抢占，设置有一个抢占计数器：thread_info->preempt_count，0表示可抢占；>0表示不可抢占
- 通过inc_preempt_count和dec_preempt_count辅助增减preempt_count
- +1表示普通抢占，+PREEMPT_ACTIVE表示内核抢占

抢占处理：

preempt_disable：preempt_count+1，停用抢占。还会做内存屏障处理
preempt_enable：preempt_count-1，启用抢占。启用后使用
preempt_check_resched检测是否有必要重调度
preempt_check_resched：检测是否有必要重调度。如果设置有TIF_NEED_RESCHED标志，说明有进程在等待CPU时间，内核需要发起重调度
- 周期性调度处理时，调用resched_task()设置标志（2.6.5节）
- 唤醒抢占进程时，调用resched_task()设置标志（2.6.6节）
- 新进程加入队列时，调用resched_task()设置标志（2.6.7节）
preempt_enable_no_resched：启用抢占，但不进行重调度

#define preempt_disable() \
do { \
	inc_preempt_count(); \
	barrier(); \
} while (0)

#define preempt_enable_no_resched() \
do { \
	barrier(); \
	dec_preempt_count(); \
} while (0)

//设置TIF_NEED_RESCHED标志，通知进程等待有进程得到CPU时间
#define preempt_check_resched() \
do { \
	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
		preempt_schedule(); \
} while (0)

#define preempt_enable() \
do { \
	preempt_enable_no_resched(); \
	barrier(); \
	preempt_check_resched(); \
} while (0)

其中preempt_schedule()是内核抢占处理流程。可以看出来，当开启内核抢占机制后，高优先级进程可以直接执行，比传统方式更快替代当前进程

#define PREEMPT_ACTIVE		0x10000000

/* 执行内核抢占机制 */
asmlinkage void __sched preempt_schedule(void)
{
......
    //如果内核不能被中断，直接返回
	if (likely(ti->preempt_count || irqs_disabled()))
		return;
	//如果可以抢占，操作抢占计数器，增加一个很大的值，表示是内核抢占
	do {
		add_preempt_count(PREEMPT_ACTIVE);
		schedule();
		sub_preempt_count(PREEMPT_ACTIVE);
......
		barrier();
        //再次检查，以免在schedule和当前点之间错过了抢占的时机
	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
}

asmlinkage void __sched schedule(void)
{
......
    //开启内核抢占机制后，如果调度是由抢占机制发起的，高优先级进程优先执行；否则调度器选择其它进程执行
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
				unlikely(signal_pending(prev)))) {
			prev->state = TASK_RUNNING;
		} else {
			deactivate_task(rq, prev, 1);
		}
	}
......
}

2. 低延迟

【背景】内核中耗时长的操作不应该完全占据整个系统，内核应不时地检测是否有另一个进程可运行并调度运行

此机制不依赖于内核抢占
内核在耗时比较长的函数（比如大量读取操作的循环）加入cond_resched()，主动让出CPU，降低系统响应延迟

for(;;) {
    //读取数据
    ......
    cond_resched();
}

int __sched cond_resched(void) {
    //如果内核设置了TIF_NEED_RESCHED标志，且内核没有被抢占，可发起有条件重调度
	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
					system_state == SYSTEM_RUNNING) {
		__cond_resched();
		return 1;
	}
	return 0;
}

static void __cond_resched(void) {
......
	do {
		add_preempt_count(PREEMPT_ACTIVE);
		schedule();
		sub_preempt_count(PREEMPT_ACTIVE);
	} while (need_resched());
}

【深入Linux内核架构笔记】第二章 进程调度(5)

2.8 调度器增强

2.8.1 SMP多处理器系统调度

2.8.2 调度域和控制组

2.8.3 内核抢占和低延迟

【深入Linux内核架构笔记】第二章进程调度(5)