论文里的模型

EEVDF algorithm: A new quantum is allocated to the client that has the eligible request with the earliest virtual deadline.

$w_i$ denotes the weight associated to client i, and $A(t)$ is the set of all clients active at time t.

The share of client i at time t, denoted $f_i(t)$ , is defined as:

f_i(t)=\frac{w_i}{\sum_{j\in\Alpha(t)}{w_j}}

$t_0$ 到 $t_1$ 的 service time 可以表示为

S_i(t_0, t_1)=\int_{t_0}^{t_1}f_i(\tau)d\tau

将 $f_i(t)$ 代入上面公式：

S_i(t_1,t_2)=w_i\int_{t_1}^{t_2}\frac{1}{\sum_{j\in\Alpha(\tau)}w_j}d\tau

lag 是一个 client 应该接收的 service time 减去实际接收到的 service time:

lag_i(t)=S_i(t_0^i,t)-s_i(t_0^i,t)

这里 $s_i(t_0^i,t)$ 是 client 在 $[t_0^i,t]$ 实际的 service time

virtual time 是

V(t)=\int_{0}^{t}\frac{1}{\sum_{j\in\Alpha(\tau)}w_j}d\tau

所以 S 也可以表示为

S_i(t_1,t_2)=w_i(V(t_2)-V(t_1))

由上面的公式可以退出 V(e) 和 V(d)

The virtual eligible time V(e) is

V(e)=V(t_0^i)+\frac{s_i(t_0^i,t)}{w_i}

The virtual deadline is

V(d)=V(e)+\frac{r}{w_i}

$r=S_i(e,d)$

If the client uses each time the entire service time it has requested:

\begin{align} ve^{(1)} &= V(t_0^i) \\ vd^{(k)} &= ve^{(k)} + \frac{r^k}{w_i} \\ ve^{(k+1)} &= vd^{(k)} \end{align}

Linux 中的实现

Fair schedulers 需要确保 $\sum{lag_i}=0$ where $lag_i=S-s_i=w_i*(V-v_i)$

S 是 ideal service time，V 是 virtual time

可以推出：

\sum{(w_i*V-w_i*v_i)}=0

V=\frac{\sum v_i*w_i}{\sum w_i}=\frac{\sum v_i*w_i}{W}

从这个公式可以看出这是所有 se 的 virtual time 的加权平均 virtual time （实际情况： $V +-= \frac{lag_i}{W}$ ）

Linux 中为了解决 $v_i$ 的溢出问题，使用了下面的公式来表示 $v_i$

v_i=v_i-v_0+v_0

所以V就是

V=\frac{\sum{(v_i-v_0+v_0)*w_i}}{\sum_{w_i}}=\frac{\sum{(v_i-v_0)*w_i}}{W}+v_0

所以：

cfs_rq→min_vruntime= $v_0$

cfs_rq→avg_vruntime= $\sum{(v_i-v_0)*w_i}$

cfs_rq→avg_load= $\sum{w_i}$

如何挑选下一个任务执行

挑选方法：挑选 eligible 且 virtual deadline 最小的：

static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
	struct sched_entity *se = __pick_first_entity(cfs_rq); // 1
	struct sched_entity *curr = cfs_rq->curr;
	struct sched_entity *best = NULL;

	/*
	 * We can safely skip eligibility check if there is only one entity
	 * in this cfs_rq, saving some cycles.
	 */
	if (cfs_rq->nr_running == 1)
		return curr && curr->on_rq ? curr : se;

	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) // 2
		curr = NULL;

	/*
	 * Once selected, run a task until it either becomes non-eligible or
	 * until it gets a new slice. See the HACK in set_next_entity().
	 */
	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
		return curr;

	/* Pick the leftmost entity if it's eligible */
	if (se && entity_eligible(cfs_rq, se)) { // 3
		best = se;
		goto found;
	}

	/* Heap search for the EEVD entity */
	while (node) { // 4
		struct rb_node *left = node->rb_left;

		/*
		 * Eligible entities in left subtree are always better
		 * choices, since they have earlier deadlines.
		 */
		if (left && vruntime_eligible(cfs_rq,
					__node_2_se(left)->min_vruntime)) {
			node = left;
			continue;
		}

		se = __node_2_se(node);

		/*
		 * The left subtree either is empty or has no eligible
		 * entity, so check the current node since it is the one
		 * with earliest deadline that might be eligible.
		 */
		if (entity_eligible(cfs_rq, se)) {
			best = se;
			break;
		}

		node = node->rb_right;
	}
found:
	if (!best || (curr && entity_before(curr, best)))
		best = curr;

	return best;
}

获取红黑树最左节点的 se
curr 不在队列里或者不是 eligible，则 curr 置为 NULL
最左的节点是 eligible, 则找到
遍历红黑树找到符合的节点，树节点的 min_vruntime 是子树中 vruntime 的最小值

判断是否 eligible，lag ≥ 0 表示理想的 service time 大于实际的 service time 所以是 eligible 的：

static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
	struct sched_entity *curr = cfs_rq->curr;
	s64 avg = cfs_rq->avg_vruntime;
	long load = cfs_rq->avg_load;

	if (curr && curr->on_rq) {
		unsigned long weight = scale_load_down(curr->load.weight);

		avg += entity_key(cfs_rq, curr) * weight;
		load += weight;
	}

	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
}

更新正在运行实体的 deadline 和 vruntime

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	s64 delta_exec;

	if (unlikely(!curr))
		return;

	delta_exec = update_curr_se(rq_of(cfs_rq), curr); // 1
	if (unlikely(delta_exec <= 0))
		return;

	curr->vruntime += calc_delta_fair(delta_exec, curr); // 2
	update_deadline(cfs_rq, curr); // 3
	update_min_vruntime(cfs_rq); // 4

	if (entity_is_task(curr))
		update_curr_task(task_of(curr), delta_exec);

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

计算当前与上一次执行时间的差值
将 wall time 差值转为 vruntime
更新 deadline
更新 min_vruntime

static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if ((s64)(se->vruntime - se->deadline) < 0)
		return;

	/*
	 * For EEVDF the virtual time slope is determined by w_i (iow.
	 * nice) while the request time r_i is determined by
	 * sysctl_sched_base_slice.
	 */
	se->slice = sysctl_sched_base_slice; // 1

	/*
	 * EEVDF: vd_i = ve_i + r_i / w_i
	 */
	se->deadline = se->vruntime + calc_delta_fair(se->slice, se); // 2

	/*
	 * The task has consumed its request, reschedule.
	 */
	if (cfs_rq->nr_running > 1) {
		resched_curr(rq_of(cfs_rq));
		clear_buddies(cfs_rq, se);
	}
}

$r_i$ 的确定
计算 vd, calc_delta_fair 会返回 $r_i \times \frac{NICE\_0\_LOAD}{w_i}$

place_entity

place_entity 函数属于 CFS 调度器算法内部使用的一个函数，其作用是调整进程调度实体的虚拟运行时间，传入的第三个参数 initial 为 1 的情况下表示是新创建的进程，否则是被唤醒的进程。

如果 sched_feat(PLACE_LAG) 和 sched_feat(PLACE_DEADLINE_INITIAL) 没有设置的话 vruntime 就是 avg_vruntime

sched_feat(PLACE_DEADLINE_INITIAL) && (flag & ENQUEUE_INITIAL) 会将 vslice 除 2，这样获得的 deadline 会更小，也就更容易被执行到（se→deadline = se→vruntime + vslice）

sched_feat(PLACE_LAG) 在新添加一个 entity 后 lag 会变小，所以为了保持 lag 不变，需要先将 lag 的值变大

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	u64 vslice, vruntime = avg_vruntime(cfs_rq);
	s64 lag = 0;

	se->slice = sysctl_sched_base_slice;
	vslice = calc_delta_fair(se->slice, se);

	/*
	 * Due to how V is constructed as the weighted average of entities,
	 * adding tasks with positive lag, or removing tasks with negative lag
	 * will move 'time' backwards, this can screw around with the lag of
	 * other tasks.
	 *
	 * EEVDF: placement strategy #1 / #2
	 */
	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
		struct sched_entity *curr = cfs_rq->curr;
		unsigned long load;

		lag = se->vlag;

		load = cfs_rq->avg_load;
		if (curr && curr->on_rq)
			load += scale_load_down(curr->load.weight);

		lag *= load + scale_load_down(se->load.weight);
		if (WARN_ON_ONCE(!load))
			load = 1;
		lag = div_s64(lag, load);
	}

	se->vruntime = vruntime - lag;

	/*
	 * When joining the competition; the exisiting tasks will be,
	 * on average, halfway through their slice, as such start tasks
	 * off with half a slice to ease into the competition.
	 */
	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
		vslice /= 2;

	/*
	 * EEVDF: vd_i = ve_i + r_i/w_i
	 */
	se->deadline = se->vruntime + vslice;
}

节点插入红黑树

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	avg_vruntime_add(cfs_rq, se);
	se->min_vruntime = se->vruntime;
	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
				__entity_less, &min_vruntime_cb);
}

插入位置判断：

static inline bool entity_before(const struct sched_entity *a,
				 const struct sched_entity *b)
{
	/*
	 * Tiebreak on vruntime seems unnecessary since it can
	 * hardly happen.
	 */
	return (s64)(a->deadline - b->deadline) < 0;
}

static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
	return entity_before(__node_2_se(a), __node_2_se(b));
}

节点从红黑树删除

插入的逆过程这里不分析

reweight

这里使用了两个推论：

在每次 reweight 的时候我们需要重新计算 vruntime
reweight 不会影响 weighted average vruntime

static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
			   unsigned long weight)
{
	unsigned long old_weight = se->load.weight;
	u64 avruntime = avg_vruntime(cfs_rq);
	s64 vlag, vslice;

	/*
	 * VRUNTIME
	 * ========
	 *
	 * COROLLARY #1: The virtual runtime of the entity needs to be
	 * adjusted if re-weight at !0-lag point.
	 *
	 * Proof: For contradiction assume this is not true, so we can
	 * re-weight without changing vruntime at !0-lag point.
	 *
	 *             Weight	VRuntime   Avg-VRuntime
	 *     before    w          v            V
	 *      after    w'         v'           V'
	 *
	 * Since lag needs to be preserved through re-weight:
	 *
	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
	 *	==>	V' = (V - v)*w/w' + v		(1)
	 *
	 * Let W be the total weight of the entities before reweight,
	 * since V' is the new weighted average of entities:
	 *
	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
	 *
	 * by using (1) & (2) we obtain:
	 *
	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
	 *
	 * Since we are doing at !0-lag point which means V != v, we
	 * can simplify (3):
	 *
	 *	==>	W / (W + w' - w) = w / w'
	 *	==>	Ww' = Ww + ww' - ww
	 *	==>	W * (w' - w) = w * (w' - w)
	 *	==>	W = w	(re-weight indicates w' != w)
	 *
	 * So the cfs_rq contains only one entity, hence vruntime of
	 * the entity @v should always equal to the cfs_rq's weighted
	 * average vruntime @V, which means we will always re-weight
	 * at 0-lag point, thus breach assumption. Proof completed.
	 *
	 *
	 * COROLLARY #2: Re-weight does NOT affect weighted average
	 * vruntime of all the entities.
	 *
	 * Proof: According to corollary #1, Eq. (1) should be:
	 *
	 *	(V - v)*w = (V' - v')*w'
	 *	==>    v' = V' - (V - v)*w/w'		(4)
	 *
	 * According to the weighted average formula, we have:
	 *
	 *	V' = (WV - wv + w'v') / (W - w + w')
	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
	 *	   = (WV + w'V' - Vw) / (W - w + w')
	 *
	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
	 *	==>	V' * (W - w) = (W - w) * V	(5)
	 *
	 * If the entity is the only one in the cfs_rq, then reweight
	 * always occurs at 0-lag point, so V won't change. Or else
	 * there are other entities, hence W != w, then Eq. (5) turns
	 * into V' = V. So V won't change in either case, proof done.
	 *
	 *
	 * So according to corollary #1 & #2, the effect of re-weight
	 * on vruntime should be:
	 *
	 *	v' = V' - (V - v) * w / w'		(4)
	 *	   = V  - (V - v) * w / w'
	 *	   = V  - vl * w / w'
	 *	   = V  - vl'
	 */
	if (avruntime != se->vruntime) {
		vlag = (s64)(avruntime - se->vruntime);
		vlag = div_s64(vlag * old_weight, weight);
		se->vruntime = avruntime - vlag;
	}

	/*
	 * DEADLINE
	 * ========
	 *
	 * When the weight changes, the virtual time slope changes and
	 * we should adjust the relative virtual deadline accordingly.
	 *
	 *	d' = v' + (d - v)*w/w'
	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
	 *	   = V  + (d - V)*w/w'
	 */
	vslice = (s64)(se->deadline - avruntime);
	vslice = div_s64(vslice * old_weight, weight);
	se->deadline = avruntime + vslice;
}

个人博客：uran0sh.github.io/blog/

Linux EEVDF CPU Scheduler 分析