论文里的模型
EEVDF algorithm: A new quantum is allocated to the client that has the eligible request with the earliest virtual deadline.
denotes the weight associated to client i, and is the set of all clients active at time t.
The share of client i at time t, denoted , is defined as:
到的 service time 可以表示为
将代入上面公式:
lag 是一个 client 应该接收的 service time 减去实际接收到的 service time:
这里 是 client 在 实际的 service time
virtual time 是
所以 S 也可以表示为
由上面的公式可以退出 V(e) 和 V(d)
The virtual eligible time V(e) is
The virtual deadline is
If the client uses each time the entire service time it has requested:
Linux 中的实现
Fair schedulers 需要确保 where
S 是 ideal service time,V 是 virtual time
可以推出:
从这个公式可以看出这是所有 se 的 virtual time 的加权平均 virtual time (实际情况:)
Linux 中为了解决 的溢出问题,使用了下面的公式来表示
所以V就是
所以:
cfs_rq→min_vruntime=
cfs_rq→avg_vruntime=
cfs_rq→avg_load=
如何挑选下一个任务执行
挑选方法:挑选 eligible 且 virtual deadline 最小的:
static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq); // 1
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
/*
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
if (cfs_rq->nr_running == 1)
return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) // 2
curr = NULL;
/*
* Once selected, run a task until it either becomes non-eligible or
* until it gets a new slice. See the HACK in set_next_entity().
*/
if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
return curr;
/* Pick the leftmost entity if it's eligible */
if (se && entity_eligible(cfs_rq, se)) { // 3
best = se;
goto found;
}
/* Heap search for the EEVD entity */
while (node) { // 4
struct rb_node *left = node->rb_left;
/*
* Eligible entities in left subtree are always better
* choices, since they have earlier deadlines.
*/
if (left && vruntime_eligible(cfs_rq,
__node_2_se(left)->min_vruntime)) {
node = left;
continue;
}
se = __node_2_se(node);
/*
* The left subtree either is empty or has no eligible
* entity, so check the current node since it is the one
* with earliest deadline that might be eligible.
*/
if (entity_eligible(cfs_rq, se)) {
best = se;
break;
}
node = node->rb_right;
}
found:
if (!best || (curr && entity_before(curr, best)))
best = curr;
return best;
}
- 获取红黑树最左节点的 se
- curr 不在队列里或者不是 eligible,则 curr 置为 NULL
- 最左的节点是 eligible, 则找到
- 遍历红黑树找到符合的节点,树节点的 min_vruntime 是子树中 vruntime 的最小值
判断是否 eligible,lag ≥ 0 表示理想的 service time 大于实际的 service time 所以是 eligible 的:
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime;
long load = cfs_rq->avg_load;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
avg += entity_key(cfs_rq, curr) * weight;
load += weight;
}
return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
}
更新正在运行实体的 deadline 和 vruntime
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
s64 delta_exec;
if (unlikely(!curr))
return;
delta_exec = update_curr_se(rq_of(cfs_rq), curr); // 1
if (unlikely(delta_exec <= 0))
return;
curr->vruntime += calc_delta_fair(delta_exec, curr); // 2
update_deadline(cfs_rq, curr); // 3
update_min_vruntime(cfs_rq); // 4
if (entity_is_task(curr))
update_curr_task(task_of(curr), delta_exec);
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
- 计算当前与上一次执行时间的差值
- 将 wall time 差值转为 vruntime
- 更新 deadline
- 更新 min_vruntime
static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if ((s64)(se->vruntime - se->deadline) < 0)
return;
/*
* For EEVDF the virtual time slope is determined by w_i (iow.
* nice) while the request time r_i is determined by
* sysctl_sched_base_slice.
*/
se->slice = sysctl_sched_base_slice; // 1
/*
* EEVDF: vd_i = ve_i + r_i / w_i
*/
se->deadline = se->vruntime + calc_delta_fair(se->slice, se); // 2
/*
* The task has consumed its request, reschedule.
*/
if (cfs_rq->nr_running > 1) {
resched_curr(rq_of(cfs_rq));
clear_buddies(cfs_rq, se);
}
}
- 的确定
- 计算 vd, calc_delta_fair 会返回
place_entity
place_entity 函数属于 CFS 调度器算法内部使用的一个函数,其作用是调整进程调度实体的虚拟运行时间,传入的第三个参数 initial 为 1 的情况下表示是新创建的进程,否则是被唤醒的进程。
如果 sched_feat(PLACE_LAG) 和 sched_feat(PLACE_DEADLINE_INITIAL) 没有设置的话 vruntime 就是 avg_vruntime
sched_feat(PLACE_DEADLINE_INITIAL) && (flag & ENQUEUE_INITIAL) 会将 vslice 除 2,这样获得的 deadline 会更小,也就更容易被执行到(se→deadline = se→vruntime + vslice)
sched_feat(PLACE_LAG) 在新添加一个 entity 后 lag 会变小,所以为了保持 lag 不变,需要先将 lag 的值变大
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
/*
* Due to how V is constructed as the weighted average of entities,
* adding tasks with positive lag, or removing tasks with negative lag
* will move 'time' backwards, this can screw around with the lag of
* other tasks.
*
* EEVDF: placement strategy #1 / #2
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
lag = se->vlag;
load = cfs_rq->avg_load;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
lag *= load + scale_load_down(se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
}
se->vruntime = vruntime - lag;
/*
* When joining the competition; the exisiting tasks will be,
* on average, halfway through their slice, as such start tasks
* off with half a slice to ease into the competition.
*/
if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
vslice /= 2;
/*
* EEVDF: vd_i = ve_i + r_i/w_i
*/
se->deadline = se->vruntime + vslice;
}
节点插入红黑树
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
se->min_vruntime = se->vruntime;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
__entity_less, &min_vruntime_cb);
}
插入位置判断:
static inline bool entity_before(const struct sched_entity *a,
const struct sched_entity *b)
{
/*
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
return (s64)(a->deadline - b->deadline) < 0;
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
}
节点从红黑树删除
插入的逆过程这里不分析
reweight
这里使用了两个推论:
- 在每次 reweight 的时候我们需要重新计算 vruntime
- reweight 不会影响 weighted average vruntime
static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
unsigned long old_weight = se->load.weight;
u64 avruntime = avg_vruntime(cfs_rq);
s64 vlag, vslice;
/*
* VRUNTIME
* ========
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
*
* Proof: For contradiction assume this is not true, so we can
* re-weight without changing vruntime at !0-lag point.
*
* Weight VRuntime Avg-VRuntime
* before w v V
* after w' v' V'
*
* Since lag needs to be preserved through re-weight:
*
* lag = (V - v)*w = (V'- v')*w', where v = v'
* ==> V' = (V - v)*w/w' + v (1)
*
* Let W be the total weight of the entities before reweight,
* since V' is the new weighted average of entities:
*
* V' = (WV + w'v - wv) / (W + w' - w) (2)
*
* by using (1) & (2) we obtain:
*
* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
*
* Since we are doing at !0-lag point which means V != v, we
* can simplify (3):
*
* ==> W / (W + w' - w) = w / w'
* ==> Ww' = Ww + ww' - ww
* ==> W * (w' - w) = w * (w' - w)
* ==> W = w (re-weight indicates w' != w)
*
* So the cfs_rq contains only one entity, hence vruntime of
* the entity @v should always equal to the cfs_rq's weighted
* average vruntime @V, which means we will always re-weight
* at 0-lag point, thus breach assumption. Proof completed.
*
*
* COROLLARY #2: Re-weight does NOT affect weighted average
* vruntime of all the entities.
*
* Proof: According to corollary #1, Eq. (1) should be:
*
* (V - v)*w = (V' - v')*w'
* ==> v' = V' - (V - v)*w/w' (4)
*
* According to the weighted average formula, we have:
*
* V' = (WV - wv + w'v') / (W - w + w')
* = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
* = (WV - wv + w'V' - Vw + wv) / (W - w + w')
* = (WV + w'V' - Vw) / (W - w + w')
*
* ==> V'*(W - w + w') = WV + w'V' - Vw
* ==> V' * (W - w) = (W - w) * V (5)
*
* If the entity is the only one in the cfs_rq, then reweight
* always occurs at 0-lag point, so V won't change. Or else
* there are other entities, hence W != w, then Eq. (5) turns
* into V' = V. So V won't change in either case, proof done.
*
*
* So according to corollary #1 & #2, the effect of re-weight
* on vruntime should be:
*
* v' = V' - (V - v) * w / w' (4)
* = V - (V - v) * w / w'
* = V - vl * w / w'
* = V - vl'
*/
if (avruntime != se->vruntime) {
vlag = (s64)(avruntime - se->vruntime);
vlag = div_s64(vlag * old_weight, weight);
se->vruntime = avruntime - vlag;
}
/*
* DEADLINE
* ========
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
*
* d' = v' + (d - v)*w/w'
* = V' - (V - v)*w/w' + (d - v)*w/w'
* = V - (V - v)*w/w' + (d - v)*w/w'
* = V + (d - V)*w/w'
*/
vslice = (s64)(se->deadline - avruntime);
vslice = div_s64(vslice * old_weight, weight);
se->deadline = avruntime + vslice;
}