WALT是高通研发的任务负载追踪算法，使用了任务运行时间占用窗口的size进行负载跟踪，相比于PELT算法，功耗和负载准确性更具优势，本文的目的包括：

移植Pixel5内核代码中的WALT算法到基于RK3566的泰山派中。
对WALT算法进行分析。

由于pixel5和泰山派提供的android_sdk都采用了4.19内核，因此，内核接口上基本没有差异，直接开干。

首先先看一下源码存放的位置，如下代码所示，在Makefile中使用CONFIG_SCHED_WALT包含了三个源码文件。

// path: kernel/sched/Makefile
obj-$(CONFIG_SCHED_WALT) += walt.o boost.o sched_avg.o

我们知道负载追踪算法的主要逻辑在于walt中，剩下的boost和sched_avg是干啥的，后面继续分析，所以我们首先对walt进行移植。

walt移植

step1 拷贝代码进行编译

在泰山派android_sdk的SDK_PATH/kernel/kernel/sched/Makefile文件中，先把walt的编译选项加进入。

# 先使用-y，后续再进行宏配置，再将剩余两个文件进行注释，
obj-y += walt.o # boost.o sched_avg.o

将pixel5源码下的walt.c和walt.h拷贝到sched这个目录下，使其参与编译，然后直接进行一波编译，果然报错了，看看编译日志是存在很多未定义的宏，以及很多结构体中变量没有进行定义，如同下图所示，因此需要进行修改的地方还很多，不过没关系，我们进行移植的过程也是对流程的一个学习过程。

编译报错解决：

根据报错，我们可知有很多宏定义没实现，观察一下walt.h文件，发现这些宏是都包含在CONFIG_SCHED_WALT中的，而我们使用的是-y编译文件，因此有些宏定义与函数是不存在的。
linux/sched/core_ctl.h头文件不存在，透过git log查看该文件，也是walt算法的一个配套文件，先搞过来，同时解决一下CONFIG_SCHED_CORE_CTL宏未定义的问题。

3. 移植task_struct结构体中walt算法的内容，ravg结构体(关键结构体)，

struct task_struct {
...
//#ifdef CONFIG_SCHED_WALT
struct ravg ravg; // 窗口相关的参数
u32 init_load_pct; // 代表了这个进程的子线程的初始负载
u64 last_wake_ts; // 最后一次唤醒的时间
u64 last_enqueued_ts; // 最后一次入队的时间
struct related_thread_group *grp; // 线程相关组指针
struct list_head grp_list; // 相关组链表
u64 cpu_cycles; // cpu的cycles数(指令执行数)
bool misfit; // 一个不合适的标志位(TODO)
/* TODO */
u32 unfilter;
bool low_latency;
bool rtg_high_prio;
//#endif
...
};

#define RAVG_HIST_SIZE_MAX 5 
#define NUM_BUSY_BUCKETS 10  

struct ravg {
    /* 标记一个窗口中的一个事件的开始(任务唤醒，任务开始执行，任务被抢占) */
    u64 mark_start;
    /* 
     * sum：任务在当前窗口的可运行程度，包含运行时间和等待时间，并且是归一的 
     * demand：在之前的sysctl_sched_ravg_hist_size窗口中看到的最大总和，demand可以推动任务的频率需求。
     */
    u32 sum, demand;
    u32 coloc_demand;
    /* 跟踪在以前的RAVG_HIST_SIZE窗口中看到的sum历史，任务完全休眠的窗口将被忽略。 */
    u32 sum_history[RAVG_HIST_SIZE_MAX];
    /* 
     * curr_window_cpu: 表示任务在当前窗口对各种CPU上的cpu繁忙时间的贡献
     * prev_window_cpu: 表示任务在上一个窗口对各种CPU上的cpu繁忙时间的贡献。
     */
    u32 *curr_windows_cpu, *prev_window_cpu;
    /* 
     * curr_window:表示curr_windows_cpu的数值总和
     * prev_window:表示prev_window_cpu的数值总和
     */
    u32 curr_window, prev_window;
    /* 表示预测的cpu繁忙时间 */
    u32 pred_demand;
    /* 将历史繁忙时间分组到用于预测的不同存储桶中 */
    u8 busy_buckets[NUM_BUS_BUCKETS];
    /* 归一到1024的任务需求 */
    u16 demand_scaled;
    /* 归一到1024的预测任务需求 */
    u16 pred_demand_scaled;
    u64 active_time;
    u64 last_win_size;
};

struct walt_sched_stats {              
    int nr_big_tasks;                    
    u64 cumulative_runnable_avg_scaled;  
    u64 pred_demands_sum_scaled;         
    unsigned int nr_rtg_high_prio_task;  
};

step2 在原生逻辑中进行侵入式修改

首先排查一下walt中有哪些对外暴露的接口，这个我们通过观察walt.h文件就可以知道。

* bool prefer_spread_on_idle(int cpu, bool new_ilb)
* void walt_rotate_work_init(void)
* void walt_rotation_checkpoint(int nr_big)
* walt_update_last_enqueue(struct task_struct *p)
* update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime)
* void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
* unsigned int walt_big_tasks(int cpu)
* void walt_adjust_nr_big_tasks(struct rq *rq, int delta, bool inc)
* void inc_nr_big_task(struct walt_sched_stats *stats, struct task_struct *p)
* void dec_nr_big_task(struct walt_sched_stats *stats, struct task_struct *p)
* void fixup_busy_time(struct task_struct *p, int new_cpu)
* void init_new_task_load(struct task_struct *p)
* void mark_task_starting(struct task_struct *p)
* void set_window_start(struct rq *rq)
* int sched_cpu_high_irqload(int cpu)
* void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
* void update_cluster_topology(void)
* void init_clusters(void)
* void sched_account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock)
* int same_cluster(int src_cpu, int dst_cpu)
* bool do_pl_notif(struct rq *rq)
* void inc_rq_walt_stats(struct rq *rq, struct task_struct *p)
* void dec_rq_walt_stats(struct rq *rq, struct task_struct *p)
* void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, u16 updated_demand_scaled, u16 updated_pred_demand_scaled)
* u64 sched_irqload(int cpu)
* bool walt_should_kick_upmigrate(struct task_struct *p, int cpu)
* u64 get_rtgb_active_time(void)
* unsigned int walt_nr_rtg_high_prio(int cpu)

static void update_history(struct rq *rq, struct task_struct *p, u32 runtime,
			   int samples, int event)
{
	u32 *hist = &p->ravg.sum_history[0];
	int ridx, widx;
	u32 max = 0, avg, demand, pred_demand;
	u64 sum = 0;
	u16 demand_scaled, pred_demand_scaled;

	/* Ignore windows where task had no activity */
	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
		goto done;

	/* Push new 'runtime' value onto stack */
	/* sched_ravg_hist_size : 窗口个数 默认为5*/
	widx = sched_ravg_hist_size - 1;
	/* samples : 传入的窗口的个数 
   * 如果传入的值为0， walt没有新增窗口，相当于widx等于hidx。
   * 如果传入值为1, 则需要把窗口左移3次
   * 如果传入值为4，移动一次
   * 如果传入值为5，则不需要移动
   * */
	ridx = widx - samples;
	for (; ridx >= 0; --widx, --ridx) {
		hist[widx] = hist[ridx];
		sum += hist[widx];
		/* 记录最大的窗口数值 */
		if (hist[widx] > max)
			max = hist[widx];
	}

	/*
   * 比如sample值为4， 则在上面的操作中已经移动过一次，下面的窗口中再移动4次 0,1,2,
   */
	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
		hist[widx] = runtime;
		sum += hist[widx];
		if (hist[widx] > max)
			max = hist[widx];
	}

	/* 上述操作中，将sum值进行了累加，记录了最大的max */

	p->ravg.sum = 0;

	if (sysctl_sched_window_stats_policy == WINDOW_STATS_RECENT) {
		demand = runtime;
	} else if (sysctl_sched_window_stats_policy == WINDOW_STATS_MAX) {
		demand = max;
	} else {
		avg = div64_u64(sum, sched_ravg_hist_size);
		if (sysctl_sched_window_stats_policy == WINDOW_STATS_AVG)
			demand = avg;
		else
			demand = max(avg, runtime);
	}
	pred_demand = predict_and_update_buckets(p, runtime);
	demand_scaled = scale_demand(demand);
	pred_demand_scaled = scale_demand(pred_demand);

	/*
	 * A throttled deadline sched class task gets dequeued without
	 * changing p->on_rq. Since the dequeue decrements walt stats
	 * avoid decrementing it here again.
	 *
	 * When window is rolled over, the cumulative window demand
	 * is reset to the cumulative runnable average (contribution from
	 * the tasks on the runqueue). If the current task is dequeued
	 * already, it's demand is not included in the cumulative runnable
	 * average. So add the task demand separately to cumulative window
	 * demand.
	 */
	if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
		if (task_on_rq_queued(p) &&
		    p->sched_class->fixup_walt_sched_stats)
			p->sched_class->fixup_walt_sched_stats(
				rq, p, demand_scaled, pred_demand_scaled);
		else if (rq->curr == p)
			walt_fixup_cum_window_demand(rq, demand_scaled);
	}

	p->ravg.demand = demand;
	p->ravg.demand_scaled = demand_scaled;
	p->ravg.coloc_demand = div64_u64(sum, sched_ravg_hist_size);
	p->ravg.pred_demand = pred_demand;
	p->ravg.pred_demand_scaled = pred_demand_scaled;

	if (demand_scaled > sysctl_sched_min_task_util_for_colocation)
		p->unfilter = sysctl_sched_task_unfilter_period;
	else if (p->unfilter)
		p->unfilter =
			max_t(int, 0, p->unfilter - p->ravg.last_win_size);
done:
	trace_sched_update_history(rq, p, runtime, samples, event);
}

代码分析

update_task_demand函数分析

p是指向task_struct结构体的指针，代表一个任务；
rq是指向rq（runqueue）结构体的指针，代表一个CPU的运行队列；
event是一个整数，表示触发更新的事件类型；
wallclock是一个64位无符号整数，表示当前的墙上时间（wall time）。 */ static u64 update_task_demand(struct task_struct *p, struct rq rq, int event, u64 wallclock) { /
- 从任务结构体p中获取一个指向walt_task_struct结构体的指针wts，
- 这个结构体包含了与WALT算法相关的特定于任务的数据。 */ struct walt_task_struct *wts = (struct walt_task_struct ) p->android_vendor_data1; u64 mark_start = wts->mark_start; /
- 从运行队列结构体rq中获取一个指向walt_rq结构体的指针wrq，
- 这个结构体包含了与WALT算法相关的特定于运行队列的数据。 */ struct walt_rq *wrq = (struct walt_rq ) rq->android_vendor_data1; /
- 定义变量delta和window_start，分别表示从mark_start到window_start的时间差，
- 以及运行队列的窗口开始时间。 / u64 delta, window_start = wrq->window_start; /
- 定义变量new_window，用于判断是否开始了一个新的窗口周期。 */ int new_window, nr_full_windows; u32 window_size = sched_ravg_window; u64 runtime;
new_window = mark_start < window_start; /*
- 如果当前时间没有被计为忙碌时间，并且开始了一个新的窗口周期，
- 则调用update_history函数来更新任务的历史需求，并返回0。 / if (!account_busy_for_task_demand(rq, p, event)) { if (new_window) / * If the time accounted isn't being accounted as * busy time, and a new window started, only the * previous window need be closed out with the * pre-existing demand. Multiple windows may have * elapsed, but since empty windows are dropped, * it is not necessary to account those. */ update_history(rq, p, wts->sum, 1, event); return 0; }
/*
- 如果当前时间在现有窗口内，
- 则直接调用add_to_task_demand函数来更新任务的需求，
- 并返回更新后的需求值。 / if (!new_window) { /
  - The simple case - busy time contained within the existing
  - window. */ return add_to_task_demand(rq, p, wallclock - mark_start); }
/*
- Busy time spans at least two windows. Temporarily rewind
- window_start to first window boundary after mark_start. / /
- 如果忙碌时间跨越了至少两个窗口，则计算从mark_start到第一个窗口边界的时间差delta，
- 以及完整窗口的数量nr_full_windows。然后，将window_start回退到mark_start之后的首个窗口边界。 */ delta = window_start - mark_start; nr_full_windows = div64_u64(delta, window_size); window_start -= (u64)nr_full_windows * (u64)window_size;
/* Process (window_start - mark_start) first / /
- 首先处理从window_start到mark_start的时间差，并更新任务的需求。 */ runtime = add_to_task_demand(rq, p, window_start - mark_start);
/* Push new sample(s) into task's demand history / /
- 调用update_history函数，将新的样本推送到任务的需求历史中。 / update_history(rq, p, wts->sum, 1, event); /
- 如果存在完整的窗口，
- 则计算每个窗口的缩放执行时间，
- 并更新任务的需求历史和总运行时间。 */ if (nr_full_windows) { u64 scaled_window = scale_exec_time(window_size, rq);
  
  update_history(rq, p, scaled_window, nr_full_windows, event); runtime += nr_full_windows * scaled_window; }
/*
- Roll window_start back to current to process any remainder
- in current window. / /
- 将window_start回退到当前，以便处理当前窗口中的剩余时间。 */ window_start += (u64)nr_full_windows * (u64)window_size;
/* Process (wallclock - window_start) next / /
- 处理从window_start到wallclock的时间差，并更新总运行时间。 */ mark_start = window_start; runtime += add_to_task_demand(rq, p, wallclock - mark_start);
/*
- 返回计算出的总运行时间。 */ return runtime; }

update_history函数分析

定义了一个名为update_history的静态函数，
它接收四个参数：rq是指向rq（runqueue）结构体的指针，代表一个CPU的运行队列；
p是指向task_struct结构体的指针，代表一个任务；
runtime是任务在当前窗口期间的运行时间；
samples是要记录的样本数量；
event是触发更新的事件类型。
eg：在update_task_demand函数中处理多个窗口时，
调用了update_history(rq, p, scaled_window, nr_full_windows, event);
rq，p和event自然不用多说，scaled_window是归一化的窗口时间，nr_full_windows是跨越的窗口个数。 */ static void update_history(struct rq *rq, struct task_struct p, u32 runtime, int samples, int event) { /
- 从任务结构体p中获取一个指向walt_task_struct结构体的指针wts，
- 这个结构体包含了与WALT算法相关的特定于任务的数据。 */ struct walt_task_struct *wts = (struct walt_task_struct ) p->android_vendor_data1; /
- 获取指向任务需求历史记录数组的指针hist。 */ u32 hist = &wts->sum_history[0]; /
- 定义变量ridx和widx，分别用于历史记录数组的读索引和写索引。 / int ridx, widx; /
- 定义变量max、avg、demand和pred_demand，
- 分别用于存储历史记录中的最大值、平均值、需求和预测需求。 / u32 max = 0, avg, demand, pred_demand; /
- 定义变量sum，用于存储历史记录的累积和。 / u64 sum = 0; /
- 定义变量demand_scaled和pred_demand_scaled，
- 用于存储缩放后的需求和预测需求。 / u16 demand_scaled, pred_demand_scaled; /
- 从运行队列结构体rq中获取一个指向walt_rq结构体的指针wrq，
- 这个结构体包含了与WALT算法相关的特定于运行队列的数据。 */ struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
/* Ignore windows where task had no activity / / 如果任务在当前窗口期间没有活动（runtime为0）、
- 任务是空闲任务（is_idle_task(p)返回真）或没有样本（samples为0），
- 则跳转到done标签。 */ if (!runtime || is_idle_task(p) || !samples) goto done;
/* Push new 'runtime' value onto stack / /
- 将新的runtime值推入历史记录堆栈。
- 这是通过将现有记录向后移动，并在前面插入新的runtime值来实现的。 */ widx = sched_ravg_hist_size - 1; ridx = widx - samples; for (; ridx >= 0; --widx, --ridx) { hist[widx] = hist[ridx]; sum += hist[widx]; if (hist[widx] > max) max = hist[widx]; }
/*
- 遍历历史记录数组，将新的runtime值插入到数组的前面，并更新sum和max。 */ for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { hist[widx] = runtime; sum += hist[widx]; if (hist[widx] > max) max = hist[widx]; }
/* 将wts->sum设置为0，这表示在更新历史记录后，需求的累积和被重置。 */ wts->sum = 0;

/*
- 根据系统控制策略（sysctl_sched_window_stats_policy），
- 计算需求（demand）。这可以是最近的运行时间、历史记录中的最大值、
- 平均值或最大值和最近运行时间的最大值。 / if (sysctl_sched_window_stats_policy == WINDOW_STATS_RECENT) { demand = runtime; } else if (sysctl_sched_window_stats_policy == WINDOW_STATS_MAX) { demand = max; } else { avg = div64_u64(sum, sched_ravg_hist_size); if (sysctl_sched_window_stats_policy == WINDOW_STATS_AVG) demand = avg; else demand = max(avg, runtime); } /
- 调用predict_and_update_buckets函数预测并更新任务的需求桶。 / pred_demand = predict_and_update_buckets(p, runtime); /
- 调用scale_demand函数缩放需求和预测需求。 */ demand_scaled = scale_demand(demand); pred_demand_scaled = scale_demand(pred_demand);
/*
- A throttled deadline sched class task gets dequeued without
- changing p->on_rq. Since the dequeue decrements walt stats
- avoid decrementing it here again.
- When window is rolled over, the cumulative window demand
- is reset to the cumulative runnable average (contribution from
- the tasks on the runqueue). If the current task is dequeued
- already, it's demand is not included in the cumulative runnable
- average. So add the task demand separately to cumulative window
- demand. / /
- 如果任务没有使用截止时间调度策略或没有被限制，
- 并且任务已经在运行队列上排队，
- 则调用fixup_walt_sched_stats_common函数来修正调度统计数据。 */ if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { if (task_on_rq_queued(p)) fixup_walt_sched_stats_common(rq, p, demand_scaled, pred_demand_scaled); }
/*
- 更新wts结构体中的demand、demand_scaled、coloc_demand、
- pred_demand和pred_demand_scaled字段。 */ wts->demand = demand; wts->demand_scaled = demand_scaled; wts->coloc_demand = div64_u64(sum, sched_ravg_hist_size); wts->pred_demand = pred_demand; wts->pred_demand_scaled = pred_demand_scaled;
/*
- 如果缩放后的需求大于系统控制的最小任务利用率，
- 则设置wts->unfilter为系统控制的任务未过滤周期；
- 否则，如果wts->unfilter不为0，则减少它。 */ if (demand_scaled > sysctl_sched_min_task_util_for_colocation) wts->unfilter = sysctl_sched_task_unfilter_period; else if (wts->unfilter) wts->unfilter = max_t(int, 0, wts->unfilter - wrq->prev_window_size);
/*
- 调用trace_sched_update_history函数记录更新历史事件。 */ done: trace_sched_update_history(rq, p, runtime, samples, event, wrq, wts); }

add_to_task_demand函数分析

static u64 add_to_task_demand(struct rq *rq, struct task_struct p, u64 delta) { / * 从任务结构体中获取特定于WALT算法的数据结构。 */ struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;

/*
 * 调用scale_exec_time函数对执行时间delta进行缩放，以适应当前的调度环境。
 */
delta = scale_exec_time(delta, rq);

/*
 * 将缩放后的时间增量累加到任务的累计CPU需求中。
 */
wts->sum += delta;

/*
 * 如果累计的CPU需求超过了调度算法定义的窗口大小，将其截断到窗口大小。
 * unlikely宏用于优化，它假设这种情况发生的概率很小，
 * 因此可以减少常规路径的指令数。
 */
if (unlikely(wts->sum > sched_ravg_window))
	wts->sum = sched_ravg_window;

/*
 * 返回添加到任务需求的时间增量。
 */
return delta;

}

TODO: predict_and_update_buckets函数分析

这是一个内联函数，用于预测任务的CPU需求并更新任务的忙桶。函数接收两个参数：
一个指向task_struct结构体的指针p，表示任务；
一个runtime值，表示任务的忙时间。 */ static inline u32 predict_and_update_buckets( struct task_struct *p, u32 runtime) { int bidx; u32 pred_demand; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;

bidx = busy_to_bucket(runtime); pred_demand = get_pred_busy(p, bidx, runtime); bucket_increase(wts->busy_buckets, bidx);

return pred_demand; }

移植高通WALT负载追踪算法到泰山派