背景
基于kernel 6.6
最近在看time子系统时,发现kernel的一套非常有趣的代码调用设计。
基于CONFIG的动态编译链接设计,优点包括:
- 实现了框架和功能解耦
- 降低了编译后可执行代码的大小
time子系统初始化
初始化过程中的调用路径:start_kernel --> time_init --> timer_probe
// drivers/clocksource/timer-probe.c
#include <linux/acpi.h>
#include <linux/init.h>
#include <linux/of.h>
#include <linux/clocksource.h>
extern struct of_device_id __timer_of_table[];
static const struct of_device_id __timer_of_table_sentinel
__used __section("__timer_of_table_end");
void __init timer_probe(void)
{
struct device_node *np;
const struct of_device_id *match;
of_init_fn_1_ret init_func_ret;
unsigned timers = 0;
int ret;
for_each_matching_node_and_match(np, __timer_of_table, &match) {
if (!of_device_is_available(np))
continue;
init_func_ret = match->data;
ret = init_func_ret(np);
if (ret) {
if (ret != -EPROBE_DEFER)
pr_err("Failed to initialize '%pOF': %d\n", np,
ret);
continue;
}
timers++;
}
...
}
timer_probe中for_each_matching_node_and_match遍历执行init_func_ret,而init_func_ret具体是什么函数内容?__timer_of_table指向何处,常规符号名搜索就找不到了。
编译过程
链接脚本文件预留段
vmlinux.lds.S为kernel链接脚本文件,其中提前预留了timer的device id表段
// arch/arm64/kernel/vmlinux.lds.S
SECTIONS
{
...
.init.data : {
INIT_DATA
INIT_SETUP(16)
INIT_CALLS
CON_INITCALL
INIT_RAM_FS
*(.init.altinstructions .init.bss) /* from the EFI stub */
}
.exit.data : {
EXIT_DATA
}
...
INIT_DATA中就包含了timer相关的device id表段:
- 以__timer_of_table开头
- 以__timer_of_table_end结尾
// include/asm-generic/vmlinux.lds.h
#define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name)
#define __OF_TABLE(cfg, name) ___OF_TABLE(cfg, name)
#define OF_TABLE(cfg, name) __OF_TABLE(IS_ENABLED(cfg), name)
#define _OF_TABLE_0(name)
#define _OF_TABLE_1(name) \
. = ALIGN(8); \
__##name##_of_table = .; \
KEEP(*(__##name##_of_table)) \
KEEP(*(__##name##_of_table_end))
#define TIMER_OF_TABLES() OF_TABLE(CONFIG_TIMER_OF, timer)
/* init and exit section handling */
#define INIT_DATA \
KEEP(*(SORT(___kentry+*))) \
*(.init.data .init.data.*) \
KERNEL_CTORS() \
MCOUNT_REC() \
*(.init.rodata .init.rodata.*) \
...
TIMER_OF_TABLES() \
...
写入timer的init函数表
arm_arch_timer.c在编译阶段,提前把armv7和armv8的device id写入预留的section中
// drivers/clocksource/arm_arch_timer.c
TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init);
TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init);
// include/linux/clocksource.h
#define TIMER_OF_DECLARE(name, compat, fn) \
OF_DECLARE_1_RET(timer, name, compat, fn)
// include/linux/of.h
#define _OF_DECLARE_STUB(table, name, compat, fn, fn_type) \
static const struct of_device_id __of_table_##name \
__attribute__((unused)) \
= { .compatible = compat, \
.data = (fn == (fn_type)NULL) ? fn : fn }
// 构建了device_id信息,其中包含了init function函数地址
#if defined(CONFIG_OF) && !defined(MODULE)
#define _OF_DECLARE(table, name, compat, fn, fn_type) \
static const struct of_device_id __of_table_##name \
__used __section("__" #table "_of_table") \
__aligned(__alignof__(struct of_device_id)) \
= { .compatible = compat, \
.data = (fn == (fn_type)NULL) ? fn : fn }
#else
#define _OF_DECLARE(table, name, compat, fn, fn_type) \
_OF_DECLARE_STUB(table, name, compat, fn, fn_type)
#endif
typedef int (*of_init_fn_2)(struct device_node *, struct device_node *);
typedef int (*of_init_fn_1_ret)(struct device_node *);
typedef void (*of_init_fn_1)(struct device_node *);
#define OF_DECLARE_1(table, name, compat, fn) \
_OF_DECLARE(table, name, compat, fn, of_init_fn_1)
init段内存回收
.init.*相关的段所占用的内存,在启动结束还是会被回收利用,一点也不浪费。
SECTIONS
{
...
. = ALIGN(SEGMENT_ALIGN);
__init_begin = .;
__inittext_begin = .;
INIT_TEXT_SECTION(8)
__exittext_begin = .;
.exit.text : {
EXIT_TEXT
}
__exittext_end = .;
. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
*(.altinstructions)
__alt_instructions_end = .;
}
UNWIND_DATA_SECTIONS
. = ALIGN(SEGMENT_ALIGN);
__inittext_end = .;
__initdata_begin = .;
init_idmap_pg_dir = .;
. += INIT_IDMAP_DIR_SIZE;
init_idmap_pg_end = .;
.init.data : {
INIT_DATA
INIT_SETUP(16)
INIT_CALLS
CON_INITCALL
INIT_RAM_FS
*(.init.altinstructions .init.bss) /* from the EFI stub */
}
.exit.data : {
EXIT_DATA
}
PERCPU_SECTION(L1_CACHE_BYTES)
HYPERVISOR_PERCPU_SECTION
HYPERVISOR_RELOC_SECTION
.rela.dyn : ALIGN(8) {
__rela_start = .;
*(.rela .rela*)
__rela_end = .;
}
.relr.dyn : ALIGN(8) {
__relr_start = .;
*(.relr.dyn)
__relr_end = .;
}
. = ALIGN(SEGMENT_ALIGN);
__initdata_end = .;
__init_end = .;
...
}
kernel_init函数的最后,调用free_initmem回收init相关段所占用的内存。
// init/main.c
static int __ref kernel_init(void *unused)
{
int ret;
/*
* Wait until kthreadd is all set-up.
*/
wait_for_completion(&kthreadd_done);
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
system_state = SYSTEM_FREEING_INITMEM;
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();
exit_boot_config();
free_initmem();
...
}
// arch/arm64/mm/init.c
void free_initmem(void)
{
free_reserved_area(lm_alias(__init_begin),
lm_alias(__init_end),
POISON_FREE_INITMEM, "unused kernel");
/*
* Unmap the __init region but leave the VM area in place. This
* prevents the region from being reused for kernel modules, which
* is not supported by kallsyms.
*/
vunmap_range((u64)__init_begin, (u64)__init_end);
}
demo代码实现类似机制
首先编写C代码:
- 将函数地址放在自定义的段中
- 主函数中执行各个函数指针,打印函数指针地址
#include <stdio.h>
// 定义函数指针类型
typedef void (*myown_call)(void);
// 声明段首尾符号(由链接脚本定义)
extern myown_call _myown_start;
extern myown_call _myown_end;
// 使用 section 属性将函数放入 .myown 段
#define _init __attribute__((unused, section(".myown")))
#define func_init(func) myown_call fn_##func _init = func
static void mspec1(void) {
printf("mspec1!\n");
}
static void mspec2(void) {
printf("mspec2!\n");
}
static void mspec3(void) {
printf("mspec3!\n");
}
func_init(mspec1);
func_init(mspec2);
func_init(mspec3);
// 遍历并调用段内所有函数
void do_initcalls(void) {
myown_call *call_ptr = &_myown_start;
do {
fprintf(stderr, "call_ptr: %p\n", call_ptr);
(*call_ptr)();
++call_ptr;
} while (call_ptr < &_myown_end);
}
int main(void) {
do_initcalls();
return 0;
}
然后修改链接脚本文件
ld --verbose > test.lds
获取到默认链接脚本文件后,在__bss_start之前加入自定义的段信息
...
. = .;
_myown_start = .;
.myown : {*(.myown)}
_myown_end = .;
__bss_start = .;
.bss :
...
注意:只保留模板链接中间部分内容
GNU ld (GNU Binutils for Ubuntu) 2.38
Supported emulations:
elf_x86_64
elf32_x86_64
elf_i386
elf_iamcu
elf_l1om
elf_k1om
i386pep
i386pe
using internal linker script:
==================================================
// 仅保留中间部分内容,并修改增加自定义段
==================================================
最后编译出可执行文件
gcc -c main.c -o main.o
gcc -T test.lds main.o -o test
我们可以通过下面命令,查看自定义的段信息
readelf -S test
或者
readelf -S main.o