kernel基于CONFIG的动态编译链接机制

29 阅读4分钟

背景

基于kernel 6.6
最近在看time子系统时,发现kernel的一套非常有趣的代码调用设计。

基于CONFIG的动态编译链接设计,优点包括:

  • 实现了框架和功能解耦
  • 降低了编译后可执行代码的大小

time子系统初始化

初始化过程中的调用路径:start_kernel --> time_init --> timer_probe

// drivers/clocksource/timer-probe.c
#include <linux/acpi.h>
#include <linux/init.h>
#include <linux/of.h>
#include <linux/clocksource.h>

extern struct of_device_id __timer_of_table[];

static const struct of_device_id __timer_of_table_sentinel
	__used __section("__timer_of_table_end");

void __init timer_probe(void)
{
	struct device_node *np;
	const struct of_device_id *match;
	of_init_fn_1_ret init_func_ret;
	unsigned timers = 0;
	int ret;

	for_each_matching_node_and_match(np, __timer_of_table, &match) {
		if (!of_device_is_available(np))
			continue;

		init_func_ret = match->data;

		ret = init_func_ret(np);
		if (ret) {
			if (ret != -EPROBE_DEFER)
				pr_err("Failed to initialize '%pOF': %d\n", np,
				       ret);
			continue;
		}

		timers++;
	}
	...
}

timer_probe中for_each_matching_node_and_match遍历执行init_func_ret,而init_func_ret具体是什么函数内容?__timer_of_table指向何处,常规符号名搜索就找不到了。

编译过程

链接脚本文件预留段

vmlinux.lds.S为kernel链接脚本文件,其中提前预留了timer的device id表段

// arch/arm64/kernel/vmlinux.lds.S
SECTIONS
{
    ...
	.init.data : {
		INIT_DATA
		INIT_SETUP(16)
		INIT_CALLS
		CON_INITCALL
		INIT_RAM_FS
		*(.init.altinstructions .init.bss)	/* from the EFI stub */
	}
	.exit.data : {
		EXIT_DATA
	}
    ...

INIT_DATA中就包含了timer相关的device id表段:

  • 以__timer_of_table开头
  • 以__timer_of_table_end结尾
// include/asm-generic/vmlinux.lds.h
#define ___OF_TABLE(cfg, name)	_OF_TABLE_##cfg(name)
#define __OF_TABLE(cfg, name)	___OF_TABLE(cfg, name)
#define OF_TABLE(cfg, name)	__OF_TABLE(IS_ENABLED(cfg), name)
#define _OF_TABLE_0(name)
#define _OF_TABLE_1(name)						\
	. = ALIGN(8);							\
	__##name##_of_table = .;					\
	KEEP(*(__##name##_of_table))					\
	KEEP(*(__##name##_of_table_end))

#define TIMER_OF_TABLES()	OF_TABLE(CONFIG_TIMER_OF, timer)

/* init and exit section handling */
#define INIT_DATA							\
	KEEP(*(SORT(___kentry+*)))					\
	*(.init.data .init.data.*)					\
	KERNEL_CTORS()							\
	MCOUNT_REC()							\
	*(.init.rodata .init.rodata.*)					\
	...
	TIMER_OF_TABLES()						\
	...

写入timer的init函数表

arm_arch_timer.c在编译阶段,提前把armv7和armv8的device id写入预留的section中

// drivers/clocksource/arm_arch_timer.c
TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init);
TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init);


// include/linux/clocksource.h
#define TIMER_OF_DECLARE(name, compat, fn) \
	OF_DECLARE_1_RET(timer, name, compat, fn)


// include/linux/of.h
#define _OF_DECLARE_STUB(table, name, compat, fn, fn_type)		\
	static const struct of_device_id __of_table_##name		\
		__attribute__((unused))					\
		 = { .compatible = compat,				\
		     .data = (fn == (fn_type)NULL) ? fn : fn }

// 构建了device_id信息,其中包含了init function函数地址
#if defined(CONFIG_OF) && !defined(MODULE)
#define _OF_DECLARE(table, name, compat, fn, fn_type)			\
	static const struct of_device_id __of_table_##name		\
		__used __section("__" #table "_of_table")		\
		__aligned(__alignof__(struct of_device_id))		\
		 = { .compatible = compat,				\
		     .data = (fn == (fn_type)NULL) ? fn : fn  }
#else
#define _OF_DECLARE(table, name, compat, fn, fn_type)			\
	_OF_DECLARE_STUB(table, name, compat, fn, fn_type)
#endif

typedef int (*of_init_fn_2)(struct device_node *, struct device_node *);
typedef int (*of_init_fn_1_ret)(struct device_node *);
typedef void (*of_init_fn_1)(struct device_node *);

#define OF_DECLARE_1(table, name, compat, fn) \
		_OF_DECLARE(table, name, compat, fn, of_init_fn_1)

init段内存回收

.init.*相关的段所占用的内存,在启动结束还是会被回收利用,一点也不浪费。

SECTIONS
{
	...

	. = ALIGN(SEGMENT_ALIGN);
	__init_begin = .;
	__inittext_begin = .;

	INIT_TEXT_SECTION(8)

	__exittext_begin = .;
	.exit.text : {
		EXIT_TEXT
	}
	__exittext_end = .;

	. = ALIGN(4);
	.altinstructions : {
		__alt_instructions = .;
		*(.altinstructions)
		__alt_instructions_end = .;
	}

	UNWIND_DATA_SECTIONS

	. = ALIGN(SEGMENT_ALIGN);
	__inittext_end = .;
	__initdata_begin = .;

	init_idmap_pg_dir = .;
	. += INIT_IDMAP_DIR_SIZE;
	init_idmap_pg_end = .;

	.init.data : {
		INIT_DATA
		INIT_SETUP(16)
		INIT_CALLS
		CON_INITCALL
		INIT_RAM_FS
		*(.init.altinstructions .init.bss)	/* from the EFI stub */
	}
	.exit.data : {
		EXIT_DATA
	}

	PERCPU_SECTION(L1_CACHE_BYTES)
	HYPERVISOR_PERCPU_SECTION

	HYPERVISOR_RELOC_SECTION

	.rela.dyn : ALIGN(8) {
		__rela_start = .;
		*(.rela .rela*)
		__rela_end = .;
	}

	.relr.dyn : ALIGN(8) {
		__relr_start = .;
		*(.relr.dyn)
		__relr_end = .;
	}

	. = ALIGN(SEGMENT_ALIGN);
	__initdata_end = .;
	__init_end = .;

    ...
}

kernel_init函数的最后,调用free_initmem回收init相关段所占用的内存。

// init/main.c
static int __ref kernel_init(void *unused)
{
	int ret;

	/*
	 * Wait until kthreadd is all set-up.
	 */
	wait_for_completion(&kthreadd_done);

	kernel_init_freeable();
	/* need to finish all async __init code before freeing the memory */
	async_synchronize_full();

	system_state = SYSTEM_FREEING_INITMEM;
	kprobe_free_init_mem();
	ftrace_free_init_mem();
	kgdb_free_init_mem();
	exit_boot_config();
	free_initmem();
    ...
}

// arch/arm64/mm/init.c
void free_initmem(void)
{
	free_reserved_area(lm_alias(__init_begin),
			   lm_alias(__init_end),
			   POISON_FREE_INITMEM, "unused kernel");
	/*
	 * Unmap the __init region but leave the VM area in place. This
	 * prevents the region from being reused for kernel modules, which
	 * is not supported by kallsyms.
	 */
	vunmap_range((u64)__init_begin, (u64)__init_end);
}

demo代码实现类似机制

首先编写C代码:

  • 将函数地址放在自定义的段中
  • 主函数中执行各个函数指针,打印函数指针地址
#include <stdio.h>

// 定义函数指针类型
typedef void (*myown_call)(void);

// 声明段首尾符号(由链接脚本定义)
extern myown_call _myown_start;
extern myown_call _myown_end;

// 使用 section 属性将函数放入 .myown 段
#define _init __attribute__((unused, section(".myown")))
#define func_init(func) myown_call fn_##func _init = func

static void mspec1(void) {
    printf("mspec1!\n");
}

static void mspec2(void) {
    printf("mspec2!\n");
}

static void mspec3(void) {
    printf("mspec3!\n");
}

func_init(mspec1);
func_init(mspec2);
func_init(mspec3);

// 遍历并调用段内所有函数
void do_initcalls(void) {
    myown_call *call_ptr = &_myown_start;
    do {
        fprintf(stderr, "call_ptr: %p\n", call_ptr);
        (*call_ptr)();
        ++call_ptr;
    } while (call_ptr < &_myown_end);
}

int main(void) {
    do_initcalls();
    return 0;
}

然后修改链接脚本文件

ld --verbose > test.lds

获取到默认链接脚本文件后,在__bss_start之前加入自定义的段信息

  ...
  . = .;
  _myown_start = .;
  .myown		: {*(.myown)}
  _myown_end = .;
  __bss_start = .;
  .bss            :
  ...

注意:只保留模板链接中间部分内容

GNU ld (GNU Binutils for Ubuntu) 2.38
  Supported emulations:
   elf_x86_64
   elf32_x86_64
   elf_i386
   elf_iamcu
   elf_l1om
   elf_k1om
   i386pep
   i386pe
using internal linker script:
==================================================
// 仅保留中间部分内容,并修改增加自定义段
==================================================

最后编译出可执行文件

gcc -c main.c -o main.o
gcc -T test.lds main.o -o test

image.png

我们可以通过下面命令,查看自定义的段信息

readelf -S test
或者
readelf -S main.o

image.png