Linux ARM64平台上Hook系统调用(以openat为例)

3,860 阅读4分钟

 我之前已经有几篇关于linux下hook系统调用的文章1,2,3,都是基于x86_64平台的,本文将会先介绍下如何在arm64平台下hook系统调用,最后会手撸个简单的例子。

本文实验环境是在银河麒麟服务器V10系统上:

[root@localhost ~]# cat /etc/os-release 
NAME="Kylin Linux Advanced Server"
VERSION="V10 (Tercel)"
ID="kylin"
VERSION_ID="V10"
PRETTY_NAME="Kylin Linux Advanced Server V10 (Tercel)"
ANSI_COLOR="0;31"

内核版本为 4.19.90-17.ky10.aarch64,基于aarch64平台:

[root@localhost ~]# cat /proc/version 
Linux version 4.19.90-17.ky10.aarch64 (YHKYLIN-OS@localhost.localdomain) (gcc version 7.3.0 (GCC)) #1 SMP Sun Jun 28 14:27:40 CST 2020
[root@localhost ~]# uname -a
Linux localhost.localdomain 4.19.90-17.ky10.aarch64 #1 SMP Sun Jun 28 14:27:40 CST 2020 aarch64 aarch64 aarch64 GNU/Linux
[root@localhost ~]# 

硬件环境为:

[root@localhost ~]# lscpu 
架构:                           aarch64
CPU 运行模式:                   64-bit
字节序:                         Little Endian
CPU:                             4
在线 CPU 列表:                  0-3
每个核的线程数:                 1
每个座的核数:                   4
座:                             1
NUMA 节点:                      1
厂商 ID:                        HiSilicon
型号:                           0
型号名称:                       Kunpeng-920
步进:                           0x1
CPU 最大 MHz:                   2600.0000
CPU 最小 MHz:                   200.0000
BogoMIPS:                       200.00
L1d 缓存:                       256 KiB
L1i 缓存:                       256 KiB
L2 缓存:                        2 MiB
L3 缓存:                        32 MiB
NUMA 节点0 CPU:                 0-3
...

其实同x86_64下hook系统调用方法大致一样,可以看这篇文章

1、获取系统调用表首地址;

2、将待hook的系统调用在系统调用表中的地址替换为自定义的函数;

3、实现自定义的函数。

难点跟不同点就在第二点上,因为系统调用表位于内核的只读内存区域,无法直接修改。

在x86下,可以通过改变cr0寄存器的值,在有些内核版本下内核还提供了相关函数来修改cr0寄存器的值,我们可以暂时将该只读区域修改为可写,在替换完成后,将只读属性改回来即可。

但是在arm64下,并没有cr0寄存器,如何来修改该只读区域的值呢?

一、确定系统调用表所在内存区域

1、整个内核只读内存区域

因为系统调用表是在内核的只读区域,我们可以通过修改整个只读区域的读写属性来完成。

即为内核的__start_rodata跟__end_rodata之间的区域。

[root@localhost ~]# grep -nr rodata /boot/System.map-4.19.90-17.ky10.aarch64 
...
39608:ffff000008a30000 D __start_rodata
...
65911:ffff000008f30000 R __end_rodata
...
[root@localhost ~]# 

2、指定系统调用地址所在内存区域

只修改具体系统调用表中对应系统调用的指针区域读写属性。

比如只修改sys_call_table_ptr+__NR_openat所在的页面的读写属性。

二、修改该内存区域读写属性

1、尝试使用set_memory_ro/rw函数修改只读属性

我们从内核源码实现入手,看下代码具体做了什么,如下:

/arch/arm64/mm/pageattr.c

int set_memory_ro(unsigned long addr, int numpages)
{
	return change_memory_common(addr, numpages,
					__pgprot(PTE_RDONLY),
					__pgprot(PTE_WRITE));
}

int set_memory_rw(unsigned long addr, int numpages)
{
	return change_memory_common(addr, numpages,
					__pgprot(PTE_WRITE),
					__pgprot(PTE_RDONLY));
}

可以看出set_memory_ro/rw函数都是调用了change_memory_common函数,在同一源文件下,change_memory_common的代码实现如下:

/*
 * This function assumes that the range is mapped with PAGE_SIZE pages.
 */
static int __change_memory_common(unsigned long start, unsigned long size,
				pgprot_t set_mask, pgprot_t clear_mask)
{
	struct page_change_data data;
	int ret;

	data.set_mask = set_mask;
	data.clear_mask = clear_mask;

	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
					&data);

	flush_tlb_kernel_range(start, start + size);
	return ret;
}

static int change_memory_common(unsigned long addr, int numpages,
				pgprot_t set_mask, pgprot_t clear_mask)
{
	unsigned long start = addr;
	unsigned long size = PAGE_SIZE * numpages;
	unsigned long end = start + size;
	struct vm_struct *area;
	int i;

	if (!PAGE_ALIGNED(addr)) {
		start &= PAGE_MASK;
		end = start + size;
		WARN_ON_ONCE(1);
	}

	/*
	 * Kernel VA mappings are always live, and splitting live section
	 * mappings into page mappings may cause TLB conflicts. This means
	 * we have to ensure that changing the permission bits of the range
	 * we are operating on does not result in such splitting.
	 *
	 * Let's restrict ourselves to mappings created by vmalloc (or vmap).
	 * Those are guaranteed to consist entirely of page mappings, and
	 * splitting is never needed.
	 *
	 * So check whether the [addr, addr + size) interval is entirely
	 * covered by precisely one VM area that has the VM_ALLOC flag set.
	 */
	area = find_vm_area((void *)addr);
	if (!area ||
	    end > (unsigned long)area->addr + area->size ||
	    !(area->flags & VM_ALLOC))
		return -EINVAL;

	if (!numpages)
		return 0;

	/*
	 * If we are manipulating read-only permissions, apply the same
	 * change to the linear mapping of the pages that back this VM area.
	 */
	if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
			    pgprot_val(clear_mask) == PTE_RDONLY)) {
		for (i = 0; i < area->nr_pages; i++) {
			__change_memory_common((u64)page_address(area->pages[i]),
					       PAGE_SIZE, set_mask, clear_mask);
		}
	}

	/*
	 * Get rid of potentially aliasing lazily unmapped vm areas that may
	 * have permissions set that deviate from the ones we are setting here.
	 */
	vm_unmap_aliases();

	return __change_memory_common(start, size, set_mask, clear_mask);
}

我们看到了如下注释:

    /*
* Kernel VA mappings are always live, and splitting live section
* mappings into page mappings may cause TLB conflicts. This means
* we have to ensure that changing the permission bits of the range
* we are operating on does not result in such splitting.
*
* Let's restrict ourselves to mappings created by vmalloc (or vmap).
* Those are guaranteed to consist entirely of page mappings, and
* splitting is never needed.
*
* So check whether the [addr, addr + size) interval is entirely
* covered by precisely one VM area that has the VM_ALLOC flag set.
*/

还有这段代码:

	if (!area ||
	    end > (unsigned long)area->addr + area->size ||
	    !(area->flags & VM_ALLOC))
		return -EINVAL;

这段注释及下面的代码中可以看出,set_memory_ro/rw只适用于由vmalloc或者vmap创建的虚拟内存映射区域内,但是系统调用表是在内核只读区域,并非是由vmalloc或vmap产生的虚拟内存区域,所以set_memory_ro/rw函数无法使用。

2、尝试使用update_mapping_prot修改

/arch/arm64/mm/mmu.c

static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
				phys_addr_t size, pgprot_t prot)
{
	if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
		pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
			&phys, virt);
		return;
	}

	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
			     NO_CONT_MAPPINGS);

	/* flush the TLBs after updating live kernel mappings */
	flush_tlb_kernel_range(virt, virt + size);
}

经过实验测试,update_mapping_prot可以成功修改系统调用表所在内存区域的读写属性。

三、代码示例

1、定义相关变量

void (*update_mapping_prot)(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot);
unsigned long start_rodata, end_rodata;
#define section_size  (end_rodata - start_rodata)

2、通过kallsyms_lookup_name获取

update_mapping_prot = (void *)kallsyms_lookup_name("update_mapping_prot");
start_rodata = (unsigned long)kallsyms_lookup_name("__start_rodata");
end_rodata= (unsigned long)kallsyms_lookup_name("__end_rodata");
printk("%s. update_mapping_prot:%lx, start_rodata:%lx, end_rodata:%lx.\n", update_mapping_prot, start_rodata, end_rodata);

注:__start_rodata、__end_rodata用kallsyms_lookup_name获取不到的话,可以在system.map文件中读取到,具体方法参见这篇文章中第二部分

3、通过update_mapping_prot实现系统调用表区域只读属性改写

static void disable_wirte_protection(void)
{
    update_mapping_prot(__pa_symbol(start_rodata), (unsigned long)start_rodata, section_size, PAGE_KERNEL);
    return ;
}

static void enable_wirte_protection(int val)
{
    update_mapping_prot(__pa_symbol(start_rodata), (unsigned long)start_rodata, section_size, PAGE_KERNEL_RO);
    return ;
}

4、保存系统原有的openat系统调用入口地址

old_openat_func = (openat_t)sys_call_table_ptr[__NR_openat];

5、替换系统调用入口为自定义函数

preempt_disable();
disable_wirte_protection();

sys_call_table_ptr[__NR_openat] = (openat_t)my_stub_openat;

enable_wirte_protection();
preempt_enable();

6、自定义系统调用实现

asmlinkage long my_stub_openat(const struct pt_regs *pt_regs)
{
        atomic_inc(&ref_count);
        long value = -1;
        char kfilename[80] = {0};

        int dfd = (int)p_regs->regs[0];
        char __user *filename = (char*)p_regs->regs[1];
        int flags = (int)p_regs->regs[2];
        int mode = (int)p_regs->regs[3];

        value = old_openat_func(pt_regs);

        copy_from_user(kfilename, filename, 80);
        printk("%s. process:[%d:%s] open file:%s.\n\t-----> open flags:0x%0x, open %s, fd:%d.\n", __FUNCTION__,
           current->tgid, current->group_leader->comm, kfilename, flags, value>=0?"sucess":"fail", value);

openat_return:
        atomic_dec(&ref_count);
        return value;
}

这里,我们hook成功后,只是打印执行openat的进程及其要打开的文件等相关信息。

7、在模块卸载的时候,替换会原来的系统调用地址,不然系统会崩溃

static void patch_cleanup(void)
{
        preempt_disable();
        disable_wirte_protection();

        if(sys_call_table_ptr[__NR_openat] == my_stub_openat)
                sys_call_table_ptr[__NR_openat] = old_openat_func;

        enable_wirte_protection();
        preempt_enable();

        return ;
}

8、编译后,加载模块

运行结果截图:

9、后记

最近在uos20 desktop for arm64 专业版系统上测试时发现,禁用写保护时失败,内核报错如下:

[ 5 18 14:54:08 2021] kernel BUG at arch/arm64/mm/mmu.c:152!
[ 5 18 14:54:08 2021] Internal error: Oops - BUG: 0 [#1] SMP
[ 5 18 14:54:08 2021] Modules linked in: lkm4arm64(O+) bluetooth ecdh_generic fuse st cfg80211 rfkill firmware_class nls_iso8859_1 nls_cp437 aes_ce_blk crypto_simd cryptd aes_ce_cipher crc32_ce crct10dif_ce ghash_ce aes_arm64 sha2_ce sha256_arm64 sha1_ce virtio_balloon qemu_fw_cfg binder_linux(O) ashmem_linux(O) efivarfs virtio_rng ip_tables x_tables btrfs xor raid6_pq hid_generic usbkbd usbmouse usbhid rtc_efi virtio_blk virtio_scsi virtio_net net_failover failover button virtio_mmio [last unloaded: lkm4arm64]
[ 5 18 14:54:08 2021] Process insmod (pid: 1535, stack limit = 0x000000005ac3dde5)
[ 5 18 14:54:08 2021] CPU: 1 PID: 1535 Comm: insmod Tainted: G           O      4.19.0-arm64-desktop #3100
[ 5 18 14:54:08 2021] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 5 18 14:54:08 2021] pstate: 60400005 (nZCv daif +PAN -UAO)
[ 5 18 14:54:08 2021] pc : alloc_init_pud+0x518/0x550
[ 5 18 14:54:08 2021] lr : alloc_init_pud+0x494/0x550
[ 5 18 14:54:08 2021] sp : ffffad7ce3fefa80
[ 5 18 14:54:08 2021] x29: ffffad7ce3fefa80 x28: ffff7dfffe637fa8 
[ 5 18 14:54:08 2021] x27: ffff3d7d5b5c1fff x26: 0000000087a00000 
[ 5 18 14:54:08 2021] x25: ffff3d7d5b000000 x24: 0068000000000f13 
[ 5 18 14:54:08 2021] x23: ffff3d7d5b400000 x22: ffff3d7d5b5c2000 
[ 5 18 14:54:08 2021] x21: ffff7dfffe6386d0 x20: ffff3d7d5b5c2000 
[ 5 18 14:54:08 2021] x19: 0060000087a00f91 x18: ffff3d7d5ba24000 
[ 5 18 14:54:08 2021] x17: 0000000000000000 x16: 0000000000000000 
[ 5 18 14:54:08 2021] x15: 00000000fffffff0 x14: ffff3d7d5bd47648 
[ 5 18 14:54:08 2021] x13: 0000000000000000 x12: ffff3d7d5bd46000 
[ 5 18 14:54:08 2021] x11: ffff3d7d5ba24000 x10: ffff3d7d5bd46ca0 
[ 5 18 14:54:08 2021] x9 : 0000000000000000 x8 : 0000000000000004 
[ 5 18 14:54:08 2021] x7 : ffff3d7fffffffff x6 : ffff7dfffe6386c8 
[ 5 18 14:54:08 2021] x5 : 0000000087800000 x4 : 0068000087800f11 
[ 5 18 14:54:08 2021] x3 : 0000000000000000 x2 : 0000000000000001 
[ 5 18 14:54:08 2021] x1 : 0000000000000001 x0 : 0060000087a00f91 
[ 5 18 14:54:08 2021] Call trace:
[ 5 18 14:54:08 2021]  alloc_init_pud+0x518/0x550
[ 5 18 14:54:08 2021]  __create_pgd_mapping+0x98/0xe8
[ 5 18 14:54:08 2021]  update_mapping_prot+0x48/0xd0
[ 5 18 14:54:08 2021]  disable_wirte_protection+0x54/0x88 [lkm4arm64]
[ 5 18 14:54:08 2021]  test_replace+0xe0/0x104 [lkm4arm64]
[ 5 18 14:54:08 2021]  lkm_init+0x20/0xd5c [lkm4arm64]
[ 5 18 14:54:08 2021]  do_one_initcall+0x30/0x19c
[ 5 18 14:54:08 2021]  do_init_module+0x58/0x1c8
[ 5 18 14:54:08 2021]  load_module+0x128c/0x1490
[ 5 18 14:54:08 2021]  __se_sys_finit_module+0x84/0xc8
[ 5 18 14:54:08 2021]  __arm64_sys_finit_module+0x18/0x20
[ 5 18 14:54:08 2021]  el0_svc_common+0x90/0x160
[ 5 18 14:54:08 2021]  el0_svc_handler+0x9c/0xa8
[ 5 18 14:54:08 2021]  el0_svc+0x8/0xc
[ 5 18 14:54:08 2021] Code: a9025bf5 a90363f7 d4210000 d4210000 (d4210000) 
[ 5 18 14:54:08 2021] ---[ end trace a452da0642349ffd ]---

解决方案

再探内核源码,在mmu.c文件里,mark_rodata_ro函数调用了update_mapping_prot来禁用内核只读区的代码:

void mark_rodata_ro(void)
{
	unsigned long section_size;

	/*
	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
	 * to cover NOTES and EXCEPTION_TABLE.
	 */
	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
			    section_size, PAGE_KERNEL_RO);

	debug_checkwx();
}

这里禁用的只读区域是__start_rodata到__init_begin,我们可以把我们测试程序的__end_rodata也改成__init_begin:

void (*update_mapping_prot)(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot);
unsigned long start_rodata, init_begin;
#define section_size  (init_begin - start_rodata)

update_mapping_prot = (void *)kallsyms_lookup_name("update_mapping_prot");
start_rodata = (unsigned long)kallsyms_lookup_name("__start_rodata");
init_begin= (unsigned long)kallsyms_lookup_name("__init_begin");
printk("%s. update_mapping_prot:%lx, start_rodata:%lx, init_begin:%lx.\n", update_mapping_prot, start_rodata, init_begin);

改完后,重新编译,加载内核模块正常了。

我们再进一步看下内核映射具体划分就知道了:

/*
 * Create fine-grained mappings for the kernel.
 */
static void __init map_kernel(pgd_t *pgdp)
{
	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
				vmlinux_initdata, vmlinux_data;

	/*
	 * External debuggers may need to write directly to the text
	 * mapping to install SW breakpoints. Allow this (only) when
	 * explicitly requested with rodata=off.
	 */
	pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;

	/*
	 * Only rodata will be remapped with different permissions later on,
	 * all other segments are allowed to use contiguous mappings.
	 */
	map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
			   VM_NO_GUARD);
	map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
			   &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
	map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
			   &vmlinux_inittext, 0, VM_NO_GUARD);
	map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
			   &vmlinux_initdata, 0, VM_NO_GUARD);
	map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);

	if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) {
		/*
		 * The fixmap falls in a separate pgd to the kernel, and doesn't
		 * live in the carveout for the swapper_pg_dir. We can simply
		 * re-use the existing dir for the fixmap.
		 */
		set_pgd(pgd_offset_raw(pgdp, FIXADDR_START),
			READ_ONCE(*pgd_offset_k(FIXADDR_START)));
	} else if (CONFIG_PGTABLE_LEVELS > 3) {
		/*
		 * The fixmap shares its top level pgd entry with the kernel
		 * mapping. This can really only occur when we are running
		 * with 16k/4 levels, so we can simply reuse the pud level
		 * entry instead.
		 */
		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
		pud_populate(&init_mm,
			     pud_set_fixmap_offset(pgdp, FIXADDR_START),
			     lm_alias(bm_pmd));
		pud_clear_fixmap();
	} else {
		BUG();
	}

	kasan_copy_shadow(pgdp);
}

可以看到:
map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
&vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);

内核只读区域实际上就是__start_rodata到 __inittext_begin,我们模块为了安全起见,还是按照mark_rodata_ro函数的方法来好了。

其实,我们编写内核模块时,很多解决方案或者函数用法在内核源码中都有,我们最安全的姿势就是直接移植(照抄)过来就行了。

本文完。

感兴趣的话可以关注我的微信公众号【大胖聊编程】,我的公众号中有更多文章分享,也可以在公众号中联系到我,一起交流学习。