从用户态到内核态逐步解析chroot沙箱机制（下～eBPF追踪和内核态动作）开启掘金成长之旅！这是我参与「掘金日新计划

开启掘金成长之旅！这是我参与「掘金日新计划 · 12 月更文挑战」的第1天，点击查看活动详情

在上节解析了chroot在用户态时的动作以及系统调用的相关解析，这节不急着去追看内核代码，先尝试一下借助内核追踪手段来探查一下chroot在内核中的动作。

内核eBPF追踪

内核追踪的手段有很多，现在最火的也是以eBPF为技术核心的一些手段。这里就简单探查一下，我就采用的是较为简洁易用的bpftrace方式。我采用的内核为5.15.60，eBPF也容易施展。

首先看下与chroot有关的可追踪的函数。

sudo bpftrace -l '*'|grep chroot
————
kfunc:__ia32_sys_chroot  
kfunc:__x64_sys_chroot  
kfunc:bpf_lsm_path_chroot  
kfunc:chroot_fs_refs  
kfunc:current_chrooted  
kfunc:security_path_chroot  
kfunc:tomoyo_path_chroot  
kprobe:__ia32_sys_chroot  
kprobe:__x64_sys_chroot  
kprobe:bpf_lsm_path_chroot  
kprobe:chroot_fs_refs  
kprobe:current_chrooted  
kprobe:security_path_chroot  
kprobe:tomoyo_path_chroot  
tracepoint:syscalls:sys_enter_chroot  
tracepoint:syscalls:sys_exit_chroot

查看系统的信息

执行该条命令后。新开shell，chroot到之前的alpine的rootfs

sudo bpftrace -e 't:*chroot* {printf("%s %d\n",comm,pid); }'
————
Attaching 2 probes...  
chroot 51827  
chroot 51827

比较一下该pid与执行的chroot命令一致。

查看一下我们怀疑的内核函数调用栈。

执行该条命令后。新开shell，chroot到之前的alpine的rootfs

sudo bpftrace -e 'k:*chroot* {@[kstack] = count(); }'
————
@[  
   __x64_sys_chroot+1  
   do_syscall_64+88  
   entry_SYSCALL_64_after_hwframe+97  
]: 1  
@[  
   security_path_chroot+1  
   __x64_sys_chroot+222  
   do_syscall_64+88  
   entry_SYSCALL_64_after_hwframe+97  
]: 1

这里我们命中了两个函数，看security_path_chroot调用的更深，有了这些关键函数名称，在源码中也更容易掌握其调用过程。

chroot内核源码实现

用户态通过syscall进行系统调用，内核部分需要进行。我们根据上述的函数调用栈逐步寻找。

entry_SYSCALL_64_after_hwframe

arch/x86/entry/entry_64.S

/*
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
 *
 * This is the only entry point used for 64-bit system calls.  The
 * hardware interface is reasonably well designed and the register to
 * argument mapping Linux uses fits well with the registers that are
 * available when SYSCALL is used.
 *
 * SYSCALL instructions can be found inlined in libc implementations as
 * well as some other programs and libraries.  There are also a handful
 * of SYSCALL instructions in the vDSO used, for example, as a
 * clock_gettimeofday fallback.
 *
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
 * are not needed). SYSCALL does not save anything on the stack
 * and does not change rsp.
 *
 * Registers on entry:
 * rax  system call number
 * rcx  return address
 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
 * rdi  arg0
 * rsi  arg1
 * rdx  arg2
 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
 * r8   arg4
 * r9   arg5
 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
 *
 * Only called from user space.
 *
 * When user can change pt_regs->foo always force IRET. That is because
 * it deals with uncanonical addresses better. SYSRET has trouble
 * with them due to bugs in both AMD and Intel CPUs.
 */

......

SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
	pushq	%rax					/* pt_regs->orig_ax */

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS

	/* IRQs are off. */
	movq	%rsp, %rdi
	/* Sign extend the lower 32bit as syscall numbers are treated as int */
	movslq	%eax, %rsi

	/* clobbers %rax, make sure it is after saving the syscall nr */
	IBRS_ENTER
	UNTRAIN_RET

	call	do_syscall_64		/* returns with IRQs disabled */

	/*
	 * Try to use SYSRET instead of IRET if we're returning to
	 * a completely clean 64-bit userspace context.  If we're not,
	 * go to the slow exit path.
	 * In the Xen PV case we must use iret anyway.
	 */

	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
		X86_FEATURE_XENPV

	movq	RCX(%rsp), %rcx
	movq	RIP(%rsp), %r11

	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
	jne	swapgs_restore_regs_and_return_to_usermode

这里代码贴的较多，主要根据注释和关键词，基本能够断定这就是最开始的入口。

这里又调用下一个函数call do_syscall_64，与之前的函数调用栈里面的相一致。

do_syscall_64和do_syscall_x64

这里的调用主要逻辑还是在于do_syscall_x64

arch/x86/entry/common.c

.....

static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
	/*
	 * Convert negative numbers to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int unr = nr;

	if (likely(unr < NR_syscalls)) {
		unr = array_index_nospec(unr, NR_syscalls);
		regs->ax = sys_call_table[unr](regs);
		return true;
	}
	return false;
}

sys_call_table[unr](regs);这里是系统调用表，系统调用表从系统调用号查找到真正的执行函数。系统调用表的生成我这里就不再细讲了……

__x64_sys_chroot和security_path_chroot

这里就是chroot动作真正执行的地方了。__x64_sys_chroot这个函数搜索不到，估计又是宏生成的，那就先找调用security_path_chroot的地方。

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
	struct path path;
	int error;
	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
	if (error)
		goto out;

	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
	if (error)
		goto dput_and_out;

	error = -EPERM;
	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
		goto dput_and_out;
	error = security_path_chroot(&path);
	if (error)
		goto dput_and_out;

	set_fs_root(current->fs, &path);
	error = 0;
dput_and_out:
	path_put(&path);
	if (retry_estale(error, lookup_flags)) {
		lookup_flags |= LOOKUP_REVAL;
		goto retry;
	}
out:
	return error;
}

果然又是宏定义追SYSCALL_DEFINE1 include/linux/syscalls.h,

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)

#define __SYSCALL_DEFINEx(x, name, ...)					\
	__diag_push();							\
	__diag_ignore(GCC, 8, "-Wattribute-alias",			\
		      "Type aliasing is used to sanitize syscall arguments");\
	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
		__attribute__((alias(__stringify(__se_sys##name))));	\
	ALLOW_ERROR_INJECTION(sys##name, ERRNO);			\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
	asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
	asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	__diag_pop();							\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))

但有个问题，这里还原出来是__do_sys_chroot，并不是__x64_sys_chroot,可能是__se_sys那别名调用？这里暂时先不管了。

set_fs_root

fs/fs_struct.c 最关键的部分就在于这个函数。

/*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
	struct path old_root;

	path_get(path);
	spin_lock(&fs->lock);
	write_seqcount_begin(&fs->seq);
	old_root = fs->root;
	fs->root = *path;
	write_seqcount_end(&fs->seq);
	spin_unlock(&fs->lock);
	if (old_root.dentry)
		path_put(&old_root);
}

我们同时回顾调用地方set_fs_root(current->fs, &path); 该函数就是将当前线程的root路径进行替换。注意是该线程！命令chroot操作对root的替换只限于该线程，不会影响其平行线程。而其子进程由于会进行继承，自然而然就形成了一个沙箱。

至此。chroot的从用户态到内核态的机制就已经追完。