开启掘金成长之旅!这是我参与「掘金日新计划 · 12 月更文挑战」的第1天,点击查看活动详情
在上节解析了chroot在用户态时的动作以及系统调用的相关解析,这节不急着去追看内核代码,先尝试一下借助内核追踪手段来探查一下chroot在内核中的动作。
内核eBPF追踪
内核追踪的手段有很多,现在最火的也是以eBPF为技术核心的一些手段。这里就简单探查一下,我就采用的是较为简洁易用的bpftrace方式。我采用的内核为5.15.60,eBPF也容易施展。
- 首先看下与chroot有关的可追踪的函数。
sudo bpftrace -l '*'|grep chroot
————
kfunc:__ia32_sys_chroot
kfunc:__x64_sys_chroot
kfunc:bpf_lsm_path_chroot
kfunc:chroot_fs_refs
kfunc:current_chrooted
kfunc:security_path_chroot
kfunc:tomoyo_path_chroot
kprobe:__ia32_sys_chroot
kprobe:__x64_sys_chroot
kprobe:bpf_lsm_path_chroot
kprobe:chroot_fs_refs
kprobe:current_chrooted
kprobe:security_path_chroot
kprobe:tomoyo_path_chroot
tracepoint:syscalls:sys_enter_chroot
tracepoint:syscalls:sys_exit_chroot
- 查看系统的信息
执行该条命令后。新开shell,chroot到之前的alpine的rootfs
sudo bpftrace -e 't:*chroot* {printf("%s %d\n",comm,pid); }'
————
Attaching 2 probes...
chroot 51827
chroot 51827
比较一下该pid与执行的chroot命令一致。
- 查看一下我们怀疑的内核函数调用栈。
执行该条命令后。新开shell,chroot到之前的alpine的rootfs
sudo bpftrace -e 'k:*chroot* {@[kstack] = count(); }'
————
@[
__x64_sys_chroot+1
do_syscall_64+88
entry_SYSCALL_64_after_hwframe+97
]: 1
@[
security_path_chroot+1
__x64_sys_chroot+222
do_syscall_64+88
entry_SYSCALL_64_after_hwframe+97
]: 1
这里我们命中了两个函数,看security_path_chroot调用的更深,有了这些关键函数名称,在源码中也更容易掌握其调用过程。
chroot内核源码实现
用户态通过syscall进行系统调用,内核部分需要进行。我们根据上述的函数调用栈逐步寻找。
- entry_SYSCALL_64_after_hwframe
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* This is the only entry point used for 64-bit system calls. The
* hardware interface is reasonably well designed and the register to
* argument mapping Linux uses fits well with the registers that are
* available when SYSCALL is used.
*
* SYSCALL instructions can be found inlined in libc implementations as
* well as some other programs and libraries. There are also a handful
* of SYSCALL instructions in the vDSO used, for example, as a
* clock_gettimeofday fallback.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
......
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rsp, %rdi
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
call do_syscall_64 /* returns with IRQs disabled */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
X86_FEATURE_XENPV
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
jne swapgs_restore_regs_and_return_to_usermode
这里代码贴的较多,主要根据注释和关键词,基本能够断定这就是最开始的入口。
这里又调用下一个函数call do_syscall_64,与之前的函数调用栈里面的相一致。
- do_syscall_64和do_syscall_x64
这里的调用主要逻辑还是在于do_syscall_x64
.....
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons.
*/
unsigned int unr = nr;
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
regs->ax = sys_call_table[unr](regs);
return true;
}
return false;
}
sys_call_table[unr](regs);这里是系统调用表,系统调用表从系统调用号查找到真正的执行函数。 系统调用表的生成我这里就不再细讲了……
- __x64_sys_chroot和security_path_chroot
这里就是chroot动作真正执行的地方了。__x64_sys_chroot这个函数搜索不到,估计又是宏生成的,那就先找调用security_path_chroot的地方。
SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
error = -EPERM;
if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
goto dput_and_out;
error = security_path_chroot(&path);
if (error)
goto dput_and_out;
set_fs_root(current->fs, &path);
error = 0;
dput_and_out:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out:
return error;
}
果然又是宏定义 追SYSCALL_DEFINE1 include/linux/syscalls.h,
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
__diag_push(); \
__diag_ignore(GCC, 8, "-Wattribute-alias", \
"Type aliasing is used to sanitize syscall arguments");\
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(__se_sys##name)))); \
ALLOW_ERROR_INJECTION(sys##name, ERRNO); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
__diag_pop(); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
但有个问题,这里还原出来是__do_sys_chroot,并不是__x64_sys_chroot,可能是__se_sys那别名调用?这里暂时先不管了。
- set_fs_root
fs/fs_struct.c 最关键的部分就在于这个函数。
/*
* Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
* It can block.
*/
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
struct path old_root;
path_get(path);
spin_lock(&fs->lock);
write_seqcount_begin(&fs->seq);
old_root = fs->root;
fs->root = *path;
write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
if (old_root.dentry)
path_put(&old_root);
}
我们同时回顾调用地方set_fs_root(current->fs, &path);
该函数就是将当前线程的root路径进行替换。注意是该线程!命令chroot操作对root的替换只限于该线程,不会影响其平行线程。而其子进程由于会进行继承,自然而然就形成了一个沙箱。
至此。chroot的从用户态到内核态的机制就已经追完。