深入Linux内核启动:从kernel_init到第一个用户进程的完整旅程

2 阅读31分钟

引言:操作系统的“第一声啼哭”

当计算机电源按下,BIOS/UEFI引导加载程序将Linux内核映像载入内存,内核完成一系列复杂的初始化工作后,最终要启动第一个用户态进程——init。这个过程宛如一个新生命的诞生:内核是“母亲”,init是“第一个孩子”。而kernel_init函数正是这场分娩的主刀医生。本文将沿着Linux内核源码,从kernel_init函数出发,一路剖析内核如何加载并执行init进程,覆盖内核执行用户程序的完整路径:execve系统调用、二进制格式处理、直至ELF加载器的精细操作。

阅读本文前,建议读者对Linux内核进程、虚拟内存、可执行文件格式有基本了解。本文将基于Linux 5.x/6.x内核源码,关键函数和数据结构均取自真实代码(略有简化以突出主线)。


第一章:一切从kernel_init开始

1.1 内核初始化的最后冲刺

Linux内核的启动入口是start_kernel,完成大部分子系统的初始化后,会创建init内核线程(PID=1),该线程执行函数kernel_init。源码如下:

c

static int __ref kernel_init(void *unused)
{
    int ret;

    /* 等待kthreadd线程完全就绪 */
    wait_for_completion(&kthreadd_done);

    kernel_init_freeable();
    /* 等待所有异步__init代码完成,以便释放初始化内存 */
    async_synchronize_full();

    system_state = SYSTEM_FREEING_INITMEM;
    kprobe_free_init_mem();
    ftrace_free_init_mem();
    kgdb_free_init_mem();
    exit_boot_config();
    free_initmem();          /* 释放__init节的内存 */
    mark_readonly();         /* 将内核文本设为只读 */

    /* 内核映射已定型,更新用户态页表以完成PTI */
    pti_finalize();

    system_state = SYSTEM_RUNNING;
    numa_default_policy();
    rcu_end_inkernel_boot();
    do_sysctl_args();

    /* 尝试执行ramdisk中的init命令 */
    if (ramdisk_execute_command) {
        ret = run_init_process(ramdisk_execute_command);
        if (!ret)
            return 0;
        pr_err("Failed to execute %s (error %d)\n",
               ramdisk_execute_command, ret);
    }

    /* 依次尝试execute_command、默认init路径等 */
    if (execute_command) {
        ret = run_init_process(execute_command);
        if (!ret)
            return 0;
        panic("Requested init %s failed (error %d).",
              execute_command, ret);
    }

    if (CONFIG_DEFAULT_INIT[0] != '\0') {
        ret = run_init_process(CONFIG_DEFAULT_INIT);
        if (ret)
            pr_err("Default init %s failed (error %d)\n",
                   CONFIG_DEFAULT_INIT, ret);
        else
            return 0;
    }

    /* 最后的挣扎:常见位置 */
    if (!try_to_run_init_process("/sbin/init") ||
        !try_to_run_init_process("/etc/init") ||
        !try_to_run_init_process("/bin/init") ||
        !try_to_run_init_process("/bin/sh"))
        return 0;

    panic("No working init found.  Try passing init= option to kernel. "
          "See Linux Documentation/admin-guide/init.rst for guidance.");
}

逐步解读:

  • wait_for_completion(&kthreadd_done):确保内核线程管理器kthreadd已就绪,因为后续可能创建内核线程。

  • kernel_init_freeable():完成可推迟(freeable)的初始化,例如启动SMP、挂载rootfs等。注意此函数会调用do_basic_setup(),后者会执行do_initcalls(),启动所有设备驱动和内置模块。

  • async_synchronize_full():等待所有异步初始化(用async_schedule调度)完成,这样在释放.init内存前不会丢失代码。

  • 然后系统状态变为SYSTEM_FREEING_INITMEM,释放__init节的内存(包括大量初始化函数和数据),调用mark_readonly()使内核代码段只读,增强安全。

  • pti_finalize():完成页表隔离(Page Table Isolation),用于防御Meltdown漏洞。

  • 系统状态变为SYSTEM_RUNNING,设置NUMA缺省策略,结束RCU内核启动阶段,处理内核启动参数(do_sysctl_args)。

  • 接下来进入核心:尝试执行init进程。搜索顺序为:

    1. ramdisk_execute_command(由init=内核参数或默认/init指定)。
    2. execute_command(也是init=参数,但优先级略低)。
    3. 编译时默认的init路径(CONFIG_DEFAULT_INIT)。
    4. 传统位置:/sbin/init/etc/init/bin/init/bin/sh
  • 若全部失败,内核panic。

其中,run_init_processtry_to_run_init_process封装了实际执行程序的动作。

1.2 run_init_process:第一次execve

c

static int run_init_process(const char *init_filename)
{
    const char *const *p;

    argv_init[0] = init_filename;
    pr_info("Run %s as init process\n", init_filename);
    pr_debug("  with arguments:\n");
    for (p = argv_init; *p; p++)
        pr_debug("    %s\n", *p);
    pr_debug("  with environment:\n");
    for (p = envp_init; *p; p++)
        pr_debug("    %s\n", *p);
    return kernel_execve(init_filename, argv_init, envp_init);
}

argv_initenvp_init是内核预先定义的参数和环境变量,例如argv_init = { "init", NULL }envp_init包含了HOME=/TERM=linux等基本环境。最终调用kernel_execve

try_to_run_init_process只是对run_init_process的包装,在返回-ENOENT(文件不存在)时不打印错误,而其他错误(如权限不足、格式错误)会报警。


第二章:内核中的execve——kernel_execve剖析

用户态通过execve系统调用陷入内核,而kernel_execve是内核内部启动进程的接口,用于init及后来的kexec等场景。其实现如下:

c

int kernel_execve(const char *kernel_filename,
                  const char *const *argv, const char *const *envp)
{
    struct filename *filename;
    struct linux_binprm *bprm;
    int fd = AT_FDCWD;
    int retval;

    /* 内核线程调用execve毫无意义 */
    if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
        return -EINVAL;

    filename = getname_kernel(kernel_filename);
    if (IS_ERR(filename))
        return PTR_ERR(filename);

    bprm = alloc_bprm(fd, filename, 0);
    if (IS_ERR(bprm)) {
        retval = PTR_ERR(bprm);
        goto out_ret;
    }

    retval = count_strings_kernel(argv);
    if (WARN_ON_ONCE(retval == 0))
        retval = -EINVAL;
    if (retval < 0)
        goto out_free;
    bprm->argc = retval;

    retval = count_strings_kernel(envp);
    if (retval < 0)
        goto out_free;
    bprm->envc = retval;

    retval = bprm_stack_limits(bprm);
    if (retval < 0)
        goto out_free;

    retval = copy_string_kernel(bprm->filename, bprm);
    if (retval < 0)
        goto out_free;
    bprm->exec = bprm->p;

    retval = copy_strings_kernel(bprm->envc, envp, bprm);
    if (retval < 0)
        goto out_free;

    retval = copy_strings_kernel(bprm->argc, argv, bprm);
    if (retval < 0)
        goto out_free;

    retval = bprm_execve(bprm);
out_free:
    free_bprm(bprm);
out_ret:
    putname(filename);
    return retval;
}

2.1 关键数据结构:linux_binprm

struct linux_binprm是内核中表示“待执行的二进制程序”的核心结构体,它承载了执行一个程序所需的所有信息:参数列表、环境变量、文件描述符、内存布局等。分配通过alloc_bprm完成。

kernel_execve的主要动作:

  1. 获取文件名getname_kernel复制内核空间路径名到可访问的结构。
  2. 分配bprm:初始化bprm,包括分配mm_struct、设置栈起始地址等。
  3. 统计参数/环境变量个数count_strings_kernel遍历指针数组,计算字符串数量并检查总长度是否超出限制(MAX_ARG_STRINGSMAX_ARG_STRLEN)。
  4. 检查栈空间限制bprm_stack_limits确保参数+环境不会超过栈空间大小(通常限制为栈大小的1/4)。
  5. 复制参数和环境到栈copy_string_kernelcopy_strings_kernel将文件名、参数、环境变量字符串逐字节复制到新进程的用户态栈中(同时也复制到内核空间缓存,但最终用户态栈会保留这些字符串)。注意这里使用的是bprm->p指针,它从用户栈顶向下移动。
  6. 调用核心函数bprm_execve执行实际的二进制加载。

一个细节:为什么copy_string_kernel(bprm->filename, bprm)要拷贝两次(文件名已经在bprm->filename中,这里又拷贝到栈上)?这是因为argv[0]通常应该指向可执行文件名,内核必须将文件名也放入用户栈,作为argv[0]的内容。而bprm->filename本身是内核空间分配的字符串,不能直接映射到用户空间,所以需要复制到用户栈。

2.2 bprm_execve:通往新程序的大门

c

static int bprm_execve(struct linux_binprm *bprm)
{
    int retval;

    retval = prepare_bprm_creds(bprm);
    if (retval)
        return retval;

    check_unsafe_exec(bprm);
    current->in_execve = 1;
    sched_mm_cid_before_execve(current);

    sched_exec();

    retval = security_bprm_creds_for_exec(bprm);
    if (retval)
        goto out;

    retval = exec_binprm(bprm);
    if (retval < 0)
        goto out;

    sched_mm_cid_after_execve(current);
    /* execve succeeded */
    current->fs->in_exec = 0;
    current->in_execve = 0;
    rseq_execve(current);
    user_events_execve(current);
    acct_update_integrals(current);
    task_numa_free(current, false);
    return retval;

out:
    if (bprm->point_of_no_return && !fatal_signal_pending(current))
        force_fatal_sig(SIGSEGV);

    sched_mm_cid_after_execve(current);
    current->fs->in_exec = 0;
    current->in_execve = 0;

    return retval;
}

该函数的核心是exec_binprm,之前还做了几件重要的事:

  • prepare_bprm_creds:准备新的凭证(credentials),即将执行新程序时切换用户ID、组ID等。这一步会克隆当前进程的凭证,以备后续根据setuid位等修改。
  • check_unsafe_exec:检查当前进程是否处于不安全状态(例如正在被ptrace跟踪,或有多个线程),会导致某些安全策略拒绝执行。
  • sched_exec:调度器钩子,为新执行的程序做NUMA亲和性调整。
  • security_bprm_creds_for_exec:LSM(Linux安全模块)钩子,例如SELinux可以在此时检查权限。

bprm->point_of_no_return标志在begin_new_exec(稍后看到)中设置,一旦越过此点,新程序的部分资源已经替换,无法回滚,如果后续失败只能强制发送信号杀死进程。

2.3 exec_binprm:二进制格式识别与解释器循环

c

static int exec_binprm(struct linux_binprm *bprm)
{
    pid_t old_pid, old_vpid;
    int ret, depth;

    old_pid = current->pid;
    rcu_read_lock();
    old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
    rcu_read_unlock();

    for (depth = 0;; depth++) {
        struct file *exec;
        if (depth > 5)
            return -ELOOP;

        ret = search_binary_handler(bprm);
        if (ret < 0)
            return ret;
        if (!bprm->interpreter)
            break;

        exec = bprm->file;
        bprm->file = bprm->interpreter;
        bprm->interpreter = NULL;

        allow_write_access(exec);
        if (unlikely(bprm->have_execfd)) {
            if (bprm->executable) {
                fput(exec);
                return -ENOEXEC;
            }
            bprm->executable = exec;
        } else
            fput(exec);
    }

    audit_bprm(bprm);
    trace_sched_process_exec(current, old_pid, bprm);
    ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
    proc_exec_connector(current);
    return 0;
}

这段代码体现了Linux支持多种可执行文件格式(如ELF、a.out、脚本)及解释器(如shebang)的灵活架构。关键点:

  • search_binary_handler:遍历已注册的二进制格式处理程序链表,尝试识别并加载二进制文件。
  • 如果bprm->interpreter被设置(如脚本的#!解释器或者ELF的动态链接器),则循环处理:将当前文件替换为解释器文件,重新调用search_binary_handler。深度限制为5,防止递归过深。
  • bprm->have_execfd用于处理execveatAT_EXECFD标志,这里不详细展开。
  • 成功加载后,记录审计、tracepoint、ptrace事件等。

2.3.1 search_binary_handler:格式匹配的艺术

c

static int search_binary_handler(struct linux_binprm *bprm)
{
    bool need_retry = IS_ENABLED(CONFIG_MODULES);
    struct linux_binfmt *fmt;
    int retval;

    retval = prepare_binprm(bprm);
    if (retval < 0)
        return retval;

    retval = security_bprm_check(bprm);
    if (retval)
        return retval;

    retval = -ENOENT;
retry:
    read_lock(&binfmt_lock);
    list_for_each_entry(fmt, &formats, lh) {
        if (!try_module_get(fmt->module))
            continue;
        read_unlock(&binfmt_lock);

        retval = fmt->load_binary(bprm);

        read_lock(&binfmt_lock);
        put_binfmt(fmt);
        if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
            read_unlock(&binfmt_lock);
            return retval;
        }
    }
    read_unlock(&binfmt_lock);

    if (need_retry) {
        if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
            printable(bprm->buf[2]) && printable(bprm->buf[3]))
            return retval;
        if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
            return retval;
        need_retry = false;
        goto retry;
    }

    return retval;
}

逻辑:

  1. prepare_binprm:读取文件头部(前128字节)到bprm->buf,并根据文件权限设置bprm->cred的euid/egid(处理setuid位)。

  2. security_bprm_check:再次安全检查。

  3. 遍历formats链表,每个元素对应一种二进制格式(ELF、a.out、脚本等)。每个格式提供load_binary函数。

    • try_module_get防止模块在加载过程中被卸载。
    • 调用fmt->load_binary(bprm)尝试解析。若返回-ENOEXEC(不是当前格式),继续尝试下一个;否则返回(无论成功或失败)。
  4. 若所有格式都不识别,且need_retry为真(表示可能缺失模块),尝试通过request_module动态加载binfmt-xxxx模块(xxxx是文件头中两个字节的十六进制值,通常用于识别#!脚本?实际上对于未知格式,内核会尝试用魔数请求模块)。再次重试。

其中formats链表在内核初始化时注册了elf_formatscript_format等。例如ELF格式定义为:

c

static struct linux_binfmt elf_format = {
    .module     = THIS_MODULE,
    .load_binary = load_elf_binary,
    .load_shlib = load_elf_library,
    .core_dump  = elf_core_dump,
    .min_coredump = ELF_EXEC_PAGESIZE,
};

脚本格式(#!解释器)的load_binary会解析第一行,提取解释器路径,设置bprm->interpreter后返回-ENOEXEC(迫使外层循环再次搜索)。


第三章:ELF加载器深度解析(load_elf_binary)

ELF(Executable and Linkable Format)是Linux上最常用的可执行文件格式。load_elf_binary是Linux内核中最复杂、最关键的代码之一,负责将ELF文件映射到进程地址空间,并跳转到入口点。

由于源码过长,我们将它拆解成若干步骤,并提取核心部分讲解。

3.1 一致性检查与头部解析

c

static int load_elf_binary(struct linux_binprm *bprm)
{
    struct file *interpreter = NULL;
    unsigned long load_bias = 0, phdr_addr = 0;
    int first_pt_load = 1;
    unsigned long error;
    struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
    struct elf_phdr *elf_property_phdata = NULL;
    unsigned long elf_brk;
    int retval, i;
    unsigned long elf_entry;
    unsigned long e_entry;
    unsigned long interp_load_addr = 0;
    ...  // 省略变量声明
    struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;  // 头部已在prepare_binprm中读入

    retval = -ENOEXEC;
    if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
        goto out;
    if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
        goto out;
    if (!elf_check_arch(elf_ex))
        goto out;
    if (elf_check_fdpic(elf_ex))
        goto out;
    if (!bprm->file->f_op->mmap)
        goto out;
  • 首先验证ELF魔数\177ELF、文件类型(可执行或共享对象,ET_EXEC为静态位置可执行文件,ET_DYN为位置无关可执行PIE或共享库)、体系架构匹配、非FDPIC(另一种嵌入式ABI)、文件系统支持内存映射。
  • 加载程序头表(Program Header Table):load_elf_phdrs读取所有程序头。

3.2 处理PT_INTERP:动态链接器的识别

c

    elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
    if (!elf_phdata)
        goto out;

    elf_ppnt = elf_phdata;
    for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) {
        char *elf_interpreter;

        if (elf_ppnt->p_type == PT_GNU_PROPERTY) {
            elf_property_phdata = elf_ppnt;
            continue;
        }

        if (elf_ppnt->p_type != PT_INTERP)
            continue;

        // 读取解释器路径(如 /lib64/ld-linux-x86-64.so.2)
        retval = -ENOEXEC;
        if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
            goto out_free_ph;

        elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
        if (!elf_interpreter)
            goto out_free_ph;

        retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
                          elf_ppnt->p_offset);
        if (retval < 0)
            goto out_free_interp;
        if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
            goto out_free_interp;

        interpreter = open_exec(elf_interpreter);
        kfree(elf_interpreter);
        retval = PTR_ERR(interpreter);
        if (IS_ERR(interpreter))
            goto out_free_ph;

        would_dump(bprm, interpreter);

        interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
        if (!interp_elf_ex) {
            retval = -ENOMEM;
            goto out_free_file;
        }

        retval = elf_read(interpreter, interp_elf_ex,
                          sizeof(*interp_elf_ex), 0);
        if (retval < 0)
            goto out_free_dentry;
        break;
    }
  • 遍历程序头,找到PT_INTERP段,读取其中的动态链接器路径(例如/lib64/ld-linux-x86-64.so.2)。然后通过open_exec打开该文件,并将它的ELF头读入interp_elf_ex
  • 后续会加载解释器本身的程序头,并可能在地址空间布局时区分PIE程序和动态链接器。

3.3 处理GNU_STACK和属性段

c

    elf_ppnt = elf_phdata;
    for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
        switch (elf_ppnt->p_type) {
        case PT_GNU_STACK:
            if (elf_ppnt->p_flags & PF_X)
                executable_stack = EXSTACK_ENABLE_X;
            else
                executable_stack = EXSTACK_DISABLE_X;
            break;
        case PT_LOPROC ... PT_HIPROC:
            retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
                                      bprm->file, false,
                                      &arch_state);
            if (retval)
                goto out_free_dentry;
            break;
        }
  • PT_GNU_STACK控制栈是否可执行(用于NX保护)。默认情况下,现代Linux要求栈不可执行,但某些古老二进制可能要求可执行栈,内核会据此设置executable_stack
  • PT_LOPROCPT_HIPROC是处理器特定的段,由体系架构代码处理(如ARM的.ARM.attributes)。

3.4 动态链接器的一致性检查

如果存在解释器(即动态链接情况),需要检查解释器的ELF头合法性、体系结构,并加载其程序头表。同样也会处理PT_GNU_PROPERTY(用于Intel CET等硬件特性)和PT_LOPROC

3.5 关键转换:begin_new_exec

c

    retval = begin_new_exec(bprm);
    if (retval)
        goto out_free_dentry;

begin_new_exec是execve的核心转折点,它负责:

  • 清空当前进程的内存映射(mm),但保留内核栈等。
  • 重置信号处理、线程信息、文件系统等。
  • 设置bprm->point_of_no_return = 1(从此无法回头)。
  • 复制新的凭证(根据setuid等)。
  • 更新进程的comm字段为文件名。

此函数调用后,原进程的用户空间上下文彻底消失。

3.6 设置新进程的内存布局

c

    SET_PERSONALITY2(*elf_ex, &arch_state);
    if (elf_read_implies_exec(*elf_ex, executable_stack))
        current->personality |= READ_IMPLIES_EXEC;

    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
        current->flags |= PF_RANDOMIZE;

    setup_new_exec(bprm);

    retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
                             executable_stack);
    if (retval < 0)
        goto out_free_dentry;
  • SET_PERSONALITY2设置进程的personality(如PER_LINUXPER_LINUX32),可能影响内存布局、系统调用行为。对于32位兼容程序尤其重要。
  • elf_read_implies_exec:如果ELF标志指示整个地址空间可执行(老旧二进制),或栈要求可执行,则设置READ_IMPLIES_EXEC
  • PF_RANDOMIZE标志决定是否启用ASLR(地址空间布局随机化)。
  • setup_new_exec:进一步初始化新的执行环境(如清除FPU状态)。
  • setup_arg_pages:在用户栈顶部建立参数和环境变量区域,随机化栈顶位置。

3.7 映射ELF的LOAD段

这是最复杂的步骤。遍历所有PT_LOAD段,对于每个段,计算内存保护属性(PROT_READ, PROT_WRITE, PROT_EXEC)和映射标志(MAP_PRIVATE),然后调用elf_load(实际上是vm_mmap的封装)将文件内容映射到指定地址。

对于ET_EXEC(固定地址可执行文件),地址由p_vaddr直接指定;对于ET_DYN(PIE或动态库),需要计算随机化偏移load_bias

PIE处理的核心逻辑:

c

if (elf_ex->e_type == ET_DYN) {
    if (interpreter) {
        load_bias = ELF_ET_DYN_BASE;
        if (current->flags & PF_RANDOMIZE)
            load_bias += arch_mmap_rnd();
        alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
        if (alignment)
            load_bias &= ~(alignment - 1);
        elf_flags |= MAP_FIXED_NOREPLACE;
    } else
        load_bias = 0;

    load_bias = ELF_PAGESTART(load_bias - vaddr);

    total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum);
    // 然后映射整个总大小,使用MAP_FIXED_NOREPLACE,避免覆盖已有映射
}
  • 如果是动态链接的可执行程序(有PT_INTERP),load_bias设为ELF_ET_DYN_BASE(通常为TASK_SIZE / 3 * 2,即地址空间下半部靠上位置),并加入随机偏移(若ASLR开启)。然后调整load_bias,使其与vaddr对齐,保证映射后虚拟地址为load_bias + vaddr
  • 对于动态链接器本身(没有PT_INTERP),load_bias = 0,意味着它将被映射到由内核mmap随机选择的位置(不固定),这可以防止动态链接器与程序冲突。
  • MAP_FIXED_NOREPLACE标志要求映射不能覆盖已有的VMA,增强了安全性。

对于第一个PT_LOAD段,还会记录reloc_func_desc(即load_bias),供体系架构代码调整函数描述符(如PowerPC64)。

映射每个段后,更新start_codeend_codestart_dataend_data,以及elf_brk(程序的堆起始地址,由最后一个LOAD段的p_memsz后边界确定)。

3.8 加载解释器(若存在)

如果有动态链接器,调用load_elf_interp加载它。该函数与加载主程序类似,但行为更简单:它将解释器的LOAD段映射到进程空间,并返回加载的基址(interp_load_addr)。然后解释器入口点为interp_load_addr + interp_elf_ex->e_entry。否则,主程序入口点为e_entry + load_bias

3.9 设置进程的辅助向量(auxv)

create_elf_tables在用户栈上构建辅助向量(AT_PHDRAT_ENTRYAT_RANDOM等),这些是动态链接器和libc初始化所需的额外信息。例如AT_ENTRY告诉动态链接器主程序入口点。

3.10 最终调整和启动

c

    mm = current->mm;
    mm->end_code = end_code;
    mm->start_code = start_code;
    mm->start_data = start_data;
    mm->end_data = end_data;
    mm->start_stack = bprm->p;

    if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
        if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
            elf_ex->e_type == ET_DYN && !interpreter) {
            mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
        }
        mm->brk = mm->start_brk = arch_randomize_brk(mm);
    }

    // 可选:映射零页(遗留的SVr4行为)
    if (current->personality & MMAP_PAGE_ZERO) {
        error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                        MAP_FIXED | MAP_PRIVATE, 0);
    }

    regs = current_pt_regs();
#ifdef ELF_PLAT_INIT
    ELF_PLAT_INIT(regs, reloc_func_desc);
#endif

    finalize_exec(bprm);
    START_THREAD(elf_ex, regs, elf_entry, bprm->p);
    retval = 0;
out:
    return retval;
  • 设置mm结构中代码段、数据段、栈顶的位置。
  • 随机化堆的起始地址(brk),除非是静态链接的PIE且没有解释器的特殊情况(此时堆放在ELF_ET_DYN_BASE附近,避免与栈冲突)。
  • 通过ELF_PLAT_INIT宏设置平台相关的寄存器初始值(例如ARM64的x0设置为AT_BASE等)。
  • finalize_exec执行最后的清理(如向LSM通知状态变更)。
  • START_THREAD实际上是一系列操作的宏:设置用户态栈指针sp = bprm->p,设置程序计数器pc = elf_entry,然后通过start_thread切换上下文,最终通过ret_from_fork返回到用户空间执行新程序。

至此,init进程或任何后续execve的程序成功启动。


第四章:完整流程图与关键数据结构

我们可以用一幅图概括整个流程:

text

kernel_init()
    │
    ├── run_init_process(init_path)
    │       │
    │       └── kernel_execve()
    │               │
    │               ├── alloc_bprm()
    │               ├── copy_strings_to_stack()
    │               └── bprm_execve()
    │                       │
    │                       ├── prepare_bprm_creds()
    │                       ├── exec_binprm()
    │                       │       │
    │                       │       └── search_binary_handler()
    │                       │               │
    │                       │               ├── prepare_binprm()
    │                       │               ├── list_for_each_entry(fmt)
    │                       │               │       └── fmt->load_binary()
    │                       │               │               └── load_elf_binary()
    │                       │               │                       ├── 解析ELF和解释器
    │                       │               │                       ├── begin_new_exec()
    │                       │               │                       ├── setup_arg_pages()
    │                       │               │                       ├── 映射LOAD段
    │                       │               │                       ├── 加载解释器
    │                       │               │                       ├── create_elf_tables()
    │                       │               │                       └── START_THREAD()
    │                       │               └── request_module() [若需]
    │                       └── 返回用户空间

关键数据结构:

  • struct linux_binprm:存储执行程序的所有参数、环境、文件指针、内存准备状态。
  • struct linux_binfmt:二进制格式驱动,提供load_binary方法。
  • struct elfhdr / struct elf_phdr:ELF头和程序头。
  • struct mm_struct:进程内存描述符,exec时被重置。

第五章:常见问题与调试技巧

5.1 为什么init必须是PID=1?

kernel_init函数以内核线程运行,PID=1。执行execve后,它变成用户进程,但PID保持不变。PID=1是系统第一个用户进程,负责收养孤儿进程,处理SIGCHLD等,具有特殊的权限和角色。

5.2 如果init崩溃,系统会怎样?

如果init进程崩溃(收到致命信号或exit),内核会调用forget_original_parent重新设置所有进程的父进程为init的父进程(即PID=0的swapper),然后触发panic。因为PID=1是系统正常运行的前提,一旦退出,内核无法处理孤儿进程,直接panic。这就是为什么“init进程不能死”。

5.3 如何调试内核execve过程?

  • 使用printk动态打印:在kernel_initload_elf_binary等函数中添加pr_info,但需重新编译内核。
  • 使用ftrace跟踪函数调用:echo function > /sys/kernel/debug/tracing/current_tracer,然后过滤exec*load_elf_binary
  • 使用kgdbkprobe动态断点。
  • 用户态工具如strace可以跟踪execve系统调用,但无法查看内核内部细节。

5.4 为什么ELF加载中要使用MAP_FIXED_NOREPLACE?

传统的MAP_FIXED会无条件覆盖已有的VMA,可能被恶意利用(例如映射到关键内核区域)。MAP_FIXED_NOREPLACE在地址冲突时返回错误,增加安全性。对于ET_EXEC固定地址程序,地址是硬编码的,可能与其他映射冲突(例如vdso),内核需要仔细处理,这也是现代内核改进的一环。

5.5 脚本文件如何执行?

脚本格式处理器的load_binary会读取第一行#! /path/to/interpreter,然后设置bprm->interpreter,将bprm->file替换为解释器文件,并修改argv使解释器的参数包含原脚本名。随后返回-ENOEXEC,外层exec_binprm循环重新搜索二进制格式,此时解释器是真正的二进制(如/bin/sh),再次进入load_elf_binary或其它格式加载。整个过程对用户透明。


第六章:总结与现代内核的演进

通过以上源码分析,我们完整地走通了从内核启动到用户态init进程的第一行指令的全过程。这不仅是一次代码漫步,更是理解Linux内核进程管理、内存管理、文件系统交互的绝佳案例。

kernel_initload_elf_binary,我们看到了:

  • 内核如何逐步释放初始化资源,将自己化为只读并进入运行状态。
  • execve机制如何精心准备用户栈、参数、环境,并安全地切换执行上下文。
  • 二进制格式的插件化设计如何优雅地支持多样化和嵌套解释器。
  • ELF加载器的复杂性,尤其是PIE、ASLR、动态链接器加载等现代操作系统安全特性的实现细节。

Linux内核发展的数十年间,这个流程不断演进:增加了MAP_FIXED_NOREPLACEPT_GNU_PROPERTY、CET支持、随机化的brk等等,但核心架构保持稳定。理解这一过程,对于从事系统开发、性能优化、安全研究的人来说,都是不可或缺的基础。

最后,不妨在自己的Linux机器上尝试strace /bin/init(如果系统允许)或者用gdb跟踪一个简单程序的execve,观察前述的数据结构和调用顺序。理论结合实践,才能深入掌握这个操作系统的“起搏器”。

#源码

{
	int ret;

	/*
	 * Wait until kthreadd is all set-up.
	 */
	wait_for_completion(&kthreadd_done);

	kernel_init_freeable();
	/* need to finish all async __init code before freeing the memory */
	async_synchronize_full();

	system_state = SYSTEM_FREEING_INITMEM;
	kprobe_free_init_mem();
	ftrace_free_init_mem();
	kgdb_free_init_mem();
	exit_boot_config();
	free_initmem();
	mark_readonly();

	/*
	 * Kernel mappings are now finalized - update the userspace page-table
	 * to finalize PTI.
	 */
	pti_finalize();

	system_state = SYSTEM_RUNNING;
	numa_default_policy();

	rcu_end_inkernel_boot();

	do_sysctl_args();

	if (ramdisk_execute_command) {
		ret = run_init_process(ramdisk_execute_command);
		if (!ret)
			return 0;
		pr_err("Failed to execute %s (error %d)\n",
		       ramdisk_execute_command, ret);
	}

	/*
	 * We try each of these until one succeeds.
	 *
	 * The Bourne shell can be used instead of init if we are
	 * trying to recover a really broken machine.
	 */
	if (execute_command) {
		ret = run_init_process(execute_command);
		if (!ret)
			return 0;
		panic("Requested init %s failed (error %d).",
		      execute_command, ret);
	}

	if (CONFIG_DEFAULT_INIT[0] != '\0') {
		ret = run_init_process(CONFIG_DEFAULT_INIT);
		if (ret)
			pr_err("Default init %s failed (error %d)\n",
			       CONFIG_DEFAULT_INIT, ret);
		else
			return 0;
	}

	if (!try_to_run_init_process("/sbin/init") ||
	    !try_to_run_init_process("/etc/init") ||
	    !try_to_run_init_process("/bin/init") ||
	    !try_to_run_init_process("/bin/sh"))
		return 0;

	panic("No working init found.  Try passing init= option to kernel. "
	      "See Linux Documentation/admin-guide/init.rst for guidance.");
}

static int run_init_process(const char *init_filename)
{
	const char *const *p;

	argv_init[0] = init_filename;
	pr_info("Run %s as init process\n", init_filename);
	pr_debug("  with arguments:\n");
	for (p = argv_init; *p; p++)
		pr_debug("    %s\n", *p);
	pr_debug("  with environment:\n");
	for (p = envp_init; *p; p++)
		pr_debug("    %s\n", *p);
	return kernel_execve(init_filename, argv_init, envp_init);
}

static int try_to_run_init_process(const char *init_filename)
{
	int ret;

	ret = run_init_process(init_filename);

	if (ret && ret != -ENOENT) {
		pr_err("Starting init: %s exists but couldn't execute it (error %d)\n",
		       init_filename, ret);
	}

	return ret;
}


int kernel_execve(const char *kernel_filename,
		  const char *const *argv, const char *const *envp)
{
	struct filename *filename;
	struct linux_binprm *bprm;
	int fd = AT_FDCWD;
	int retval;

	/* It is non-sense for kernel threads to call execve */
	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
		return -EINVAL;

	filename = getname_kernel(kernel_filename);
	if (IS_ERR(filename))
		return PTR_ERR(filename);

	bprm = alloc_bprm(fd, filename, 0);
	if (IS_ERR(bprm)) {
		retval = PTR_ERR(bprm);
		goto out_ret;
	}

	retval = count_strings_kernel(argv);
	if (WARN_ON_ONCE(retval == 0))
		retval = -EINVAL;
	if (retval < 0)
		goto out_free;
	bprm->argc = retval;

	retval = count_strings_kernel(envp);
	if (retval < 0)
		goto out_free;
	bprm->envc = retval;

	retval = bprm_stack_limits(bprm);
	if (retval < 0)
		goto out_free;

	retval = copy_string_kernel(bprm->filename, bprm);
	if (retval < 0)
		goto out_free;
	bprm->exec = bprm->p;

	retval = copy_strings_kernel(bprm->envc, envp, bprm);
	if (retval < 0)
		goto out_free;

	retval = copy_strings_kernel(bprm->argc, argv, bprm);
	if (retval < 0)
		goto out_free;

	retval = bprm_execve(bprm);
out_free:
	free_bprm(bprm);
out_ret:
	putname(filename);
	return retval;
}

static int bprm_execve(struct linux_binprm *bprm)
{
	int retval;

	retval = prepare_bprm_creds(bprm);
	if (retval)
		return retval;

	/*
	 * Check for unsafe execution states before exec_binprm(), which
	 * will call back into begin_new_exec(), into bprm_creds_from_file(),
	 * where setuid-ness is evaluated.
	 */
	check_unsafe_exec(bprm);
	current->in_execve = 1;
	sched_mm_cid_before_execve(current);

	sched_exec();

	/* Set the unchanging part of bprm->cred */
	retval = security_bprm_creds_for_exec(bprm);
	if (retval)
		goto out;

	retval = exec_binprm(bprm);
	if (retval < 0)
		goto out;

	sched_mm_cid_after_execve(current);
	/* execve succeeded */
	current->fs->in_exec = 0;
	current->in_execve = 0;
	rseq_execve(current);
	user_events_execve(current);
	acct_update_integrals(current);
	task_numa_free(current, false);
	return retval;

out:
	/*
	 * If past the point of no return ensure the code never
	 * returns to the userspace process.  Use an existing fatal
	 * signal if present otherwise terminate the process with
	 * SIGSEGV.
	 */
	if (bprm->point_of_no_return && !fatal_signal_pending(current))
		force_fatal_sig(SIGSEGV);

	sched_mm_cid_after_execve(current);
	current->fs->in_exec = 0;
	current->in_execve = 0;

	return retval;
}


/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
	pid_t old_pid, old_vpid;
	int ret, depth;

	/* Need to fetch pid before load_binary changes it */
	old_pid = current->pid;
	rcu_read_lock();
	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
	rcu_read_unlock();

	/* This allows 4 levels of binfmt rewrites before failing hard. */
	for (depth = 0;; depth++) {
		struct file *exec;
		if (depth > 5)
			return -ELOOP;

		ret = search_binary_handler(bprm);
		if (ret < 0)
			return ret;
		if (!bprm->interpreter)
			break;

		exec = bprm->file;
		bprm->file = bprm->interpreter;
		bprm->interpreter = NULL;

		allow_write_access(exec);
		if (unlikely(bprm->have_execfd)) {
			if (bprm->executable) {
				fput(exec);
				return -ENOEXEC;
			}
			bprm->executable = exec;
		} else
			fput(exec);
	}

	audit_bprm(bprm);
	trace_sched_process_exec(current, old_pid, bprm);
	ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
	proc_exec_connector(current);
	return 0;
}

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
	bool need_retry = IS_ENABLED(CONFIG_MODULES);
	struct linux_binfmt *fmt;
	int retval;

	retval = prepare_binprm(bprm);
	if (retval < 0)
		return retval;

	retval = security_bprm_check(bprm);
	if (retval)
		return retval;

	retval = -ENOENT;
 retry:
	read_lock(&binfmt_lock);
	list_for_each_entry(fmt, &formats, lh) {
		if (!try_module_get(fmt->module))
			continue;
		read_unlock(&binfmt_lock);

		retval = fmt->load_binary(bprm);

		read_lock(&binfmt_lock);
		put_binfmt(fmt);
		if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
			read_unlock(&binfmt_lock);
			return retval;
		}
	}
	read_unlock(&binfmt_lock);

	if (need_retry) {
		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
			return retval;
		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
			return retval;
		need_retry = false;
		goto retry;
	}

	return retval;
}


static struct linux_binfmt elf_format = {
	.module		= THIS_MODULE,
	.load_binary	= load_elf_binary,
	.load_shlib	= load_elf_library,
#ifdef CONFIG_COREDUMP
	.core_dump	= elf_core_dump,
	.min_coredump	= ELF_EXEC_PAGESIZE,
#endif
};


static int load_elf_binary(struct linux_binprm *bprm)
{
	struct file *interpreter = NULL; /* to shut gcc up */
	unsigned long load_bias = 0, phdr_addr = 0;
	int first_pt_load = 1;
	unsigned long error;
	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
	struct elf_phdr *elf_property_phdata = NULL;
	unsigned long elf_brk;
	int retval, i;
	unsigned long elf_entry;
	unsigned long e_entry;
	unsigned long interp_load_addr = 0;
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long reloc_func_desc __maybe_unused = 0;
	int executable_stack = EXSTACK_DEFAULT;
	struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
	struct elfhdr *interp_elf_ex = NULL;
	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
	struct mm_struct *mm;
	struct pt_regs *regs;

	retval = -ENOEXEC;
	/* First of all, some simple consistency checks */
	if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
		goto out;

	if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
		goto out;
	if (!elf_check_arch(elf_ex))
		goto out;
	if (elf_check_fdpic(elf_ex))
		goto out;
	if (!bprm->file->f_op->mmap)
		goto out;

	elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
	if (!elf_phdata)
		goto out;

	elf_ppnt = elf_phdata;
	for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) {
		char *elf_interpreter;

		if (elf_ppnt->p_type == PT_GNU_PROPERTY) {
			elf_property_phdata = elf_ppnt;
			continue;
		}

		if (elf_ppnt->p_type != PT_INTERP)
			continue;

		/*
		 * This is the program interpreter used for shared libraries -
		 * for now assume that this is an a.out format binary.
		 */
		retval = -ENOEXEC;
		if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
			goto out_free_ph;

		retval = -ENOMEM;
		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
		if (!elf_interpreter)
			goto out_free_ph;

		retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
				  elf_ppnt->p_offset);
		if (retval < 0)
			goto out_free_interp;
		/* make sure path is NULL terminated */
		retval = -ENOEXEC;
		if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
			goto out_free_interp;

		interpreter = open_exec(elf_interpreter);
		kfree(elf_interpreter);
		retval = PTR_ERR(interpreter);
		if (IS_ERR(interpreter))
			goto out_free_ph;

		/*
		 * If the binary is not readable then enforce mm->dumpable = 0
		 * regardless of the interpreter's permissions.
		 */
		would_dump(bprm, interpreter);

		interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
		if (!interp_elf_ex) {
			retval = -ENOMEM;
			goto out_free_file;
		}

		/* Get the exec headers */
		retval = elf_read(interpreter, interp_elf_ex,
				  sizeof(*interp_elf_ex), 0);
		if (retval < 0)
			goto out_free_dentry;

		break;

out_free_interp:
		kfree(elf_interpreter);
		goto out_free_ph;
	}

	elf_ppnt = elf_phdata;
	for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
		switch (elf_ppnt->p_type) {
		case PT_GNU_STACK:
			if (elf_ppnt->p_flags & PF_X)
				executable_stack = EXSTACK_ENABLE_X;
			else
				executable_stack = EXSTACK_DISABLE_X;
			break;

		case PT_LOPROC ... PT_HIPROC:
			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
						  bprm->file, false,
						  &arch_state);
			if (retval)
				goto out_free_dentry;
			break;
		}

	/* Some simple consistency checks for the interpreter */
	if (interpreter) {
		retval = -ELIBBAD;
		/* Not an ELF interpreter */
		if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
			goto out_free_dentry;
		/* Verify the interpreter has a valid arch */
		if (!elf_check_arch(interp_elf_ex) ||
		    elf_check_fdpic(interp_elf_ex))
			goto out_free_dentry;

		/* Load the interpreter program headers */
		interp_elf_phdata = load_elf_phdrs(interp_elf_ex,
						   interpreter);
		if (!interp_elf_phdata)
			goto out_free_dentry;

		/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
		elf_property_phdata = NULL;
		elf_ppnt = interp_elf_phdata;
		for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
			switch (elf_ppnt->p_type) {
			case PT_GNU_PROPERTY:
				elf_property_phdata = elf_ppnt;
				break;

			case PT_LOPROC ... PT_HIPROC:
				retval = arch_elf_pt_proc(interp_elf_ex,
							  elf_ppnt, interpreter,
							  true, &arch_state);
				if (retval)
					goto out_free_dentry;
				break;
			}
	}

	retval = parse_elf_properties(interpreter ?: bprm->file,
				      elf_property_phdata, &arch_state);
	if (retval)
		goto out_free_dentry;

	/*
	 * Allow arch code to reject the ELF at this point, whilst it's
	 * still possible to return an error to the code that invoked
	 * the exec syscall.
	 */
	retval = arch_check_elf(elf_ex,
				!!interpreter, interp_elf_ex,
				&arch_state);
	if (retval)
		goto out_free_dentry;

	/* Flush all traces of the currently running executable */
	retval = begin_new_exec(bprm);
	if (retval)
		goto out_free_dentry;

	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
	   may depend on the personality.  */
	SET_PERSONALITY2(*elf_ex, &arch_state);
	if (elf_read_implies_exec(*elf_ex, executable_stack))
		current->personality |= READ_IMPLIES_EXEC;

	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
		current->flags |= PF_RANDOMIZE;

	setup_new_exec(bprm);

	/* Do this so that we can load the interpreter, if need be.  We will
	   change some of these later */
	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
				 executable_stack);
	if (retval < 0)
		goto out_free_dentry;

	elf_brk = 0;

	start_code = ~0UL;
	end_code = 0;
	start_data = 0;
	end_data = 0;

	/* Now we do a little grungy work by mmapping the ELF image into
	   the correct location in memory. */
	for(i = 0, elf_ppnt = elf_phdata;
	    i < elf_ex->e_phnum; i++, elf_ppnt++) {
		int elf_prot, elf_flags;
		unsigned long k, vaddr;
		unsigned long total_size = 0;
		unsigned long alignment;

		if (elf_ppnt->p_type != PT_LOAD)
			continue;

		elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
				     !!interpreter, false);

		elf_flags = MAP_PRIVATE;

		vaddr = elf_ppnt->p_vaddr;
		/*
		 * The first time through the loop, first_pt_load is true:
		 * layout will be calculated. Once set, use MAP_FIXED since
		 * we know we've already safely mapped the entire region with
		 * MAP_FIXED_NOREPLACE in the once-per-binary logic following.
		 */
		if (!first_pt_load) {
			elf_flags |= MAP_FIXED;
		} else if (elf_ex->e_type == ET_EXEC) {
			/*
			 * This logic is run once for the first LOAD Program
			 * Header for ET_EXEC binaries. No special handling
			 * is needed.
			 */
			elf_flags |= MAP_FIXED_NOREPLACE;
		} else if (elf_ex->e_type == ET_DYN) {
			/*
			 * This logic is run once for the first LOAD Program
			 * Header for ET_DYN binaries to calculate the
			 * randomization (load_bias) for all the LOAD
			 * Program Headers.
			 *
			 * There are effectively two types of ET_DYN
			 * binaries: programs (i.e. PIE: ET_DYN with INTERP)
			 * and loaders (ET_DYN without INTERP, since they
			 * _are_ the ELF interpreter). The loaders must
			 * be loaded away from programs since the program
			 * may otherwise collide with the loader (especially
			 * for ET_EXEC which does not have a randomized
			 * position). For example to handle invocations of
			 * "./ld.so someprog" to test out a new version of
			 * the loader, the subsequent program that the
			 * loader loads must avoid the loader itself, so
			 * they cannot share the same load range. Sufficient
			 * room for the brk must be allocated with the
			 * loader as well, since brk must be available with
			 * the loader.
			 *
			 * Therefore, programs are loaded offset from
			 * ELF_ET_DYN_BASE and loaders are loaded into the
			 * independently randomized mmap region (0 load_bias
			 * without MAP_FIXED nor MAP_FIXED_NOREPLACE).
			 */
			if (interpreter) {
				load_bias = ELF_ET_DYN_BASE;
				if (current->flags & PF_RANDOMIZE)
					load_bias += arch_mmap_rnd();
				alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
				if (alignment)
					load_bias &= ~(alignment - 1);
				elf_flags |= MAP_FIXED_NOREPLACE;
			} else
				load_bias = 0;

			/*
			 * Since load_bias is used for all subsequent loading
			 * calculations, we must lower it by the first vaddr
			 * so that the remaining calculations based on the
			 * ELF vaddrs will be correctly offset. The result
			 * is then page aligned.
			 */
			load_bias = ELF_PAGESTART(load_bias - vaddr);

			/*
			 * Calculate the entire size of the ELF mapping
			 * (total_size), used for the initial mapping,
			 * due to load_addr_set which is set to true later
			 * once the initial mapping is performed.
			 *
			 * Note that this is only sensible when the LOAD
			 * segments are contiguous (or overlapping). If
			 * used for LOADs that are far apart, this would
			 * cause the holes between LOADs to be mapped,
			 * running the risk of having the mapping fail,
			 * as it would be larger than the ELF file itself.
			 *
			 * As a result, only ET_DYN does this, since
			 * some ET_EXEC (e.g. ia64) may have large virtual
			 * memory holes between LOADs.
			 *
			 */
			total_size = total_mapping_size(elf_phdata,
							elf_ex->e_phnum);
			if (!total_size) {
				retval = -EINVAL;
				goto out_free_dentry;
			}
		}

		error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
				elf_prot, elf_flags, total_size);
		if (BAD_ADDR(error)) {
			retval = IS_ERR_VALUE(error) ?
				PTR_ERR((void*)error) : -EINVAL;
			goto out_free_dentry;
		}

		if (first_pt_load) {
			first_pt_load = 0;
			if (elf_ex->e_type == ET_DYN) {
				load_bias += error -
				             ELF_PAGESTART(load_bias + vaddr);
				reloc_func_desc = load_bias;
			}
		}

		/*
		 * Figure out which segment in the file contains the Program
		 * Header table, and map to the associated memory address.
		 */
		if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
		    elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
			phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
				    elf_ppnt->p_vaddr;
		}

		k = elf_ppnt->p_vaddr;
		if ((elf_ppnt->p_flags & PF_X) && k < start_code)
			start_code = k;
		if (start_data < k)
			start_data = k;

		/*
		 * Check to see if the section's size will overflow the
		 * allowed task size. Note that p_filesz must always be
		 * <= p_memsz so it is only necessary to check p_memsz.
		 */
		if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
		    elf_ppnt->p_memsz > TASK_SIZE ||
		    TASK_SIZE - elf_ppnt->p_memsz < k) {
			/* set_brk can never work. Avoid overflows. */
			retval = -EINVAL;
			goto out_free_dentry;
		}

		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;

		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
			end_code = k;
		if (end_data < k)
			end_data = k;
		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
		if (k > elf_brk)
			elf_brk = k;
	}

	e_entry = elf_ex->e_entry + load_bias;
	phdr_addr += load_bias;
	elf_brk += load_bias;
	start_code += load_bias;
	end_code += load_bias;
	start_data += load_bias;
	end_data += load_bias;

	current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);

	if (interpreter) {
		elf_entry = load_elf_interp(interp_elf_ex,
					    interpreter,
					    load_bias, interp_elf_phdata,
					    &arch_state);
		if (!IS_ERR_VALUE(elf_entry)) {
			/*
			 * load_elf_interp() returns relocation
			 * adjustment
			 */
			interp_load_addr = elf_entry;
			elf_entry += interp_elf_ex->e_entry;
		}
		if (BAD_ADDR(elf_entry)) {
			retval = IS_ERR_VALUE(elf_entry) ?
					(int)elf_entry : -EINVAL;
			goto out_free_dentry;
		}
		reloc_func_desc = interp_load_addr;

		allow_write_access(interpreter);
		fput(interpreter);

		kfree(interp_elf_ex);
		kfree(interp_elf_phdata);
	} else {
		elf_entry = e_entry;
		if (BAD_ADDR(elf_entry)) {
			retval = -EINVAL;
			goto out_free_dentry;
		}
	}

	kfree(elf_phdata);

	set_binfmt(&elf_format);

#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
	retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
	if (retval < 0)
		goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */

	retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
				   e_entry, phdr_addr);
	if (retval < 0)
		goto out;

	mm = current->mm;
	mm->end_code = end_code;
	mm->start_code = start_code;
	mm->start_data = start_data;
	mm->end_data = end_data;
	mm->start_stack = bprm->p;

	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
		/*
		 * For architectures with ELF randomization, when executing
		 * a loader directly (i.e. no interpreter listed in ELF
		 * headers), move the brk area out of the mmap region
		 * (since it grows up, and may collide early with the stack
		 * growing down), and into the unused ELF_ET_DYN_BASE region.
		 */
		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
		    elf_ex->e_type == ET_DYN && !interpreter) {
			mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
		}

		mm->brk = mm->start_brk = arch_randomize_brk(mm);
#ifdef compat_brk_randomized
		current->brk_randomized = 1;
#endif
	}

	if (current->personality & MMAP_PAGE_ZERO) {
		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
		   and some applications "depend" upon this behavior.
		   Since we do not have the power to recompile these, we
		   emulate the SVr4 behavior. Sigh. */
		error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
				MAP_FIXED | MAP_PRIVATE, 0);
	}

	regs = current_pt_regs();
#ifdef ELF_PLAT_INIT
	/*
	 * The ABI may specify that certain registers be set up in special
	 * ways (on i386 %edx is the address of a DT_FINI function, for
	 * example.  In addition, it may also specify (eg, PowerPC64 ELF)
	 * that the e_entry field is the address of the function descriptor
	 * for the startup routine, rather than the address of the startup
	 * routine itself.  This macro performs whatever initialization to
	 * the regs structure is required as well as any relocations to the
	 * function descriptor entries when executing dynamically links apps.
	 */
	ELF_PLAT_INIT(regs, reloc_func_desc);
#endif

	finalize_exec(bprm);
	START_THREAD(elf_ex, regs, elf_entry, bprm->p);
	retval = 0;
out:
	return retval;

	/* error cleanup */
out_free_dentry:
	kfree(interp_elf_ex);
	kfree(interp_elf_phdata);
out_free_file:
	allow_write_access(interpreter);
	if (interpreter)
		fput(interpreter);
out_free_ph:
	kfree(elf_phdata);
	goto out;
}