Android10 Framework—Init进程-16.init进程对子进程异常监控对于init启动的service

对于init启动的service，如果没有指定oneshort（一次性）的option，则默认为常驻service。在此情况下，当servcie意外退出，init将监控到并重启service。

监控服务异常退出

int SecondStageMain(int argc, char** argv) {
    ...省略代码 

    InstallSignalFdHandler(&epoll);

    ...省略代码 
}

init二阶段中通过调用InstallSignalFdHandler监听服务异常退出。

static void InstallSignalFdHandler(Epoll* epoll) {
    // Applying SA_NOCLDSTOP to a defaulted SIGCHLD handler prevents the signalfd from receiving
    // SIGCHLD when a child process stops or continues (b/77867680#comment9).
    const struct sigaction act { .sa_handler = SIG_DFL, .sa_flags = SA_NOCLDSTOP };
    sigaction(SIGCHLD, &act, nullptr);

    sigset_t mask;
    sigemptyset(&mask);
    sigaddset(&mask, SIGCHLD);

    if (!IsRebootCapable()) {
        // If init does not have the CAP_SYS_BOOT capability, it is running in a container.
        // In that case, receiving SIGTERM will cause the system to shut down.
        sigaddset(&mask, SIGTERM);
    }

    if (sigprocmask(SIG_BLOCK, &mask, nullptr) == -1) {
        PLOG(FATAL) << "failed to block signals";
    }

    // Register a handler to unblock signals in the child processes.
    const int result = pthread_atfork(nullptr, nullptr, &UnblockSignals);
    if (result != 0) {
        LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
    }

    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC);
    if (signal_fd == -1) {
        PLOG(FATAL) << "failed to create signalfd";
    }

    if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd); !result) {
        LOG(FATAL) << result.error();
    }
}

通过epoll监听信号SIGCHLD，当有此信号发生时表示有服务异常退出，会回调HandleSignalFd函数，在HandleSignalFd函数杜宇SIGCHLD信号的处理会调用ReapAnyOutstandingChildren函数

//system/core/init/sigchld_handler.cpp

void ReapAnyOutstandingChildren() {
    while (ReapOneProcess()) {
    }
}

static bool ReapOneProcess() {
    ...

    if (PropertyChildReap(pid)) {
        name = "Async property child";
    } else if (SubcontextChildReap(pid)) {
        name = "Subcontext";
    } else {
        service = ServiceList::GetInstance().FindService(pid, &Service::pid);

        ...
    }

    ...

    service->Reap(siginfo);

    if (service->flags() & SVC_TEMPORARY) {
        ServiceList::GetInstance().RemoveService(*service);
    }

    return true;
}

ReapOneProcess中通过服务的pid找到该Service对象，然后调用其Reap方法

void Service::Reap(const siginfo_t& siginfo) {
    // 不是一次性的或者需要重启的
    if (!(flags_ & SVC_ONESHOT) || (flags_ & SVC_RESTART)) {
        // 服务死亡，杀死其进程组所有进程， 第二个参数表示是否report_oneshot
        KillProcessGroup(SIGKILL);
    }

    // Remove any descriptor resources we may have created.
    //清理描述符资源
    std::for_each(descriptors_.begin(), descriptors_.end(),
                  std::bind(&DescriptorInfo::Clean, std::placeholders::_1));

    for (const auto& f : reap_callbacks_) {
        f(siginfo);
    }

    // 重置 is_exec_service_running_ flag
    if (flags_ & SVC_EXEC) UnSetExec();
    // 临时oneshot服务，返回
    if (flags_ & SVC_TEMPORARY) return;

    pid_ = 0;
    flags_ &= (~SVC_RUNNING);
    start_order_ = 0;

    // Oneshot processes go into the disabled state on exit,
    // except when manually restarted.
    // 标记为 SVC_RESTART 的，是需要重启服务的。在StopOrReset函数先kill进程，然后标记为SVC_RESTART，到回收后则进行重启
    if ((flags_ & SVC_ONESHOT) && !(flags_ & SVC_RESTART) && !(flags_ & SVC_RESET)) {
        flags_ |= SVC_DISABLED;
    }

    // Disabled and reset processes do not get restarted automatically.
    if (flags_ & (SVC_DISABLED | SVC_RESET))  {
        NotifyStateChange("stopped");
        return;
    }

    // If we crash > 4 times in 4 minutes or before boot_completed,
    // reboot into bootloader or set crashing property
    boot_clock::time_point now = boot_clock::now();
    if (((flags_ & SVC_CRITICAL) || !pre_apexd_) && !(flags_ & SVC_RESTART)) {
        bool boot_completed = android::base::GetBoolProperty("sys.boot_completed", false);
        if (now < time_crashed_ + 4min || !boot_completed) {
            if (++crash_count_ > 4) {
                if (flags_ & SVC_CRITICAL) {
                    // Aborts into bootloader
                    LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
                               << (boot_completed ? "in 4 minutes" : "before boot completed");
                } else {
                    LOG(ERROR) << "updatable process '" << name_ << "' exited 4 times "
                               << (boot_completed ? "in 4 minutes" : "before boot completed");
                    // Notifies update_verifier and apexd
                    property_set("ro.init.updatable_crashing_process_name", name_);
                    property_set("ro.init.updatable_crashing", "1");
                }
            }
        } else {
            time_crashed_ = now;
            crash_count_ = 1;
        }
    }

    flags_ &= (~SVC_RESTART);
    // 注意此处标记，是服务重启的关键
    flags_ |= SVC_RESTARTING;

    // Execute all onrestart commands for this service.
    // 执行所有 onrestart 命令， 在rc里面配置的
    onrestart_.ExecuteAllCommands();

    NotifyStateChange("restarting");
    return;
}

Reap方法完整 2 个主要功能

在Service重启前，做一些清理工作
- kill进程组所有进程
- 清理所有描述符资源文件
- 回调reap_callbacks_，比如之前设置的启动失败回调

通知开始重启服务

设置Service状态为SVC_RESTARTING

执行Service下所有onRestart命令，这个主要作用是rc文件中可能定义了服务重启时需要onrestart的其它服务，如下所示

service zygote /system/bin/app_process64 -Xzygote /system/bin --zygote --start-system-server
    class main
    priority -20
    user root
    group root readproc reserved_disk
    socket zygote stream 660 root system
    socket usap_pool_primary stream 660 root system
    onrestart write /sys/android_power/request_state wake
    onrestart write /sys/power/state on
    onrestart restart audioserver
    onrestart restart cameraserver
    onrestart restart media
    onrestart restart netd
    onrestart restart wificond
    writepid /dev/cpuset/foreground/tasks

通知状态改变，NotifyStateChange在前面也讲过，它最后就是调用property_set改变属性，进而在for循环中重启自身服务

int SecondStageMain(int argc, char** argv) {
    ...省略代码
    
    while (true) {
        if (!(waiting_for_prop || Service::is_exec_service_running())) {
            if (!shutting_down) {
                auto next_process_action_time = HandleProcessActions();

                // If there's a process that needs restarting, wake up in time for that.
                if (next_process_action_time) {
                    epoll_timeout = std::chrono::ceil<std::chrono::milliseconds>(
                            *next_process_action_time - boot_clock::now());
                    if (*epoll_timeout < 0ms) epoll_timeout = 0ms;
                }
            }

            // If there's more work to do, wake up again immediately.
            if (am.HasMoreCommands()) epoll_timeout = 0ms;
        }
    }
    
    ...省略代码 
}

此时Service的状态为SVC_RESTARTING，因此is_exec_service_running返回false；shutting_down也为false；所以会调用HandleProcessActions方法

static std::optional<boot_clock::time_point> HandleProcessActions() {
    std::optional<boot_clock::time_point> next_process_action_time;
    for (const auto& s : ServiceList::GetInstance()) {
        ...省略代码

        //服务状态不为SVC_RESTARTING的被过滤
        if (!(s->flags() & SVC_RESTARTING)) continue;

        auto restart_time = s->time_started() + s->restart_period();
        if (boot_clock::now() > restart_time) {
            if (auto result = s->Start(); !result) {
                LOG(ERROR) << "Could not restart process '" << s->name() << "': " << result.error();
            }
        } else {
            if (!next_process_action_time || restart_time < *next_process_action_time) {
                next_process_action_time = restart_time;
            }
        }
    }
    return next_process_action_time;
}

for遍历所有服务，但是服务状态不为SVC_RESTARTING的被过滤

这里通过time_started和restart_period计算服务重启时间

time_started方法返回的是服务上次启动时间

Result<Success> Service::Start() {
    time_started_ = boot_clock::now();
}

restart_period方法返回的是服务重启周期，在rc文件中可以配置，默认值是 5s，其定义如下

restart_period <seconds>
> If a non-oneshot service exits, it will be restarted at its start time plus
  this period. It defaults to 5s to rate limit crashing services.
  This can be increased for services that are meant to run periodically. For
  example, it may be set to 3600 to indicate that the service should run every hour
  or 86400 to indicate that the service should run every day.

如果当前时间大于restart_time表示这个服务需要重新启动一下，于是调用Start对服务进行重启