Android Watchdog

3,137 阅读8分钟

watchdog(看门狗),当异常(死锁、死循环)发生的时候,重置/重启系统,使系统恢复正常。分为硬件看门狗和软件看门狗,本文是Android系统软件看门狗的实现逻辑。

watchdog启动

frameworks/base/services/java/com/android/server/SystemServer.java

private void startOtherServices() {
        final Watchdog watchdog = Watchdog.getInstance();
        watchdog.init(context, mActivityManagerService);

        mActivityManagerService.systemReady(new Runnable() {
            @Override
            public void run() {
                Watchdog.getInstance().start();
            }
        });
    }

Watchdog

Watchdog类采用单例模式设计,继承Thread

frameworks/base/services/core/java/com/android/server/Watchdog.java

static Watchdog sWatchdog;

public static Watchdog getInstance() {
    if (sWatchdog == null) {
        sWatchdog = new Watchdog();
    }

    return sWatchdog;
}

private Watchdog() {
    // 创建mMonitorChecker,用于监测前台线程,用于添加需要监测的任务
    mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
            "foreground thread", DEFAULT_TIMEOUT);
    mHandlerCheckers.add(mMonitorChecker);
    // 添加一些固定任务,用于监测主线程、UI线程、I/O线程和显示线程
    mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
            "main thread", DEFAULT_TIMEOUT));
    mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
            "ui thread", DEFAULT_TIMEOUT));
    mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
            "i/o thread", DEFAULT_TIMEOUT));
    mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
            "display thread", DEFAULT_TIMEOUT));
    // 添加到MonitorChecker,用于检查binder线程
    addMonitor(new BinderThreadMonitor());
}

启动Watchdog线程

Watchdog线程是在SystemServer中启动,当SystemServer调用start()方法时,会执行Watchdog的run()方法

public void run() {
    boolean waitedHalf = false;
    while (true) {
        final ArrayList<HandlerChecker> blockedCheckers;
        final String subject;
        final boolean allowRestart;
        int debuggerWasConnected = 0;
        synchronized (this) {
            long timeout = CHECK_INTERVAL;
            // 发起check
            for (int i=0; i<mHandlerCheckers.size(); i++) {
                HandlerChecker hc = mHandlerCheckers.get(i);
                hc.scheduleCheckLocked();
            }

            if (debuggerWasConnected > 0) {
                debuggerWasConnected--;
            }

            // 等待超时时间timeout(60秒)
            long start = SystemClock.uptimeMillis();
            while (timeout > 0) {
                if (Debug.isDebuggerConnected()) {
                    debuggerWasConnected = 2;
                }
                try {
                    wait(timeout);
                } catch (InterruptedException e) {
                    Log.wtf(TAG, e);
                }
                if (Debug.isDebuggerConnected()) {
                    debuggerWasConnected = 2;
                }
                timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
            }

            // 检查check结果
            final int waitState = evaluateCheckerCompletionLocked();
            // 如果结果是COMPLETED/WAITING/WAITED_HALF,说明没有阻塞
            if (waitState == COMPLETED) {
                // The monitors have returned; reset
                waitedHalf = false;
                continue;
            } else if (waitState == WAITING) {
                // still waiting but within their configured intervals; back off and recheck
                continue;
            } else if (waitState == WAITED_HALF) {
                if (!waitedHalf) {
                    // We've waited half the deadlock-detection interval.  Pull a stack
                    // trace and wait another half.
                    ArrayList<Integer> pids = new ArrayList<Integer>();
                    pids.add(Process.myPid());
                    ActivityManagerService.dumpStackTraces(true, pids, null, null,
                            NATIVE_STACKS_OF_INTEREST);
                    waitedHalf = true;
                }
                continue;
            }

            // 跑到这里,说明状态是OVERDUE,有地方发生了阻塞
            blockedCheckers = getBlockedCheckersLocked();
            subject = describeCheckersLocked(blockedCheckers);
            allowRestart = mAllowRestart;
        }

        ArrayList<Integer> pids = new ArrayList<Integer>();
        pids.add(Process.myPid());
        if (mPhonePid > 0) pids.add(mPhonePid);
        // Pass !waitedHalf so that just in case we somehow wind up here without having
        // dumped the halfway stacks, we properly re-initialize the trace file.
        final File stack = ActivityManagerService.dumpStackTraces(
                !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);

        //等待成功获取stacktrace
        SystemClock.sleep(2000);

        // 使用kernel的SysRq功能打印Block和backtrace信息
        doSysRq('w');
        doSysRq('l');

        // Try to add the error to the dropbox, but assuming that the ActivityManager
        // itself may be deadlocked.  (which has happened, causing this statement to
        // deadlock and the watchdog as a whole to be ineffective)
        Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
                public void run() {
                    mActivity.addErrorToDropBox(
                            "watchdog", null, "system_server", null, null,
                            subject, null, stack, null);
                }
            };
        dropboxThread.start();
        try {
            dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
        } catch (InterruptedException ignored) {}

        IActivityController controller;
        synchronized (this) {
            controller = mController;
        }
        if (controller != null) {
            Slog.i(TAG, "Reporting stuck state to activity controller");
            try {
                Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
                // 1 = keep waiting, -1 = kill system
                int res = controller.systemNotResponding(subject);
                if (res >= 0) {
                    Slog.i(TAG, "Activity controller requested to coninue to wait");
                    waitedHalf = false;
                    continue;
                }
            } catch (RemoteException e) {
            }
        }

        // Only kill the process if the debugger is not attached.
        if (Debug.isDebuggerConnected()) {
            debuggerWasConnected = 2;
        }
        if (debuggerWasConnected >= 2) {
            Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
        } else if (debuggerWasConnected > 0) {
            Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
        } else if (!allowRestart) {
            Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
        } else {
            Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
            for (int i=0; i<blockedCheckers.size(); i++) {
                Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
                StackTraceElement[] stackTrace
                        = blockedCheckers.get(i).getThread().getStackTrace();
                for (StackTraceElement element: stackTrace) {
                    Slog.w(TAG, "    at " + element);
                }
            }
            Slog.w(TAG, "*** GOODBYE!");
            // 杀死自身进程,即system_server进程
            Process.killProcess(Process.myPid());
            System.exit(10);
        }

        waitedHalf = false;
    }
}

检查是否发生binder thread阻塞的机制

// These are temporally ordered: larger values as lateness increases
static final int COMPLETED = 0;
static final int WAITING = 1;
static final int WAITED_HALF = 2;
static final int OVERDUE = 3;

watchdog检查进程,会得到四种状态。COMPLETED检查完成未阻塞;WAITING检查尚未完成,等待中;WAITED_HALF检查尚未完成,等待中,时间过去了一半;OVERDUE检查完成,未返回,发生了阻塞。

发起check
public void scheduleCheckLocked() {
    if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
        mCompleted = true;
        return;
    }

    if (!mCompleted) {
        // we already have a check in flight, so no need
        return;
    }

    mCompleted = false;
    mCurrentMonitor = null;
    mStartTime = SystemClock.uptimeMillis();
    // 把HandlerChecker.run()添加到Handler队头执行
    mHandler.postAtFrontOfQueue(this);
}

public void run() {
    final int size = mMonitors.size();
    for (int i = 0 ; i < size ; i++) {
        synchronized (Watchdog.this) {
            mCurrentMonitor = mMonitors.get(i);
        }
        // 执行Monitor.monitor()方法,有可能会被阻塞在这里
        mCurrentMonitor.monitor();
    }

    synchronized (Watchdog.this) {
        mCompleted = true;
        mCurrentMonitor = null;
    }
}

private static final class BinderThreadMonitor implements Watchdog.Monitor {
    @Override
    public void monitor() {
        // 调用Binder框架方法
        Binder.blockUntilThreadAvailable();
    }
}

frameworks/base/core/java/android/os/Binder.java

/**
 * Call blocks until the number of executing binder threads is less
 * than the maximum number of binder threads allowed for this process.
 * @hide
 */
public static final native void blockUntilThreadAvailable();

frameworks/base/core/jni/android_util_Binder.cpp

static void android_os_Binder_blockUntilThreadAvailable(JNIEnv* env, jobject clazz)
{
    return IPCThreadState::self()->blockUntilThreadAvailable();
}

frameworks/native/libs/binder/IPCThreadState.cpp

void IPCThreadState::blockUntilThreadAvailable()
{
    pthread_mutex_lock(&mProcess->mThreadCountLock);
    // 如果正在执行的binder线程数量大于预先设定的最大binder线程数量,则进入等待状态(block),即HandlerChecker.run()会会被阻塞
    // 预先设定的binder最大线程数量是15,等待时间是60秒
    while (mProcess->mExecutingThreadsCount >= mProcess->mMaxThreads) {
        ALOGW("Waiting for thread to be free. mExecutingThreadsCount=%lu mMaxThreads=%lu\n",
                static_cast<unsigned long>(mProcess->mExecutingThreadsCount),
                static_cast<unsigned long>(mProcess->mMaxThreads));
        // 使用pthread机制实现
        pthread_cond_wait(&mProcess->mThreadCountDecrement, &mProcess->mThreadCountLock);
    }
    pthread_mutex_unlock(&mProcess->mThreadCountLock);
}
检查check结果
private int evaluateCheckerCompletionLocked() {
    int state = COMPLETED;
    for (int i=0; i<mHandlerCheckers.size(); i++) {
        HandlerChecker hc = mHandlerCheckers.get(i);
        state = Math.max(state, hc.getCompletionStateLocked());
    }
    return state;
}

public int getCompletionStateLocked() {
    if (mCompleted) {
        return COMPLETED;
    } else {
        long latency = SystemClock.uptimeMillis() - mStartTime;
        if (latency < mWaitMax/2) {
            return WAITING;
        } else if (latency < mWaitMax) {
            return WAITED_HALF;
        }
    }
    return OVERDUE;
}

通过检查mCompleted变量的值和check任务执行的时间来得到check结果。

如何使用watchdog

public interface Monitor {
    void monitor();
}

Watchdog类定义了Monitor接口,其它地方需要实现这个接口,然后调用Watchdog.addMonitor(),最后调用start()来启动watchdog。具体的使用可以查看system_server进程中watchdog的使用。

知识点总结

watchdog杀死system_server进程的时候会通过SysRq打印Block和backtrace信息

# echo w > /proc/sysrq-trigger                                        
[  228.239812] SysRq : Show Blocked State
[  228.243627]   task                PC stack   pid father
[  228.248889] PANEL_DHDResetM D a0a9e39c     0   884      2 0x00000000
[  228.255252] [<a0a9e39c>] (__schedule) from [<a0a9e6b0>] (schedule+0xa0/0xa4)
[  228.262290] [<a0a9e6b0>] (schedule) from [<a0aa1114>] (schedule_timeout+0x184/0x1b8)
[  228.270010] [<a0aa1114>] (schedule_timeout) from [<a0aa11e0>] (schedule_timeout_uninterruptible+0x30/0x34)
[  228.279628] [<a0aa11e0>] (schedule_timeout_uninterruptible) from [<a006df84>] (msleep+0x28/0x30)
[  228.288388] [<a006df84>] (msleep) from [<a0651338>] (WriteRegThread+0xfc/0x2b4)
[  228.295675] [<a0651338>] (WriteRegThread) from [<a003c734>] (kthread+0xdc/0xf0)
[  228.302960] [<a003c734>] (kthread) from [<a000e440>] (ret_from_fork+0x14/0x20)
[  228.310163] HI_VPSS_Process D a0a9e39c     0   954      2 0x00000000
[  228.316514] [<a0a9e39c>] (__schedule) from [<a0a9e6b0>] (schedule+0xa0/0xa4)
[  228.323536] [<a0a9e6b0>] (schedule) from [<a0aa1114>] (schedule_timeout+0x184/0x1b8)
[  228.331254] [<a0aa1114>] (schedule_timeout) from [<a07f9650>] (VPSS_OSAL_WaitEvent+0xd0/0x110)
[  228.339839] [<a07f9650>] (VPSS_OSAL_WaitEvent) from [<a080acb8>] (VPSS_CTRL_ThreadProc+0x2fc/0x3a0)
[  228.348857] [<a080acb8>] (VPSS_CTRL_ThreadProc) from [<a003c734>] (kthread+0xdc/0xf0)
[  228.356660] [<a003c734>] (kthread) from [<a000e440>] (ret_from_fork+0x14/0x20)
# 
# 
# echo l > /proc/sysrq-trigger                                        
[  232.206813] SysRq : Show backtrace of all active CPUs
[  232.211879] Backtrace for cpu 0 (current):
[  232.215962] CPU: 0 PID: 4008 Comm: sh Tainted: P           O   3.18.16_s5 #1
[  232.223002] [<a0015e74>] (unwind_backtrace) from [<a00123d4>] (show_stack+0x20/0x24)
[  232.230720] [<a00123d4>] (show_stack) from [<a0a9b664>] (dump_stack+0x78/0x94)
[  232.237917] [<a0a9b664>] (dump_stack) from [<a00148cc>] (smp_send_all_cpu_backtrace+0x70/0xd8)
[  232.246519] [<a00148cc>] (smp_send_all_cpu_backtrace) from [<a000f0c4>] (arch_trigger_all_cpu_backtrace+0x18/0x1c)
[  232.256843] [<a000f0c4>] (arch_trigger_all_cpu_backtrace) from [<a0349ee4>] (sysrq_handle_showallcpus+0x18/0x1c)
[  232.266997] [<a0349ee4>] (sysrq_handle_showallcpus) from [<a034a620>] (__handle_sysrq+0x94/0x12c)
[  232.275839] [<a034a620>] (__handle_sysrq) from [<a034aa5c>] (write_sysrq_trigger+0x4c/0x5c)
[  232.284161] [<a034aa5c>] (write_sysrq_trigger) from [<a0165098>] (proc_reg_write+0x80/0x94)
[  232.292483] [<a0165098>] (proc_reg_write) from [<a01113e8>] (vfs_write+0xcc/0x184)
[  232.300026] [<a01113e8>] (vfs_write) from [<a0111890>] (SyS_write+0x58/0x98)
[  232.307053] [<a0111890>] (SyS_write) from [<a000e3a0>] (ret_fast_syscall+0x0/0x38)
[  232.314589] 
[  232.314589] sending IPI to all other CPUs:
[  232.320138] IPI backtrace for cpu 1
[  232.323612] CPU: 1 PID: 3362 Comm: InputReader Tainted: P           O   3.18.16_s5 #1
[  232.331402] task: b9ae5c80 ti: bd72a000 task.ti: bd72a000
[  232.336772] PC is at 0x7a1b897e
[  232.339890] LR is at 0x7a1c0004
[  232.343011] pc : [<7a1b897e>]    lr : [<7a1c0004>]    psr: 600f0030
[  232.343011] sp : 6eb80900  ip : 00000000  fp : 00000058
[  232.354431] r10: 7a1ba4e9  r9 : 8761e868  r8 : 8761e878
[  232.359625] r7 : 00000078  r6 : 6eb80920  r5 : 893a4edd  r4 : 8761e868
[  232.366116] r3 : 7a1c000c  r2 : 8759ebe0  r1 : 8761e868  r0 : 7a1c000c
[  232.372608] Flags: nZCv  IRQs on  FIQs on  Mode USER_32  ISA Thumb  Segment user
[  232.379964] Control: 10c0383d  Table: 215b006a  DAC: 00000015
[  232.385679] CPU: 1 PID: 3362 Comm: InputReader Tainted: P           O   3.18.16_s5 #1
[  232.393476] [<a0015e74>] (unwind_backtrace) from [<a00123d4>] (show_stack+0x20/0x24)
[  232.401184] [<a00123d4>] (show_stack) from [<a0a9b664>] (dump_stack+0x78/0x94)
[  232.408372] [<a0a9b664>] (dump_stack) from [<a000f670>] (show_regs+0x1c/0x20)
[  232.415473] [<a000f670>] (show_regs) from [<a0014afc>] (handle_IPI+0x1c8/0x284)
[  232.422745] [<a0014afc>] (handle_IPI) from [<a000869c>] (gic_handle_irq+0x64/0x6c)
[  232.430276] [<a000869c>] (gic_handle_irq) from [<a00131c8>] (__irq_usr+0x48/0x60)
[  232.437718] Exception stack(0xbd72bfb0 to 0xbd72bff8)
[  232.442740] bfa0:                                     7a1c000c 8761e868 8759ebe0 7a1c000c
[  232.450876] bfc0: 8761e868 893a4edd 6eb80920 00000078 8761e878 8761e868 7a1ba4e9 00000058
[  232.459011] bfe0: 00000000 6eb80900 7a1c0004 7a1b897e 600f0030 ffffffff
#