watchdog(看门狗),当异常(死锁、死循环)发生的时候,重置/重启系统,使系统恢复正常。分为硬件看门狗和软件看门狗,本文是Android系统软件看门狗的实现逻辑。
watchdog启动
frameworks/base/services/java/com/android/server/SystemServer.java
private void startOtherServices() {
final Watchdog watchdog = Watchdog.getInstance();
watchdog.init(context, mActivityManagerService);
mActivityManagerService.systemReady(new Runnable() {
@Override
public void run() {
Watchdog.getInstance().start();
}
});
}
Watchdog
Watchdog类采用单例模式设计,继承Thread
frameworks/base/services/core/java/com/android/server/Watchdog.java
static Watchdog sWatchdog;
public static Watchdog getInstance() {
if (sWatchdog == null) {
sWatchdog = new Watchdog();
}
return sWatchdog;
}
private Watchdog() {
// 创建mMonitorChecker,用于监测前台线程,用于添加需要监测的任务
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// 添加一些固定任务,用于监测主线程、UI线程、I/O线程和显示线程
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
// 添加到MonitorChecker,用于检查binder线程
addMonitor(new BinderThreadMonitor());
}
启动Watchdog线程
Watchdog线程是在SystemServer中启动,当SystemServer调用start()方法时,会执行Watchdog的run()方法
public void run() {
boolean waitedHalf = false;
while (true) {
final ArrayList<HandlerChecker> blockedCheckers;
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
long timeout = CHECK_INTERVAL;
// 发起check
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
hc.scheduleCheckLocked();
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// 等待超时时间timeout(60秒)
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
wait(timeout);
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
// 检查check结果
final int waitState = evaluateCheckerCompletionLocked();
// 如果结果是COMPLETED/WAITING/WAITED_HALF,说明没有阻塞
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
ActivityManagerService.dumpStackTraces(true, pids, null, null,
NATIVE_STACKS_OF_INTEREST);
waitedHalf = true;
}
continue;
}
// 跑到这里,说明状态是OVERDUE,有地方发生了阻塞
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
}
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
if (mPhonePid > 0) pids.add(mPhonePid);
// Pass !waitedHalf so that just in case we somehow wind up here without having
// dumped the halfway stacks, we properly re-initialize the trace file.
final File stack = ActivityManagerService.dumpStackTraces(
!waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
//等待成功获取stacktrace
SystemClock.sleep(2000);
// 使用kernel的SysRq功能打印Block和backtrace信息
doSysRq('w');
doSysRq('l');
// Try to add the error to the dropbox, but assuming that the ActivityManager
// itself may be deadlocked. (which has happened, causing this statement to
// deadlock and the watchdog as a whole to be ineffective)
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
mActivity.addErrorToDropBox(
"watchdog", null, "system_server", null, null,
subject, null, stack, null);
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) {}
IActivityController controller;
synchronized (this) {
controller = mController;
}
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
for (int i=0; i<blockedCheckers.size(); i++) {
Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
StackTraceElement[] stackTrace
= blockedCheckers.get(i).getThread().getStackTrace();
for (StackTraceElement element: stackTrace) {
Slog.w(TAG, " at " + element);
}
}
Slog.w(TAG, "*** GOODBYE!");
// 杀死自身进程,即system_server进程
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
检查是否发生binder thread阻塞的机制
// These are temporally ordered: larger values as lateness increases
static final int COMPLETED = 0;
static final int WAITING = 1;
static final int WAITED_HALF = 2;
static final int OVERDUE = 3;
watchdog检查进程,会得到四种状态。COMPLETED
检查完成未阻塞;WAITING
检查尚未完成,等待中;WAITED_HALF
检查尚未完成,等待中,时间过去了一半;OVERDUE
检查完成,未返回,发生了阻塞。
发起check
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
// 把HandlerChecker.run()添加到Handler队头执行
mHandler.postAtFrontOfQueue(this);
}
public void run() {
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
// 执行Monitor.monitor()方法,有可能会被阻塞在这里
mCurrentMonitor.monitor();
}
synchronized (Watchdog.this) {
mCompleted = true;
mCurrentMonitor = null;
}
}
private static final class BinderThreadMonitor implements Watchdog.Monitor {
@Override
public void monitor() {
// 调用Binder框架方法
Binder.blockUntilThreadAvailable();
}
}
frameworks/base/core/java/android/os/Binder.java
/**
* Call blocks until the number of executing binder threads is less
* than the maximum number of binder threads allowed for this process.
* @hide
*/
public static final native void blockUntilThreadAvailable();
frameworks/base/core/jni/android_util_Binder.cpp
static void android_os_Binder_blockUntilThreadAvailable(JNIEnv* env, jobject clazz)
{
return IPCThreadState::self()->blockUntilThreadAvailable();
}
frameworks/native/libs/binder/IPCThreadState.cpp
void IPCThreadState::blockUntilThreadAvailable()
{
pthread_mutex_lock(&mProcess->mThreadCountLock);
// 如果正在执行的binder线程数量大于预先设定的最大binder线程数量,则进入等待状态(block),即HandlerChecker.run()会会被阻塞
// 预先设定的binder最大线程数量是15,等待时间是60秒
while (mProcess->mExecutingThreadsCount >= mProcess->mMaxThreads) {
ALOGW("Waiting for thread to be free. mExecutingThreadsCount=%lu mMaxThreads=%lu\n",
static_cast<unsigned long>(mProcess->mExecutingThreadsCount),
static_cast<unsigned long>(mProcess->mMaxThreads));
// 使用pthread机制实现
pthread_cond_wait(&mProcess->mThreadCountDecrement, &mProcess->mThreadCountLock);
}
pthread_mutex_unlock(&mProcess->mThreadCountLock);
}
检查check结果
private int evaluateCheckerCompletionLocked() {
int state = COMPLETED;
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
state = Math.max(state, hc.getCompletionStateLocked());
}
return state;
}
public int getCompletionStateLocked() {
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
通过检查mCompleted变量的值和check任务执行的时间来得到check结果。
如何使用watchdog
public interface Monitor {
void monitor();
}
Watchdog类定义了Monitor接口,其它地方需要实现这个接口,然后调用Watchdog.addMonitor(),最后调用start()来启动watchdog。具体的使用可以查看system_server进程中watchdog的使用。
知识点总结
watchdog杀死system_server进程的时候会通过SysRq打印Block和backtrace信息
# echo w > /proc/sysrq-trigger
[ 228.239812] SysRq : Show Blocked State
[ 228.243627] task PC stack pid father
[ 228.248889] PANEL_DHDResetM D a0a9e39c 0 884 2 0x00000000
[ 228.255252] [<a0a9e39c>] (__schedule) from [<a0a9e6b0>] (schedule+0xa0/0xa4)
[ 228.262290] [<a0a9e6b0>] (schedule) from [<a0aa1114>] (schedule_timeout+0x184/0x1b8)
[ 228.270010] [<a0aa1114>] (schedule_timeout) from [<a0aa11e0>] (schedule_timeout_uninterruptible+0x30/0x34)
[ 228.279628] [<a0aa11e0>] (schedule_timeout_uninterruptible) from [<a006df84>] (msleep+0x28/0x30)
[ 228.288388] [<a006df84>] (msleep) from [<a0651338>] (WriteRegThread+0xfc/0x2b4)
[ 228.295675] [<a0651338>] (WriteRegThread) from [<a003c734>] (kthread+0xdc/0xf0)
[ 228.302960] [<a003c734>] (kthread) from [<a000e440>] (ret_from_fork+0x14/0x20)
[ 228.310163] HI_VPSS_Process D a0a9e39c 0 954 2 0x00000000
[ 228.316514] [<a0a9e39c>] (__schedule) from [<a0a9e6b0>] (schedule+0xa0/0xa4)
[ 228.323536] [<a0a9e6b0>] (schedule) from [<a0aa1114>] (schedule_timeout+0x184/0x1b8)
[ 228.331254] [<a0aa1114>] (schedule_timeout) from [<a07f9650>] (VPSS_OSAL_WaitEvent+0xd0/0x110)
[ 228.339839] [<a07f9650>] (VPSS_OSAL_WaitEvent) from [<a080acb8>] (VPSS_CTRL_ThreadProc+0x2fc/0x3a0)
[ 228.348857] [<a080acb8>] (VPSS_CTRL_ThreadProc) from [<a003c734>] (kthread+0xdc/0xf0)
[ 228.356660] [<a003c734>] (kthread) from [<a000e440>] (ret_from_fork+0x14/0x20)
#
#
# echo l > /proc/sysrq-trigger
[ 232.206813] SysRq : Show backtrace of all active CPUs
[ 232.211879] Backtrace for cpu 0 (current):
[ 232.215962] CPU: 0 PID: 4008 Comm: sh Tainted: P O 3.18.16_s5 #1
[ 232.223002] [<a0015e74>] (unwind_backtrace) from [<a00123d4>] (show_stack+0x20/0x24)
[ 232.230720] [<a00123d4>] (show_stack) from [<a0a9b664>] (dump_stack+0x78/0x94)
[ 232.237917] [<a0a9b664>] (dump_stack) from [<a00148cc>] (smp_send_all_cpu_backtrace+0x70/0xd8)
[ 232.246519] [<a00148cc>] (smp_send_all_cpu_backtrace) from [<a000f0c4>] (arch_trigger_all_cpu_backtrace+0x18/0x1c)
[ 232.256843] [<a000f0c4>] (arch_trigger_all_cpu_backtrace) from [<a0349ee4>] (sysrq_handle_showallcpus+0x18/0x1c)
[ 232.266997] [<a0349ee4>] (sysrq_handle_showallcpus) from [<a034a620>] (__handle_sysrq+0x94/0x12c)
[ 232.275839] [<a034a620>] (__handle_sysrq) from [<a034aa5c>] (write_sysrq_trigger+0x4c/0x5c)
[ 232.284161] [<a034aa5c>] (write_sysrq_trigger) from [<a0165098>] (proc_reg_write+0x80/0x94)
[ 232.292483] [<a0165098>] (proc_reg_write) from [<a01113e8>] (vfs_write+0xcc/0x184)
[ 232.300026] [<a01113e8>] (vfs_write) from [<a0111890>] (SyS_write+0x58/0x98)
[ 232.307053] [<a0111890>] (SyS_write) from [<a000e3a0>] (ret_fast_syscall+0x0/0x38)
[ 232.314589]
[ 232.314589] sending IPI to all other CPUs:
[ 232.320138] IPI backtrace for cpu 1
[ 232.323612] CPU: 1 PID: 3362 Comm: InputReader Tainted: P O 3.18.16_s5 #1
[ 232.331402] task: b9ae5c80 ti: bd72a000 task.ti: bd72a000
[ 232.336772] PC is at 0x7a1b897e
[ 232.339890] LR is at 0x7a1c0004
[ 232.343011] pc : [<7a1b897e>] lr : [<7a1c0004>] psr: 600f0030
[ 232.343011] sp : 6eb80900 ip : 00000000 fp : 00000058
[ 232.354431] r10: 7a1ba4e9 r9 : 8761e868 r8 : 8761e878
[ 232.359625] r7 : 00000078 r6 : 6eb80920 r5 : 893a4edd r4 : 8761e868
[ 232.366116] r3 : 7a1c000c r2 : 8759ebe0 r1 : 8761e868 r0 : 7a1c000c
[ 232.372608] Flags: nZCv IRQs on FIQs on Mode USER_32 ISA Thumb Segment user
[ 232.379964] Control: 10c0383d Table: 215b006a DAC: 00000015
[ 232.385679] CPU: 1 PID: 3362 Comm: InputReader Tainted: P O 3.18.16_s5 #1
[ 232.393476] [<a0015e74>] (unwind_backtrace) from [<a00123d4>] (show_stack+0x20/0x24)
[ 232.401184] [<a00123d4>] (show_stack) from [<a0a9b664>] (dump_stack+0x78/0x94)
[ 232.408372] [<a0a9b664>] (dump_stack) from [<a000f670>] (show_regs+0x1c/0x20)
[ 232.415473] [<a000f670>] (show_regs) from [<a0014afc>] (handle_IPI+0x1c8/0x284)
[ 232.422745] [<a0014afc>] (handle_IPI) from [<a000869c>] (gic_handle_irq+0x64/0x6c)
[ 232.430276] [<a000869c>] (gic_handle_irq) from [<a00131c8>] (__irq_usr+0x48/0x60)
[ 232.437718] Exception stack(0xbd72bfb0 to 0xbd72bff8)
[ 232.442740] bfa0: 7a1c000c 8761e868 8759ebe0 7a1c000c
[ 232.450876] bfc0: 8761e868 893a4edd 6eb80920 00000078 8761e878 8761e868 7a1ba4e9 00000058
[ 232.459011] bfe0: 00000000 6eb80900 7a1c0004 7a1b897e 600f0030 ffffffff
#