SystemService.PHASE_ACTIVITY_MANAGER_READY); ... // watchdog启动【见小节3.1】 Watchdog.getInstance().start(); mSystemServiceManager.startBootPhase( SystemService.PHASE_THIRD_PARTY_APPS_CAN_START); }
} }
system_server进程启动的过程中初始化WatchDog,主要有:
- 创建watchdog对象,该对象本身继承于Thread;
- 注册reboot广播;
- 调用start()开始工作。
2.2 getInstance
[-> Watchdog.java]
public static Watchdog getInstance() { if (sWatchdog == null) { //单例模式,创建实例对象【见小节2.3 】 sWatchdog = new Watchdog(); } return sWatchdog; }
2.3 创建Watchdog
[-> Watchdog.java]
public class Watchdog extends Thread { //所有的HandlerChecker对象组成的列表,HandlerChecker对象类型【见小节2.3.1】 final ArrayList mHandlerCheckers = new ArrayList<>(); ...
private Watchdog() { super("watchdog"); //将前台线程加入队列 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread", DEFAULT_TIMEOUT); mHandlerCheckers.add(mMonitorChecker); //将主线程加入队列 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread", DEFAULT_TIMEOUT)); //将ui线程加入队列 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread", DEFAULT_TIMEOUT)); //将i/o线程加入队列 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread", DEFAULT_TIMEOUT)); //将display线程加入队列 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), "display thread", DEFAULT_TIMEOUT)); //【见小节2.3.2】 addMonitor(new BinderThreadMonitor()); }
}
Watchdog继承于Thread,创建的线程名为”watchdog”。mHandlerCheckers队列包括、 主线程,fg, ui, io, display线程的HandlerChecker对象。
2.3.1 HandlerChecker
[-> Watchdog.java]
public final class HandlerChecker implements Runnable { private final Handler mHandler; //Handler对象 private final String mName; //线程描述名 private final long mWaitMax; //最长等待时间 //记录着监控的服务 private final ArrayList mMonitors = new ArrayList(); private boolean mCompleted; //开始检查时先设置成false private Monitor mCurrentMonitor; private long mStartTime; //开始准备检查的时间点
HandlerChecker(Handler handler, String name, long waitMaxMillis) { mHandler = handler; mName = name; mWaitMax = waitMaxMillis; mCompleted = true; } }
2.3.2 addMonitor
public class Watchdog extends Thread { public void addMonitor(Monitor monitor) { synchronized (this) { ... //此处mMonitorChecker数据类型为HandlerChecker mMonitorChecker.addMonitor(monitor); } }
public final class HandlerChecker implements Runnable { private final ArrayList mMonitors = new ArrayList();
public void addMonitor(Monitor monitor) { //将上面的BinderThreadMonitor添加到mMonitors队列 mMonitors.add(monitor); } ... } }
监控Binder线程, 将monitor添加到HandlerChecker的成员变量mMonitors列表中。 在这里是将BinderThreadMonitor对象加入该线程。
private static final class BinderThreadMonitor implements Watchdog.Monitor { public void monitor() { Binder.blockUntilThreadAvailable(); } }
blockUntilThreadAvailable最终调用的是IPCThreadState,等待有空闲的binder线程
void IPCThreadState::blockUntilThreadAvailable() { pthread_mutex_lock(&mProcess->mThreadCountLock); while (mProcess->mExecutingThreadsCount >= mProcess->mMaxThreads) { //等待正在执行的binder线程小于进程最大binder线程上限(16个) pthread_cond_wait(&mProcess->mThreadCountDecrement, &mProcess->mThreadCountLock); } pthread_mutex_unlock(&mProcess->mThreadCountLock); }
可见addMonitor(new BinderThreadMonitor())是将Binder线程添加到android.fg线程的handler(mMonitorChecker)来检查是否工作正常。
2.3 init
[-> Watchdog.java]
public void init(Context context, ActivityManagerService activity) { mResolver = context.getContentResolver(); mActivity = activity; //注册reboot广播接收者【见小节2.3.1】 context.registerReceiver(new RebootRequestReceiver(), new IntentFilter(Intent.ACTION_REBOOT), android.Manifest.permission.REBOOT, null); }
2.3.1 RebootRequestReceiver
[-> Watchdog.java]
final class RebootRequestReceiver extends BroadcastReceiver { @Override public void onReceive(Context c, Intent intent) { if (intent.getIntExtra("nowait", 0) != 0) { //【见小节2.3.2】 rebootSystem("Received ACTION_REBOOT broadcast"); return; } Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); } }
2.3.2 rebootSystem
[-> Watchdog.java]
void rebootSystem(String reason) { Slog.i(TAG, "Rebooting system because: " + reason); IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); try { //通过PowerManager执行reboot操作 pms.reboot(false, reason, false); } catch (RemoteException ex) { } }
最终是通过PowerManagerService来完成重启操作,具体的重启流程后续会单独讲述。
三、Watchdog检测机制
当调用Watchdog.getInstance().start()时,则进入线程“watchdog”的run()方法, 该方法分成两部分:
- 前半部 [小节3.1] 用于监测是否触发超时;
- 后半部 [小节4.1], 当触发超时则输出各种信息。
3.1 run
[-> Watchdog.java]
public void run() { boolean waitedHalf = false; while (true) { final ArrayList blockedCheckers; final String subject; final boolean allowRestart; int debuggerWasConnected = 0; synchronized (this) { long timeout = CHECK_INTERVAL; //CHECK_INTERVAL=30s for (int i=0; i<mHandlerCheckers.size(); i++) { HandlerChecker hc = mHandlerCheckers.get(i); //执行所有的Checker的监控方法, 每个Checker记录当前的mStartTime[见小节3.2] hc.scheduleCheckLocked(); }
if (debuggerWasConnected > 0) { debuggerWasConnected--; }
long start = SystemClock.uptimeMillis(); //通过循环,保证执行30s才会继续往下执行 while (timeout > 0) { if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } try { wait(timeout); //触发中断,直接捕获异常,继续等待. } catch (InterruptedException e) { Log.wtf(TAG, e); } if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); }
//评估Checker状态【见小节3.3】 final int waitState = evaluateCheckerCompletionLocked(); if (waitState == COMPLETED) { waitedHalf = false; continue; } else if (waitState == WAITING) { continue; } else if (waitState == WAITED_HALF) { if (!waitedHalf) { //首次进入等待时间过半的状态 ArrayList pids = new ArrayList(); pids.add(Process.myPid()); //输出system_server和3个native进程的traces【见小节4.2】 ActivityManagerService.dumpStackTraces(true, pids, null, null, NATIVE_STACKS_OF_INTEREST); waitedHalf = true; } continue; } ... //进入这里,意味着Watchdog已超时【见小节4.1】 } ... } }
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { "/system/bin/mediaserver", "/system/bin/sdcard", "/system/bin/surfaceflinger" };
该方法主要功能:
- 执行所有的Checker的监控方法scheduleCheckLocked()
- 当mMonitor个数为0(除了android.fg线程之外都为0)且处于poll状态,则设置mCompleted = true;
- 当上次check还没有完成, 则直接返回.
- 等待30s后, 再调用evaluateCheckerCompletionLocked来评估Checker状态;
- 根据waitState状态来执行不同的操作:
- 当COMPLETED或WAITING,则相安无事;
- 当WAITED_HALF(超过30s)且为首次, 则输出system_server和3个Native进程的traces;
- 当OVERDUE, 则输出更多信息.
由此,可见当触发一次Watchdog, 则必然会调用两次AMS.dumpStackTraces, 也就是说system_server和3个Native进程的traces 的traces信息会输出两遍,且时间间隔超过30s.
3.2 scheduleCheckLocked
public final class HandlerChecker implements Runnable { ... public void scheduleCheckLocked() { if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) { mCompleted = true; //当目标looper正在轮询状态则返回。 return; }
if (!mCompleted) { return; //有一个check正在处理中,则无需重复发送 } mCompleted = false;
mCurrentMonitor = null; // 记录当下的时间 mStartTime = SystemClock.uptimeMillis(); //发送消息,插入消息队列最开头, 见下方的run()方法 mHandler.postAtFrontOfQueue(this); }
public void run() { final int size = mMonitors.size(); for (int i = 0 ; i < size ; i++) { synchronized (Watchdog.this) { mCurrentMonitor = mMonitors.get(i); } //回调具体服务的monitor方法 mCurrentMonitor.monitor(); }
synchronized (Watchdog.this) { mCompleted = true; mCurrentMonitor = null; } } }
该方法主要功能: 向Watchdog的监控线程的Looper池的最头部执行该HandlerChecker.run()方法, 在该方法中调用monitor(),执行完成后会设置mCompleted = true. 那么当handler消息池当前的消息, 导致迟迟没有机会执行monitor()方法, 则会触发watchdog.
其中postAtFrontOfQueue(this),该方法输入参数为Runnable对象,根据消息机制, 最终会回调HandlerChecker中的run方法,该方法会循环遍历所有的Monitor接口,具体的服务实现该接口的monitor()方法。
可能的问题,如果有其他消息不断地调用postAtFrontOfQueue()也可能导致watchdog没有机会执行;或者是每个monitor消耗一些时间,雷加起来超过1分钟造成的watchdog. 这些都是非常规的Watchdog.
3.3 evaluateCheckerCompletionLocked
private int evaluateCheckerCompletionLocked() { int state = COMPLETED; for (int i=0; i<mHandlerCheckers.size(); i++) { HandlerChecker hc = mHandlerCheckers.get(i); //【见小节3.4】 state = Math.max(state, hc.getCompletionStateLocked()); } return state; }
获取mHandlerCheckers列表中等待状态值最大的state.
3.4 getCompletionStateLocked
public int getCompletionStateLocked() { if (mCompleted) { return COMPLETED; } else { long latency = SystemClock.uptimeMillis() - mStartTime; // mWaitMax默认是60s if (latency < mWaitMax/2) { return WAITING; } else if (latency < mWaitMax) { return WAITED_HALF; } } return OVERDUE; }
- COMPLETED = 0:等待完成;
- WAITING = 1:等待时间小于DEFAULT_TIMEOUT的一半,即30s;
- WAITED_HALF = 2:等待时间处于30s~60s之间;
- OVERDUE = 3:等待时间大于或等于60s。
四. Watchdog处理流程
4.1 run
[-> Watchdog.java]
public void run() { while (true) { synchronized (this) { ... //获取被阻塞的checkers 【见小节4.1.1】 blockedCheckers = getBlockedCheckersLocked(); // 获取描述信息 【见小节4.1.2】 subject = describeCheckersLocked(blockedCheckers); allowRestart = mAllowRestart; }
EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
ArrayList pids = new ArrayList(); pids.add(Process.myPid()); if (mPhonePid > 0) pids.add(mPhonePid); //第二次以追加的方式,输出system_server和3个native进程的栈信息【见小节4.2】 final File stack = ActivityManagerService.dumpStackTraces( !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
//系统已被阻塞1分钟,也不在乎多等待2s,来确保stack trace信息输出 SystemClock.sleep(2000);
if (RECORD_KERNEL_THREADS) { //输出kernel栈信息【见小节4.3】 dumpKernelStackTraces(); }
//触发kernel来dump所有阻塞线程【见小节4.4】 doSysRq('l');
//输出dropbox信息【见小节4.5】 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { public void run() { mActivity.addErrorToDropBox( "watchdog", null, "system_server", null, null, subject, null, stack, null); } }; dropboxThread.start();
try { dropboxThread.join(2000); //等待dropbox线程工作2s } catch (InterruptedException ignored) { }
IActivityController controller; synchronized (this) { controller = mController; } if (controller != null) { //将阻塞状态报告给activity controller, try { Binder.setDumpDisabled("Service dumps disabled due to hung system process."); //返回值为1表示继续等待,-1表示杀死系统 int res = controller.systemNotResponding(subject); if (res >= 0) { waitedHalf = false; continue; //设置ActivityController的某些情况下,可以让发生Watchdog时继续等待 } } catch (RemoteException e) { } }
//当debugger没有attach时,才杀死进程 if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } if (debuggerWasConnected >= 2) { Slog.w(TAG, "Debugger connected: Watchdog is not killing the system process"); } else if (debuggerWasConnected > 0) { Slog.w(TAG, "Debugger was connected: Watchdog is not killing the system process"); } else if (!allowRestart) { Slog.w(TAG, "Restart not allowed: Watchdog is not killing the system process"); } else { Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); //遍历输出阻塞线程的栈信息 for (int i=0; i<blockedCheckers.size(); i++) { Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); StackTraceElement[] stackTrace = blockedCheckers.get(i).getThread().getStackTrace(); for (StackTraceElement element: stackTrace) { Slog.w(TAG, " at " + element); } } Slog.w(TAG, "*** GOODBYE!"); //杀死进程system_server【见小节4.6】 Process.killProcess(Process.myPid()); System.exit(10); } waitedHalf = false; } }
Watchdog检测到异常的信息收集工作:
- AMS.dumpStackTraces:输出Java和Native进程的栈信息;
- WD.dumpKernelStackTraces:输出Kernel栈信息;
- doSysRq
- dropBox
收集完信息后便会杀死system_server进程。此处allowRestart默认值为true, 当执行am hang操作则设置不允许重启(allowRestart =false), 则不会杀死system_server进程.
4.1.1 getBlockedCheckersLocked
private ArrayList getBlockedCheckersLocked() { ArrayList checkers = new ArrayList(); //遍历所有的Checker for (int i=0; i<mHandlerCheckers.size(); i++) { HandlerChecker hc = mHandlerCheckers.get(i); //将所有没有完成,且超时的checker加入队列 if (hc.isOverdueLocked()) { checkers.add(hc); } } return checkers; }
4.1.2 describeCheckersLocked
private String describeCheckersLocked(ArrayList checkers) { StringBuilder builder = new StringBuilder(128); for (int i=0; i<checkers.size(); i++) { if (builder.length() > 0) { builder.append(", "); } // 输出所有的checker信息 builder.append(checkers.get(i).describeBlockedStateLocked()); } return builder.toString(); }
public String describeBlockedStateLocked() { //非前台线程进入该分支 if (mCurrentMonitor == null) { return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; //前台线程进入该分支 } else { return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
- " on " + mName + " (" + getThread().getName() + ")"; } }
将所有执行时间超过1分钟的handler线程或者monitor都记录下来.
- 当输出的信息是
Blocked in handler,意味着相应的线程处理当前消息时间超过1分钟; - 当输出的信息是
Blocked in monitor,意味着相应的线程处理当前消息时间超过1分钟,或者monitor迟迟拿不到锁;
4.2 AMS.dumpStackTraces
public static File dumpStackTraces(boolean clearTraces, ArrayList firstPids, ProcessCpuTracker processCpuTracker, SparseArray lastPids, String[] nativeProcs) { //默认为 data/anr/traces.txt String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); if (tracesPath == null || tracesPath.length() == 0) { return null; }
File tracesFile = new File(tracesPath); try {
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!