ANR系列文章
IdleHandler你会用吗?记一次IdleHandler使用误区,导致ANR
1. 现场收集哪些证据
- 经过前文触发机制源码分析我们了解到发生ANR的原理,但是想要解决问题了解原理只是基础,更重要的现场证据。下面我们先从代码角度看看会有哪些现场信息。
源码分析
appNotResponding()方法
//com.android.server.am.AppErrors.java
/* app: 当前发生ANR的进程
* activity: 发生ANR的界面
* parent: 发生ANR的界面的上一级界面
* aboveSystem:
* annotation: 发生ANR的原因
*/
final void appNotResponding(ProcessRecord app, ActivityRecord activity,
ActivityRecord parent, boolean aboveSystem, final String annotation) {
// 构建firstPids和lastPids数组
// firstPids: 用于保存ANR进程及其父进程,system_server进程和persistent的进程
// lastPids: 用于保存除firstPids外的其他进程
ArrayList<Integer> firstPids = new ArrayList<Integer>(5);
SparseArray<Boolean> lastPids = new SparseArray<Boolean>(20);
if (mService.mController != null) {
try {
// 0 == continue, -1 = kill process immediately
int res = mService.mController.appEarlyNotResponding(
app.processName, app.pid, annotation);
if (res < 0 && app.pid != MY_PID) {
app.kill("anr", true);
}
} catch (RemoteException e) {
mService.mController = null;
Watchdog.getInstance().setActivityController(null);
}
}
long anrTime = SystemClock.uptimeMillis();
if (ActivityManagerService.MONITOR_CPU_USAGE) {
// 更新CPU使用信息
mService.updateCpuStatsNow();
}
// Unless configured otherwise, swallow ANRs in background processes & kill the process.
// 如果配置了ANR_SHOW_BACKGROUND(anr_show_background)值为非空,则会弹出一个对话框,否则静默kill
boolean showBackground = Settings.Secure.getInt(mContext.getContentResolver(),
Settings.Secure.ANR_SHOW_BACKGROUND, 0) != 0;
boolean isSilentANR;
// 跳过下面这些场景下的ANR
synchronized (mService) {
// PowerManager.reboot() can block for a long time, so ignore ANRs while shutting down.
//正在关机的
if (mService.mShuttingDown) {
Slog.i(TAG, "During shutdown skipping ANR: " + app + " " + annotation);
return;
//已经有一个ANR弹出框时
} else if (app.notResponding) {
Slog.i(TAG, "Skipping duplicate ANR: " + app + " " + annotation);
return;
//正在处理crash的
} else if (app.crashing) {
Slog.i(TAG, "Crashing app skipping ANR: " + app + " " + annotation);
return;
//被AMS kill的
} else if (app.killedByAm) {
Slog.i(TAG, "App already killed by AM skipping ANR: " + app + " " + annotation);
return;
// 进程已经被kill
} else if (app.killed) {
Slog.i(TAG, "Skipping died app ANR: " + app + " " + annotation);
return;
}
// In case we come through here for the same app before completing
// this one, mark as anring now so we will bail out.
//防止同一个进程重复dump信息
app.notResponding = true;
//1.Log the ANR to the event log. 事件记录到eventlog中
EventLog.writeEvent(EventLogTags.AM_ANR, app.userId, app.pid,
app.processName, app.info.flags, annotation);
// Dump thread traces as quickly as we can, starting with "interesting" processes.
// 将当前进程添加到firstPids
firstPids.add(app.pid);
// 2.Don't dump other PIDs if it's a background ANR
//showBackground为false(不显示后台ANR的dialog)
isSilentANR = !showBackground && !isInterestingForBackgroundTraces(app);
if (!isSilentANR) {
int parentPid = app.pid;
if (parent != null && parent.app != null && parent.app.pid > 0) {
parentPid = parent.app.pid;
}
if (parentPid != app.pid) firstPids.add(parentPid);
// MY_PID为system_server的PID,将system_server进程添加到firstPids
if (MY_PID != app.pid && MY_PID != parentPid) firstPids.add(MY_PID);
for (int i = mService.mLruProcesses.size() - 1; i >= 0; i--) {
ProcessRecord r = mService.mLruProcesses.get(i);
if (r != null && r.thread != null) {
int pid = r.pid;
if (pid > 0 && pid != app.pid && pid != parentPid && pid != MY_PID) {
// 将persistent进程添加到firstPids
if (r.persistent) {
firstPids.add(pid);
if (DEBUG_ANR) Slog.i(TAG, "Adding persistent proc: " + r);
} else if (r.treatLikeActivity) {
firstPids.add(pid);
if (DEBUG_ANR) Slog.i(TAG, "Adding likely IME: " + r);
} else {
//其他进程添加到lastPids
lastPids.put(pid, Boolean.TRUE);
if (DEBUG_ANR) Slog.i(TAG, "Adding ANR proc: " + r);
}
}
}
}
}
}
// Log the ANR to the main log.
// 把ANR信息输出到main log
StringBuilder info = new StringBuilder();
info.setLength(0);
info.append("ANR in ").append(app.processName);
if (activity != null && activity.shortComponentName != null) {
info.append(" (").append(activity.shortComponentName).append(")");
}
info.append("\n");
info.append("PID: ").append(app.pid).append("\n");
if (annotation != null) {
info.append("Reason: ").append(annotation).append("\n");
}
if (parent != null && parent != activity) {
info.append("Parent: ").append(parent.shortComponentName).append("\n");
}
ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);
// don't dump native PIDs for background ANRs unless it is the process of interest
String[] nativeProcs = null;
if (isSilentANR) {
for (int i = 0; i < NATIVE_STACKS_OF_INTEREST.length; i++) {
if (NATIVE_STACKS_OF_INTEREST[i].equals(app.processName)) {
nativeProcs = new String[] { app.processName };
break;
}
}
} else {
nativeProcs = NATIVE_STACKS_OF_INTEREST;
}
// pids为NATIVE_STACKS_OF_INTEREST中定义的几个进程
int[] pids = nativeProcs == null ? null : Process.getPidsForCommands(nativeProcs);
ArrayList<Integer> nativePids = null;
if (pids != null) {
nativePids = new ArrayList<Integer>(pids.length);
for (int i : pids) {
nativePids.add(i);
}
}
// For background ANRs, don't pass the ProcessCpuTracker to
// avoid spending 1/2 second collecting stats to rank lastPids.
// 输出traces信息,后面细说
File tracesFile = ActivityManagerService.dumpStackTraces(
true, firstPids,
(isSilentANR) ? null : processCpuTracker,
(isSilentANR) ? null : lastPids,
nativePids);
String cpuInfo = null;
// cpu信息统计,各个进程的cpu使用情况
if (ActivityManagerService.MONITOR_CPU_USAGE) {
mService.updateCpuStatsNow();
synchronized (mService.mProcessCpuTracker) {
// 记录ANR之前的cpu使用情况(CPU usage from 38980ms to 0ms ago)
cpuInfo = mService.mProcessCpuTracker.printCurrentState(anrTime);
}
info.append(processCpuTracker.printCurrentLoad());
info.append(cpuInfo);
}
// 记录从anr时间开始的cpu使用情况(CPU usage from 72ms to 465ms later)
info.append(processCpuTracker.printCurrentState(anrTime));
// 输出info信息,包含ANR的Reason、CPU负载信息以及使用率
Slog.e(TAG, info.toString());
if (tracesFile == null) {
// There is no trace file, so dump (only) the alleged culprit's threads to the log
// 如果trace为空,则发送singal 3到发送ANR的进程,相当于adb shell kill -3 pid
Process.sendSignal(app.pid, Process.SIGNAL_QUIT);
}
// 将traces文件和CPU使用率信息保存到dropbox,即data/system/dropbox
StatsLog.write(StatsLog.ANR_OCCURRED, app.uid, app.processName,
activity == null ? "unknown": activity.shortComponentName, annotation,
(app.info != null) ? (app.info.isInstantApp()
? StatsLog.ANROCCURRED__IS_INSTANT_APP__TRUE
: StatsLog.ANROCCURRED__IS_INSTANT_APP__FALSE)
: StatsLog.ANROCCURRED__IS_INSTANT_APP__UNAVAILABLE,
app != null ? (app.isInterestingToUserLocked()
? StatsLog.ANROCCURRED__FOREGROUND_STATE__FOREGROUND
: StatsLog.ANROCCURRED__FOREGROUND_STATE__BACKGROUND)
: StatsLog.ANROCCURRED__FOREGROUND_STATE__UNKNOWN);
mService.addErrorToDropBox("anr", app, app.processName, activity, parent, annotation,
cpuInfo, tracesFile, null);
if (mService.mController != null) {
try {
// 0 == show dialog, 1 = keep waiting, -1 = kill process immediately
int res = mService.mController.appNotResponding(
app.processName, app.pid, info.toString());
if (res != 0) {
if (res < 0 && app.pid != MY_PID) {
app.kill("anr", true);
} else {
synchronized (mService) {
mService.mServices.scheduleServiceTimeoutLocked(app);
}
}
return;
}
} catch (RemoteException e) {
mService.mController = null;
Watchdog.getInstance().setActivityController(null);
}
}
synchronized (mService) {
mService.mBatteryStatsService.noteProcessAnr(app.processName, app.uid);
// 后台进程ANR,则直接kill
if (isSilentANR) {
app.kill("bg anr", true);
return;
}
// Set the app's notResponding state, and look up the errorReportReceiver
makeAppNotRespondingLocked(app,
activity != null ? activity.shortComponentName : null,
annotation != null ? "ANR " + annotation : "ANR",
info.toString());
// Bring up the infamous App Not Responding dialog
Message msg = Message.obtain();
msg.what = ActivityManagerService.SHOW_NOT_RESPONDING_UI_MSG;
msg.obj = new AppNotRespondingDialog.Data(app, activity, aboveSystem);
mService.mUiHandler.sendMessage(msg);
}
}
总结下
-
1、输出ANR Reason信息到Events log中,关键字是am_anr,一般都是用这个信息来确定Anr的发生时间
-
2、如果是后台的anr,即isSilentANR值为true,那么不显示ANR对话框,直接kill掉ANR所在进程
-
3、输出ANR 信息到main log中,关键字是ANR in,含有Cpu负载信息,一般都是用这个看ANR发生时候的其他进程的资源占用情况
-
4、调用dumpStackTraces去生成trace文件,这个trace文件有哪些进程呢?
a.firstPids队列:第一个是ANR进程,第二个是system_server,剩余是所有persistent进程;
b.Native队列:是指/system/bin/目录的mediaserver,sdcard 以及surfaceflinger进程;
c.lastPids队列: 是指mLruProcesses中的不属于firstPids的所有进程。 -
5、将traces文件和 CPU使用情况信息保存到dropbox,即data/system/dropbox目录
-
6、弹窗告知用户
另外提下额外信息
- 1、am_anr关键字比ANR in要先打印,所以更接近ANR的实际发生时间
下面来继续来看
dumpStackTraces()方法
//com.android.server.am.ActivityManagerService.java
public static File dumpStackTraces(boolean clearTraces, ArrayList<Integer> firstPids,
ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
ArrayList<Integer> nativePids) {
ArrayList<Integer> extraPids = null;
// Measure CPU usage as soon as we're called in order to get a realistic sampling
// of the top users at the time of the request.
// 一旦调用此方法就马上计算CPU使用率以在请求的时候获取top用户的实际采样
if (processCpuTracker != null) {
processCpuTracker.init();
try {
Thread.sleep(200);
} catch (InterruptedException ignored) {
}
// 测量CPU使用情况
processCpuTracker.update();
// We'll take the stack crawls of just the top apps using CPU.
final int N = processCpuTracker.countWorkingStats();
extraPids = new ArrayList<>();
// 从lastPids中选取CPU使用率top 5的进程,晚些时机输出这些进程的stacks
for (int i = 0; i < N && extraPids.size() < 5; i++) {
ProcessCpuTracker.Stats stats = processCpuTracker.getWorkingStats(i);
if (lastPids.indexOfKey(stats.pid) >= 0) {
if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for extra pid " + stats.pid);
extraPids.add(stats.pid);
} else if (DEBUG_ANR) {
Slog.d(TAG, "Skipping next CPU consuming process, not a java proc: "
+ stats.pid);
}
}
}
boolean useTombstonedForJavaTraces = false;
File tracesFile;
//如果dalvik.vm.stack-trace-dir没有配置,trace就写到全局文件data/anr/traces.txt中
final String tracesDirProp = SystemProperties.get("dalvik.vm.stack-trace-dir", "");
if (tracesDirProp.isEmpty()) {
// When dalvik.vm.stack-trace-dir is not set, we are using the "old" trace
// dumping scheme. All traces are written to a global trace file (usually
// "/data/anr/traces.txt") so the code below must take care to unlink and recreate
// the file if requested.
//
// This mode of operation will be removed in the near future.
String globalTracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
if (globalTracesPath.isEmpty()) {
Slog.w(TAG, "dumpStackTraces: no trace path configured");
return null;
}
tracesFile = new File(globalTracesPath);
try {
//删除旧的,创建新的
if (clearTraces && tracesFile.exists()) {
tracesFile.delete();
}
tracesFile.createNewFile();
//注意这里的权限设置 8.0以前的版本是没有的
FileUtils.setPermissions(globalTracesPath, 0666, -1, -1); // -rw-rw-rw-
} catch (IOException e) {
Slog.w(TAG, "Unable to prepare ANR traces file: " + tracesFile, e);
return null;
}
} else {
File tracesDir = new File(tracesDirProp);
// When dalvik.vm.stack-trace-dir is set, we use the "new" trace dumping scheme.
// Each set of ANR traces is written to a separate file and dumpstate will process
// all such files and add them to a captured bug report if they're recent enough.
maybePruneOldTraces(tracesDir);
// NOTE: We should consider creating the file in native code atomically once we've
// gotten rid of the old scheme of dumping and lot of the code that deals with paths
// can be removed.
//以当前时间命名的trace文件 yyyy-MM-dd-HH-mm-ss-SSS
tracesFile = createAnrDumpFile(tracesDir);
if (tracesFile == null) {
return null;
}
useTombstonedForJavaTraces = true;
}
dumpStackTraces(tracesFile.getAbsolutePath(), firstPids, nativePids, extraPids,
useTombstonedForJavaTraces);
return tracesFile;
}
总结一下上面代码,这段代码并没有真正的执行dump操作,只是确定了trace文件路径,如果”dalvik.vm.stack-trace-dir”这个属性没有配置,就使用旧的dump策略,trace信息写入到全局trace文件/data/anr/traces.txt中,删除旧的traces文件,创建新的traces文件;如果”dalvik.vm.stack-trace-dir”属性被配置了,就创建格式为anr_yyyy-MM-dd-HH-mm-ss-SSS的文件。
dumpStackTraces()方法
private static void dumpStackTraces(String tracesFile, ArrayList<Integer> firstPids,
ArrayList<Integer> nativePids, ArrayList<Integer> extraPids,
boolean useTombstonedForJavaTraces) {
// We don't need any sort of inotify based monitoring when we're dumping traces via
// tombstoned. Data is piped to an "intercept" FD installed in tombstoned so we're in full
// control of all writes to the file in question.
final DumpStackFileObserver observer;
if (useTombstonedForJavaTraces) {
observer = null;
} else {
// Use a FileObserver to detect when traces finish writing.
// The order of traces is considered important to maintain for legibility.
observer = new DumpStackFileObserver(tracesFile);
}
// We must complete all stack dumps within 20 seconds.
// 所有的dump操作需要在20秒之内完成
long remainingTime = 20 * 1000;
try {
if (observer != null) {
observer.startWatching();
}
// 第一步收集firstPids中进程的trace,超时会自动放弃
// First collect all of the stacks of the most important pids.
if (firstPids != null) {
int num = firstPids.size();
for (int i = 0; i < num; i++) {
if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for pid "
+ firstPids.get(i));
final long timeTaken;
if (useTombstonedForJavaTraces) {
timeTaken = dumpJavaTracesTombstoned(firstPids.get(i), tracesFile, remainingTime);
} else {
timeTaken = observer.dumpWithTimeout(firstPids.get(i), remainingTime);
}
remainingTime -= timeTaken;
if (remainingTime <= 0) {
Slog.e(TAG, "Aborting stack trace dump (current firstPid=" + firstPids.get(i) +
"); deadline exceeded.");
return;
}
if (DEBUG_ANR) {
Slog.d(TAG, "Done with pid " + firstPids.get(i) + " in " + timeTaken + "ms");
}
}
}
// 第二步收集nativePids中进程的trace,超时会自动放弃
// Next collect the stacks of the native pids
if (nativePids != null) {
for (int pid : nativePids) {
if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for native pid " + pid);
final long nativeDumpTimeoutMs = Math.min(NATIVE_DUMP_TIMEOUT_MS, remainingTime);
final long start = SystemClock.elapsedRealtime();
Debug.dumpNativeBacktraceToFileTimeout(
pid, tracesFile, (int) (nativeDumpTimeoutMs / 1000));
final long timeTaken = SystemClock.elapsedRealtime() - start;
remainingTime -= timeTaken;
if (remainingTime <= 0) {
Slog.e(TAG, "Aborting stack trace dump (current native pid=" + pid +
"); deadline exceeded.");
return;
}
if (DEBUG_ANR) {
Slog.d(TAG, "Done with native pid " + pid + " in " + timeTaken + "ms");
}
}
}
//第三步收集extraPids中进程的trace,超时会自动放弃,这个代码和第一步一样的
// Lastly, dump stacks for all extra PIDs from the CPU tracker.
if (extraPids != null) {
for (int pid : extraPids) {
if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for extra pid " + pid);
final long timeTaken;
if (useTombstonedForJavaTraces) {
timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime);
} else {
timeTaken = observer.dumpWithTimeout(pid, remainingTime);
}
remainingTime -= timeTaken;
if (remainingTime <= 0) {
Slog.e(TAG, "Aborting stack trace dump (current extra pid=" + pid +
"); deadline exceeded.");
return;
}
if (DEBUG_ANR) {
Slog.d(TAG, "Done with extra pid " + pid + " in " + timeTaken + "ms");
}
}
}
} finally {
if (observer != null) {
observer.stopWatching();
}
}
}
总结下以上逻辑,firstPids和extraPids中的进程dump trace有两种方式,看useTombstonedForJavaTraces的取值;
- 如果useTombstonedForJavaTraces为真,采用dumpJavaTracesTombstoned获取trace
- 如果useTombstonedForJavaTraces为假,采用DumpStackFileObserver#dumpWithTimeout获取trace
对于nativePids中的进程,只能通过 Debug.dumpNativeBacktraceToFileTimeout来获取trace。
- 额外看下常见的几个信号量的差异
//Process.java:
public static final int SIGNAL_QUIT = 3; //用于输出线程trace
public static final int SIGNAL_KILL = 9; //用于杀进程/线程
public static final int SIGNAL_USR1 = 10; //用于强制执行GC
- 官方说明:
信号SIGKILL的处理过程,这是因为SIGKILL是不能被忽略同时也不能被捕获,故不会由目标线程的signalCatcher线程来处理,而是由内核直接处理,到此便完成。 但对于信号3和10,则是交由目标进程(art虚拟机)的SignalCatcher线程来捕获完成相应操作的,接下来进入目标线程来处理相应的信号。其中SIGNAL_QUIT会dump当前进程全部现场的stack,来帮助我们定位ANR原因。
问题思考
-
问题1: 利用am_anr来确定ANR的发生时间真的准确吗? 不一定准确,可能由于系统资源比较紧张造成一定程度的滞后,十多秒都可能,此时需要结合eventlog进一步分析。
-
问题2:ANR中经常出现没有trace,或者无效trace是怎么回事? 1.从appNotResponding方法中,我们看到有不少return的地方,在dumpStackTraces方法,也会有return出现,比如这么多进程的trace需要在20秒之内完成,在系统资源紧张的时候,能不能完成,是非常没有保障的;
2.有一些ANR问题需要binder对端的trace,通过上面的分析看,大概率是没有对端的trace的,除非binder call到system_server或者比较重要的几个native进程里面;
3.发生ANR的时候,线程并没有暂停掉,有可能等到我们去抓这个进程的trace的时候,当前主线程耗时的方法都已经完成,恢复正常了,比如经常出现下面nativePollOnce状态的堆栈。
"main" prio=5 tid=1 Native
| group="main" sCount=1 dsCount=0 obj=0x7685ba50 self=0x7fb0840a00
| sysTid=1460 nice=-10 cgrp=default sched=0/0 handle=0x7fb4824aa0
| state=S schedstat=( 213445473008 99617457476 724519 ) utm=14518 stm=6826 core=0 HZ=100
| stack=0x7fe82c6000-0x7fe82c8000 stackSize=8MB
| held mutexes=
kernel: __switch_to+0x70/0x7c
kernel: SyS_epoll_wait+0x2d4/0x394
kernel: SyS_epoll_pwait+0xc4/0x150
kernel: el0_svc_naked+0x24/0x28
native: #00 pc 000000000001bf2c /system/lib64/libc.so (syscall+28)
native: #01 pc 00000000000e813c /system/lib64/libart.so (_ZN3art17ConditionVariable16WaitHoldingLocksEPNS_6ThreadE+160)
native: #02 pc 000000000054b6f8 /system/lib64/libart.so (_ZN3artL12GoToRunnableEPNS_6ThreadE+328)
native: #03 pc 000000000054b56c /system/lib64/libart.so (_ZN3art12JniMethodEndEjPNS_6ThreadE+28)
native: #04 pc 00000000008ccc50 /system/framework/arm64/boot-framework.oat (Java_android_os_MessageQueue_nativePollOnce__JI+156)
at android.os.MessageQueue.nativePollOnce(Native method)
at android.os.MessageQueue.next(MessageQueue.java:323)
at android.os.Looper.loop(Looper.java:142)
at com.android.server.SystemServer.run(SystemServer.java:387)
at com.android.server.SystemServer.main(SystemServer.java:245)
at java.lang.reflect.Method.invoke!(Native method)
at com.android.internal.os.ZygoteInit$MethodAndArgsCaller.run(ZygoteInit.java:904)
at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:794)
另外我们需要通过源码可以发现,当发生的是后台进程ANR,会少收集很多的信息,比如CPU使用率,CPU占比较高的5个应用,NATIVE进程情况。当然同时更不会有弹窗出现,而是直接kill掉进程。但是这些情况也是我们必须要关心的,毕竟进程活着对于应用本身来说才有更多的可能。
后面的篇章我们会通过一些实例来分析并尝试解决问题。
参考链接