Crash排查系列第九篇|后台过度使用cpu被杀

952 阅读3分钟

背景

应用中后台过度资源导致的退出占比在2%左右。

源码搜索(基于Android 13)

观察日志大部分是抛出以下格式日志。

"excessive cpu 8020 during 300091 dur=1235468 limit=2"

搜索源码在ActivityManagerService的updateAppProcessCpuTimeLPr方法中

private void updateAppProcessCpuTimeLPr(final long uptimeSince, final boolean doCpuKills,
        final long checkDur, final int cpuLimit, final ProcessRecord app) {
    synchronized (mAppProfiler.mProfilerLock) {
        final ProcessProfileRecord profile = app.mProfile;
        final long curCpuTime = profile.mCurCpuTime.get();
        final long lastCpuTime = profile.mLastCpuTime.get();
        if (lastCpuTime > 0) {
            final long cpuTimeUsed = curCpuTime - lastCpuTime;
            if (checkExcessivePowerUsageLPr(uptimeSince, doCpuKills, cpuTimeUsed,
                        app.processName, app.toShortString(), cpuLimit, app)) {
                mHandler.post(() -> {
                    synchronized (ActivityManagerService.this) {
                        app.killLocked("excessive cpu " + cpuTimeUsed + " during "
                                + uptimeSince + " dur=" + checkDur + " limit=" + cpuLimit,
                                ApplicationExitInfo.REASON_EXCESSIVE_RESOURCE_USAGE,
                                ApplicationExitInfo.SUBREASON_EXCESSIVE_CPU,
                                true);
                    }
                });
                profile.reportExcessiveCpu();
            }
        }

        profile.mLastCpuTime.set(curCpuTime);
    }
}

堆栈调试

触发检测堆栈。 默认5分钟检测一次。

    at com.android.server.am.ActivityManagerService.updateAppProcessCpuTimeLPr(ActivityManagerService.java:15240)
        at com.android.server.am.ActivityManagerService.lambda$checkExcessivePowerUsage$20(ActivityManagerService.java:15224)
        at com.android.server.am.ActivityManagerService.$r8$lambda$vSwcjZLInwE40j-EAbD7kDO2Uwo(Unknown Source:0)
        at com.android.server.am.ActivityManagerService$$ExternalSyntheticLambda12.accept(Unknown Source:13)
        at com.android.server.am.ProcessList.forEachLruProcessesLOSP(ProcessList.java:3809)
        at com.android.server.am.ActivityManagerService.checkExcessivePowerUsage(ActivityManagerService.java:15206)
        at com.android.server.am.ActivityManagerService.-$$Nest$mcheckExcessivePowerUsage(Unknown Source:0)
        at com.android.server.am.ActivityManagerService$MainHandler.handleMessage(ActivityManagerService.java:1767)
        at android.os.Handler.dispatchMessage(Handler.java:106)
        at android.os.Looper.loopOnce(Looper.java:201)
        at android.os.Looper.loop(Looper.java:288)
        at android.os.HandlerThread.run(HandlerThread.java:67)
        at com.android.server.ServiceThread.run(ServiceThread.java:44)

关键检测逻辑分析

按照堆栈主要分析以下两个方法。

private void checkExcessivePowerUsage() {
    updateCpuStatsNow();

    final boolean monitorPhantomProcs = mSystemReady && FeatureFlagUtils.isEnabled(mContext,
            SETTINGS_ENABLE_MONITOR_PHANTOM_PROCS);
    synchronized (mProcLock) {
        final boolean doCpuKills = mLastPowerCheckUptime != 0;
        final long curUptime = SystemClock.uptimeMillis();
        final long uptimeSince = curUptime - mLastPowerCheckUptime;
        mLastPowerCheckUptime = curUptime;
        mProcessList.forEachLruProcessesLOSP(false, app -> {
            if (app.getThread() == null) {
                return;
            }
            if (app.mState.getSetProcState() >= ActivityManager.PROCESS_STATE_HOME) {
                int cpuLimit;
                long checkDur = curUptime - app.mState.getWhenUnimportant();
                if (checkDur <= mConstants.POWER_CHECK_INTERVAL) {
                    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_1;
                } else if (checkDur <= (mConstants.POWER_CHECK_INTERVAL * 2)
                        || app.mState.getSetProcState() <= ActivityManager.PROCESS_STATE_HOME) {
                    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_2;
                } else if (checkDur <= (mConstants.POWER_CHECK_INTERVAL * 3)) {
                    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_3;
                } else {
                    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_4;
                }

                updateAppProcessCpuTimeLPr(uptimeSince, doCpuKills, checkDur, cpuLimit, app);

                if (monitorPhantomProcs) {
                    // Also check the phantom processes if there is any
                    updatePhantomProcessCpuTimeLPr(
                            uptimeSince, doCpuKills, checkDur, cpuLimit, app);
                }
            }
        });
    }
}
private boolean checkExcessivePowerUsageLPr(final long uptimeSince, boolean doCpuKills,
        final long cputimeUsed, final String processName, final String description,
        final int cpuLimit, final ProcessRecord app) {
    if (DEBUG_POWER && (uptimeSince > 0)) {
        StringBuilder sb = new StringBuilder(128);
        sb.append("CPU for ");
        sb.append(description);
        sb.append(": over ");
        TimeUtils.formatDuration(uptimeSince, sb);
        sb.append(" used ");
        TimeUtils.formatDuration(cputimeUsed, sb);
        sb.append(" (");
        sb.append((cputimeUsed * 100.0) / uptimeSince);
        sb.append("%)");
        Slog.i(TAG_POWER, sb.toString());
    }
    // If the process has used too much CPU over the last duration, the
    // user probably doesn't want this, so kill!
    if (doCpuKills && uptimeSince > 0) {
        if (((cputimeUsed * 100) / uptimeSince) >= cpuLimit) {
            mBatteryStatsService.reportExcessiveCpu(app.info.uid, app.processName,
                    uptimeSince, cputimeUsed);
            app.getPkgList().forEachPackageProcessStats(holder -> {
                final ProcessState state = holder.state;
                FrameworkStatsLog.write(
                        FrameworkStatsLog.EXCESSIVE_CPU_USAGE_REPORTED,
                        app.info.uid,
                        processName,
                        state != null ? state.getPackage() : app.info.packageName,
                        holder.appVersion);
            });
            return true;
        }
    }
    return false;
}

有几个关键的字段

((cputimeUsed * 100) / uptimeSince) >= cpuLimit 这个条件成立后会触发杀进程。

cputimeUsed: cpu使用时长

uptimeSince:距离上次的检查时间。

final long curUptime = SystemClock.uptimeMillis();
final long uptimeSince = curUptime - mLastPowerCheckUptime;
mLastPowerCheckUptime = curUptime;

cpuLimit

  1. 会检测app 被设置mWhenUnimportant的时间, 这个important就和之前文章的进程优先级相关。 当ProcState<PROCESS_STATE_SERVICE 就会被设置 。
  2. 当前时间减去mWhenUnimportant 得到checkDur 然后按照checkDur 计算cpuLimit

这个查看代码计算规则是这样的 POWER_CHECK_INTERVAL 默认是5分钟。 小于10分钟cpuLimit为25 小于15分钟cpuLimit为10 否则就是2

总结: cputimeUsed越小越不容易触发。 uptimeSince越大越不容易触发,cpuLimit越大越不容易触发。 也就cpu使用时长要小,距离上次的检查时间要长,进程不重要持续时间要短。

private static final int DEFAULT_POWER_CHECK_MAX_CPU_1 = 25;
private static final int DEFAULT_POWER_CHECK_MAX_CPU_2 = 25;
private static final int DEFAULT_POWER_CHECK_MAX_CPU_3 = 10;
private static final int DEFAULT_POWER_CHECK_MAX_CPU_4 = 2;

int cpuLimit;
long checkDur = curUptime - app.mState.getWhenUnimportant();
if (checkDur <= mConstants.POWER_CHECK_INTERVAL) {
    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_1;
} else if (checkDur <= (mConstants.POWER_CHECK_INTERVAL * 2)
        || app.mState.getSetProcState() <= ActivityManager.PROCESS_STATE_HOME) {
    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_2;
} else if (checkDur <= (mConstants.POWER_CHECK_INTERVAL * 3)) {
    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_3;
} else {
    cpuLimit = mConstants.POWER_CHECK_MAX_CPU_4;
}

cputimeUsed 是如何计算的

final long curCpuTime = profile.mCurCpuTime.get();
final long lastCpuTime = profile.mLastCpuTime.get();
final long cpuTimeUsed = curCpuTime - lastCpuTime;

在ProcessProfile类中,mCurCpuTime和mLastCpuTime的差值可以用来计算进程在两次记录之间消耗的CPU时间,这个时间差值可以用于计算进程的CPU利用率等性能指标。另外,通过比较mCurCpuTime和mLastCpuTime的值,可以检测进程是否正在占用CPU资源

mCurCpuTime 和mLastCpuTime 这两个值是如何计算的呢。

checkExcessivePowerUsage调用后 首先会调用updateCpuStatsNow 方法其中会用ProcessCpuTracker类去更新cpu使用信息。ProcessCpuTracker在anr分析文章中已经是老常客了

简单来说就是通过读取/proc//stat文件,通过解析该文件中的各个字段,来更新进程的CPU时间信息,包括mCurCpuTime和mLastCpuTime等成员变量

**再回过头看一下 excessive cpu 8020 during 300091 dur=1235468 limit=2 的意思。

距离上次cpu检测300091 ms 进程被设置不重要时间1235468 ms 超过了POWER_CHECK_INTERVAL * 3 默认15分钟时间。

cpu消耗了8020ms

(8020*100)/300091>=2 成立。触发了杀进程。**

如何避免

  1. 避免后台过多使用cpu,按照aosp 13逻辑 当进程为非重要进程时 。

    1. 10分钟内要满足 (a*100)/300000<25 5分钟内cpu使用少于75s
    2. 15分钟内要满足 (a*100)/300000<10 5分钟内cpu使用少于30s
    3. 20分钟内要满足 (a*100)/300000<2 也就是5分钟内cpu使用少于6s
  2. 提高进程优先级 至于如何提升进程优先级又是另外一个话题了。

    1. 理论上 app.mState.getSetProcState() < ActivityManager.PROCESS_STATE_HOME 就不会去进行杀进程。
    2. enum ProcessStateEnum {
          /** @hide Not a real process state. */
          UNKNOWN = -1,
      
          /** @hide Process is a persistent system process. */
          PERSISTENT = 0,
      
          /** @hide Process is a persistent system process and is doing UI. */
          PERSISTENT_UI = 1,
      
          /** @hide Process is hosting the current top activities.  Note that this covers
           * all activities that are visible to the user. */
          TOP = 2,
      
          /** @hide Process is bound to a TOP app. */
          BOUND_TOP = 3,
      
          /** @hide Process is hosting a foreground service. */
          FOREGROUND_SERVICE = 4,
      
          /** @hide Process is hosting a foreground service due to a system binding. */
          BOUND_FOREGROUND_SERVICE = 5,
      
          /** @hide Process is important to the user, and something they are aware of. */
          IMPORTANT_FOREGROUND = 6,
      
          /** @hide Process is important to the user, but not something they are aware of. */
          IMPORTANT_BACKGROUND = 7,
      
          /** @hide Process is in the background transient so we will try to keep running. */
          TRANSIENT_BACKGROUND = 8,
      
          /** @hide Process is in the background running a backup/restore operation. */
          BACKUP = 9,
      
          /** @hide Process is in the background running a service.  Unlike oom_adj, this level
           * is used for both the normal running in background state and the executing
           * operations state. */
          SERVICE = 10,
      
          /** @hide Process is in the background running a receiver.   Note that from the
           * perspective of oom_adj, receivers run at a higher foreground level, but for our
           * prioritization here that is not necessary and putting them below services means
           * many fewer changes in some process states as they receive broadcasts. */
          RECEIVER = 11,
      
          /** @hide Same as {@link #PROCESS_STATE_TOP} but while device is sleeping. */
          TOP_SLEEPING = 12,
      
          /** @hide Process is in the background, but it can't restore its state so we want
           * to try to avoid killing it. */
          HEAVY_WEIGHT = 13,
      
          /** @hide Process is in the background but hosts the home activity. */
          HOME = 14,
      
          /** @hide Process is in the background but hosts the last shown activity. */
          LAST_ACTIVITY = 15,