微信Android客户端的ANR监控方案

1,917 阅读12分钟

mp.weixin.qq.com/s?__biz=MzA…

微信公众号,WeMobileDev 2021年7月19日发布的 微信Android客户端的ANR监控方案

该方案的所有代码已经在Matrix(github.com/Tencent/mat…

1.SignalAnrTracer onAlive方法里调用nativeInitSignalAnrDetective方法监听SIGQUIT信号

public class SignalAnrTracer extends Tracer {
    //region 参数
    private static final String TAG = "SignalAnrTracer";
    //检测anr线程名字
    //监控到SIGQUIT后,我们在20秒内(20秒是ANR dump的timeout时间)不断轮询自己是否有NOT_RESPONDING flag
    //一旦发现有这个flag,那么马上就可以认定发生了一次ANR。
    private static final String CHECK_ANR_STATE_THREAD_NAME = "Check-ANR-State-Thread";
    //检测NOT_RESPONDING flag间隔时间
    private static final int CHECK_ERROR_STATE_INTERVAL = 500;
    //dump最长时间20s
    private static final int ANR_DUMP_MAX_TIME = 20000;
    //检测error次数
    private static final int CHECK_ERROR_STATE_COUNT =
            ANR_DUMP_MAX_TIME / CHECK_ERROR_STATE_INTERVAL;
    //前台消息,超时2s的时候,说明卡住了
    private static final long FOREGROUND_MSG_THRESHOLD = -2000;
    //后台消息,超时2s的时候,说明卡住了
    private static final long BACKGROUND_MSG_THRESHOLD = -10000;
    //是否hasInstance
    public static boolean hasInstance = false;
    //是否是前台状态
    private static boolean currentForeground = false;
    //anr trace 文件路径
    private static String sAnrTraceFilePath = "";
    //    这个Hook Trace的方案,不仅仅可以用来查ANR问题,任何时候我们都可以手动向自己发送一个SIGQUIT信号,
//    从而hook到当时的Trace。Trace的内容对于我们排查线程死锁,线程异常,耗电等问题都非常有帮助。
    //打印trace 文件路径 ,自己触发的
    private static String sPrintTraceFilePath = "";
    //监听
    private static SignalAnrDetectedListener sSignalAnrDetectedListener;
    //sApplication
    private static Application sApplication;
    //是否初始化了
    private static boolean hasInit = false;
    //anr发生时间,负值
    private static long anrMessageWhen = 0L;
    //anr发生时主线程处理的消息
    private static String anrMessageString = "";
    //endregion
 
    static {
        //加载trace-canary lib
        System.loadLibrary("trace-canary");
    }
 
    //region 构造函数
    public SignalAnrTracer(TraceConfig traceConfig) {
        hasInstance = true;
        sAnrTraceFilePath = traceConfig.anrTraceFilePath;
        sPrintTraceFilePath = traceConfig.printTraceFilePath;
    }
 
    public SignalAnrTracer(Application application) {
        hasInstance = true;
        sApplication = application;
    }
 
    public SignalAnrTracer(Application application, String anrTraceFilePath, String printTraceFilePath) {
        hasInstance = true;
        sAnrTraceFilePath = anrTraceFilePath;
        sPrintTraceFilePath = printTraceFilePath;
        sApplication = application;
    }
    //endregion
 
    /**
     * AnrDumper.cc里 handleSignal
     */
    @RequiresApi(api = Build.VERSION_CODES.M)
    @Keep
    private static void onANRDumped() {
        //是否是前台
        currentForeground = AppForegroundUtil.isInterestingToUser();
        //是否是主线程堵塞了,需要report
        boolean needReport = isMainThreadBlocked();
 
        //有两种情况,主线程消息已经堵住了,或者开启一个线程检测状态 NOT_RESPONDING
        //需要report
        if (needReport) {
            report(false);
        } else {
//            监控到SIGQUIT后,我们在20秒内(20秒是ANR dump的timeout时间)不断轮询自己是否有NOT_RESPONDING flag
//            ,一旦发现有这个flag,那么马上就可以认定发生了一次ANR。
            new Thread(new Runnable() {
                @Override
                public void run() {
                    //开启了一个线程检查
                    checkErrorStateCycle();
                }
            }, CHECK_ANR_STATE_THREAD_NAME).start();
        }
    }
 
    @Keep
    private static void onANRDumpTrace() {
        try {
            MatrixUtil.printFileByLine(TAG, sAnrTraceFilePath);
        } catch (Throwable t) {
            MatrixLog.e(TAG, "onANRDumpTrace error: %s", t.getMessage());
        }
    }
    //endregion
 
    @Keep
    private static void onPrintTrace() {
        try {
            MatrixUtil.printFileByLine(TAG, sPrintTraceFilePath);
        } catch (Throwable t) {
            MatrixLog.e(TAG, "onPrintTrace error: %s", t.getMessage());
        }
    }
 
    /**
     * @param fromProcessErrorState false代表主线程阻塞了
     */
    private static void report(boolean fromProcessErrorState) {
        try {
            String stackTrace = Utils.getMainThreadJavaStackTrace();
            if (sSignalAnrDetectedListener != null) {
                sSignalAnrDetectedListener.onAnrDetected(stackTrace, anrMessageString, anrMessageWhen, fromProcessErrorState);
                return;
            }
 
            TracePlugin plugin = Matrix.with().getPluginByClass(TracePlugin.class);
            if (null == plugin) {
                return;
            }
 
            String scene = AppMethodBeat.getVisibleScene();
 
            JSONObject jsonObject = new JSONObject();
            jsonObject = DeviceUtil.getDeviceInfo(jsonObject, Matrix.with().getApplication());
            jsonObject.put(SharePluginInfo.ISSUE_STACK_TYPE, Constants.Type.SIGNAL_ANR);
            jsonObject.put(SharePluginInfo.ISSUE_SCENE, scene);
            jsonObject.put(SharePluginInfo.ISSUE_THREAD_STACK, stackTrace);
            jsonObject.put(SharePluginInfo.ISSUE_PROCESS_FOREGROUND, currentForeground);
 
            Issue issue = new Issue();
            issue.setTag(SharePluginInfo.TAG_PLUGIN_EVIL_METHOD);
            issue.setContent(jsonObject);
            plugin.onDetectIssue(issue);
            MatrixLog.e(TAG, "happens real ANR : %s ", jsonObject.toString());
 
        } catch (JSONException e) {
            MatrixLog.e(TAG, "[JSONException error: %s", e);
        }
    }
 
    //通过消息时间,来判断是否到超出阈值
    @RequiresApi(api = Build.VERSION_CODES.M)
    private static boolean isMainThreadBlocked() {
        try {
            MessageQueue mainQueue = Looper.getMainLooper().getQueue();
            Field field = mainQueue.getClass().getDeclaredField("mMessages");
            field.setAccessible(true);
            final Message mMessage = (Message) field.get(mainQueue);
            if (mMessage != null) {
                anrMessageString = mMessage.toString();
                long when = mMessage.getWhen();
                if (when == 0) {
                    return false;
                }
                long time = when - SystemClock.uptimeMillis();
                anrMessageWhen = time;
                long timeThreshold = BACKGROUND_MSG_THRESHOLD;
                if (currentForeground) {
                    timeThreshold = FOREGROUND_MSG_THRESHOLD;
                }
                return time < timeThreshold;
            }
        } catch (Exception e) {
            return false;
        }
        return false;
    }
 
    private static void checkErrorStateCycle() {
        int checkErrorStateCount = 0;
        //开启一个循环检测
        while (checkErrorStateCount < CHECK_ERROR_STATE_COUNT) {
            try {
                checkErrorStateCount++;
                boolean myAnr = checkErrorState();
                if (myAnr) {
                    report(true);
                    break;
                }
 
                Thread.sleep(CHECK_ERROR_STATE_INTERVAL);
            } catch (Throwable t) {
                MatrixLog.e(TAG, "checkErrorStateCycle error, e : " + t.getMessage());
                break;
            }
        }
    }
 
    //用来判断anr发生了
//    在ANR弹窗前,会执行到makeAppNotRespondingLocked方法中,在这里会给发生ANR进程标记一个NOT_RESPONDING的flag。
//    而这个flag我们可以通过ActivityManager来获取:
    private static boolean checkErrorState() {
        try {
            Application application =
                    sApplication == null ? Matrix.with().getApplication() : sApplication;
            ActivityManager am = (ActivityManager) application
                    .getSystemService(Context.ACTIVITY_SERVICE);
            //从ActivityManager 获取ProcessErrorStateInfo
            List<ActivityManager.ProcessErrorStateInfo> procs = am.getProcessesInErrorState();
            if (procs == null) return false;
 
            for (ActivityManager.ProcessErrorStateInfo proc : procs) {
                MatrixLog.i(TAG, "[checkErrorState] found Error State proccessName = %s, proc.condition = %d", proc.processName, proc.condition);
 
                if (proc.uid != android.os.Process.myUid()
                        && proc.condition == ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {
                    MatrixLog.i(TAG, "maybe received other apps ANR signal");
                }
 
                if (proc.pid != android.os.Process.myPid()) continue;
 
                if (proc.condition != ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {
                    continue;
                }
                //只有是自己进程,并且是NOT_RESPONDING的时候,才返回true
                return true;
            }
            return false;
        } catch (Throwable t) {
            MatrixLog.e(TAG, "[checkErrorState] error : %s", t.getMessage());
        }
        return false;
    }
 
    //ok
    public static void printTrace() {
        if (!hasInstance) {
            MatrixLog.e(TAG, "SignalAnrTracer has not been initialize");
            return;
        }
        if (sPrintTraceFilePath.equals("")) {
            MatrixLog.e(TAG, "PrintTraceFilePath has not been set");
            return;
        }
        nativePrintTrace();
    }
 
    private static native void nativeInitSignalAnrDetective(String anrPrintTraceFilePath, String printTraceFilePath);
 
    private static native void nativeFreeSignalAnrDetective();
 
    private static native void nativePrintTrace();
 
    @Override
    protected void onAlive() {
        super.onAlive();
        if (!hasInit) {
            //调用native方法启动监听
            nativeInitSignalAnrDetective(sAnrTraceFilePath, sPrintTraceFilePath);
            //主要用来判断是否是前台
            AppForegroundUtil.INSTANCE.init();
            hasInit = true;
        }
    }
 
    @Override
    protected void onDead() {
        super.onDead();
        //free anr检测
        nativeFreeSignalAnrDetective();
    }
 
    public void setSignalAnrDetectedListener(SignalAnrDetectedListener listener) {
        sSignalAnrDetectedListener = listener;
    }
 
    public interface SignalAnrDetectedListener {
        void onAnrDetected(String stackTrace, String mMessageString, long mMessageWhen, boolean fromProcessErrorState);
    }
}

2.MatrixTracer.cc

2.1 JNI_OnLoad初始化,双向绑定函数

2.2 nativeInitSignalAnrDetective,开启检测,真正检测的地方在AnrDumper.cc

2.3 AnrDumper.cc 里handleSignal里调用MatrixTracer anrDumpCallback ,表示anr可能发生了,通知SignalAnrTracer检测ui线程是否block或者状态为NOT_RESPONDING。并调用hookAnrTraceWrite方法,开启hook,为了找到write trace的点

2.4 my_connect,my_open是开始socket通信了,主要为了检测socket通信之后的write方法

2.5 my_write是我们的write方法

#define PROP_VALUE_MAX  92                      //用于求getApiLevel
#define PROP_SDK_NAME "ro.build.version.sdk"    //用于求getApiLevel
#define HOOK_CONNECT_PATH "/dev/socket/tombstoned_java_trace"   //socket文件地址
#define HOOK_OPEN_PATH "/data/anr/traces.txt"                   //socket文件地址
 
using namespace MatrixTracer;
 
static std::optional<AnrDumper> sAnrDumper; //AnrDumper,是自定义的SignalHandler
static bool isTraceWrite = false;           //isTraceWrite my_connect my_open设置为true,my_write设置为false
static bool fromMyPrintTrace = false;       //fromMyPrintTrace 是否是自己想打的
static bool isHooking = false;              //是否hooking,unHookAnrTraceWrite设置为false
static std::string anrTracePathstring;      //新的anrTracePathstring,系统用的
static std::string printTracePathstring;    //新的printTracePathstring,我自己想打印的时候用的
static int signalCatcherTid;                //signalCatcherTid的线程id
 
//一个结构体,用来保存java层 类,方法地址
static struct StacktraceJNI {
    jclass AnrDetective;                    //SignalAnrTracer
    jclass ThreadPriorityDetective;
    jmethodID AnrDetector_onANRDumped;      //SignalAnrTracer 里的
    jmethodID AnrDetector_onANRDumpTrace;   //SignalAnrTracer 里的
    jmethodID AnrDetector_onPrintTrace;     //SignalAnrTracer 里的
 
    jmethodID ThreadPriorityDetective_onMainThreadPriorityModified;
    jmethodID ThreadPriorityDetective_onMainThreadTimerSlackModified;
} gJ;
 
//region MainThreadPriorityModified相关的东西
int (*original_setpriority)(int __which, id_t __who, int __priority);
 
int my_setpriority(int __which, id_t __who, int __priority) {
 
    if (__priority <= 0) {
        return original_setpriority(__which, __who, __priority);
    }
    if (__who == 0 && getpid() == gettid()) {
        JNIEnv *env = JniInvocation::getEnv();
        env->CallStaticVoidMethod(gJ.ThreadPriorityDetective,
                                  gJ.ThreadPriorityDetective_onMainThreadPriorityModified,
                                  __priority);
    } else if (__who == getpid()) {
        JNIEnv *env = JniInvocation::getEnv();
        env->CallStaticVoidMethod(gJ.ThreadPriorityDetective,
                                  gJ.ThreadPriorityDetective_onMainThreadPriorityModified,
                                  __priority);
    }
 
    return original_setpriority(__which, __who, __priority);
}
 
int (*original_prctl)(int option, unsigned long arg2, unsigned long arg3,
                      unsigned long arg4, unsigned long arg5);
 
int my_prctl(int option, unsigned long arg2, unsigned long arg3,
             unsigned long arg4, unsigned long arg5) {
 
    if (option == PR_SET_TIMERSLACK) {
        if (gettid() == getpid() && arg2 > 50000) {
            JNIEnv *env = JniInvocation::getEnv();
            env->CallStaticVoidMethod(gJ.ThreadPriorityDetective,
                                      gJ.ThreadPriorityDetective_onMainThreadTimerSlackModified,
                                      arg2);
 
        }
    }
 
    return original_prctl(option, arg2, arg3, arg4, arg5);
}
//endregion
 
/**
 *
 * @param content 内容
 * @param filePath 文件地址
 */
void writeAnr(const std::string &content, const std::string &filePath) {
    //unhook write
    unHookAnrTraceWrite();
    std::stringstream stringStream(content);
    std::string to;
    std::ofstream outfile;
    outfile.open(filePath);
    outfile << content;
}
 
//region my_connect  original_connect
int (*original_connect)(int __fd, const struct sockaddr *__addr, socklen_t __addr_length);
 
int my_connect(int __fd, const struct sockaddr *__addr, socklen_t __addr_length) {
    if (__addr != nullptr) {
        //hook connect方法,检测sockaddr地址是否为HOOK_CONNECT_PATH,表明是signal检测线程
        if (strcmp(__addr->sa_data, HOOK_CONNECT_PATH) == 0) {
            //设置signal检测线程id
            signalCatcherTid = gettid();
            //标记开始打印
            isTraceWrite = true;
        }
    }
    return original_connect(__fd, __addr, __addr_length);
}
//endregion
 
//region my_open original_open
int (*original_open)(const char *pathname, int flags, mode_t mode);
 
int my_open(const char *pathname, int flags, mode_t mode) {
    if (pathname != nullptr) {
        //hook connect方法,检测sockaddr地址是否为HOOK_OPEN_PATH,表明是signal检测线程
        if (strcmp(pathname, HOOK_OPEN_PATH) == 0) {
            //设置signal检测线程id
            signalCatcherTid = gettid();
            //标记开始打印
            isTraceWrite = true;
        }
    }
    return original_open(pathname, flags, mode);
}
//endregion
 
//region original_write my_write
ssize_t (*original_write)(int fd, const void *const __pass_object_size0 buf, size_t count);
 
ssize_t my_write(int fd, const void *const buf, size_t count) {
    //如果标记为isTraceWrite为true,第一个signalCatcher线程,write调用即为打印trace的地方
    if (isTraceWrite && gettid() == signalCatcherTid) {
        isTraceWrite = false;
        signalCatcherTid = 0;
        if (buf != nullptr) {
            std::string targetFilePath;
            if (fromMyPrintTrace) {
                targetFilePath = printTracePathstring;
            } else {
                targetFilePath = anrTracePathstring;
            }
            if (!targetFilePath.empty()) {
                char *content = (char *) buf;
                writeAnr(content, targetFilePath);
                if (!fromMyPrintTrace) {
                    anrDumpTraceCallback();
                } else {
                    printTraceCallback();
                }
                fromMyPrintTrace = false;
            }
        }
    }
    return original_write(fd, buf, count);
}
//endregion
 
//调用java的onANRDumped,AnrDumper.cc 里handleSignal里调用anrCallback然后调用这个anrDumpCallback回调
bool anrDumpCallback() {
    JNIEnv *env = JniInvocation::getEnv();
    if (!env) return false;
    env->CallStaticVoidMethod(gJ.AnrDetective, gJ.AnrDetector_onANRDumped);
    return true;
}
 
//调用java的onANRDumpTrace,my_write里调用
bool anrDumpTraceCallback() {
    JNIEnv *env = JniInvocation::getEnv();
    if (!env) return false;
    env->CallStaticVoidMethod(gJ.AnrDetective, gJ.AnrDetector_onANRDumpTrace);
    return true;
}
 
//调用java的onPrintTrace,my_write里调用
bool printTraceCallback() {
    JNIEnv *env = JniInvocation::getEnv();
    if (!env) return false;
    env->CallStaticVoidMethod(gJ.AnrDetective, gJ.AnrDetector_onPrintTrace);
    return true;
}
 
//ok
int getApiLevel() {
    char buf[PROP_VALUE_MAX];
    int len = __system_property_get(PROP_SDK_NAME, buf);
    if (len <= 0)
        return 0;
 
    return atoi(buf);
}
 
/**
 * @param isSiUser true为自己的进程
 * AnrDumper.cc 里handleSignal里调用anrCallback方法,或者调用siUserCallback,然后调用这个hookAnrTraceWrite回调
 */
void hookAnrTraceWrite(bool isSiUser) {
    int apiLevel = getApiLevel();
    if (apiLevel < 19) {
        return;
    }
 
    //isSiUser为true,表示自己进程发的时候是通过kill发的,此处不符合逻辑,返回
    if (!fromMyPrintTrace && isSiUser) {
        return;
    }
 
    if (isHooking) {
        return;
    }
 
    isHooking = true;
 
    if (apiLevel >= 27) {
        void *libcutils_info = xhook_elf_open("/system/lib64/libcutils.so");
        if (!libcutils_info) {
            libcutils_info = xhook_elf_open("/system/lib/libcutils.so");
        }
        xhook_hook_symbol(libcutils_info, "connect", (void *) my_connect,
                          (void **) (&original_connect));
    } else {
        void *libart_info = xhook_elf_open("libart.so");
        xhook_hook_symbol(libart_info, "open", (void *) my_open, (void **) (&original_open));
    }
 
    if (apiLevel >= 30 || apiLevel == 25 || apiLevel == 24) {
        void *libc_info = xhook_elf_open("libc.so");
        xhook_hook_symbol(libc_info, "write", (void *) my_write, (void **) (&original_write));
    } else if (apiLevel == 29) {
        void *libbase_info = xhook_elf_open("/system/lib64/libbase.so");
        if (!libbase_info) {
            libbase_info = xhook_elf_open("/system/lib/libbase.so");
        }
        xhook_hook_symbol(libbase_info, "write", (void *) my_write, (void **) (&original_write));
        xhook_elf_close(libbase_info);
    } else {
        void *libart_info = xhook_elf_open("libart.so");
        xhook_hook_symbol(libart_info, "write", (void *) my_write, (void **) (&original_write));
    }
}
 
//unhook
void unHookAnrTraceWrite() {
    int apiLevel = getApiLevel();
    if (apiLevel >= 27) {
        void *libcutils_info = xhook_elf_open("/system/lib64/libcutils.so");
        xhook_hook_symbol(libcutils_info, "connect", (void *) original_connect, nullptr);
    } else {
        void *libart_info = xhook_elf_open("libart.so");
        xhook_hook_symbol(libart_info, "open", (void *) original_connect, nullptr);
    }
 
    if (apiLevel >= 30 || apiLevel == 25 || apiLevel == 24) {
        void *libc_info = xhook_elf_open("libc.so");
        xhook_hook_symbol(libc_info, "write", (void *) original_write, nullptr);
    } else if (apiLevel == 29) {
        void *libbase_info = xhook_elf_open("/system/lib64/libbase.so");
        xhook_hook_symbol(libbase_info, "write", (void *) original_write, nullptr);
    } else {
        void *libart_info = xhook_elf_open("libart.so");
        xhook_hook_symbol(libart_info, "write", (void *) original_write, nullptr);
    }
    isHooking = false;
}
 
//初始化,开启检测Signalanr检测,真正检测的地方在AnrDumper.cc
static void
nativeInitSignalAnrDetective(JNIEnv *env, jclass, jstring anrTracePath, jstring printTracePath) {
    //anr发生时,打印path
    const char *anrTracePathChar = env->GetStringUTFChars(anrTracePath, nullptr);
    //手动发送SIGQUIT,打印的trace地址
    const char *printTracePathChar = env->GetStringUTFChars(printTracePath, nullptr);
    anrTracePathstring = std::string(anrTracePathChar);
    printTracePathstring = std::string(printTracePathChar);
    //开启检测,真正检测的地方在AnrDumper.cc
    sAnrDumper.emplace(anrTracePathChar, printTracePathChar, anrDumpCallback);
}
 
//Free Signal Anr Detective 重置,释放
static void nativeFreeSignalAnrDetective(JNIEnv *env, jclass) {
    //重置,释放
    sAnrDumper.reset();
}
 
//region MainThreadPriority相关 ,先不看
static void nativeInitMainThreadPriorityDetective(JNIEnv *env, jclass) {
    xhook_register(".*\\.so$", "setpriority", (void *) my_setpriority,
                   (void **) (&original_setpriority));
    xhook_register(".*\\.so$", "prctl", (void *) my_prctl, (void **) (&original_prctl));
    xhook_refresh(true);
}
//endregion
 
//自己打印trace,发送自己的进程发送SIGQUIT
static void nativePrintTrace() {
    fromMyPrintTrace = true;
    kill(getpid(), SIGQUIT);
}
 
template<typename T, std::size_t sz>//todo
static inline constexpr std::size_t NELEM(const T(&)[sz]) { return sz; }//todo
 
//JNINativeMethod 数组 anr相关的
static const JNINativeMethod ANR_METHODS[] = {
        {"nativeInitSignalAnrDetective", "(Ljava/lang/String;Ljava/lang/String;)V", (void *) nativeInitSignalAnrDetective},
        {"nativeFreeSignalAnrDetective", "()V",                                     (void *) nativeFreeSignalAnrDetective},
        {"nativePrintTrace",             "()V",                                     (void *) nativePrintTrace},
};
 
//MainThreadPriority相关的,先不看
static const JNINativeMethod THREAD_PRIORITY_METHODS[] = {
        {"nativeInitMainThreadPriorityDetective", "()V", (void *) nativeInitMainThreadPriorityDetective},
};
 
//JNI_OnLoad 初始化jni环境
JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *) {
    JniInvocation::init(vm);
 
    JNIEnv *env;
    //获取env环境,如果env环境没有获取成功,返回-1
    if (vm->GetEnv(reinterpret_cast<void **>(&env), JNI_VERSION_1_6) != JNI_OK)
        return -1;
 
    //获取SignalAnrTracer变为jclass
    jclass anrDetectiveCls = env->FindClass("com/tencent/matrix/trace/tracer/SignalAnrTracer");
    if (!anrDetectiveCls)
        return -1;
    //保存SignalAnrTracer为jclass
    gJ.AnrDetective = static_cast<jclass>(env->NewGlobalRef(anrDetectiveCls));
    //保存方法
    gJ.AnrDetector_onANRDumped =
            env->GetStaticMethodID(anrDetectiveCls, "onANRDumped", "()V");
    gJ.AnrDetector_onANRDumpTrace =
            env->GetStaticMethodID(anrDetectiveCls, "onANRDumpTrace", "()V");
    gJ.AnrDetector_onPrintTrace =
            env->GetStaticMethodID(anrDetectiveCls, "onPrintTrace", "()V");
 
    //注册native方法,使得java可以调用native
    if (env->RegisterNatives(
            anrDetectiveCls, ANR_METHODS, static_cast<jint>(NELEM(ANR_METHODS))) != 0)
        return -1;
 
    //删除anrDetectiveCls
    env->DeleteLocalRef(anrDetectiveCls);
 
 
    jclass threadPriorityDetectiveCls = env->FindClass(
            "com/tencent/matrix/trace/tracer/ThreadPriorityTracer");
    if (!threadPriorityDetectiveCls)
        return -1;
    gJ.ThreadPriorityDetective = static_cast<jclass>(env->NewGlobalRef(threadPriorityDetectiveCls));
    gJ.ThreadPriorityDetective_onMainThreadPriorityModified =
            env->GetStaticMethodID(threadPriorityDetectiveCls, "onMainThreadPriorityModified",
                                   "(I)V");
    gJ.ThreadPriorityDetective_onMainThreadTimerSlackModified =
            env->GetStaticMethodID(threadPriorityDetectiveCls, "onMainThreadTimerSlackModified",
                                   "(J)V");
 
 
    if (env->RegisterNatives(
            threadPriorityDetectiveCls, THREAD_PRIORITY_METHODS,
            static_cast<jint>(NELEM(THREAD_PRIORITY_METHODS))) != 0)
        return -1;
 
    env->DeleteLocalRef(threadPriorityDetectiveCls);
 
 
    return JNI_VERSION_1_6;
}   // namespace MatrixTracer

3.AnrDumper.h 定义AnrDumper,继承SignalHandler

namespace MatrixTracer {
 
class AnrDumper : public SignalHandler {
 public:
    //定义回调方法
    using DumpCallbackFunction = std::function<bool()>;
 
    AnrDumper(const char* anrTraceFile, const char* printTraceFile, DumpCallbackFunction&& callback);//&&引用。这个功能是C++的补充,常用在函数传参(C中一般用指针)、临时变量引用等。
    virtual ~AnrDumper();
 
 private:
    //处理signal地方
    Result handleSignal(int sig, const siginfo_t *info, void *uc) final;
    const DumpCallbackFunction mCallback;
};
}   // namespace MatrixTracer
 
#endif  // LAGDETECTOR_LAG_DETECTOR_MAIN_CPP_ANRDUMPER_H_

4.AnrDumper.cc handleSignal方法监听 SIGQUIT信号,并根据其他进程还是自己进程来调用anrCallback 或者siUserCallback,

4.1 anr是system_server进程发来的SIGQUIT,anrCallback代表可能发生了anr,之后会调用anrDumpCallback,让SignalAnrTracer检测ui线程是否block或者状态为NOT_RESPONDING

#define SIGNAL_CATCHER_THREAD_NAME "Signal Catcher"
#define SIGNAL_CATCHER_THREAD_SIGBLK 0x1000 //得到SignalCatcherThreadId,todo 没看明白
#define O_WRONLY 00000001
#define O_CREAT 00000100
#define O_TRUNC 00001000

namespace MatrixTracer {
   static sigset_t old_sigSet;
   const char *mAnrTraceFile;
   const char *mPrintTraceFile;

//建立了Signal Handler之后,我们发现在同时有sigwait和signal handler的情况下,
// 信号没有走到我们的signal handler而是依然被系统的Signal Catcher线程捕获到了,这是什么原因呢?
//
//原来是Android默认把SIGQUIT设置成了BLOCKED,所以只会响应sigwait而不会进入到我们设置的handler方法中。
// 我们通过pthread_sigmask或者sigprocmask把SIGQUIT设置为UNBLOCK,那么再次收到SIGQUIT时,就一定会进入到我们的handler方法中。需要这样设置:
   AnrDumper::AnrDumper(const char *anrTraceFile, const char *printTraceFile,
                        AnrDumper::DumpCallbackFunction &&callback) : mCallback(callback) {
       // must unblocked SIGQUIT, otherwise the signal handler can not capture SIGQUIT
       // 必须unblock,否则signal handler无法接收到信号,而是由signal_cahcher线程中的sigwait接收信号,走一般的ANR流程
       mAnrTraceFile = anrTraceFile;
       mPrintTraceFile = printTraceFile;
       sigset_t sigSet;
       sigemptyset(&sigSet);
       sigaddset(&sigSet, SIGQUIT);
       pthread_sigmask(SIG_UNBLOCK, &sigSet, &old_sigSet);
   }

   //得到SignalCatcherThreadId,todo 没看明白
   static int getSignalCatcherThreadId() {
       char taskDirPath[128];
       DIR *taskDir;
       long long sigblk;
       int signalCatcherTid = -1;
       int firstSignalCatcherTid = -1;

       snprintf(taskDirPath, sizeof(taskDirPath), "/proc/%d/task", getpid());
       if ((taskDir = opendir(taskDirPath)) == nullptr) {
           return -1;
       }
       struct dirent *dent;
       pid_t tid;
       while ((dent = readdir(taskDir)) != nullptr) {
           tid = atoi(dent->d_name);
           if (tid <= 0) {
               continue;
           }

           char threadName[1024];
           char commFilePath[1024];
           snprintf(commFilePath, sizeof(commFilePath), "/proc/%d/task/%d/comm", getpid(), tid);

           Support::readFileAsString(commFilePath, threadName, sizeof(threadName));

           if (strncmp(SIGNAL_CATCHER_THREAD_NAME, threadName,
                       sizeof(SIGNAL_CATCHER_THREAD_NAME) - 1) != 0) {
               continue;
           }

           if (firstSignalCatcherTid == -1) {
               firstSignalCatcherTid = tid;
           }

           sigblk = 0;
           char taskPath[128];
           snprintf(taskPath, sizeof(taskPath), "/proc/%d/status", tid);

           ScopedFileDescriptor fd(open(taskPath, O_RDONLY, 0));
           LineReader lr(fd.get());
           const char *line;
           size_t len;
           while (lr.getNextLine(&line, &len)) {
               if (1 == sscanf(line, "SigBlk: %" SCNx64, &sigblk)) {
                   break;
               }
               lr.popLine(len);
           }
           if (SIGNAL_CATCHER_THREAD_SIGBLK != sigblk) {
               continue;
           }
           signalCatcherTid = tid;
           break;
       }
       closedir(taskDir);

       if (signalCatcherTid == -1) {
           signalCatcherTid = firstSignalCatcherTid;
       }
       return signalCatcherTid;
   }

//我们通过Signal Handler抢到了SIGQUIT后,原本的Signal Catcher线程中的sigwait就不再能收到SIGQUIT了,
// 原本的dump堆栈的逻辑就无法完成了,我们为了ANR的整个逻辑和流程跟原来完全一致,需要在Signal Handler里面重新向Signal Catcher线程发送一个SIGQUIT:
   static void sendSigToSignalCatcher() {
       //遍历/proc/[pid]目录,找到SignalCatcher线程的tid
       int tid = getSignalCatcherThreadId();
       syscall(SYS_tgkill, getpid(), tid, SIGQUIT);
   }

   //SIGQUIT发生了,其他进程发来的,anr是system_server进程发来的消息,不是自己进程发来的
   static void *anrCallback(void *arg) {
       //anr可能发生了,通知SignalAnrTracer检测ui线程是否block或者状态为NOT_RESPONDING
       anrDumpCallback();

       if (strlen(mAnrTraceFile) > 0) {
           //开始hook write socket
           hookAnrTraceWrite(false);
       }
       //转发SIGQUIT
       sendSigToSignalCatcher();
       return nullptr;
   }

   //SIGQUIT发生了,自己进程发来的,不是anr
   static void *siUserCallback(void *arg) {
       //这里没有调用anrDumpCallback,因为是自己触发的
       if (strlen(mPrintTraceFile) > 0) {
           //开始hook write socket
           hookAnrTraceWrite(true);
       }
       //转发SIGQUIT
       sendSigToSignalCatcher();
       return nullptr;
   }

//另外,Signal Handler回调的第二个参数siginfo_t,也包含了一些有用的信息,该结构体的第三个字段si_code表示该信号被
// 发送的方法,SI_USER表示信号是通过kill发送的,SI_QUEUE表示信号是通过sigqueue发送的。但在Android的ANR流程中,
// 高版本使用的是sigqueue发送的信号,某些低版本使用的是kill发送的信号,并不统一。
//
//而第五个字段(极少数机型上是第四个字段)si_pid表示的是发送该信号的进程的pid,这里适用几乎所有Android版本和机型的
// 一个条件是:如果发送信号的进程是自己的进程,那么一定不是一个ANR。可以通过这个条件排除自己发送SIGQUIT,
// 而导致误报的情况。
   SignalHandler::Result AnrDumper::handleSignal(int sig, const siginfo_t *info, void *uc) {
       // Only process SIGQUIT, which indicates an ANR.
       if (sig != SIGQUIT) return NOT_HANDLED;
       //Got An ANR
       int fromPid1 = info->_si_pad[3];
       int fromPid2 = info->_si_pad[4];
       int myPid = getpid();

       pthread_t thd;

       if (fromPid1 != myPid && fromPid2 != myPid) {
           //一个条件是:如果发送信号的进程是自己的进程,那么一定不是一个ANR。可以通过这个条件排除自己发送SIGQUIT,
           pthread_create(&thd, nullptr, anrCallback, nullptr);
       } else {
           //自己的进程
           pthread_create(&thd, nullptr, siUserCallback, nullptr);
       }
       pthread_detach(thd);

       return HANDLED_NO_RETRIGGER;
   }

   //没用到
   static void *anr_trace_callback(void *args) {
       anrDumpTraceCallback();
       return nullptr;
   }

   //没用到
   static void *print_trace_callback(void *args) {
       printTraceCallback();
       return nullptr;
   }


   AnrDumper::~AnrDumper() {
       pthread_sigmask(SIG_SETMASK, &old_sigSet, nullptr);
   }

}   // namespace MatrixTracer

5.我们的SignalHandler类

5.1 signalHandler方法主要是收到了信号

5.2 handleSignal处理信号

namespace MatrixTracer {
 
    class SignalHandler {
    public:
        SignalHandler();
 
        virtual ~SignalHandler();//析构函数:
//    当一个类的对象离开作用域时,析构函数将被调用(系统自动调用)。析构函数的名字和类名一样,不过要在前面加上 ~ 。
//    对一个类来说,只能允许一个析构函数,析构函数不能有参数,并且也没有返回值。
//    析构函数的作用是完成一个清理工作,如释放从堆中分配的内存。
 
    protected:
        enum Result {
            NOT_HANDLED = 0, HANDLED, HANDLED_NO_RETRIGGER
        };//retrigger
        virtual Result handleSignal(int sig, const siginfo_t *info, void *uc) = 0;
 
    private:
        static void signalHandler(int sig, siginfo_t *info, void *uc);
 
        static bool installHandlersLocked();
 
        //https://blog.csdn.net/lmb1612977696/article/details/80035487
        SignalHandler(const SignalHandler &) = delete;//禁止生成该函数,默认拷贝构造函数
        SignalHandler &operator=(const SignalHandler &) = delete;//禁止生成该函数,默认赋值函数
    };
 
}   // namespace MatrixTracer
 
#endif  // LAGDETECTOR_LAG_DETECTOR_MAIN_CPP_SIGNALHANDLER_H_
6.SignalHandler.cc
6.1 installHandlersLocked 通过可以sigaction方法,建立一个Signal Handler,sa_sigaction方法地址设置为我们的signalHandler方法
6.2 signalHandler 信号处理的地方,转发给各SignalHandler的handleSignal
//线程名字,todo,得到SignalCatcherThreadId,todo 没看明白
#define SIGNAL_CATCHER_THREAD_NAME "Signal Catcher"
//退出线程标记,todo,得到SignalCatcherThreadId,todo 没看明白
#define SIGNAL_CATCHER_THREAD_SIGBLK 0x1000
 
namespace MatrixTracer {
//信号
    const int TARGET_SIG = SIGQUIT;//3
//使用sigaction方法注册signal handler进行异步监听,sOldHandlers是保存老的sigaction
    struct sigaction sOldHandlers;//todo
    bool sHandlerInstalled = false;
 
// The global signal handler stack. This is needed because there may exist
// multiple SignalHandler instances in a process. Each will have itself
// registered in this stack.
    static std::vector<SignalHandler *> *sHandlerStack = nullptr;//todo
// C++11中新增了<mutex>,它是C++标准程序库中的一个头文件,定义了C++11标准中的一些互斥访问的类与方法等。其中std::mutex就是lock、unlock。std::lock_guard与std::mutex配合使用,把锁放到lock_guard中时,mutex自动上锁,lock_guard析构时,同时把mutex解锁。mutex又称互斥量。
    static std::mutex sHandlerStackMutex;//todo
    static bool sStackInstalled = false;
// InstallAlternateStackLocked will store the newly installed stack in new_stack
// and (if it exists) the previously installed stack in old_stack.
    static stack_t sOldStack;//todo
    static stack_t sNewStack;//todo
 
    static void installAlternateStackLocked() {//todo
        if (sStackInstalled)
            return;
        //重置
        memset(&sOldStack, 0, sizeof(sOldStack));
        memset(&sNewStack, 0, sizeof(sNewStack));
        static constexpr unsigned kSigStackSize = std::max(16384, SIGSTKSZ);
        //取到老的sOldStack
        if (sigaltstack(nullptr, &sOldStack) == -1 || !sOldStack.ss_sp ||
            sOldStack.ss_size < kSigStackSize) {
            sNewStack.ss_sp = calloc(1, kSigStackSize);
            sNewStack.ss_size = kSigStackSize;
            //设置新的sNewStack
            if (sigaltstack(&sNewStack, nullptr) == -1) {
                free(sNewStack.ss_sp);
                return;
            }
        }
 
        sStackInstalled = true;
        ALOGV("Alternative stack installed.");
    }
 
// Runs before crashing: normal context.
//    我们通过可以sigaction方法,建立一个Signal Handler:ok
    bool SignalHandler::installHandlersLocked() {
        if (sHandlerInstalled) {
            return false;
        }
        // Fail if unable to store all the old handlers.
        //取到老的sOldHandlers
        if (sigaction(TARGET_SIG, nullptr, &sOldHandlers) == -1) {
            return false;
        }
 
        struct sigaction sa{};//sigaction结构体
        sa.sa_sigaction = signalHandler;//方法地址,收到信号的地方
        sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART;
        //我们通过可以sigaction方法,建立一个Signal Handler
        if (sigaction(TARGET_SIG, &sa, nullptr) == -1) {//sigaction方法,将sa设置为Signal Handler
            ALOGV("Signal handler cannot be installed");
 
            // At this point it is impractical to back out changes, and so failure to
            // install a signal is intentionally ignored.
        }
 
        sHandlerInstalled = true;
        ALOGV("Signal handler installed.");
        return true;
    }
 
    //todo
    static void installDefaultHandler(int sig) {
 
        // Android L+ expose signal and sigaction symbols that override the system
        // ones. There is a bug in these functions where a request to set the handler
        // to SIG_DFL is ignored. In that case, an infinite loop is entered as the
        // signal is repeatedly sent to breakpad's signal handler.
        // To work around this, directly call the system's sigaction.
        struct sigaction sa;
        memset(&sa, 0, sizeof(sa));
        sigemptyset(&sa.sa_mask);
        sa.sa_handler = SIG_DFL;
        sa.sa_flags = SA_RESTART;
        sigaction(sig, &sa, nullptr);
    }
 
// This function runs in a compromised context: see the top of the file.
// Runs on the crashing thread.
    static void restoreHandlersLocked() {//todo
        if (!sHandlerInstalled)
            return;
        //将老的sOldHandlers重新sigaction上
        if (sigaction(TARGET_SIG, &sOldHandlers, nullptr) == -1) {
            //todo
            installDefaultHandler(TARGET_SIG);
        }
 
        sHandlerInstalled = false;
        ALOGV("Signal handler restored.");
    }
    
    static void restoreAlternateStackLocked() {//todo
        if (!sStackInstalled)
            return;
 
        stack_t current_stack;
        if (sigaltstack(nullptr, &current_stack) == -1)
            return;
        // Only restore the old_stack if the current alternative stack is the one
        // installed by the call to InstallAlternateStackLocked.
        if (current_stack.ss_sp == sNewStack.ss_sp) {
            if (sOldStack.ss_sp) {
                if (sigaltstack(&sOldStack, nullptr) == -1)
                    return;
            } else {
                stack_t disable_stack;
                disable_stack.ss_flags = SS_DISABLE;
                if (sigaltstack(&disable_stack, nullptr) == -1)
                    return;
            }
        }
 
        free(sNewStack.ss_sp);
        sStackInstalled = false;
    }
 
// This function runs in a compromised context: see the top of the file.
// Runs on the crashing thread.
// 发生信号处理的地方,转发给各sHandlerStack的handleSignal ok
    void SignalHandler::signalHandler(int sig, siginfo_t *info, void *uc) {
        ALOGV("Entered signal handler.");
// All the exception signals are blocked at this point.
        std::unique_lock<std::mutex> lock(sHandlerStackMutex);
 
        for (auto it = sHandlerStack->rbegin(); it != sHandlerStack->rend(); ++it) {
            (*it)->handleSignal(sig, info, uc);
        }
 
        lock.unlock();
    }
 
 
    SignalHandler::SignalHandler() {
        //上锁,todo
        std::lock_guard<std::mutex> lock(sHandlerStackMutex);
 
        //建一个sHandlerStack
        if (!sHandlerStack)
            sHandlerStack = new std::vector<SignalHandler *>;
 
        //todo
        installAlternateStackLocked();
        //todo
        installHandlersLocked();
        //将自己放进去
        sHandlerStack->push_back(this);
    }
 
    SignalHandler::~SignalHandler() {
        std::lock_guard<std::mutex> lock(sHandlerStackMutex);
 
        auto it = std::find(sHandlerStack->begin(), sHandlerStack->end(), this);
        sHandlerStack->erase(it);
        if (sHandlerStack->empty()) {
            delete sHandlerStack;
            sHandlerStack = nullptr;
            restoreAlternateStackLocked();
            restoreHandlersLocked();
        }
    }