1、前言
KOOM 的内存监控,分为三大块,分别为 Java 、Native 和 Thread,此篇主要是对 Java 层的内存监控模块的探索。
- KOOM 源码解读 - java 监控
- KOOM 源码解读 - native 监控
- KOOM 源码解读 - thread 监控
2、源码分析
1、启动监控
从 demo 中的入口开始
case R.id.btn_make_java_leak:
/*
* Init OOMMonitor
*/
// 1.1初始化配置
OOMMonitorInitTask.INSTANCE.init(JavaLeakTestActivity.this.getApplication());
// 1.2启动 OOMMonitor 监控
OOMMonitor.INSTANCE.startLoop(true, false,5_000L);
/*
* Make some leaks for test!
*/
// 1.3制造内存泄露
LeakMaker.makeLeak(this);
break;
case R.id.btn_hprof_dump:
//Pull the hprof from the devices.
//adb shell "run-as com.kwai.koom.demo cat 'files/test.hprof'" > ~/temp/test.hprof
// dump 内存快照 hprof
ForkStripHeapDumper.getInstance().dump(
this.getFilesDir().getAbsolutePath() + File.separator + "test.hprof");
break;
1.1 初始化配置
object OOMMonitorInitTask : InitTask {
override fun init(application: Application) {
val config = OOMMonitorConfig.Builder()
.setThreadThreshold(50) //50 only for test! Please use default value!
.setFdThreshold(300) // 300 only for test! Please use default value!
.setHeapThreshold(0.9f) // 0.9f for test! Please use default value!
.setVssSizeThreshold(1_000_000) // 1_000_000 for test! Please use default value!
.setMaxOverThresholdCount(1) // 1 for test! Please use default value!
.setAnalysisMaxTimesPerVersion(3) // Consider use default value!
.setAnalysisPeriodPerVersion(15 * 24 * 60 * 60 * 1000) // Consider use default value!
.setLoopInterval(5_000) // 5_000 for test! Please use default value!
.setEnableHprofDumpAnalysis(true)
.setHprofUploader(object : OOMHprofUploader {
override fun upload(file: File, type: OOMHprofUploader.HprofType) {
MonitorLog.e("OOMMonitor", "todo, upload hprof ${file.name} if necessary")
}
})
.setReportUploader(object : OOMReportUploader {
override fun upload(file: File, content: String) {
MonitorLog.i("OOMMonitor", content)
MonitorLog.e("OOMMonitor", "todo, upload report ${file.name} if necessary")
}
})
.build()
MonitorManager.addMonitorConfig(config)
}
}
1.2 启动 OOMMonitor
// 需要监控的 Tracker,不同的类型有不同的指标
private val mOOMTrackers = mutableListOf(
HeapOOMTracker(), ThreadOOMTracker(), FdOOMTracker(),
PhysicalMemoryOOMTracker(), FastHugeMemoryOOMTracker()
)
override fun startLoop(clearQueue: Boolean, postAtFront: Boolean, delayMillis: Long) {
throwIfNotInitialized { return }
...
// 调用父类的 startLoop 方法,然后执行 call() 方法
super.startLoop(clearQueue, postAtFront, delayMillis)
}
override fun call(): LoopState {
...
// 调用 trackOOM()
return trackOOM()
}
private fun trackOOM(): LoopState {
// 更新堆栈和内存cpu等信息
SystemInfo.refresh()
mTrackReasons.clear()
for (oomTracker in mOOMTrackers) {
// 1.2.1 判断 Tracker 是否达到设定的指标
if (oomTracker.track()) {
mTrackReasons.add(oomTracker.reason())
}
}
if (mTrackReasons.isNotEmpty() && monitorConfig.enableHprofDumpAnalysis) {
if (isExceedAnalysisPeriod() || isExceedAnalysisTimes()) {
MonitorLog.e(TAG, "Triggered, but exceed analysis times or period!")
} else {
async {
MonitorLog.i(TAG, "mTrackReasons:${mTrackReasons}")
// 有超出指标的,dump hprof文件并且分析
dumpAndAnalysis()
}
}
return LoopState.Terminate
}
return LoopState.Continue
}
总结下这里的流程:
- 调用 OOMMonitor.startLoop 开启监控,再到父类的 LoopMonitor.startLoop,然后最终会不断调用 OOMMonitor.call() 方法,实现每隔一段时间就检测一次。具体逻辑可以查看 KOOM 源码解读 - 开篇
- call() 方法调用 trackOOM() 方法
- 更新堆栈和内存cpu等信息
- 判断 oomTracker.track() 判断 mOOMTrackers 中哪个 Tracker 达到设定的指标
- 如果有满足条件的,执行 dumpAndAnalysis() ,dump hprof文件并且分析
1.2.1 mOOMTrackers 中的 5 种 Tracker
1、HeapOOMTracker 堆内存的监控指标判断
companion object {
private const val HEAP_RATIO_THRESHOLD_GAP = 0.05f
}
private var mLastHeapRatio = 0.0f
private var mOverThresholdCount = 0
override fun track(): Boolean {
val heapRatio = SystemInfo.javaHeap.rate
// 当前堆内存使用率大于配置的阈值,并且当前比上一次的堆内存使用率减少<0.05f
if (heapRatio > monitorConfig.heapThreshold
&& heapRatio >= mLastHeapRatio - HEAP_RATIO_THRESHOLD_GAP) {
mOverThresholdCount++
} else {
reset()
}
mLastHeapRatio = heapRatio
// 当连续满足的次数达到配置的标准时,返回true
return mOverThresholdCount >= monitorConfig.maxOverThresholdCount
}
2、ThreadOOMTracker 线程监控指标判断
companion object {
private const val THREAD_COUNT_THRESHOLD_GAP = 50 //Thread连续值递增浮动范围50
}
private var mLastThreadCount = 0
private var mOverThresholdCount = 0
override fun track(): Boolean {
// 获取进程的线程数
val threadCount = SystemInfo.procStatus.thread
// 当前线程数大于配置的阈值,并且当前比上一次的减少线程数<50
if (threadCount > monitorConfig.threadThreshold
&& threadCount >= mLastThreadCount - THREAD_COUNT_THRESHOLD_GAP) {
mOverThresholdCount++
// 打印当前进程的线程信息
dumpThreadIfNeed()
} else {
reset()
}
mLastThreadCount = threadCount
// 当连续满足的次数达到配置的标准时,返回true
return mOverThresholdCount >= monitorConfig.maxOverThresholdCount
}
// 打印当前进程的线程信息
private fun dumpThreadIfNeed() {
if (mOverThresholdCount > monitorConfig.maxOverThresholdCount) return
val threadNames = runCatching { File("/proc/self/task").listFiles() }
.getOrElse {
MonitorLog.i(TAG, "/proc/self/task child files is empty")
return@getOrElse emptyArray()
}
?.map {
runCatching { File(it, "comm").readText() }.getOrElse { "failed to read $it/comm" } }
?.map {
if (it.endsWith("\n")) it.substring(0, it.length - 1) else it
}
?: emptyList()
OOMFileManager.createDumpFile(OOMFileManager.threadDumpDir)
.run {
runCatching { writeText(threadNames.joinToString(separator = ",")) }
}
}
3、FdOOMTracker 文件打开数监控指标判断
companion object {
private const val FD_COUNT_THRESHOLD_GAP = 50 //FD连续值递增浮动范围50
}
private var mLastFdCount = 0
private var mOverThresholdCount = 0
override fun track(): Boolean {
// 获取当前进程 fd 的数量
val fdCount = getFdCount()
// 如果当前 fd 数量超过阈值,并且较上次 fd 数量没有较少超过 50
if (fdCount > monitorConfig.fdThreshold && fdCount >= mLastFdCount - FD_COUNT_THRESHOLD_GAP) {
mOverThresholdCount++
// dump fd 相关的信息
dumpFdIfNeed()
} else {
reset()
}
mLastFdCount = fdCount
// 如果累计的次数超过标准值,则返回 true
return mOverThresholdCount >= monitorConfig.maxOverThresholdCount
}
// 获取当前进程 fd 的数量
private fun getFdCount(): Int {
return File("/proc/self/fd").listFiles()?.size ?: 0
}
// dump fd 相关的信息
private fun dumpFdIfNeed() {
if (mOverThresholdCount > monitorConfig.maxOverThresholdCount) return
if (Build.VERSION.SDK_INT < Build.VERSION_CODES.LOLLIPOP) return
val fdNames = runCatching { File("/proc/self/fd").listFiles() }
.getOrElse {
return@getOrElse emptyArray()
}
?.map { file ->
runCatching { Os.readlink(file.path) }.getOrElse { "failed to read link ${file.path}" }
}
?: emptyList()
OOMFileManager.createDumpFile(OOMFileManager.fdDumpDir)
.run {
runCatching { writeText(fdNames.sorted().joinToString(separator = ",")) }
}
}
4、PhysicalMemoryOOMTracker cpu内存使用率监控指标判断
// 代码里只有一些日志,没有实际的触发dump
override fun track(): Boolean {
val info = SystemInfo.memInfo
when {
info.rate < monitorConfig.deviceMemoryThreshold -> {
MonitorLog.e(TAG, "oom meminfo.rate < " +
"${monitorConfig.deviceMemoryThreshold * 100}%")
//return true //先只是上传,不真实触发dump
}
info.rate < 0.10f -> {
MonitorLog.i(TAG, "oom meminfo.rate < 10.0%")
}
info.rate < 0.15f -> {
MonitorLog.i(TAG, "oom meminfo.rate < 15.0%")
}
info.rate < 0.20f -> {
MonitorLog.i(TAG, "oom meminfo.rate < 20.0%")
}
info.rate < 0.30f -> {
MonitorLog.i(TAG, "oom meminfo.rate < 30.0%")
}
}
return false
}
5、FastHugeMemoryOOMTracker 大内存对象创建和高危阈值监控指标判断
override fun track(): Boolean {
val javaHeap = SystemInfo.javaHeap
// 高危阈值直接触发dump分析
if (javaHeap.rate > monitorConfig.forceDumpJavaHeapMaxThreshold) {
mDumpReason = REASON_HIGH_WATERMARK
return true
}
// 高差值直接dump
val lastJavaHeap = SystemInfo.lastJavaHeap
if (lastJavaHeap.max != 0L && javaHeap.used - lastJavaHeap.used
> SizeUnit.KB.toByte(monitorConfig.forceDumpJavaHeapDeltaThreshold)) {
mDumpReason = REASON_HUGE_DELTA
return true
}
return false
}
1.3 手动制造内存泄露
public static void makeLeak(Context context) {
leakMakerList.add(new ActivityLeakMaker());
leakMakerList.add(new BitmapLeakMaker());
leakMakerList.add(new ByteArrayLeakMaker());
leakMakerList.add(new FragmentLeakMaker());
leakMakerList.add(new StringLeakMaker());
for (LeakMaker leakMaker : leakMakerList) {
leakMaker.startLeak(context);
}
for (int i = 0; i < 700; i++) {
new Thread(() -> {
try {
Thread.sleep(200000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}).start();
}
}
2、dump 内存快照
private fun dumpAndAnalysis() {
...
// 2.1 调用 ForkJvmHeapDumper 的 dump 方法保存 hprof 文件
ForkJvmHeapDumper.getInstance().run {
dump(hprofFile.absolutePath)
}
// 保证文件同步到磁盘
Thread.sleep(1000) // make sure file synced to disk.
// 3.启动 HeapAnalysisService 服务 分析 hprof 文件
startAnalysisService(hprofFile, jsonFile, mTrackReasons.joinToString())
}
2.1 ForkJvmHeapDumper dump 方法
@Override
public synchronized boolean dump(String path) {
boolean dumpRes = false;
try {
// 2.2 挂起 JVM 并 fork(),native 方法
int pid = suspendAndFork();
if (pid == 0) {
// 2.3 当pid(pid==0表示创建子进程成功,而非子进程的id为0)为0时,表示为子进程,则执行dump hprof,然后退出子进程
Debug.dumpHprofData(path);
exitProcess();
} else if (pid > 0) {
// pid(pid表示子进程的进程id) > 0 时,表示为当前父进程,resume JVM
// 这里还会等待子进程结束,才会执行返回
dumpRes = resumeAndWait(pid);
}
} catch (IOException e) {
e.printStackTrace();
}
return dumpRes;
}
2.2 挂起 JVM 并且 fork() 子进程
suspendAndFork() 方法为 native 方法
JNIEXPORT jint JNICALL
Java_com_kwai_koom_fastdump_ForkJvmHeapDumper_suspendAndFork(
JNIEnv *env ATTRIBUTE_UNUSED, jobject jobject ATTRIBUTE_UNUSED) {
// hprof_dump.cpp 中
return HprofDump::GetInstance().SuspendAndFork();
}
HprofDump::SuspendAndFork()
pid_t HprofDump::SuspendAndFork() {
KCHECKI(init_done_)
if (android_api_ < __ANDROID_API_R__) {
// 2.2.1 挂起 JVM
suspend_vm_fnc_();
} else if (android_api_ <= __ANDROID_API_S__) {
void *self = __get_tls()[TLS_SLOT_ART_THREAD_SELF];
sgc_constructor_fnc_((void *)sgc_instance_.get(), self, kGcCauseHprof,
kCollectorTypeHprof);
ssa_constructor_fnc_((void *)ssa_instance_.get(), LOG_TAG, true);
// avoid deadlock with child process
exclusive_unlock_fnc_(*mutator_lock_ptr_, self);
sgc_destructor_fnc_((void *)sgc_instance_.get());
}
// 2.2.2 fork 子进程
pid_t pid = fork();
if (pid == 0) {
// Set timeout for child process
alarm(60);
prctl(PR_SET_NAME, "forked-dump-process");
}
return pid;
}
2.2.1 挂起 JVM
为什么 fork 子进程前,需要挂起 JVM?
其实 fork 进程前,是不需要挂起 JVM 的,这里面之所以要挂起 JVM,是因为 Dump 内存快照前,需要挂起子进程的所有线程。
为什么要先挂起 JVM,再 fork() 子进程?不能在子进程中再挂起 JVM 吗?
这里就涉及到多线程与 fork() ,简单来说,就是当前进程,如果存在多线程运行时,进行 fork(),会存在死锁、全局变量状态不一致或者内存泄露等问题。具体的可以参考下这篇文章:www.techug.com/post/linux-…
这里说下 fork 函数的特性。fork 函数被调用一次,却能够返回两次,它会有三种不同的返回值:
- 在父进程中,fork 返回新创建的子进程的进程id;
- 在子进程中,fork 返回0,表示子进程创建成功;
- 在父进程中,fork 返回负数,表示 fork 出现错误;
2.2.2 如何挂起 JVM
void HprofDump::Initialize() {
if (init_done_ || android_api_ < __ANDROID_API_L__) {
return;
}
void *handle = kwai::linker::DlFcn::dlopen("libart.so", RTLD_NOW);
KCHECKV(handle)
if (android_api_ < __ANDROID_API_R__) {
// 挂起 JVM
suspend_vm_fnc_ =
(void (*)())DlFcn::dlsym(handle, "_ZN3art3Dbg9SuspendVMEv");
KFINISHV_FNC(suspend_vm_fnc_, DlFcn::dlclose, handle)
// 恢复 JVM
resume_vm_fnc_ = (void (*)())kwai::linker::DlFcn::dlsym(
handle, "_ZN3art3Dbg8ResumeVMEv");
KFINISHV_FNC(resume_vm_fnc_, DlFcn::dlclose, handle)
} else if (android_api_ <= __ANDROID_API_S__) {
...
}
DlFcn::dlclose(handle);
init_done_ = true;
}
这里类似于 java 中的反射,这里使用 dlsym 调用 "_ZN3art3Dbg9SuspendVMEv",相当于调用了 Dbg::SuspendVM 方法。这里的 _ZN3art3Dbg9SuspendVMEv ,是 SuspendVM 最终生成的地址名,可以通过 arm-linux-androideabi-nm -D libart.so > test.txt 命令查看 libart.so 下的所有方法名。
2.3 resume JVM
bool HprofDump::ResumeAndWait(pid_t pid) {
KCHECKB(init_done_)
if (android_api_ < __ANDROID_API_R__) {
// resume JVM
resume_vm_fnc_();
} else if (android_api_ <= __ANDROID_API_S__) {
...
}
int status;
for (;;) {
// 这里会挂起线程,等待子进程结束
if (waitpid(pid, &status, 0) != -1 || errno != EINTR) {
if (!WIFEXITED(status)) {
ALOGE("Child process %d exited with status %d, terminated by signal %d",
pid, WEXITSTATUS(status), WTERMSIG(status));
return false;
}
return true;
}
return false;
}
}
跟 SuspendVM 一样,这里也是通过 dlsym 调用 "_ZN3art3Dbg8ResumeVMEv" ,相当于调用 Dbg::ResumeVM 方法。
2.4 dump 内存快照文件
@Override
public synchronized boolean dump(String path) {
...
int pid = suspendAndFork();
if (pid == 0) {
// dump
Debug.dumpHprofData(path);
exitProcess();
}
...
}
调用 Debug.dumpHprofData(path) 方法 dump hprof ,完成后 退出子进程。
这里还涉及到一个点,就是 针对 hprof 的文件过大问题,KOOM 对 hprof 进行了裁剪,那是通过什么方式实现 hprof 的裁剪?
其实,这里是通过 hook write 方法,在写入文件过程中裁剪。具体的代码,在 hprof_strip.cpp 文件中:
void HprofStrip::HookInit() {
xhook_enable_debug(0);
/**
*
* android 7.x,write方法在libc.so中
* android 8-9,write方法在libart.so中
* android 10,write方法在libartbase.so中
* libbase.so是一个保险操作,防止前面2个so里面都hook不到(:
*/
...
xhook_register("libc.so", "write", (void *)HookWrite, nullptr);
xhook_register("libart.so", "write", (void *)HookWrite, nullptr);
xhook_register("libbase.so", "write", (void *)HookWrite, nullptr);
xhook_register("libartbase.so", "write", (void *)HookWrite, nullptr);
xhook_refresh(0);
xhook_clear();
}
static ssize_t HookWrite(int fd, const void *buf, size_t count) {
return HprofStrip::GetInstance().HookWriteInternal(fd, buf, count);
}
ssize_t HprofStrip::HookWriteInternal(int fd, const void *buf, ssize_t count) {
if (fd != hprof_fd_) {
return write(fd, buf, count);
}
// 每次hook_write,初始化重置
reset();
const unsigned char tag = ((unsigned char *)buf)[0];
// 删除掉无关record tag类型匹配,只匹配heap相关提高性能
switch (tag) {
case HPROF_TAG_HEAP_DUMP:
case HPROF_TAG_HEAP_DUMP_SEGMENT: {
ProcessHeap(
buf,
HEAP_TAG_BYTE_SIZE + RECORD_TIME_BYTE_SIZE + RECORD_LENGTH_BYTE_SIZE,
count, heap_serial_num_, 0);
heap_serial_num_++;
} break;
default:
break;
}
// 根据裁剪掉的zygote space和image space更新length
int record_length;
if (tag == HPROF_TAG_HEAP_DUMP || tag == HPROF_TAG_HEAP_DUMP_SEGMENT) {
record_length = GetIntFromBytes((unsigned char *)buf,
HEAP_TAG_BYTE_SIZE + RECORD_TIME_BYTE_SIZE);
record_length -= strip_bytes_sum_;
int index = HEAP_TAG_BYTE_SIZE + RECORD_TIME_BYTE_SIZE;
((unsigned char *)buf)[index] =
(unsigned char)(((unsigned int)record_length & 0xff000000u) >> 24u);
((unsigned char *)buf)[index + 1] =
(unsigned char)(((unsigned int)record_length & 0x00ff0000u) >> 16u);
((unsigned char *)buf)[index + 2] =
(unsigned char)(((unsigned int)record_length & 0x0000ff00u) >> 8u);
((unsigned char *)buf)[index + 3] =
(unsigned char)((unsigned int)record_length & 0x000000ffu);
}
size_t total_write = 0;
int start_index = 0;
for (int i = 0; i < strip_index_; i++) {
// 将裁剪掉的区间,通过写时过滤掉
void *write_buf = (void *)((unsigned char *)buf + start_index);
auto write_len = (size_t)(strip_index_list_pair_[i * 2] - start_index);
if (write_len > 0) {
total_write += FullyWrite(fd, write_buf, write_len);
} else if (write_len < 0) {
__android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
"HookWrite array i:%d writeLen<0:%zu", i, write_len);
}
start_index = strip_index_list_pair_[i * 2 + 1];
}
auto write_len = (size_t)(count - start_index);
if (write_len > 0) {
void *write_buf = (void *)((unsigned char *)buf + start_index);
total_write += FullyWrite(fd, write_buf, count - start_index);
}
hook_write_serial_num_++;
if (VERBOSE_LOG && total_write != count) {
__android_log_print(ANDROID_LOG_INFO, LOG_TAG,
"hook write, hprof strip happens");
}
return count;
}
具体的细节,这里就不展开说了,感兴趣的可以研究下。
3、分析 hprof
这里调用 HeapAnalysisService.startAnalysisService 启动 HeapAnalysisService 服务,对 hprof 进行分析。
override fun onHandleIntent(intent: Intent?) {
val resultReceiver = intent?.getParcelableExtra<ResultReceiver>(Info.RESULT_RECEIVER)
val hprofFile = intent?.getStringExtra(Info.HPROF_FILE)
val jsonFile = intent?.getStringExtra(Info.JSON_FILE)
val rootPath = intent?.getStringExtra(Info.ROOT_PATH)
OOMFileManager.init(rootPath)
kotlin.runCatching {
// 3.1 通过 hprof 构建 HeapGraph 对象
buildIndex(hprofFile)
}.onFailure {
...
}
// 3.2 初始化 HeapReport 数据
buildJson(intent)
kotlin.runCatching {
// 3.3 遍历镜像所有class,找出可能引起泄露的对象
filterLeakingObjects()
}.onFailure {
...
}
kotlin.runCatching {
// 3.4 对 3.3 中的class遍历,找出泄露的对象,添加到 HeapReport
findPathsToGcRoot()
}.onFailure {
...
}
// 3.5 把 HeapReport 转化为 json 写入到 jsonFile
fillJsonFile(jsonFile)
// 通知 OOMMonitor,分析结束
resultReceiver?.send(AnalysisReceiver.RESULT_CODE_OK, null)
// 结束当前服务进程
System.exit(0);
}
3.1 通过 hprof 构建 HeapGraph 对象
private fun buildIndex(hprofFile: String?) {
...
measureTimeMillis {
// 创建 HeapGraph 对象
mHeapGraph = File(hprofFile).openHeapGraph(null,
setOf(HprofRecordTag.ROOT_JNI_GLOBAL,
HprofRecordTag.ROOT_JNI_LOCAL,
HprofRecordTag.ROOT_NATIVE_STACK,
HprofRecordTag.ROOT_STICKY_CLASS,
HprofRecordTag.ROOT_THREAD_BLOCK,
HprofRecordTag.ROOT_THREAD_OBJECT));
}.also {
MonitorLog.i(TAG, "build index cost time: $it")
}
}
3.2 初始化 HeapReport 数据
private fun buildJson(intent: Intent?) {
// 初始化 HeapReport 对象 mLeakModel
mLeakModel.runningInfo = HeapReport.RunningInfo().apply {
jvmMax = intent?.getStringExtra(Info.JAVA_MAX_MEM)
jvmUsed = intent?.getStringExtra(Info.JAVA_USED_MEM)
threadCount = intent?.getStringExtra(Info.THREAD)
fdCount = intent?.getStringExtra(Info.FD)
vss = intent?.getStringExtra(Info.VSS)
pss = intent?.getStringExtra(Info.PSS)
rss = intent?.getStringExtra(Info.RSS)
sdkInt = intent?.getStringExtra(Info.SDK)
manufacture = intent?.getStringExtra(Info.MANUFACTURE)
buildModel = intent?.getStringExtra(Info.MODEL)
usageSeconds = intent?.getStringExtra(Info.USAGE_TIME)
currentPage = intent?.getStringExtra(Info.CURRENT_PAGE)
nowTime = intent?.getStringExtra(Info.TIME)
deviceMemTotal = intent?.getStringExtra(Info.DEVICE_MAX_MEM);
deviceMemAvaliable = intent?.getStringExtra(Info.DEVICE_AVA_MEM)
dumpReason = intent?.getStringExtra(Info.REASON)
MonitorLog.i(TAG, "handle Intent, fdCount:${fdCount} pss:${pss} rss:${rss} vss:${vss} " +
"threadCount:${threadCount}")
fdList = createDumpFile(fdDumpDir).takeIf { it.exists() }?.readLines()
threadList = createDumpFile(threadDumpDir).takeIf { it.exists() }?.readLines()
createDumpFile(fdDumpDir).delete()
createDumpFile(threadDumpDir).delete()
}
}
3.3 遍历快照所有class,找出可能引起泄露的对象
/**
* 遍历镜像所有class查找
*
* 计算gc path:
* 1.已经destroyed和finished的activity
* 2.已经fragment manager为空的fragment
* 3.已经destroyed的window
* 4.超过阈值大小的bitmap
* 5.超过阈值大小的基本类型数组
* 6.超过阈值大小的对象个数的任意class
*
*
* 记录关键类:
* 对象数量
* 1.基本类型数组
* 2.Bitmap
* 3.NativeAllocationRegistry
* 4.超过阈值大小的对象的任意class
*
*
* 记录大对象:
* 对象大小
* 1.Bitmap
* 2.基本类型数组
*/
private fun filterLeakingObjects() {
val startTime = System.currentTimeMillis()
MonitorLog.i(TAG, "filterLeakingObjects " + Thread.currentThread())
val activityHeapClass = mHeapGraph.findClassByName(ACTIVITY_CLASS_NAME)
val fragmentHeapClass = mHeapGraph.findClassByName(ANDROIDX_FRAGMENT_CLASS_NAME)
?: mHeapGraph.findClassByName(NATIVE_FRAGMENT_CLASS_NAME)
?: mHeapGraph.findClassByName(SUPPORT_FRAGMENT_CLASS_NAME)
val bitmapHeapClass = mHeapGraph.findClassByName(BITMAP_CLASS_NAME)
val nativeAllocationHeapClass = mHeapGraph.findClassByName(NATIVE_ALLOCATION_CLASS_NAME)
val nativeAllocationThunkHeapClass = mHeapGraph.findClassByName(NATIVE_ALLOCATION_CLEANER_THUNK_CLASS_NAME)
val windowClass = mHeapGraph.findClassByName(WINDOW_CLASS_NAME)
//缓存classHierarchy,用于查找class的所有instance
val classHierarchyMap = mutableMapOf<Long, Pair<Long, Long>>()
//记录class objects数量
val classObjectCounterMap = mutableMapOf<Long, ObjectCounter>()
//遍历镜像的所有instance
for (instance in mHeapGraph.instances) {
if (instance.isPrimitiveWrapper) {
continue
}
//使用HashMap缓存及遍历两边classHierarchy,这2种方式加速查找instance是否是对应类实例
//superId1代表类的继承层次中倒数第一的id,0就是继承自object
//superId4代表类的继承层次中倒数第四的id
//类的继承关系,以AOSP代码为主,部分厂商入如OPPO Bitmap会做一些修改,这里先忽略
val instanceClassId = instance.instanceClassId
val (superId1, superId4) = if (classHierarchyMap[instanceClassId] != null) {
classHierarchyMap[instanceClassId]!!
} else {
val classHierarchyList = instance.instanceClass.classHierarchy.toList()
val first = classHierarchyList.getOrNull(classHierarchyList.size - 2)?.objectId ?: 0L
val second = classHierarchyList.getOrNull(classHierarchyList.size - 5)?.objectId ?: 0L
Pair(first, second).also { classHierarchyMap[instanceClassId] = it }
}
//Activity
if (activityHeapClass?.objectId == superId4) {
val destroyField = instance[ACTIVITY_CLASS_NAME, DESTROYED_FIELD_NAME]!!
val finishedField = instance[ACTIVITY_CLASS_NAME, FINISHED_FIELD_NAME]!!
if (destroyField.value.asBoolean!! || finishedField.value.asBoolean!!) {
val objectCounter = updateClassObjectCounterMap(classObjectCounterMap, instanceClassId, true)
MonitorLog.i(TAG, "activity name : " + instance.instanceClassName
+ " mDestroyed:" + destroyField.value.asBoolean
+ " mFinished:" + finishedField.value.asBoolean
+ " objectId:" + (instance.objectId and 0xffffffffL))
if (objectCounter.leakCnt <= SAME_CLASS_LEAK_OBJECT_PATH_THRESHOLD) {
mLeakingObjectIds.add(instance.objectId)
mLeakReasonTable[instance.objectId] = "Activity Leak"
MonitorLog.i(OOM_ANALYSIS_TAG,
instance.instanceClassName + " objectId:" + instance.objectId)
}
}
continue
}
//Fragment
if (fragmentHeapClass?.objectId == superId1) {
val fragmentManager = instance[fragmentHeapClass.name, FRAGMENT_MANAGER_FIELD_NAME]
if (fragmentManager != null && fragmentManager.value.asObject == null) {
val mCalledField = instance[fragmentHeapClass.name, FRAGMENT_MCALLED_FIELD_NAME]
//mCalled为true且fragment manager为空时认为fragment已经destroy
val isLeak = mCalledField != null && mCalledField.value.asBoolean!!
val objectCounter = updateClassObjectCounterMap(classObjectCounterMap, instanceClassId, isLeak)
MonitorLog.i(TAG, "fragment name:" + instance.instanceClassName + " isLeak:" + isLeak)
if (objectCounter.leakCnt <= SAME_CLASS_LEAK_OBJECT_PATH_THRESHOLD && isLeak) {
mLeakingObjectIds.add(instance.objectId)
mLeakReasonTable[instance.objectId] = "Fragment Leak"
MonitorLog.i(OOM_ANALYSIS_TAG,
instance.instanceClassName + " objectId:" + instance.objectId)
}
}
continue
}
//Bitmap
if (bitmapHeapClass?.objectId == superId1) {
val fieldWidth = instance[BITMAP_CLASS_NAME, "mWidth"]
val fieldHeight = instance[BITMAP_CLASS_NAME, "mHeight"]
val width = fieldWidth!!.value.asInt!!
val height = fieldHeight!!.value.asInt!!
if (width * height >= DEFAULT_BIG_BITMAP) {
val objectCounter = updateClassObjectCounterMap(classObjectCounterMap, instanceClassId, true)
MonitorLog.e(TAG, "suspect leak! bitmap name: ${instance.instanceClassName}" +
" width: ${width} height:${height}")
if (objectCounter.leakCnt <= SAME_CLASS_LEAK_OBJECT_PATH_THRESHOLD) {
mLeakingObjectIds.add(instance.objectId)
mLeakReasonTable[instance.objectId] = "Bitmap Size Over Threshold, ${width}x${height}"
MonitorLog.i(OOM_ANALYSIS_TAG,
instance.instanceClassName + " objectId:" + instance.objectId)
//加入大对象泄露json
val leakObject = HeapReport.LeakObject().apply {
className = instance.instanceClassName
size = (width * height).toString()
extDetail = "$width x $height"
objectId = (instance.objectId and 0xffffffffL).toString()
}
mLeakModel.leakObjects.add(leakObject)
}
}
continue
}
//nativeallocation/NativeAllocationThunk/window
if (nativeAllocationHeapClass?.objectId == superId1
|| nativeAllocationThunkHeapClass?.objectId == superId1
|| windowClass?.objectId == superId1) {
updateClassObjectCounterMap(classObjectCounterMap, instanceClassId, false)
}
}
//关注class和对应instance数量,加入json
for ((instanceId, objectCounter) in classObjectCounterMap) {
val leakClass = HeapReport.ClassInfo().apply {
val heapClass = mHeapGraph.findObjectById(instanceId).asClass
className = heapClass?.name
instanceCount = objectCounter.allCnt.toString()
MonitorLog.i(OOM_ANALYSIS_TAG, "leakClass.className: $className leakClass.objectCount: $instanceCount")
}
mLeakModel.classInfos.add(leakClass)
}
//查找基本类型数组
val primitiveArrayIterator = mHeapGraph.primitiveArrays.iterator()
while (primitiveArrayIterator.hasNext()) {
val primitiveArray = primitiveArrayIterator.next()
val arraySize = primitiveArray.recordSize
if (arraySize >= DEFAULT_BIG_PRIMITIVE_ARRAY) {
val arrayName = primitiveArray.arrayClassName
val typeName = primitiveArray.primitiveType.toString()
MonitorLog.e(OOM_ANALYSIS_TAG,
"uspect leak! primitive arrayName:" + arrayName
+ " size:" + arraySize + " typeName:" + typeName
+ ", objectId:" + (primitiveArray.objectId and 0xffffffffL)
+ ", toString:" + primitiveArray.toString())
mLeakingObjectIds.add(primitiveArray.objectId)
mLeakReasonTable[primitiveArray.objectId] = "Primitive Array Size Over Threshold, ${arraySize}"
val leakObject = HeapReport.LeakObject().apply {
className = arrayName
size = arraySize.toString()
objectId = (primitiveArray.objectId and 0xffffffffL).toString()
}
mLeakModel.leakObjects.add(leakObject)
}
}
//查找对象数组
val objectArrayIterator = mHeapGraph.objectArrays.iterator()
while (objectArrayIterator.hasNext()) {
val objectArray = objectArrayIterator.next()
val arraySize = objectArray.recordSize
if (arraySize >= DEFAULT_BIG_OBJECT_ARRAY) {
val arrayName = objectArray.arrayClassName
MonitorLog.i(OOM_ANALYSIS_TAG,
"object arrayName:" + arrayName + " objectId:" + objectArray.objectId)
mLeakingObjectIds.add(objectArray.objectId)
val leakObject = HeapReport.LeakObject().apply {
className = arrayName
size = arraySize.toString()
objectId = (objectArray.objectId and 0xffffffffL).toString()
}
mLeakModel.leakObjects.add(leakObject)
}
}
val endTime = System.currentTimeMillis()
mLeakModel.runningInfo?.filterInstanceTime = ((endTime - startTime).toFloat() / 1000).toString()
MonitorLog.i(OOM_ANALYSIS_TAG, "filterLeakingObjects time:" + 1.0f * (endTime - startTime) / 1000)
}
3.4 对 3.3 中的class遍历,找出泄露的对象,添加到 HeapReport
private fun findPathsToGcRoot() {
val startTime = System.currentTimeMillis()
val heapAnalyzer = HeapAnalyzer(
OnAnalysisProgressListener { step: OnAnalysisProgressListener.Step ->
MonitorLog.i(TAG, "step:" + step.name + ", leaking obj size:" + mLeakingObjectIds.size)
}
)
val findLeakInput = FindLeakInput(mHeapGraph, AndroidReferenceMatchers.appDefaults,
false, mutableListOf())
val (applicationLeaks, libraryLeaks) = with(heapAnalyzer) {
findLeakInput.findLeaks(mLeakingObjectIds)
}
MonitorLog.i(OOM_ANALYSIS_TAG,
"---------------------------Application Leak---------------------------------------")
//填充application leak
MonitorLog.i(OOM_ANALYSIS_TAG, "ApplicationLeak size:" + applicationLeaks.size)
for (applicationLeak in applicationLeaks) {
val (gcRootType, referencePath, leakTraceObject) = applicationLeak.leakTraces[0]
val gcRoot = gcRootType.description
val labels = leakTraceObject.labels.toTypedArray()
leakTraceObject.leakingStatusReason = mLeakReasonTable[leakTraceObject.objectId].toString()
val leakTraceChainModel = HeapReport.GCPath()
.apply {
this.instanceCount = applicationLeak.leakTraces.size
this.leakReason = leakTraceObject.leakingStatusReason
this.gcRoot = gcRoot
this.signature = applicationLeak.signature
}
.also { mLeakModel.gcPaths.add(it) }
// 添加索引到的trace path
for (reference in referencePath) {
val referenceName = reference.referenceName
val clazz = reference.originObject.className
val referenceDisplayName = reference.referenceDisplayName
val referenceGenericName = reference.referenceGenericName
val referenceType = reference.referenceType.toString()
val declaredClassName = reference.owningClassName
val leakPathItem = HeapReport.GCPath.PathItem().apply {
this.reference = if (referenceDisplayName.startsWith("[")) //数组类型[]
clazz
else
"$clazz.$referenceDisplayName"
this.referenceType = referenceType
this.declaredClass = declaredClassName
}
leakTraceChainModel.path.add(leakPathItem)
}
// 添加本身trace path
leakTraceChainModel.path.add(HeapReport.GCPath.PathItem().apply {
reference = leakTraceObject.className
referenceType = leakTraceObject.typeName
})
}
//填充library leak
MonitorLog.i(OOM_ANALYSIS_TAG, "LibraryLeak size:" + libraryLeaks.size)
for (libraryLeak in libraryLeaks) {
val (gcRootType, referencePath, leakTraceObject) = libraryLeak.leakTraces[0]
val gcRoot = gcRootType.description
val labels = leakTraceObject.labels.toTypedArray()
leakTraceObject.leakingStatusReason = mLeakReasonTable[leakTraceObject.objectId].toString()
val leakTraceChainModel = HeapReport.GCPath().apply {
this.instanceCount = libraryLeak.leakTraces.size
this.leakReason = leakTraceObject.leakingStatusReason
this.signature = libraryLeak.signature
this.gcRoot = gcRoot
}
mLeakModel.gcPaths.add(leakTraceChainModel)
// 添加索引到的trace path
for (reference in referencePath) {
val clazz = reference.originObject.className
val referenceName = reference.referenceName
val referenceDisplayName = reference.referenceDisplayName
val referenceGenericName = reference.referenceGenericName
val referenceType = reference.referenceType.toString()
val declaredClassName = reference.owningClassName
val leakPathItem = HeapReport.GCPath.PathItem().apply {
this.reference = if (referenceDisplayName.startsWith("["))
clazz
else //数组类型[]
"$clazz.$referenceDisplayName"
this.referenceType = referenceType
this.declaredClass = declaredClassName
}
leakTraceChainModel.path.add(leakPathItem)
}
// 添加本身trace path
leakTraceChainModel.path.add(HeapReport.GCPath.PathItem().apply {
reference = leakTraceObject.className
referenceType = leakTraceObject.typeName
})
break
}
val endTime = System.currentTimeMillis()
mLeakModel.runningInfo!!.findGCPathTime = ((endTime - startTime).toFloat() / 1000).toString()
}
3.5 把 HeapReport 转化为 json 写入到 jsonFile
private fun fillJsonFile(jsonFile: String?) {
val json = Gson().toJson(mLeakModel)
try {
jsonFile?.let { File(it) }?.writeText(json)
MonitorLog.i(OOM_ANALYSIS_TAG, "JSON write success: $json")
} catch (e: IOException) {
e.printStackTrace()
MonitorLog.i(OOM_ANALYSIS_TAG, "JSON write exception: $json", true)
}
}
总结:这里主要使用了 shark 对 hprof 的解析及分析。主要的流程:
- 通过 hprof 构建 HeapGraph 对象;
- 初始化 HeapReport 的实例 mLeakModel 对象的数据;
- 遍历镜像所有class,找出可能引起泄露的对象。这里主要有 Activity、Fragment、Bitmap、nativeallocation 、基本类型数组 以及 对象数组;
- 对 3.3 中筛选出来的class遍历,找出泄露的对象,然后把数据添加到 mLeakModel ;
- 把 mLeakModel 转化为 json 写入到 jsonFile;
- 通知 OOMMonitor,分析结束,可以处理后续逻辑;
- 结束当前的服务进程;
3、总结
整个流程,还是挺清晰的,对于一些代码细节,感兴趣的可以细读,例如多线程与fork()、从 hprof 中查找对象、找出对象泄露等,都有很多细节在,涉及到的知识点也比较多。
而在 KOOM 中,使用了 xhook 实现 hook ,使用 shark 实现 hprof 分析。想深入探索的可以研究下它们的源码。
文中如果有讲述错误或者不当的地方,希望佬们帮忙指正,非常感谢!