问题背景
近期收到某大厂反馈他们的应用程序在我们的机型中频繁出现如下崩溃信息。这个问题处理前面是其他同事处理的,后来同事咨询问如何跟踪某个管道 FD 被关闭的堆栈,看了他们的文档介绍后,一看这东西咋那么眼熟。
pid: 21992, tid: 28371, name: Signal Catcher >>> a.b.c <<<
uid: 10283
tagged_addr_ctrl: 0000000000000001 (PR_TAGGED_ADDR_ENABLE)
pac_enabled_keys: 000000000000000f (PR_PAC_APIAKEY, PR_PAC_APIBKEY, PR_PAC_APDAKEY, PR_PAC_APDBKEY)
esr: 0000000092000006 (Data Abort Exception 0x24)
signal 6 (SIGABRT), code -1 (SI_QUEUE), fault addr --------
Abort message: 'Check failed: niceness != -1 || errno == 0 No such process tid:28373'
x0 0000000000000000 x1 0000000000006ed3 x2 0000000000000006 x3 000000774e92bd40
x4 00000000000efe20 x5 00000000000efe20 x6 00000000000efe20 x7 0000000000000001
x8 00000000000000f0 x9 000001ff00000020 x10 000000ff00000020 x11 000000000012800c
x12 000000000000cc32 x13 000000005fff4000 x14 0000000000000001 x15 00000000000000f0
x16 0000007a1e58a170 x17 0000007a1e570840 x18 00000076dde34000 x19 00000000000055e8
x20 0000000000006ed3 x21 00000000ffffffff x22 0000007761e146c8 x23 0000007855603010
x24 0000007761e13000 x25 0000000000fffff9 x26 000000005a000000 x27 0000000000000000
x28 b4000077e5616d10 x29 000000774e92bdc0
lr 0000007a1e510c3c sp 000000774e92bd40 pc 0000007a1e510c60 pst 0000000000001000
esr 0000000092000006
26 total frames
backtrace:
#00 pc 0000000000075c60 /apex/com.android.runtime/lib64/bionic/libc.so (abort+160) (BuildId: c56349c5b531d74a1e2fa0bafba78e5c)
#01 pc 000000000091ee30 /apex/com.android.art/lib64/libart.so (art::Runtime::Abort(char const*)+1008) (BuildId: 358ba270e15145fce121992683a577bc)
#02 pc 0000000000016134 /apex/com.android.art/lib64/libbase.so (android::base::SetAborter(std::__1::function<void (char const*)>&&)::$_0::__invoke(char const*)+68) (BuildId: e3a4bdc6221ada5240a090255fdc022d)
#03 pc 0000000000015690 /apex/com.android.art/lib64/libbase.so (android::base::LogMessage::~LogMessage()+540) (BuildId: e3a4bdc6221ada5240a090255fdc022d)
#04 pc 00000000004b0264 /apex/com.android.art/lib64/libart.so (art::Thread::GetNativeNiceness() const+412) (BuildId: 358ba270e15145fce121992683a577bc)
#05 pc 0000000000456f24 /apex/com.android.art/lib64/libart.so (art::Thread::DumpState(std::__1::basic_ostream<char, std::__1::char_traits<char>>&, art::Thread const*, int)+3768) (BuildId: 358ba270e15145fce121992683a577bc)
#06 pc 0000000000455a04 /apex/com.android.art/lib64/libart.so (art::Thread::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char>>&, unwindstack::AndroidLocalUnwinder&, bool, bool) const+56) (BuildId: 358ba270e15145fce121992683a577bc)
#07 pc 00000000004558f4 /apex/com.android.art/lib64/libart.so (art::DumpCheckpoint::Run(art::Thread*)+116) (BuildId: 358ba270e15145fce121992683a577bc)
#08 pc 00000000002c1a14 /apex/com.android.art/lib64/libart.so (art::ThreadList::RunCheckpoint(art::Closure*, art::Closure*, bool, bool)+1140) (BuildId: 358ba270e15145fce121992683a577bc)
#09 pc 0000000000459fc8 /apex/com.android.art/lib64/libart.so (art::ThreadList::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char>>&, bool)+216) (BuildId: 358ba270e15145fce121992683a577bc)
#10 pc 0000000000922b50 /apex/com.android.art/lib64/libart.so (art::AbortState::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char>>&) const+208) (BuildId: 358ba270e15145fce121992683a577bc)
#11 pc 000000000091ee78 /apex/com.android.art/lib64/libart.so (art::Runtime::Abort(char const*)+1080) (BuildId: 358ba270e15145fce121992683a577bc)
#12 pc 0000000000016134 /apex/com.android.art/lib64/libbase.so (android::base::SetAborter(std::__1::function<void (char const*)>&&)::$_0::__invoke(char const*)+68) (BuildId: e3a4bdc6221ada5240a090255fdc022d)
#13 pc 0000000000015690 /apex/com.android.art/lib64/libbase.so (android::base::LogMessage::~LogMessage()+540) (BuildId: e3a4bdc6221ada5240a090255fdc022d)
#14 pc 00000000004b0264 /apex/com.android.art/lib64/libart.so (art::Thread::GetNativeNiceness() const+412) (BuildId: 358ba270e15145fce121992683a577bc)
#15 pc 0000000000456b3c /apex/com.android.art/lib64/libart.so (art::Thread::DumpState(std::__1::basic_ostream<char, std::__1::char_traits<char>>&, art::Thread const*, int)+2768) (BuildId: 358ba270e15145fce121992683a577bc)
#16 pc 0000000000455a04 /apex/com.android.art/lib64/libart.so (art::Thread::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char>>&, unwindstack::AndroidLocalUnwinder&, bool, bool) const+56) (BuildId: 358ba270e15145fce121992683a577bc)
#17 pc 00000000004558f4 /apex/com.android.art/lib64/libart.so (art::DumpCheckpoint::Run(art::Thread*)+116) (BuildId: 358ba270e15145fce121992683a577bc)
#18 pc 00000000002c1a14 /apex/com.android.art/lib64/libart.so (art::ThreadList::RunCheckpoint(art::Closure*, art::Closure*, bool, bool)+1140) (BuildId: 358ba270e15145fce121992683a577bc)
#19 pc 0000000000459fc8 /apex/com.android.art/lib64/libart.so (art::ThreadList::Dump(std::__1::basic_ostream<char, std::__1::char_traits<char>>&, bool)+216) (BuildId: 358ba270e15145fce121992683a577bc)
#20 pc 0000000000459a58 /apex/com.android.art/lib64/libart.so (art::ThreadList::DumpForSigQuit(std::__1::basic_ostream<char, std::__1::char_traits<char>>&)+376) (BuildId: 358ba270e15145fce121992683a577bc)
#21 pc 0000000000458cd0 /apex/com.android.art/lib64/libart.so (art::Runtime::DumpForSigQuit(std::__1::basic_ostream<char, std::__1::char_traits<char>>&)+48) (BuildId: 358ba270e15145fce121992683a577bc)
#22 pc 00000000005097e0 /apex/com.android.art/lib64/libart.so (art::SignalCatcher::HandleSigQuit()+632) (BuildId: 358ba270e15145fce121992683a577bc)
#23 pc 00000000004713a4 /apex/com.android.art/lib64/libart.so (art::SignalCatcher::Run(void*)+1388) (BuildId: 358ba270e15145fce121992683a577bc)
#24 pc 0000000000086f60 /apex/com.android.runtime/lib64/bionic/libc.so (__pthread_start(void*) (.__uniq.67847048707805468364044055584648682506)+184) (BuildId: c56349c5b531d74a1e2fa0bafba78e5c)
#25 pc 0000000000079690 /apex/com.android.runtime/lib64/bionic/libc.so (__start_thread+68) (BuildId: c56349c5b531d74a1e2fa0bafba78e5c)
其 abort 原因在函数 GetNativeNiceness 上发现该 art::Thread 已经不存在,从 Signal Catcher 线程号 28371,线程号 28373 基本可以判断是虚拟机创建的线程,诸如“Metrics Background Reporting Thread”,“perfetto_hprof_listener”,“ADB-JDWP Connection Control Thread” 等。
该函数 art::Thread::GetNativeNiceness() 替代以前的 art::Thread::GetNativePriority();
问题分析
安装反馈提供的应用程序启动后,用抓取现场,可以看到线程 perfetto_hprof_listener 已经不存在了,但是它的 art::Thread 对象依旧挂在 thread_list_ 列表,意味着该线程退出前,没有 Unregister() 从列表中移除,于是调用 GetNativeNiceness 函数触发 abort 中断程序。
core-parser> t -a
ID TID STATUS ADDRESS NAME
*1 11589 Native 0xb400007472617010 "main"
2 12468 WaitingInMainSignalCatcherLoop 0x747263b820 "Signal Catcher"
3 12469 Native 0x747261fb20 "perfetto_hprof_listener" (NOT EXIST THREAD)
4 12470 WaitingInMainDebuggerLoop 0x747261c380 "ADB-JDWP Connection Control Thread"
5 12472 Native 0x747261a7b0 "Jit thread pool worker thread 0"
6 12474 Waiting 0x7472628630 "ReferenceQueueDaemon"
7 12473 WaitingForTaskProcessor 0x74726216f0 "HeapTaskDaemon"
8 12475 Waiting 0x74726232c0 "FinalizerDaemon"
9 12476 Sleeping 0x7472639c50 "FinalizerWatchdogDaemon"
可以看到线程 perfetto_hprof_listener 代码仅有 AttachCurrentThread 函数添加到虚拟机管理,没有在异常退出处调用 DetachCurrentThread 反注册移除 thread_list_ 的列表。
从代码的角度,不难发现只有写管道的 g_signal_pipe_fds[1] 被关闭的情况下退出满足条件退出循环。看到这个变量符号那一刻,关于 Android15 GKI2407R40 导致梆梆加固软件崩溃,不就是当时这个问题里看到代码片段吗...。
调试取证
1 年前已经存在这样的逻辑,我想现在发生类似的事情,这部分代码应该还在,于是抓个现场证据看看。已知条件是通过读取 g_signal_pipe_fds 符号地址,获取写管道 fd 的值。调用 close 进行关闭操作。
注入 libopencore.so
# setenforce 0
./data/core-parser -p `pidof zygote64`
core-parser> remote hook --inject -l /data/libopencore.so
arm64: hook inject "/data/libopencore.so"
arm64: hook found "dlopen" address: 0x733a131020
arm64: target process current sp: 0x7ffe9f02a0
arm64: call dlopen(0x7ffe9f0280 "/data/libopencore.so", 0x2)
arm64: return 0xa4eeb80a85c4950f
修改 zygote64 内存
我们可以对 close 函数进行改造,如果 fd = g_signal_pipe_fds[1],则 segv 触发 core。
core-parser> disas _ZN14perfetto_hprof17g_signal_pipe_fdsE
LIB: /apex/com.android.art/lib64/libperfetto_hprof.so
SYMBOL: _ZN14perfetto_hprof17g_signal_pipe_fdsE
* perfetto_hprof::g_signal_pipe_fds: 0x706ed634b8
core-parser> rd 0x706ed634b8
706ed634b8: 0000000000000000 ........
core-parser>
新增汇编代码逻辑改造 close 函数汇编代码如下,写入 zygote64 内存中。
mov x2, 0x34b8
movk x2, 0x6ed6, lsl #16
movk x2, 0x0070, lsl #32
ldr w3, [x2, #4]
cmp w0, w3
b.ne 0x8
ldr x1, [x1] // segv dead
core-parser> rd 7343fc6100 -e 7343fc6164 -i
0x7343fc6100: a9be7bfd | stp x29, x30, [sp, #-0x20]!
0x7343fc6104: f9000bf3 | str x19, [sp, #0x10]
0x7343fc6108: 910003fd | mov x29, sp
0x7343fc610c: aa1f03e1 | mov x1, xzr
0x7343fc6110: d2869702 | mov x2, #0x34b8
0x7343fc6114: f2addac2 | movk x2, #0x6ed6, lsl #16
0x7343fc6118: f2c00e02 | movk x2, #0x70, lsl #32
0x7343fc611c: b9400443 | ldr w3, [x2, #4]
0x7343fc6120: 6b03001f | cmp w0, w3
0x7343fc6124: 54000041 | b.ne 0x7343fc612c
0x7343fc6128: f9400021 | ldr x1, [x1]
0x7343fc612c: 97fffa35 | bl 0x7343fc4a00
0x7343fc6130: 3100041f | cmn w0, #1
0x7343fc6134: 54000121 | b.ne 0x7343fc6158
0x7343fc6138: 2a0003f3 | mov w19, w0
0x7343fc613c: 97fff7fd | bl 0x7343fc4130
0x7343fc6140: aa0003e8 | mov x8, x0
0x7343fc6144: 2a1303e0 | mov w0, w19
0x7343fc6148: b9400108 | ldr w8, [x8]
0x7343fc614c: 7100111f | cmp w8, #4
0x7343fc6150: 54000041 | b.ne 0x7343fc6158
0x7343fc6154: 2a1f03e0 | mov w0, wzr
0x7343fc6158: f9400bf3 | ldr x19, [sp, #0x10]
0x7343fc615c: a8c27bfd | ldp x29, x30, [sp], #0x20
0x7343fc6160: d65f03c0 | ret
...
core-parser> disas close
LIB: /apex/com.android.runtime/lib64/bionic/libc.so
close: [7343f4daf0, 7343f4db38]
0x7343f4daf0: 1401e184 | b 0x7343fc6100
0x7343f4daf4: f9000bf3 | str x19, [sp, #0x10]
0x7343f4daf8: 910003fd | mov x29, sp
0x7343f4dafc: aa1f03e1 | mov x1, xzr
0x7343f4db00: 9401dbc0 | bl 0x7343fc4a00
0x7343f4db04: 3100041f | cmn w0, #1
0x7343f4db08: 54000121 | b.ne 0x7343f4db2c
0x7343f4db0c: 2a0003f3 | mov w19, w0
0x7343f4db10: 9401d988 | bl 0x7343fc4130
0x7343f4db14: aa0003e8 | mov x8, x0
0x7343f4db18: 2a1303e0 | mov w0, w19
0x7343f4db1c: b9400108 | ldr w8, [x8]
0x7343f4db20: 7100111f | cmp w8, #4
0x7343f4db24: 54000041 | b.ne 0x7343f4db2c
0x7343f4db28: 2a1f03e0 | mov w0, wzr
0x7343f4db2c: f9400bf3 | ldr x19, [sp, #0x10]
0x7343f4db30: a8c27bfd | ldp x29, x30, [sp], #0x20
0x7343f4db34: d65f03c0 | ret
修改 close 第一行机器码直接跳转到 0x7343fc6100 hook_close 位置。
启动应用程序
# killall -9 usap64 // 删掉 app 进程缓存
01-28 18:58:15.649 13270 13270 I opencore: Init inject opencore-1.4.16 environment..
01-28 19:00:24.215 5895 5895 I opencore: Wait (5984) coredump
01-28 19:00:24.217 5984 5984 I opencore: Coredump /sdcard/Android/data/a.b.c/files/core.a.b.c_5895_1769598024 ...
01-28 19:00:29.501 5984 5984 I opencore: Finish done.
core-parser> bt
"main" sysTid=5895 Native
| group="main" daemon=0 prio=5 target=0x0 uncaught_exception=0x0
| tid=1 sCount=0 flags=0 obj=0x728b83e0 self=0xb40000712140d7b0 env=0xb4000071b1414710
| stack=0x7ffe1f5000-0x7ffe1f7000 stackSize=0x7ff000 handle=0x7357535118
| mutexes=0xb40000712140df50 held=
x0 0x000000000000005d x1 0x0000000000000000 x2 0x000000706ed634b8 x3 0x000000000000005d
x4 0xffffffffffffffff x5 0x0000007ffe9ed650 x6 0x0000000000000039 x7 0x7f7f7f7f7f7f7f7f
x8 0x0000007ffe9ed243 x9 0x0000000000000065 x10 0x0000000000000033 x11 0x0000000000000002
x12 0x0000007ffe9ecd14 x13 0xffffff80ffffffd8 x14 0x0000000000000010 x15 0x00000000ffffffa5
x16 0x0000006ff4a2ea68 x17 0x0000007343f4daf0 x18 0x0000007356464000 x19 0x0000006ff4a39930
x20 0x0000006ff4a39000 x21 0x000000706ed634b8 x22 0x000000000000005d x23 0x0000000000000045
x24 0x0000007ffe9ed240 x25 0x0000006ff4a39000 x26 0x0000006ff4a39000 x27 0x0000006ff4a30000
x28 0x0000006ff4a393e0 fp 0x0000007ffe9ed0d0
lr 0x0000006ff4967de4 sp 0x0000007ffe9ed0d0 pc 0x0000007343fc6128 pst 0x0000000060001000
Native: #0 0000007343fc6128 /apex/com.android.runtime/lib64/bionic/libc.so+0xf2128
Native: #1 0000006ff4967de0
Native: #2 0000006ff494e9bc
Native: #3 0000006ff49482d4
Native: #4 000000707d53966c art::JavaVMExt::LoadNativeLibrary(_JNIEnv*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, _jobject*, _jclass*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*)+0x4ec
Native: #5 000000706b8ee480 JVM_NativeLoad+0x170
Native: #6 00000000715d2078 art_jni_trampoline+0x98
JavaKt: #00 0000000000000000 java.lang.Runtime.nativeLoad
JavaKt: #01 000000707cb0c56c java.lang.Runtime.loadLibrary0
JavaKt: #02 000000707cb0c4f8 java.lang.Runtime.loadLibrary0
JavaKt: #03 000000707cb17b90 java.lang.System.loadLibrary
JavaKt: #04 0000006ff947cafc com.AppGuard.AppGuard.XLVQB.load
JavaKt: #05 0000006ff947b74e com.AppGuard.AppGuard.NZMXG.instantiateClassLoader
JavaKt: #06 000000707c1516c6 android.app.LoadedApk.createOrUpdateClassLoaderLocked
JavaKt: #07 000000707c150378 android.app.LoadedApk.getResources
JavaKt: #08 000000707c0e0b30 android.app.ContextImpl.createAppContext
JavaKt: #09 000000707c0ae14e android.app.ActivityThread.handleBindApplication
JavaKt: #10 000000707c0aa8fc android.app.ActivityThread.-$$Nest$mhandleBindApplication
JavaKt: #11 000000707c0a5780 android.app.ActivityThread$H.handleMessage
JavaKt: #12 000000707afaabae android.os.Handler.dispatchMessage
JavaKt: #13 000000707afd3ed2 android.os.Looper.loopOnce
JavaKt: #14 000000707afd4800 android.os.Looper.loop
JavaKt: #15 000000707c0b3072 android.app.ActivityThread.main
JavaKt: #16 0000000000000000 java.lang.reflect.Method.invoke
JavaKt: #17 0000007079f6bbfa com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run
JavaKt: #18 0000007079f70ae0 com.android.internal.os.ZygoteInit.main
core-parser>
core-parser> disas _ZN14perfetto_hprof17g_signal_pipe_fdsE
LIB: /apex/com.android.art/lib64/libperfetto_hprof.so
SYMBOL: _ZN14perfetto_hprof17g_signal_pipe_fdsE
* perfetto_hprof::g_signal_pipe_fds: 0x706ed634b8
core-parser> rd 0x706ed634b8
706ed634b8: 0000005d00000045 E...]...
core-parser>
core-parser> f 1
JavaKt: #01 000000707cb0c56c java.lang.Runtime.loadLibrary0(java.lang.ClassLoader, java.lang.Class, java.lang.String)
{
Location: /apex/com.android.art/javalib/core-oj.jar
art::ArtMethod: 0x6fc7bcf0
dex_pc_ptr: 0x707cb0c56c
quick_frame: 0x7ffe9ef810
frame_pc: 0x71384bd8
method_header: 0x71384a8c
DEX CODE:
0x707cb0c564: 020c | move-result-object v2
0x707cb0c566: 2107 | move-object v1, v2
0x707cb0c568: 0138 0010 | if-eqz v1, 0x707cb0c588 //+16
0x707cb0c56c: 3071 0dbb 0761 | invoke-static {v1, v6, v7}, java.lang.String java.lang.Runtime.nativeLoad(java.lang.String, java.lang.ClassLoader, java.lang.Class) // method@3515
{
v0 = w24 v1 = w26 v5 = w25 v6 = w22
v7 = w23 v8 = w24
}
OAT CODE:
0x71384bb4: 14000002 | b 0x71384bbc
0x71384bb8: aa0003fa | mov x26, x0
0x71384bbc: 340003da | cbz w26, 0x71384c34
0x71384bc0: aa1603e2 | mov x2, x22
0x71384bc4: aa1703e3 | mov x3, x23
0x71384bc8: aa1a03e1 | mov x1, x26
0x71384bcc: f0ff47a0 | adrp x0, 0x6fc7b000
0x71384bd0: 9135c000 | add x0, x0, #0xd70
0x71384bd4: f9400c1e | ldr x30, [x0, #0x18]
0x71384bd8: d63f03c0 | blr x30
0x71384bdc: 350000c0 | cbnz w0, 0x71384bf4
0x71384be0: aa0003fb | mov x27, x0
{
x19 = 0xb40000712140d7b0 x20 = 0x0000000000000000 x21 = 0xb40000712140d870 x22 = 0x00000000023bcc70
x23 = 0x00000000af2bdde0 x24 = 0x00000000023bec58 x25 = 0x000000006fa5cd10 x26 = 0x00000000023bef70
x27 = 0x0000000000000018 x28 = 0x0000007ffe9ef990 fp = 0x0000007ffe9ef96c lr = 0x0000000071384bdc
}
}
core-parser> p 0x00000000023bef70
Size: 0x70
Object Name: java.lang.String
[0x10] virutal char[] values = "/data/app/~~gKv2P9XhKhqKOOzrwlg7NQ==/a.b.c-SUPK3pUVM0I0SmBiF2mlGg==/lib/arm64/libAppGuard.so"
[0x0c] private int hash = 0
[0x08] private final int count = 96
// extends java.lang.Object
[0x04] private transient int shadow$_monitor_ = 0
[0x00] private transient java.lang.Class shadow$_klass_ = 0x6faa0a80
core-parser>
core-parser> vtor 0000006ff4967de0
* VIRTUAL: 0x6ff4967de0
* PHYSICAL: 0x9e501de0
* OFFSET: 0x34de0
* OR: 0x74bb701de0
* MMAP: 0x0
* OVERLAY: 0x0
[6ff4933000, 6ff4a24000) rwx 00000f1000 00000f1000 [] [*]
core-parser>
可以看到在加载 libAppGuard.so 的过程中,在匿名代码段 <anonymous:6ff4933000> 中调用了 close 函数关闭了 g_signal_pipe_fds[1] 管道,于是 perfetto_hprof_listener 线程异常退出。
匿名代码段
core-parser> rd 6ff4933000 -e 6ff4a24000 -f /data/6ff4933000.bin
Saved [/data/6ff4933000.bin].
core-parser>
该代码片段和一年前的一模一样,埋下了这个隐患,直到 ART 代码更新,新增 GetNativeNiceness 函数后开始发生错误。
后记
perfetto_hprof_listener 线程的异常退出没有移除 thread_list_ 列表属 ART 的 BUG,但是依赖 Google Mainline 的更新,周期一般都挺长的,应用端更新相对较快,期望更多开发者能尽快更新,转达不易,感谢相互告知!