前言
MTE、HWASAN 等内存检测方案,它们检测内存是否被踩踏上采用的是标记设计,这里存在一个弊端无法检测,踩踏内存的指针标记是相同的情况。那么这种情况下结合业务场景判断是否可使用内存访问权限来进行定位,下面介绍如何进行数据结构膨胀来进行访问权限设置。
struct S {
uint64_t A;
uint64_t B;
uint64_t C;
};
如我们现有一个数据结构 S,里有数据成员 A、B、C,现发现成员 B 的内存始终被破坏,我们需要调查成员 B 被改写的地方。
结构膨胀
假设这个数据结构 S 大小 < 4K,在内存上两种排布,落在一张页表上或横跨两张页表上。
内存保护
假设 B 地址为 0x7F,1234,1234,那么页表起始点为 0x7F,1234,1234 & ~0xFFF = 0x7F,1234,1000,结束点则是 0x7F,1234,1234 + sizeof(B),保护内存范围取决于 sizeof(B) 的值,是否落在同一个页表上,因此大小计算公式为页表下界 - 页表上界即可。
AlignUp(0x7F,1234,1234 + sizeof(B), 4096) - AlignDown(0x7F,1234,1234, 4096)
Size = AlignUp(Ptr + sizeof(B), 4K) - AlignDown(Ptr, 4K)
// int mprotect(void *addr, size_t len, int prot);
// Read only
mprotect(AlignDown(Ptr, 4K), Size, PROT_READ)
// Read / Write
mprotect(AlignDown(Ptr, 4K), Size, PROT_READ | PROT_WRITE)
例子
// main.cpp
#include "GlobalLog.h"
#include <thread>
#include <dlfcn.h>
void push() {
while (1) {
gLog.record();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
void load() {
while (1) {
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
void *handle = dlopen("/data/local/tmp/libsecond.so", RTLD_LAZY);
if (handle) {
void *sym = dlsym(handle, "_Z13unused_secondv");
typedef void (*unused_second)();
((unused_second)sym)();
dlclose(handle);
}
}
}
int main() {
std::thread t1(push);
std::thread t2(load);
t1.join();
t2.join();
return 0;
}
// GlobalLog.h
#ifndef PADDINGSTRUCTTEST_GLOBALLOG_H
#define PADDINGSTRUCTTEST_GLOBALLOG_H
#include <string>
#include <deque>
#include <iostream>
// #define PADDING_CHECK
class GlobalLog {
public:
void record();
GlobalLog(std::string n) : name(n) {}
private:
std::string name;
#ifdef PADDING_CHECK
__attribute__((unused)) const char padding_start[0x1000] = {'s'};
#endif
std::deque<std::string> mEntries;
#ifdef PADDING_CHECK
__attribute__((unused)) const char padding_end[0x1000] = {'e'};
#endif
};
extern GlobalLog gLog;
#endif //PADDINGSTRUCTTEST_GLOBALLOG_H
// GlobalLog.cpp
#include "GlobalLog.h"
#include <stdint.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <iostream>
GlobalLog gLog("GlobalLog");
uint64_t align_down(uint64_t x, uint64_t n) {
return (x & -n);
}
uint64_t align_up(uint64_t x, uint64_t n) {
return align_down(x + n - 1, n);
}
void GlobalLog::record() {
#ifdef PADDING_CHECK
mprotect((void *)align_down((uint64_t)&mEntries, 0x1000),
align_up((uint64_t)&mEntries + sizeof(mEntries), 0x1000) - align_down((uint64_t)&mEntries, 0x1000),
PROT_READ | PROT_WRITE);
#endif
mEntries.push_back("test_msg");
#ifdef PADDING_CHECK
mprotect((void *)align_down((uint64_t)&mEntries, 0x1000),
align_up((uint64_t)&mEntries + sizeof(mEntries), 0x1000) - align_down((uint64_t)&mEntries, 0x1000),
PROT_READ);
#endif
}
// second.cpp
#include "GlobalLog.h"
#include <iostream>
void __attribute__((unused)) unused_second() {
// do nothing
std::cout << std::hex << &gLog << std::endl;
}
// CMakeLists.txt
cmake_minimum_required(VERSION 3.21.1)
project(padding_tester)
add_library(global-lib STATIC GlobalLog.cpp)
add_library(second SHARED second.cpp)
target_link_libraries(second global-lib)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic -ldl -pthread")
add_executable(padding_tester main.cpp)
target_link_libraries(padding_tester global-lib)
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_NDK=$ANDROID_NDK \
-DANDROID_PLATFORM=android-30 \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DCMAKE_BUILD_TARGET=android \
CMakeLists.txt \
-B android
make -C android -j8
测试
139|matisse:/data/local/tmp # ./padding_tester
0x56b50af388
0x56b50af388
0x56b50af388
0x56b50af388
0x56b50af388
0x56b50af388
0x56b50af388
0x56b50af388
Segmentation fault
pid: 19598, tid: 19599, name: padding_tester >>> ./padding_tester <<<
uid: 0
tagged_addr_ctrl: 0000000000000001
signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x10
Cause: null pointer dereference
x0 00000056b50af388 x1 0000007066416bb8 x2 00000070e6a029c8 x3 00000070e6a029d8
x4 0000000000000000 x5 0000000000000000 x6 0000000000000001 x7 00000000001f5000
x8 0000000000000000 x9 0000000000989680 x10 0000000000000055 x11 0000000000000055
x12 0000000000000000 x13 0000000000000000 x14 0000000000000800 x15 000000001a4820ee
x16 00000056b50ae1a0 x17 00000070e6f83370 x18 0000007065e06000 x19 00000056b50af388
x20 0000007066417000 x21 0000007066416cb0 x22 0000000000004c8e x23 0000000000004c8e
x24 0000007066416cb0 x25 0000007066416cb0 x26 0000007066416ff8 x27 00000000000fc000
x28 00000000000fe000 x29 0000007066416bd0
lr 00000056b5081e04 sp 0000007066416bb0 pc 00000056b50821f8 pst 0000000060001000
backtrace:
#00 pc 00000000000241f8 /data/local/tmp/padding_tester (GlobalLog::record()+188) (BuildId: c37bc1b67c1d1a283ac77a3ed8a7335c48fa0b65)
#01 pc 0000000000023e00 /data/local/tmp/padding_tester (push()+36) (BuildId: c37bc1b67c1d1a283ac77a3ed8a7335c48fa0b65)
#02 pc 0000000000024010 /data/local/tmp/padding_tester (void* std::__ndk1::__thread_proxy<std::__ndk1::tuple<std::__ndk1::unique_ptr<std::__ndk1::__thread_struct, std::__ndk1::default_delete<std::__ndk1::__thread_struct> >, void (*)()> >(void*)+44) (BuildId: c37bc1b67c1d1a283ac77a3ed8a7335c48fa0b65)
#03 pc 00000000000eae70 /apex/com.android.runtime/lib64/bionic/libc.so (__pthread_start(void*)+204) (BuildId: 4cbc2a7636b3e0b018386ca8427ddf9b)
#04 pc 000000000008a82c /apex/com.android.runtime/lib64/bionic/libc.so (__start_thread+64) (BuildId: 4cbc2a7636b3e0b018386ca8427ddf9b)
整个程序有且仅有一处使用到 gLog 这个变量进行 mEntries.push_back("test_msg");,这个 tombstone 的解析这里就不做太多解答,这一次 Native Crash 的直接原因是 mEntries 指向的内存被破坏导致 push_back 函数流程上发生段错误。 当我们打开宏定义 PADDING_CHECK,重新编译程序测试。
pid: 19902, tid: 19905, name: padding_tester >>> ./padding_tester <<<
uid: 0
tagged_addr_ctrl: 0000000000000001
signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x5fbde5f000
x0 0000005fbde5e471 x1 0000000000000000 x2 0000000000000420 x3 0000005fbde5f000
x4 0000005fbde5f4a0 x5 0000000000000004 x6 6f6c2f617461642f x7 2f706d742f6c6163
x8 0000000000000073 x9 2e90e08f834e33cb x10 6f4c6c61626f6c47 x11 0000000000000067
x12 7362696c2f706d74 x13 6f732e646e6f6365 x14 000000000281a7a7 x15 0000000000000000
x16 0000007495ded008 x17 00000075195611a0 x18 0000007496df2000 x19 0000005fbde5e458
x20 00000074989c1000 x21 0000007495dec888 x22 0000000000000000 x23 000000751dcf4104
x24 000000751dcef359 x25 000000751de0d000 x26 000000751dcf4c7e x27 0000000000000002
x28 0000007495d81da8 x29 00000074989bf950
lr 0000007495d81e10 sp 00000074989bf930 pc 00000075195612a0 pst 0000000020001000
backtrace:
#00 pc 00000000000852a0 /apex/com.android.runtime/lib64/bionic/libc.so (memset+256) (BuildId: 4cbc2a7636b3e0b018386ca8427ddf9b)
#01 pc 000000000005ae0c /data/local/tmp/libsecond.so (_GLOBAL__sub_I_GlobalLog.cpp+100) (BuildId: ba6fec2c86fc767de42d0292e21713e734e2ea2e)
#02 pc 0000000000051294 /apex/com.android.runtime/bin/linker64 (__dl__ZN6soinfo17call_constructorsEv+628) (BuildId: 402a7650c18b31ec79a6bce8358d1074)
#03 pc 000000000003bd30 /apex/com.android.runtime/bin/linker64 (__dl__Z9do_dlopenPKciPK17android_dlextinfoPKv+2052) (BuildId: 402a7650c18b31ec79a6bce8358d1074)
#04 pc 00000000000371d8 /apex/com.android.runtime/bin/linker64 (__loader_dlopen+76) (BuildId: 402a7650c18b31ec79a6bce8358d1074)
#05 pc 0000000000001024 /apex/com.android.runtime/lib64/bionic/libdl.so (dlopen+16) (BuildId: 050162f4b068d2ea4e05bf86f04ca900)
#06 pc 0000000000023e94 /data/local/tmp/padding_tester (load()+64) (BuildId: 56085108fcd1b328f00f00ccb7ba7fc2f7705f28)
#07 pc 0000000000024050 /data/local/tmp/padding_tester (void* std::__ndk1::__thread_proxy<std::__ndk1::tuple<std::__ndk1::unique_ptr<std::__ndk1::__thread_struct, std::__ndk1::default_delete<std::__ndk1::__thread_struct> >, void (*)()> >(void*)+44) (BuildId: 56085108fcd1b328f00f00ccb7ba7fc2f7705f28)
#08 pc 00000000000eae70 /apex/com.android.runtime/lib64/bionic/libc.so (__pthread_start(void*)+204) (BuildId: 4cbc2a7636b3e0b018386ca8427ddf9b)
#09 pc 000000000008a82c /apex/com.android.runtime/lib64/bionic/libc.so (__start_thread+64) (BuildId: 4cbc2a7636b3e0b018386ca8427ddf9b)
可以发现该问题原因是 gLog 全局变量重复初始化,导致 mEntries 内存被重置,因此其中一个线程在 push_back 过程中,内存被重置造成内存踩踏的现象。
解决方案
该问题的根本原因依赖 global-lib.a,gLog 变量使用同一个地址,并重复初始化导致内存被重置,具体的 Native Crash 分析这里就不一一解答,可以参考上一篇《如何理解Native Crash问题》
- 追加 -Wl,-Bsymbolic [优缺点明显,剥离全局变量能修复此问题,但也会引入其它问题]
- 在使用变量前完成所有的初始化,缺点库文件加载后不应该被移除,也就是不能进行热插拔