dlopen/dlsym art实战

815 阅读5分钟

最近看了 抖音 Android 性能优化系列:新一代全能型性能分析工具 Rhea,也心痒痒的想自己做个demo,先从 dl 开始吧,搞了几天,写了一个demo so库来练手时,一点毛病都没,一到设备上 dl libart 时却各种崩溃,现在终于没崩溃了,记录一下,虽然就几行代码。

0 实验环境

Android N

[ro.build.version.sdk]: [24]

使用 kwai-linker 绕过系统 dlopen so 限制

kwai-linker

1 调用 int32_t Runtime::GetStat(int kind)

runtime.cc 源码

从系统中拉取 libart.so 使用 nm 命令查看符号表

nm libart.so |grep _ZN3art7Runtime

readelf 也可查看

readelf -Ws libart.so |grep _ZN3art7Runtime

从符号表找到 Runtime 实例和 GetStat方法,直接开撸

void *libHandle = kwai::linker::DlFcn::dlopen("libart.so", RTLD_NOW);

// static Runtime* instance_;
void** runtime = (void**) kwai::linker::DlFcn::dlsym(libHandle, "_ZN3art7Runtime9instance_E");
ALOG("runtime addr is , %x", *runtime);

// int32_t Runtime::GetStat(int kind)
int (*GetStat)(void*, int);
GetStat = (int (*)(void*, int)) kwai::linker::DlFcn::dlsym(libHandle, "_ZN3art7Runtime7GetStatEi");
ALOG("GetStat addr is , %x", GetStat);

for (int i = 0; i <= 6; ++i) {
    int c = GetStat(*runtime, 1<<i);
    ALOG("GetStat %d, %d", i, c);
}

// 也可以从 JavaVM 里获取 Runtime 对象(epic代码里发现的)
struct JavaVMExt {
    void* functions;
    void* runtime;
};

JavaVM* javaVM;
env->GetJavaVM(&javaVM);
JavaVMExt* javaVMExt = (JavaVMExt*)javaVM;
void* runtime = javaVMExt->runtime;

2 修改 gLogVerbosity->systrace_lock_logging

logging.cc 源码

为了能抓取 thinlock 的 trace,需要修改 gLogVerbosity->systrace_lock_logging 的值 在符号表找到后直接转型为 LogVerbosity 即可修改,LogVerbosity 是从源码直接复制

struct LogVerbosity {
  bool class_linker;  // Enabled with "-verbose:class".
  bool collector;
  bool compiler;
  bool deopt;
  bool gc;
  bool heap;
  bool jdwp;
  bool jit;
  bool jni;
  bool monitor;
  bool oat;
  bool profiler;
  bool signals;
  bool simulator;
  bool startup;
  bool third_party_jni;  // Enabled with "-verbose:third-party-jni".
  bool threads;
  bool verifier;
  bool image;
  bool systrace_lock_logging;  // Enabled with "-verbose:sys-locks".
};

LogVerbosity* logVerbosity = (LogVerbosity*)kwai::linker::DlFcn::dlsym(libHandle, "_ZN3art13gLogVerbosityE");
ALOG("LogVerbosity %x", logVerbosity);
ALOG("LogVerbosity %d", logVerbosity->systrace_lock_logging);
logVerbosity->systrace_lock_logging = true;
ALOG("LogVerbosity %d", logVerbosity->systrace_lock_logging);

3 获取 Art Thread 对象

native Thread 会把地址保存在 java ThreadnativePeer 里。

同时 JNIEnvExt 里也保存着 Thread 对象,可以强转 JNIEnv 得到 JNIEnvExt 对象。

还可以从 Thread 的静态方法获取当前的 Thread 对象

// 1 从当前 java Thread 对象的字段 nativePeer 获取
jclass threadClass = env->FindClass("java/lang/Thread");
jmethodID currentThreadMethodId = env->GetStaticMethodID(threadClass, "currentThread", "()Ljava/lang/Thread;");
jobject currentThreadObject = env->CallStaticObjectMethod(threadClass, currentThreadMethodId);
jfieldID nativePeerField = env->GetFieldID(threadClass, "nativePeer", "J");
jlong nativePeer = env->GetLongField(currentThreadObject, nativePeerField);

// 2 从 JniEnv 里获取
struct JNIEnvExt {
    void* functions;
    void* thread; // self
}
JNIEnvExt* envExt = (JNIEnvExt*)env;
void* artThread = envExt->thread; 

// 3 从 Thread 的静态方法获取
// static Thread* CurrentFromGdb()
void* (*currentThread)();
currentThread = reinterpret_cast<void* (*)()>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZN3art6Thread14CurrentFromGdbEv"));

// dump thread
// void ShortDump(std::ostream& os) const;
void (*dumpThread)(void* thread, std::ostream& os);
dumpThread = reinterpret_cast<void (*)(void*, std::ostream&)>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZNK3art6Thread9ShortDumpERNSt3__113basic_ostreamIcNS1_11char_traitsIcEEEE"));
        
void printThread(void* thread) {
    std::ostringstream oss;
    dumpThread(thread, oss);
    ALOGE("dumpThread %x %s", thread, oss.str().c_str());
}

4 获取 Heap 对象

第一种是将 runtime.h 的字段定义复制出来,只需要到 heap_ 字段就行。然后将 runtime 强转成 Runtime_7X,就可以直接获取到 heap。这段代码是 epic 代码里发现的,这里只复制了N版本的

struct Runtime_7X {
    uint64_t callee_save_methods_[3];
    void* pre_allocated_OutOfMemoryError_;
    void* pre_allocated_NoClassDefFoundError_;
    void* resolution_method_;
    void* imt_conflict_method_;
    void* imt_unimplemented_method_;
    void* sentinel_;

    int instruction_set_;
    uint32_t callee_save_method_frame_infos_[9];

    void* compiler_callbacks_;
    bool is_zygote_;
    bool must_relocate_;
    bool is_concurrent_gc_enabled_;
    bool is_explicit_gc_disabled_;
    bool dex2oat_enabled_;
    bool image_dex2oat_enabled_;

    std::string compiler_executable_;
    std::string patchoat_executable_;
    std::vector<std::string> compiler_options_;
    std::vector<std::string> image_compiler_options_;
    std::string image_location_;

    std::string boot_class_path_string_;
    std::string class_path_string_;
    std::vector<std::string> properties_;

    // The default stack size for managed threads created by the runtime.
    size_t default_stack_size_;

    void* heap_;
};

Runtime_7X* runtime7X = (Runtime_7X*)runtime_instance_;
void* heap = runtime7X->heap_;

第二种是根据 runtime.h 里字段的定义顺序,找到 JavaVMExt* java_vm_ 的偏移量后,在偏移一定距离找到 heap_。此代码是从 whale 代码里看到的。

// runtime.h的部分定义
gc::Heap* heap_;
std::unique_ptr<ArenaPool> jit_arena_pool_;
std::unique_ptr<ArenaPool> arena_pool_;
std::unique_ptr<ArenaPool> low_4gb_arena_pool_;
std::unique_ptr<LinearAlloc> linear_alloc_;
size_t max_spins_before_thin_lock_inflation_;
MonitorList* monitor_list_;
MonitorPool* monitor_pool_;
ThreadList* thread_list_;
InternTable* intern_table_;
ClassLinker* class_linker_;
SignalCatcher* signal_catcher_;
std::string stack_trace_file_;      // sizeof(std::string) == 3
JavaVMExt* java_vm_;                // 找到 vm 的偏移量往回找
template<typename R>
static inline R OffsetOf(void* ptr, size_t offset) {
    return (R) (reinterpret_cast<intptr_t>(ptr) + offset);
}
template<typename T>
static inline T MemberOf(void* ptr, size_t offset) {
    return *OffsetOf<T *>(ptr, offset);
}
   
// 32bit
int kPointerSize = 4;
// 从 JavaVmExt 中找到 runtime
auto runtime = MemberOf<void*>(vm, kPointerSize);
void* heap = nullptr;

size_t start = 200;
size_t end = start + (100 * kPointerSize);
for (size_t offset = start; offset != end; offset += kPointerSize) {
    // 从 runtime 中找到 java_vm_ 的偏移量
    if (MemberOf<ptr_t>(runtime, offset) == vm) {
        // 减去 std::string 、SignalCatcher* 和 ClassLinker*
        size_t class_linker_offset = offset - (kPointerSize * 3) - (2 * kPointerSize);
        size_t intern_table_offset = class_linker_offset - kPointerSize;
        size_t thread_list_Offset = intern_table_offset - kPointerSize;
        // heap 距离 thread_list_ 8个kPointerSize
        size_t heap_offset = thread_list_Offset - (8 * kPointerSize);

        heap = MemberOf<void*>(runtime, heap_offset);
        break;
    }
}

第三种方式是创建一个 partial Runtime,只包含 runtime 的部分字段,从 heap_java_vm_,然后计算 offset,原理和第二种类似。

struct PartialRuntimeN {
    void* heap_;
    void* jit_arena_pool_;
    void* arena_pool_;
    void* low_4gb_arena_pool_;
    void* linear_alloc_;
    size_t max_spins_before_thin_lock_inflation_;
    void* monitor_list_;
    void* monitor_pool_;
    void* thread_list_;
    void* intern_table_;
    void* class_linker_;   //1x4
    void* signal_catcher_;    // 1x4
    std::string stack_trace_file_;    // 3x4
    void* java_vm_;
};

int kPointerSize = 4;
auto runtime = MemberOf<void*>(vm, kPointerSize);
void* heap = nullptr;

size_t start = 200;
size_t end = start + (100 * kPointerSize);
for (size_t offset = start; offset != end; offset += kPointerSize) {
    if (MemberOf<ptr_t>(runtime, offset) == vm) {
        PartialRuntimeN* partialRuntimeN = reinterpret_cast<PartialRuntimeN*>(reinterpret_cast<char*>(runtime) + offset - offsetof(PartialRuntimeN, java_vm_));
        heap = partialRuntimeN->heap_;
        break;
    }
}

5 请求 concurren gc

// void Heap::RequestConcurrentGC(Thread* self, bool force_full)
void (*requestConcurrentGC)(void* heap, void* self, bool force_full);
requestConcurrentGC = reinterpret_cast<void (*)(void*, void*, bool)>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZN3art2gc4Heap19RequestConcurrentGCEPNS_6ThreadEb"));


requestConcurrentGC(heap, currentThread(), true);

也可以直接通过 VMRuntime 来调用

val clazz = Class.forName("dalvik.system.VMRuntime")
val getRuntime = clazz.getDeclaredMethod("getRuntime")
val runtime = getRuntime.invoke(null)
val requestConcurrentGC = clazz.getDeclaredMethod("requestConcurrentGC")
requestConcurrentGC.invoke(runtime)

6 打印 gc log

LogGC 只有在 GcCausekGcCauseExplicit 或者 gc pause 超过阈值才打印。因此出入 kGcCauseExplicit,强制打印最后一次的 log

enum GcCause {
    kGcCauseForAlloc,
    kGcCauseBackground,
    kGcCauseExplicit,
    kGcCauseForNativeAlloc,
    kGcCauseCollectorTransition,
    kGcCauseDisableMovingGc,
    kGcCauseTrim,
    kGcCauseInstrumentation,
    kGcCauseAddRemoveAppImageSpace,
    kGcCauseHomogeneousSpaceCompact,
    kGcCauseClassLinker,
};
enum GcType {
    kGcTypeNone,
    kGcTypeSticky,
    kGcTypePartial,
    kGcTypeFull,
    kGcTypeMax,
};
//void LogGC(GcCause gc_cause, collector::GarbageCollector* collector);
void (*logGc)(void* heap, GcCause, void* collector);
logGc = reinterpret_cast<void (*)(void*, GcCause, void*)>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZN3art2gc4Heap5LogGCENS0_7GcCauseEPNS0_9collector16GarbageCollectorE"));

//collector::GarbageCollector* FindCollectorByGcType(collector::GcType gc_type);
void* (*findCollectorByGcType)(void* heap, GcType);
findCollectorByGcType = reinterpret_cast<void* (*)(void*, GcType)>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZN3art2gc4Heap21FindCollectorByGcTypeENS0_9collector6GcTypeE"));
        
void* collector = findCollectorByGcType(heap, kGcTypeFull);
logGc(heap, kGcCauseExplicit, collector);

7 dump heap spaces

打印 art 里管理的所有 space

//std::string Heap::DumpSpaces() const
std::string (*dumpSpace)(void *);
dumpSpace = reinterpret_cast<std::string (*)(void *)>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZNK3art2gc4Heap10DumpSpacesEv"));
        
if (dumpSpace != nullptr) {
    std::string spaces(dumpSpace(heap));
    std::string token;
    std::istringstream tokenStream(spaces);
    while (std::getline(tokenStream, token, '\n')) {
        ALOG("%s", token.c_str());
    }
}

8 请求内存 Compact

请求进行内存整理,在 N 版本? 一般在应用退到后台或者oom前才会去做 Compact 操作

enum HomogeneousSpaceCompactResult {
    kSuccess,
    kErrorReject,
    kErrorUnsupported,
    kErrorVMShuttingDown,
};
// HomogeneousSpaceCompactResult PerformHomogeneousSpaceCompact()
HomogeneousSpaceCompactResult (*performHomogeneousSpaceCompact)(void* heap);
performHomogeneousSpaceCompact  = reinterpret_cast<HomogeneousSpaceCompactResult (*)(void*)>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZN3art2gc4Heap30PerformHomogeneousSpaceCompactEv"));
        
HomogeneousSpaceCompactResult hscResult = performHomogeneousSpaceCompact(heap);

// 调用成功时会打印以下log
// I/art: Starting a blocking GC HomogeneousSpaceCompact
// I/art: HomogeneousSpaceCompact marksweep + semispace GC freed 1882(51KB) AllocSpace objects, 0(0B) LOS objects, 39% free, 5MB/9MB, paused 44.782ms total 44.782ms

9 直接调用 gc

直接调用 gc 操作,可以传入不同的 GcType

//collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCause gc_cause, bool clear_soft_references)
GcType (*collectGarbageInternal)(void* heap, GcType, GcCause, bool clear_soft_references);
collectGarbageInternal = reinterpret_cast<GcType (*)(void*, GcType, GcCause, bool )>(kwai::linker::DlFcn::dlsym(
        libHandle, "_ZN3art2gc4Heap22CollectGarbageInternalENS0_9collector6GcTypeENS0_7GcCauseEb"));

GcType gcType = collectGarbageInternal(heap, kGcTypeFull, kGcCauseExplicit, false);

// I/art: Starting a blocking GC Explicit
// I/art: Explicit concurrent mark sweep GC freed 3925(176KB) AllocSpace objects, 0(0B) LOS objects, 40% free, 5MB/8MB, paused 446us total 30.759ms

参考文档

抖音 Android 性能优化系列:新一代全能型性能分析工具 Rhea

epic dl的使用

whale

KOOM dl的使用

linphone dl的使用

Dynamic Library Programming