hook 应用 trace 日志上回讲到 atrace 的实现，其实主要就是写几个系统属性和节点，然后 APP 再将 t

上回讲到 atrace 的实现，其实主要就是写几个系统属性和节点，然后 APP 再将 trace log 写入 /sys/kernel/debug/tracing/trace_marker。因此，如果想不通过命令行去使用 atrace 的功能，只需要开启 trace 功能和 hook 对这个文件节点的写入操作，即可实现此功能。

分析 `hook` 点

APP 在上层一般是使用 Trace.beginSection 方法来记录 trace 日志，可以看到只有 isTagEnabled(TRACE_TAG_APP) 为 true 时，才能打日志

  public static void beginSection(String sectionName) {
      if (isTagEnabled(TRACE_TAG_APP)) {
          nativeTraceBegin(TRACE_TAG_APP, sectionName);
      }
  }

而 isTagEnabled 是取决于 sEnabledTags 静态变量第 TRACE_TAG_APP（1L << 12）是否置 1

public static boolean isTagEnabled(long traceTag) {
    long tags = sEnabledTags;
    if (tags == TRACE_TAG_NOT_READY) {
        tags = cacheEnabledTags();
    }
    return (tags & traceTag) != 0;
}
  
private static long cacheEnabledTags() {
    long tags = nativeGetEnabledTags();
    sEnabledTags = tags;
    return tags;
}


static jlong android_os_Trace_nativeGetEnabledTags(JNIEnv* env, jclass clazz) {
    return atrace_get_enabled_tags();
}

static inline uint64_t atrace_get_enabled_tags()
{
    atrace_init();
    return atrace_enabled_tags;
}

通过 atrace 目录读取日志时，sEnabledTags 是由命令行参数控制的。

因此需要通过反射设置 sEnabledTags，同时设置 native 层的 atrace_enabled_tags 变量，因为 native 层也有一个类似的判断。

static inline void atrace_begin(uint64_t tag, const char* name)
{
    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
        void atrace_begin_body(const char*);
        atrace_begin_body(name);
    }
}

static inline uint64_t atrace_is_tag_enabled(uint64_t tag)
{
    return atrace_get_enabled_tags() & tag;
}

最终调用到 atrace_begin_body

void atrace_begin_body(const char* name)
{
    char buf[ATRACE_MESSAGE_LENGTH];

    int len = snprintf(buf, sizeof(buf), "B|%d|%s", getpid(), name);
    if (len >= (int) sizeof(buf)) {
        ALOGW("Truncated name in %s: %s\n", __FUNCTION__, name);
        len = sizeof(buf) - 1;
    }
    write(atrace_marker_fd, buf, len);
}

可以看到只是简单的写入 atrace_marker_fd，因此只要 hook 写入过程，再将日志写入到其他文件，即可获取 APP 的 trace 日志。

获取 `trace_marker` 的文件描述符和 `atrace_enabled_tags`

使用 kwai::linker 从 libcutils.so 找到 atrace_enabled_tags 和 atrace_marker_fd 符号。

以下代码是从 profilo 参考的，可以看到就是简单的 dlopen 和 dlsym 两部曲，有意思的是 atrace_enabled_tags 在源码里的定义是 uint64_t，确在这里强转成 std::atomic<uint64_t>。

uint64_t                atrace_enabled_tags  = ATRACE_TAG_NOT_READY;

int* atrace_marker_fd = nullptr;
std::atomic<uint64_t>* atrace_enabled_tags = nullptr;

auto sdk = android_get_device_api_level();
{
    ALOGE("SDK %d ", sdk);
    using kwai::linker::DlFcn;
    std::string lib_name("libcutils.so");
    std::string enabled_tags_sym("atrace_enabled_tags");
    std::string fd_sym("atrace_marker_fd");

    if (sdk < 18) {
        lib_name = "libutils.so";
        // android::Tracer::sEnabledTags
        enabled_tags_sym = "_ZN7android6Tracer12sEnabledTagsE";
        // android::Tracer::sTraceFD
        fd_sym = "_ZN7android6Tracer8sTraceFDE";
    }

    void* handle;
    if (sdk < 21) {
        handle = DlFcn::dlopen(lib_name.c_str(), RTLD_LOCAL);
    } else {
        handle = DlFcn::dlopen(lib_name.c_str(), RTLD_GLOBAL);
    }

    atrace_enabled_tags = reinterpret_cast<std::atomic<uint64_t>*>(
            DlFcn::dlsym(handle, enabled_tags_sym.c_str()));

    if (atrace_enabled_tags == nullptr) {
        throw std::runtime_error("Enabled Tags not defined");
    }

    atrace_marker_fd = reinterpret_cast<int*>(DlFcn::dlsym(handle, fd_sym.c_str()));

    if (*atrace_marker_fd == -1) {
        // This is a case that can happen for older Android version i.e. 4.4
        // in which scenario the marker fd is not initialized/opened  by Zygote.
        // Nevertheless for Profilo trace it is not necessary to have an open fd,
        // since all we really need is to ensure that we 'know' it is marker
        // fd to continue writing Profilo logs, thus the usage of marker fd
        // acting really as a placeholder for magic id.
        *atrace_marker_fd = -100;
    }

    DlFcn::dlclose(handle);
}

开启 `trace`

从源码分析得知，atrace_enabled_tags 是一个 int64_t，每一位代表一个 trace 标签，因此全部置 1 代表打开所有功能，这里在 native 设置，同时也需要在上层设置，反射调用 nativeGetEnabledTags，然后将值设置给 Trace.sEnabledTags。

atrace_enabled_tags->exchange(UINT64_MAX);

private object SystraceReflector {
    fun updateSystraceTags() {
        if (sTrace_sEnabledTags != null && sTrace_nativeGetEnabledTags != null) {
            try {
                sTrace_sEnabledTags?.set(
                    null,
                    sTrace_nativeGetEnabledTags?.invoke(null)
                )
            } catch (e: IllegalAccessException) {
            } catch (e: InvocationTargetException) {
            }
        }
    }

    private var sTrace_nativeGetEnabledTags: Method? = null
    private var sTrace_sEnabledTags: Field? = null

    init {
        var m: Method?
        try {
            m = Trace::class.java.getDeclaredMethod("nativeGetEnabledTags")
            m.setAccessible(true)
        } catch (e: NoSuchMethodException) {
            m = null
        }
        sTrace_nativeGetEnabledTags = m
        var f: Field?
        try {
            f = Trace::class.java.getDeclaredField("sEnabledTags")
            f.setAccessible(true)
        } catch (e: NoSuchFieldException) {
            f = null
        }
        sTrace_sEnabledTags = f
    }
}

`hook write` 调用

开启完后，接下来 hook 对 atrace_marker_fd 的写入，这里使用的 hook 库是 xhook。很简单，直接 hook 所有加载的 so 的 write 和 __write_chk 函数。按照 profilo 只 hook 以下 7 个 so 时，trace log 会遗漏，导致生成的 trace.html 里的函数出现 Did not finished，不知道遗漏了哪些 so，索性全部 hook 了，但是这样会导致多出来的结束标志位，但是不影响可视化，所有先将就着吧 :P 。

"libandroid_runtime.so",
"libui.so",
"libgui.so",
"libart.so",
"libhwui.so",
"libEGL.so",
"libcutils.so"

bool should_log_systrace(int fd, size_t count) {
    return systrace_installed && fd == *atrace_marker_fd;
}

ssize_t write_hook(int fd, const void* buf, size_t count) {
    if (should_log_systrace(fd, count)) {
        log_systrace(buf, count);
        return count;
    }
    return write(fd, buf, count);
}

size_t (*orig__write_chk)(int fd, const void* buf, size_t count, size_t buf_size);

size_t hook__write_chk(int fd, const void* buf, size_t count, size_t buf_size) {
    if (should_log_systrace(fd, count)) {
        log_systrace(buf, count);
        return count;
    }
    return orig__write_chk(fd, buf, count, buf_size);
}

void hookLoadedLibs() {
    xhook_register(".*\\.so$", "write", (void *) write_hook, nullptr);
    xhook_register(".*\\.so$", "__write_chk", (void *) hook__write_chk,
                   (void **) &orig__write_chk);
    // 忽略对当前 so 的 hook               
    xhook_ignore(".*toy_lib\\.so$", "write");
    xhook_ignore(".*toy_lib\\.so$", "__write_chk");

    xhook_refresh(0);
    xhook_clear();
}

保存日志

从之前文章得知，一个可以使用 systrace.py 生成可视化的 html 的 trace log，必须以下 3 行日志开头，可能 2 行也行，没测试。

TRACE:
# tracer: nop
#

同时日志的格式需要是一下格式

<线程名>-<线程id>  [000] ...1 <时间-秒>: tracing_mark_write: <B|E>|<进程id>|<TAG>

但是我们获取到的日志只有最后一小节 <B|E>|<进程id>|<TAG>，因此需要拼接好后再写入，这里需要获取线程名字、线程 id 和 MONOTONIC 时间。

线程 id 可以通过 gettid()获取。

线程名字可以从 /proc/<pid>/task/<tid>/comm 读取，pid 通过 getpid() 获取。

时间通过 clock_gettime(CLOCK_MONOTONIC, &t) 获取，需要从纳秒转成秒，把 int64_t 的转成 double 后再去转成秒，否则时间精度不对。

拼接完后，写入文件，这里在 /data/local/tmp 目录创建了一个文件，但是并没有创建成功，可能是权限的问题，只能临时收到在命令行创建了。为了代码简洁，这里是都是同步写入日志。cpp 基本没写过，都是临时摸索复制百度谷歌来的，写得有点辣眼睛，请多多包涵，大概意思就是那样了 :P。

int fd = 0;
std::map<int, std::string> mapThreadName;

std::string getThreadName(int tid) {
    std::map<int, std::string>::iterator iter;
    iter = mapThreadName.find(tid);

    if (iter != mapThreadName.end()) {
        return iter->second;
    } else {
        char fname[128];
        int len = sprintf(fname, "/proc/%d/task/%d/comm", getpid(), tid);
        int cfd = open(fname, O_RDONLY);
        if (cfd) {
            char name[32];
            int c = read(cfd, name,  32);
            if (c > 0) {
                mapThreadName[tid] = std::string(name, c - 1);
                return mapThreadName[tid];
            }
        }
    }
    return std::to_string(tid);
}

typedef int64_t nsecs_t; // nano-seconds
nsecs_t systemTime()
{
    struct timespec t;
    t.tv_sec = t.tv_nsec = 0;
    clock_gettime(CLOCK_MONOTONIC, &t);
    return nsecs_t(t.tv_sec)*1000000000LL + t.tv_nsec;
}


void log_systrace(const void* buf, size_t count) {
    const char* msg = reinterpret_cast<const char*>(buf);
    double f = double(systemTime()) / 1000000000;
    switch (msg[0]) {
        case 'B':
            ALOGE("%s", buf); //"B|<pid>|<name>"
            if (fd) {
                char buf1[1024];
                int len = snprintf(buf1, sizeof(buf1), "%s-%d [000] ...1 %.6f: tracing_mark_write: %s\n", getThreadName(gettid()).c_str(), gettid(), f, msg);
                write(fd, buf1, len);
            }
            break;
        case 'E':
            ALOGE("E");  // "E"
            if (fd) {
                char buf1[1024];
                int len = snprintf(buf1, sizeof(buf1), "%s-%d [000] ...1 %.6f: tracing_mark_write: E\n", getThreadName(gettid()).c_str(), gettid(), f);
                write(fd, buf1, len);
            }
            break;
        default:
            return;
    }
}

void installAtrace() {
    fd = open("/data/local/tmp/mytrace.txt", O_WRONLY | O_CREAT, 0666);
    std::string head("TRACE:\n# tracer: nop\n#\n");
    write(fd, head.c_str(), head.size());
    
    // 开始 hook 后，打开 trace 功能
}

查看日志文件，可以看到正确生成了所需的格式

TRACE:
# tracer: nop
#
RenderThread-20036 [000] ...1 241751.465787: tracing_mark_write: B|19945|AttachCurrentThread
RenderThread-20036 [000] ...1 241751.468185: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.468293: tracing_mark_write: B|19945|postAndWait
com.example.toy-19945 [000] ...1 241751.477535: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.493119: tracing_mark_write: B|19945|postAndWait
com.example.toy-19945 [000] ...1 241751.493345: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.493457: tracing_mark_write: B|19945|postAndWait
com.example.toy-19945 [000] ...1 241751.493575: tracing_mark_write: E
RenderThread-20036 [000] ...1 241751.494002: tracing_mark_write: B|19945|notifyFramePending
RenderThread-20036 [000] ...1 241751.494100: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.499166: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.499489: tracing_mark_write: B|19945|postAndWait
com.example.toy-19945 [000] ...1 241751.499661: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.501794: tracing_mark_write: B|19945|Choreographer#doFrame
com.example.toy-19945 [000] ...1 241751.501954: tracing_mark_write: B|19945|input
com.example.toy-19945 [000] ...1 241751.502065: tracing_mark_write: E
com.example.toy-19945 [000] ...1 241751.502129: tracing_mark_write: B|19945|traversal