三种 cpu 分析的方式。

201 阅读7分钟

thread_create

通过 thread_create, 我们可以创建一个定时器,根据每个线程消耗的 cpu 时间,发送 signal 到对应的线程去处理,默认这个信号是 SIGPROF。

正常的使用流程是先通过 timer_create 获取 timer,再通过 timer_settime 去设置 timer 的 interval。

var timerid int32
var sevp sigevent
sevp.notify = _SIGEV_THREAD_ID
sevp.signo = _SIGPROF
sevp.sigev_notify_thread_id = int32(mp.procid)
ret := timer_create(_CLOCK_THREAD_CPUTIME_ID, &sevp, &timerid)
if ret != 0 {
  // If we cannot create a timer for this M, leave profileTimerValid false
  // to fall back to the process-wide setitimer profiler.
  return
}

ret = timer_settime(timerid, 0, spec, nil)

在 go 里头,由于有 goroutine 的存在,因此如何为每个线程去开启采样也是比较麻烦的。

当前是在 execute 函数中 hook,当每次调度 goroutine 到新的 m 上去运行的时候,就会为当前线程设置。

func execute(gp *g, inheritTime bool) {
    mp := getg().m

    if goroutineProfile.active {
       // Make sure that gp has had its stack written out to the goroutine
       // profile, exactly as it was when the goroutine profiler first stopped
       // the world.
       tryRecordGoroutineProfile(gp, osyield)
    }

    // Assign gp.m before entering _Grunning so running Gs have an
    // M.
    mp.curg = gp
    gp.m = mp
    casgstatus(gp, _Grunnable, _Grunning)
    gp.waitsince = 0
    gp.preempt = false
    gp.stackguard0 = gp.stack.lo + stackGuard
    if !inheritTime {
       mp.p.ptr().schedtick++
    }

    // Check whether the profiler needs to be turned on or off.
    hz := sched.profilehz
    if mp.profilehz != hz {
       setThreadCPUProfiler(hz)
    }

    trace := traceAcquire()
    if trace.ok() {
       // GoSysExit has to happen when we have a P, but before GoStart.
       // So we emit it here.
       if !goexperiment.ExecTracer2 && gp.syscallsp != 0 {
          trace.GoSysExit(true)
       }
       trace.GoStart()
       traceRelease(trace)
    }

    gogo(&gp.sched)
}

当接受到信号的时候, 可以通过 ucontect 访问中断时候的寄存器以及堆栈信息。

signalHandler(int signo, siginfo_t* siginfo, void* ucontext)

在 java 中,语言没有内置,而是通过一个外部的线程去为其他的线程设置。

在 cpuEngine 中。 首先找到当前的所有线程:

int CpuEngine::createForAllThreads() {
    int result = EPERM;

    ThreadList* thread_list = OS::listThreads();
    for (int tid; (tid = thread_list->next()) != -1; ) {
        int err = createForThread(tid);
        if (isResourceLimit(err)) {
            result = err;
            break;
        } else if (result != 0) {
            result = err;
        }
    }
    delete thread_list;

    return result;
}

同时,还要添加一个 thread_hook,为新启动的线程 create_timer:

bool CpuEngine::setupThreadHook() {
    if (_pthread_entry != NULL) {
        return true;
    }

    if (!VM::loaded()) {
        static void* dummy_pthread_entry;
        _pthread_entry = &dummy_pthread_entry;
        return true;
    }

    // Depending on Zing version, pthread_setspecific is called either from libazsys.so or from libjvm.so
    if (VM::isZing()) {
        CodeCache* libazsys = Profiler::instance()->findLibraryByName("libazsys");
        if (libazsys != NULL && (_pthread_entry = libazsys->findImport(im_pthread_setspecific)) != NULL) {
            return true;
        }
    }

    CodeCache* lib = Profiler::instance()->findJvmLibrary("libj9thr");
    return lib != NULL && (_pthread_entry = lib->findImport(im_pthread_setspecific)) != NULL;
}

添加 pthread_setspecific_hook 在 pthread_setspecific 上:

// Intercept thread creation/termination by patching libjvm's GOT entry for pthread_setspecific().
// HotSpot puts VMThread into TLS on thread start, and resets on thread end.
static int pthread_setspecific_hook(pthread_key_t key, const void* value) {
    if (key != VMThread::key()) {
        return pthread_setspecific(key, value);
    }
    if (pthread_getspecific(key) == value) {
        return 0;
    }

    if (value != NULL) {
        int result = pthread_setspecific(key, value);
        CpuEngine::onThreadStart();
        return result;
    } else {
        CpuEngine::onThreadEnd();
        return pthread_setspecific(key, value);
    }
}

然后该函数会在 perf_event 中 以及 ctimer_linux 中使用。

ctimer 中为每个线程设置 timer 的函数:

为每个线程设置 timer

int CTimer::createForThread(int tid) {
    if (tid >= _max_timers) {
        Log::warn("tid[%d] > pid_max[%d]. Restart profiler after changing pid_max", tid, _max_timers);
        return -1;
    }

    struct sigevent sev;
    sev.sigev_value.sival_ptr = NULL;
    sev.sigev_signo = _signal;
    sev.sigev_notify = SIGEV_THREAD_ID;
    ((int*)&sev.sigev_notify)[1] = tid;

    // Use raw syscalls, since libc wrapper allows only predefined clocks
    clockid_t clock = thread_cpu_clock(tid);
    int timer;
    if (syscall(__NR_timer_create, clock, &sev, &timer) < 0) {
        return -1;
    }

    // Kernel timer ID may start with zero, but we use zero as an empty slot
    if (!__sync_bool_compare_and_swap(&_timers[tid], 0, timer + 1)) {
        // Lost race
        syscall(__NR_timer_delete, timer);
        return -1;
    }

    struct itimerspec ts;
    ts.it_interval.tv_sec = (time_t)(_interval / 1000000000);
    ts.it_interval.tv_nsec = _interval % 1000000000;
    ts.it_value = ts.it_interval;
    syscall(__NR_timer_settime, timer, 0, &ts, NULL);
    return 0;
}

timer_settimer 需要设置一个结构体:

其中, it_value 表示在系统调用后被唤醒的时间,而 it_interval 表示唤醒的间隔。

type itimerspec struct {
    it_interval timespec
    it_value    timespec
}

假设我们想要每 s 100 hz 的时间,那么在 java 里头,我们就会每隔 10ms 唤醒一次,并且下一次唤醒时间在 10ms 之后。

但是,在 go 里头,做了一些优化:

基本上来说,启动时间随机化。

// The period of the timer should be 1/Hz. For every "1/Hz" of additional
// work, the user should expect one additional sample in the profile.
//
// But to scale down to very small amounts of application work, to observe
// even CPU usage of "one tenth" of the requested period, set the initial
// timing delay in a different way: So that "one tenth" of a period of CPU
// spend shows up as a 10% chance of one sample (for an expected value of
// 0.1 samples), and so that "two and six tenths" periods of CPU spend show
// up as a 60% chance of 3 samples and a 40% chance of 2 samples (for an
// expected value of 2.6). Set the initial delay to a value in the unifom
// random distribution between 0 and the desired period. And because "0"
// means "disable timer", add 1 so the half-open interval [0,period) turns
// into (0,period].
//
// Otherwise, this would show up as a bias away from short-lived threads and
// from threads that are only occasionally active: for example, when the
// garbage collector runs on a mostly-idle system, the additional threads it
// activates may do a couple milliseconds of GC-related work and nothing
// else in the few seconds that the profiler observes.
spec := new(itimerspec)
spec.it_value.setNsec(1 + int64(fastrandn(uint32(1e9/hz))))
spec.it_interval.setNsec(1e9 / int64(hz))

itimer

通过 setitimer 系统调用,可以每经过一段物理时间,向进程发送一次信号,然后进程收到信号后去处理。

这是在 go 1.18 以前的性能分析方式,async profiler 同样也是如此:

// Enable the Go signal handler if not enabled.
if atomic.Cas(&handlingSig[_SIGPROF], 0, 1) {
  h := getsig(_SIGPROF)
  // If no signal handler was installed before, then we record
  // _SIG_IGN here. When we turn off profiling (below) we'll start
  // ignoring SIGPROF signals. We do this, rather than change
  // to SIG_DFL, because there may be a pending SIGPROF
  // signal that has not yet been delivered to some other thread.
  // If we change to SIG_DFL when turning off profiling, the
  // program will crash when that SIGPROF is delivered. We assume
  // that programs that use profiling don't want to crash on a
  // stray SIGPROF. See issue 19320.
  // We do the change here instead of when turning off profiling,
  // because there we may race with a signal handler running
  // concurrently, in particular, sigfwdgo may observe _SIG_DFL and
  // die. See issue 43828.
  if h == _SIG_DFL {
   h = _SIG_IGN
  }
  atomic.Storeuintptr(&fwdSig[_SIGPROF], h)
  setsig(_SIGPROF, abi.FuncPCABIInternal(sighandler))
}

var it itimerval
it.it_interval.tv_sec = 0
it.it_interval.set_usec(1000000 / hz)
it.it_value = it.it_interval
setitimer(_ITIMER_PROF, &it, nil)

而在 datadog python 中,情况又不太一样,

python 会有一个单独的 collector。每隔一段时间获取所有运行的 threadid,然后通过系统调用去获取 threadid 的 cpu 使用时间。

为了避免收集线程数过多,python 会限制 collect 运行的 wall time, 不超过百分之多少的系统运行时间。

perf event open

为每个线程设置 perf_event_open:

通过 perf_event_open 打开 fd。

通过 mmap open 读取文件缓冲区的文件。

通过 fcntl(fd, F_SETSIG, _signal), 设置 fd 准备好的时候所发送的信号。

设置 wakeup_events 为 1,每收集 1 个 event,即产生信号。

注册信号回调函数,通过 sighandler 记录函数栈。

结束后通过 ioctl 去刷新 fd。

void PerfEvents::signalHandler(int signo, siginfo_t* siginfo, void* ucontext) {
    if (siginfo->si_code <= 0) {
        // Looks like an external signal; don't treat as a profiling event
        return;
    }

    if (_enabled) {
        u64 counter = readCounter(siginfo, ucontext);
        ExecutionEvent event;
        Profiler::instance()->recordSample(ucontext, counter, PERF_SAMPLE, &event);
    } else {
        resetBuffer(OS::threadId());
    }

    ioctl(siginfo->si_fd, PERF_EVENT_IOC_RESET, 0);
    ioctl(siginfo->si_fd, PERF_EVENT_IOC_REFRESH, 1);
}

事实上,该过程完全不需要 mmap 中的数据。

我们仅仅是通过 perf_event_open 为一个 event 设置一个溢出时候的信号处理。

获取到信号的时候再去进行堆栈展开。

相比较通过 sighandler 进行处理的形式,通过 poll 轮训文件,从 mmap 中进行读取更为有效。

stackoverflow.com/questions/7…

package main

import (
    "encoding/binary"
    "fmt"
    "os"
    "runtime"
    "sync/atomic"
    "syscall"
    "unsafe"

    "golang.org/x/sys/unix"
)

func main() {
    // Open output file
    file, err := os.Create("perf.data")
    if err != nil {
       fmt.Printf("Error creating perf.data: %v\n", err)
       return
    }
    defer file.Close()

    // Define perf event attributes
    attr := unix.PerfEventAttr{
       Type:        unix.PERF_TYPE_SOFTWARE,
       Config:      unix.PERF_COUNT_SW_CPU_CLOCK,
       Size:        uint32(unsafe.Sizeof(unix.PerfEventAttr{})),
       Sample:      100,
       Bits:        unix.PerfBitFreq | unix.PerfBitComm,
       Sample_type: unix.PERF_SAMPLE_IP,
       Wakeup:      1,
    }

    // Open perf event for each CPU
    for i := 0; i < runtime.NumCPU(); i++ { // Adjust the CPU range as needed
       fd, err := unix.PerfEventOpen(&attr, -1, 1, -1, unix.PERF_FLAG_FD_CLOEXEC)
       if err != nil {
          fmt.Printf("Error opening perf event for CPU %d: %v\n", i, err)
          return
       }

       // Enable the counter for the event
       if _, _, errno := syscall.Syscall(unix.SYS_IOCTL, uintptr(fd), unix.PERF_EVENT_IOC_ENABLE, 0); errno != 0 {
          fmt.Printf("Error enabling perf event for CPU %d: %v\n", i, err)
          return
       }

       ring, err := unix.Mmap(fd, 0, (1+128)*unix.Getpagesize(), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED)
       if err != nil {
          panic(err)
       }

       meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&ring[0]))
       fmt.Println(meta.Data_offset, meta.Data_size)

       ringdata := ring[meta.Data_offset:]

       for {
          fds := []unix.PollFd{{Fd: int32(fd), Events: unix.POLLIN}}
          _, err = unix.Ppoll(fds, nil, nil)
          if err != nil {
             panic(err)
          }
          fmt.Println("receive")
          if fds[0].Revents&unix.POLLIN != 0 {
             fmt.Println("read data")
          }

          tail := atomic.LoadUint64(&meta.Data_tail)
          fmt.Println(meta.Data_tail)
          // Head and tail values only ever grow, so we must take their value
          // modulo the size of the data segment of the ring.
          start := tail % uint64(len(ringdata))
          header := *(*RecordHeader)(unsafe.Pointer(&ringdata[start]))
          end := (tail + uint64(header.Size)) % uint64(len(ringdata))

          rawData := ringdata[start:end][unsafe.Sizeof(RecordHeader{}):]

          // Notify the kernel of the last record we've seen.
          atomic.AddUint64(&meta.Data_tail, uint64(header.Size))

          // PERF_RECORDTYPE_SAMPLE
          fmt.Println(fmt.Sprintf("%x", binary.LittleEndian.Uint64(rawData)), header.Type, rawData[0])
       }
    }

    fmt.Println("Perf data recorded.")
}

type RecordHeader struct {
    Type uint32
    Misc uint16
    Size uint16
}

perf record 在不同 linux 机器上有不同的版本,如果直接使用 perf_event_open 的话,也许能做到一些低版本做不到的事情:

  • 过滤 cpuidle_enter_state 的数据。
  • mmap-flush.一次从 mmap 读一批数据并 process,这个有点难理解,看样子是减少了最终 write 的次数。lore.kernel.org/lkml/201904…
  • 只记录 ip,callchain 等数据, 不获取 timestamp 等数据。
  • --no-buffing 表示设置 wakeup_events 为1,pool ready 十分频繁。
  • aio,异步写入数据,避免在多 core cpu 机器上丢失跟踪数据。lore.kernel.org/all/e045b4a…
  • –proc-map,perf 每一次运行都会读取所有的进程 mmap 以及新启动进程的 mmap。这些操作其实可以 cache 在应用程序中以避免,但同时也需要方法去监听新 mmap 事件的产生。

perf_event_open 最终使用 local_clock() 函数去获取软件的时间:stackoverflow.com/questions/2…