thread_create
通过 thread_create, 我们可以创建一个定时器,根据每个线程消耗的 cpu 时间,发送 signal 到对应的线程去处理,默认这个信号是 SIGPROF。
正常的使用流程是先通过 timer_create 获取 timer,再通过 timer_settime 去设置 timer 的 interval。
var timerid int32
var sevp sigevent
sevp.notify = _SIGEV_THREAD_ID
sevp.signo = _SIGPROF
sevp.sigev_notify_thread_id = int32(mp.procid)
ret := timer_create(_CLOCK_THREAD_CPUTIME_ID, &sevp, &timerid)
if ret != 0 {
// If we cannot create a timer for this M, leave profileTimerValid false
// to fall back to the process-wide setitimer profiler.
return
}
ret = timer_settime(timerid, 0, spec, nil)
在 go 里头,由于有 goroutine 的存在,因此如何为每个线程去开启采样也是比较麻烦的。
当前是在 execute 函数中 hook,当每次调度 goroutine 到新的 m 上去运行的时候,就会为当前线程设置。
func execute(gp *g, inheritTime bool) {
mp := getg().m
if goroutineProfile.active {
// Make sure that gp has had its stack written out to the goroutine
// profile, exactly as it was when the goroutine profiler first stopped
// the world.
tryRecordGoroutineProfile(gp, osyield)
}
// Assign gp.m before entering _Grunning so running Gs have an
// M.
mp.curg = gp
gp.m = mp
casgstatus(gp, _Grunnable, _Grunning)
gp.waitsince = 0
gp.preempt = false
gp.stackguard0 = gp.stack.lo + stackGuard
if !inheritTime {
mp.p.ptr().schedtick++
}
// Check whether the profiler needs to be turned on or off.
hz := sched.profilehz
if mp.profilehz != hz {
setThreadCPUProfiler(hz)
}
trace := traceAcquire()
if trace.ok() {
// GoSysExit has to happen when we have a P, but before GoStart.
// So we emit it here.
if !goexperiment.ExecTracer2 && gp.syscallsp != 0 {
trace.GoSysExit(true)
}
trace.GoStart()
traceRelease(trace)
}
gogo(&gp.sched)
}
当接受到信号的时候, 可以通过 ucontect 访问中断时候的寄存器以及堆栈信息。
signalHandler(int signo, siginfo_t* siginfo, void* ucontext)
在 java 中,语言没有内置,而是通过一个外部的线程去为其他的线程设置。
在 cpuEngine 中。 首先找到当前的所有线程:
int CpuEngine::createForAllThreads() {
int result = EPERM;
ThreadList* thread_list = OS::listThreads();
for (int tid; (tid = thread_list->next()) != -1; ) {
int err = createForThread(tid);
if (isResourceLimit(err)) {
result = err;
break;
} else if (result != 0) {
result = err;
}
}
delete thread_list;
return result;
}
同时,还要添加一个 thread_hook,为新启动的线程 create_timer:
bool CpuEngine::setupThreadHook() {
if (_pthread_entry != NULL) {
return true;
}
if (!VM::loaded()) {
static void* dummy_pthread_entry;
_pthread_entry = &dummy_pthread_entry;
return true;
}
// Depending on Zing version, pthread_setspecific is called either from libazsys.so or from libjvm.so
if (VM::isZing()) {
CodeCache* libazsys = Profiler::instance()->findLibraryByName("libazsys");
if (libazsys != NULL && (_pthread_entry = libazsys->findImport(im_pthread_setspecific)) != NULL) {
return true;
}
}
CodeCache* lib = Profiler::instance()->findJvmLibrary("libj9thr");
return lib != NULL && (_pthread_entry = lib->findImport(im_pthread_setspecific)) != NULL;
}
添加 pthread_setspecific_hook 在 pthread_setspecific 上:
// Intercept thread creation/termination by patching libjvm's GOT entry for pthread_setspecific().
// HotSpot puts VMThread into TLS on thread start, and resets on thread end.
static int pthread_setspecific_hook(pthread_key_t key, const void* value) {
if (key != VMThread::key()) {
return pthread_setspecific(key, value);
}
if (pthread_getspecific(key) == value) {
return 0;
}
if (value != NULL) {
int result = pthread_setspecific(key, value);
CpuEngine::onThreadStart();
return result;
} else {
CpuEngine::onThreadEnd();
return pthread_setspecific(key, value);
}
}
然后该函数会在 perf_event 中 以及 ctimer_linux 中使用。
ctimer 中为每个线程设置 timer 的函数:
为每个线程设置 timer
int CTimer::createForThread(int tid) {
if (tid >= _max_timers) {
Log::warn("tid[%d] > pid_max[%d]. Restart profiler after changing pid_max", tid, _max_timers);
return -1;
}
struct sigevent sev;
sev.sigev_value.sival_ptr = NULL;
sev.sigev_signo = _signal;
sev.sigev_notify = SIGEV_THREAD_ID;
((int*)&sev.sigev_notify)[1] = tid;
// Use raw syscalls, since libc wrapper allows only predefined clocks
clockid_t clock = thread_cpu_clock(tid);
int timer;
if (syscall(__NR_timer_create, clock, &sev, &timer) < 0) {
return -1;
}
// Kernel timer ID may start with zero, but we use zero as an empty slot
if (!__sync_bool_compare_and_swap(&_timers[tid], 0, timer + 1)) {
// Lost race
syscall(__NR_timer_delete, timer);
return -1;
}
struct itimerspec ts;
ts.it_interval.tv_sec = (time_t)(_interval / 1000000000);
ts.it_interval.tv_nsec = _interval % 1000000000;
ts.it_value = ts.it_interval;
syscall(__NR_timer_settime, timer, 0, &ts, NULL);
return 0;
}
timer_settimer 需要设置一个结构体:
其中, it_value 表示在系统调用后被唤醒的时间,而 it_interval 表示唤醒的间隔。
type itimerspec struct {
it_interval timespec
it_value timespec
}
假设我们想要每 s 100 hz 的时间,那么在 java 里头,我们就会每隔 10ms 唤醒一次,并且下一次唤醒时间在 10ms 之后。
但是,在 go 里头,做了一些优化:
基本上来说,启动时间随机化。
// The period of the timer should be 1/Hz. For every "1/Hz" of additional
// work, the user should expect one additional sample in the profile.
//
// But to scale down to very small amounts of application work, to observe
// even CPU usage of "one tenth" of the requested period, set the initial
// timing delay in a different way: So that "one tenth" of a period of CPU
// spend shows up as a 10% chance of one sample (for an expected value of
// 0.1 samples), and so that "two and six tenths" periods of CPU spend show
// up as a 60% chance of 3 samples and a 40% chance of 2 samples (for an
// expected value of 2.6). Set the initial delay to a value in the unifom
// random distribution between 0 and the desired period. And because "0"
// means "disable timer", add 1 so the half-open interval [0,period) turns
// into (0,period].
//
// Otherwise, this would show up as a bias away from short-lived threads and
// from threads that are only occasionally active: for example, when the
// garbage collector runs on a mostly-idle system, the additional threads it
// activates may do a couple milliseconds of GC-related work and nothing
// else in the few seconds that the profiler observes.
spec := new(itimerspec)
spec.it_value.setNsec(1 + int64(fastrandn(uint32(1e9/hz))))
spec.it_interval.setNsec(1e9 / int64(hz))
itimer
通过 setitimer 系统调用,可以每经过一段物理时间,向进程发送一次信号,然后进程收到信号后去处理。
这是在 go 1.18 以前的性能分析方式,async profiler 同样也是如此:
// Enable the Go signal handler if not enabled.
if atomic.Cas(&handlingSig[_SIGPROF], 0, 1) {
h := getsig(_SIGPROF)
// If no signal handler was installed before, then we record
// _SIG_IGN here. When we turn off profiling (below) we'll start
// ignoring SIGPROF signals. We do this, rather than change
// to SIG_DFL, because there may be a pending SIGPROF
// signal that has not yet been delivered to some other thread.
// If we change to SIG_DFL when turning off profiling, the
// program will crash when that SIGPROF is delivered. We assume
// that programs that use profiling don't want to crash on a
// stray SIGPROF. See issue 19320.
// We do the change here instead of when turning off profiling,
// because there we may race with a signal handler running
// concurrently, in particular, sigfwdgo may observe _SIG_DFL and
// die. See issue 43828.
if h == _SIG_DFL {
h = _SIG_IGN
}
atomic.Storeuintptr(&fwdSig[_SIGPROF], h)
setsig(_SIGPROF, abi.FuncPCABIInternal(sighandler))
}
var it itimerval
it.it_interval.tv_sec = 0
it.it_interval.set_usec(1000000 / hz)
it.it_value = it.it_interval
setitimer(_ITIMER_PROF, &it, nil)
而在 datadog python 中,情况又不太一样,
python 会有一个单独的 collector。每隔一段时间获取所有运行的 threadid,然后通过系统调用去获取 threadid 的 cpu 使用时间。
为了避免收集线程数过多,python 会限制 collect 运行的 wall time, 不超过百分之多少的系统运行时间。
perf event open
为每个线程设置 perf_event_open:
通过 perf_event_open 打开 fd。
通过 mmap open 读取文件缓冲区的文件。
通过 fcntl(fd, F_SETSIG, _signal), 设置 fd 准备好的时候所发送的信号。
设置 wakeup_events 为 1,每收集 1 个 event,即产生信号。
注册信号回调函数,通过 sighandler 记录函数栈。
结束后通过 ioctl 去刷新 fd。
void PerfEvents::signalHandler(int signo, siginfo_t* siginfo, void* ucontext) {
if (siginfo->si_code <= 0) {
// Looks like an external signal; don't treat as a profiling event
return;
}
if (_enabled) {
u64 counter = readCounter(siginfo, ucontext);
ExecutionEvent event;
Profiler::instance()->recordSample(ucontext, counter, PERF_SAMPLE, &event);
} else {
resetBuffer(OS::threadId());
}
ioctl(siginfo->si_fd, PERF_EVENT_IOC_RESET, 0);
ioctl(siginfo->si_fd, PERF_EVENT_IOC_REFRESH, 1);
}
事实上,该过程完全不需要 mmap 中的数据。
我们仅仅是通过 perf_event_open 为一个 event 设置一个溢出时候的信号处理。
获取到信号的时候再去进行堆栈展开。
相比较通过 sighandler 进行处理的形式,通过 poll 轮训文件,从 mmap 中进行读取更为有效。
stackoverflow.com/questions/7…
package main
import (
"encoding/binary"
"fmt"
"os"
"runtime"
"sync/atomic"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
)
func main() {
// Open output file
file, err := os.Create("perf.data")
if err != nil {
fmt.Printf("Error creating perf.data: %v\n", err)
return
}
defer file.Close()
// Define perf event attributes
attr := unix.PerfEventAttr{
Type: unix.PERF_TYPE_SOFTWARE,
Config: unix.PERF_COUNT_SW_CPU_CLOCK,
Size: uint32(unsafe.Sizeof(unix.PerfEventAttr{})),
Sample: 100,
Bits: unix.PerfBitFreq | unix.PerfBitComm,
Sample_type: unix.PERF_SAMPLE_IP,
Wakeup: 1,
}
// Open perf event for each CPU
for i := 0; i < runtime.NumCPU(); i++ { // Adjust the CPU range as needed
fd, err := unix.PerfEventOpen(&attr, -1, 1, -1, unix.PERF_FLAG_FD_CLOEXEC)
if err != nil {
fmt.Printf("Error opening perf event for CPU %d: %v\n", i, err)
return
}
// Enable the counter for the event
if _, _, errno := syscall.Syscall(unix.SYS_IOCTL, uintptr(fd), unix.PERF_EVENT_IOC_ENABLE, 0); errno != 0 {
fmt.Printf("Error enabling perf event for CPU %d: %v\n", i, err)
return
}
ring, err := unix.Mmap(fd, 0, (1+128)*unix.Getpagesize(), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED)
if err != nil {
panic(err)
}
meta := (*unix.PerfEventMmapPage)(unsafe.Pointer(&ring[0]))
fmt.Println(meta.Data_offset, meta.Data_size)
ringdata := ring[meta.Data_offset:]
for {
fds := []unix.PollFd{{Fd: int32(fd), Events: unix.POLLIN}}
_, err = unix.Ppoll(fds, nil, nil)
if err != nil {
panic(err)
}
fmt.Println("receive")
if fds[0].Revents&unix.POLLIN != 0 {
fmt.Println("read data")
}
tail := atomic.LoadUint64(&meta.Data_tail)
fmt.Println(meta.Data_tail)
// Head and tail values only ever grow, so we must take their value
// modulo the size of the data segment of the ring.
start := tail % uint64(len(ringdata))
header := *(*RecordHeader)(unsafe.Pointer(&ringdata[start]))
end := (tail + uint64(header.Size)) % uint64(len(ringdata))
rawData := ringdata[start:end][unsafe.Sizeof(RecordHeader{}):]
// Notify the kernel of the last record we've seen.
atomic.AddUint64(&meta.Data_tail, uint64(header.Size))
// PERF_RECORDTYPE_SAMPLE
fmt.Println(fmt.Sprintf("%x", binary.LittleEndian.Uint64(rawData)), header.Type, rawData[0])
}
}
fmt.Println("Perf data recorded.")
}
type RecordHeader struct {
Type uint32
Misc uint16
Size uint16
}
perf record 在不同 linux 机器上有不同的版本,如果直接使用 perf_event_open 的话,也许能做到一些低版本做不到的事情:
- 过滤 cpuidle_enter_state 的数据。
- mmap-flush.一次从 mmap 读一批数据并 process,这个有点难理解,看样子是减少了最终 write 的次数。lore.kernel.org/lkml/201904…
- 只记录 ip,callchain 等数据, 不获取 timestamp 等数据。
- --no-buffing 表示设置 wakeup_events 为1,pool ready 十分频繁。
- aio,异步写入数据,避免在多 core cpu 机器上丢失跟踪数据。lore.kernel.org/all/e045b4a…
- –proc-map,perf 每一次运行都会读取所有的进程 mmap 以及新启动进程的 mmap。这些操作其实可以 cache 在应用程序中以避免,但同时也需要方法去监听新 mmap 事件的产生。
perf_event_open 最终使用 local_clock() 函数去获取软件的时间:stackoverflow.com/questions/2…