async profiler implementation

374 阅读4分钟

wall

WallClock 通过给线程发送 signal 的方式来获取调用栈。

每一轮 tick,至多采集 8 个线程,来减少多服务的 overhead。

_interval 控制的是当只采集 running 线程的时候的休眠时间。

当要采集休眠线程的时候,会根据线程数量去调整 _interval,让采样更平均。(比如 100ms 的 interval,总共 80 个线程,就会调整为每 8 ms 醒一次给 8 个线程发送采集信号)

// Maximum number of threads sampled in one iteration. This limit serves as a throttle
// when generating profiling signals. Otherwise applications with too many threads may
// suffer from a big profiling overhead. Also, keeping this limit low enough helps
// to avoid contention on a spin lock inside Profiler::recordSample().
const int THREADS_PER_TICK = 8;
​
// Set the hard limit for thread walking interval to 100 microseconds.
// Smaller intervals are practically unusable due to large overhead.
const long MIN_INTERVAL = 100000;
​
​
void WallClock::timerLoop() {
    int self = OS::threadId();
    ThreadFilter* thread_filter = Profiler::instance()->threadFilter();
    bool thread_filter_enabled = thread_filter->enabled();
    bool sample_idle_threads = _sample_idle_threads;
​
    ThreadList* thread_list = OS::listThreads();
    long long next_cycle_time = OS::nanotime();
​
    while (_running) {
        if (!_enabled) {
            OS::sleep(_interval);
            continue;
        }
​
        if (sample_idle_threads) {
            // Try to keep the wall clock interval stable, regardless of the number of profiled threads
            int estimated_thread_count = thread_filter_enabled ? thread_filter->size() : thread_list->size();
            next_cycle_time += adjustInterval(_interval, estimated_thread_count);
        }
​
        for (int count = 0; count < THREADS_PER_TICK; ) {
            int thread_id = thread_list->next();
            if (thread_id == -1) {
                thread_list->rewind();
                break;
            }
​
            if (thread_id == self || (thread_filter_enabled && !thread_filter->accept(thread_id))) {
                continue;
            }
​
            if (sample_idle_threads || OS::threadState(thread_id) == THREAD_RUNNING) {
                if (OS::sendSignalToThread(thread_id, _signal)) {
                    count++;
                }
            }
        }
​
        if (sample_idle_threads) {
            long long current_time = OS::nanotime();
            if (next_cycle_time - current_time > MIN_INTERVAL) {
                OS::sleep(next_cycle_time - current_time);
            } else {
                next_cycle_time = current_time + MIN_INTERVAL;
                OS::sleep(MIN_INTERVAL);
            }
        } else {
            OS::sleep(_interval);
        }
    }
​
    delete thread_list;
}
​

这里通过 pc 地址来区分是 THREAD_SLEEPING 状态还是 THREAD_RUNNING 状态。

ThreadState WallClock::getThreadState(void* ucontext) {
    StackFrame frame(ucontext);
    uintptr_t pc = frame.pc();
​
    // Consider a thread sleeping, if it has been interrupted in the middle of syscall execution,
    // either when PC points to the syscall instruction, or if syscall has just returned with EINTR
    if (StackFrame::isSyscall((instruction_t*)pc)) {
        return THREAD_SLEEPING;
    }
​
    // Make sure the previous instruction address is readable
    uintptr_t prev_pc = pc - SYSCALL_SIZE;
    if ((pc & 0xfff) >= SYSCALL_SIZE || Profiler::instance()->findLibraryByAddress((instruction_t*)prev_pc) != NULL) {
        if (StackFrame::isSyscall((instruction_t*)prev_pc) && frame.checkInterruptedSyscall()) {
            return THREAD_SLEEPING;
        }
    }
​
    return THREAD_RUNNING;
}

mutex

mutex 通过接受 JVMTI_EVENT_MONITOR_CONTENDED_ENTER 和 JVMTI_EVENT_MONITOR_CONTENDED_ENTERED 这两个事件。

并且注册 park hook, 记录 ReentrantLock,ReentrantReadWriteLock,Semaphore 等获取并发锁的函数。

​
Error LockTracer::start(Arguments& args) {
    _ticks_to_nanos = 1e9 / TSC::frequency();
    _threshold = (jlong)(args._lock * (TSC::frequency() / 1e9));
​
    if (!_initialized) {
        initialize();
    }
​
    // Enable Java Monitor events
    jvmtiEnv* jvmti = VM::jvmti();
    jvmti->SetEventNotificationMode(JVMTI_ENABLE, JVMTI_EVENT_MONITOR_CONTENDED_ENTER, NULL);
    jvmti->SetEventNotificationMode(JVMTI_ENABLE, JVMTI_EVENT_MONITOR_CONTENDED_ENTERED, NULL);
    _start_time = TSC::ticks();
​
    // Intercept Unsafe.park() for tracing contended ReentrantLocks
    if (_orig_Unsafe_park != NULL) {
        bindUnsafePark(UnsafeParkHook);
    }
​
    return Error::OK;
}
​
bool LockTracer::isConcurrentLock(const char* lock_name) {
    // Do not count synchronizers other than ReentrantLock, ReentrantReadWriteLock and Semaphore
    return strncmp(lock_name, "Ljava/util/concurrent/locks/ReentrantLock", 41) == 0 ||
           strncmp(lock_name, "Ljava/util/concurrent/locks/ReentrantReadWriteLock", 50) == 0 ||
           strncmp(lock_name, "Ljava/util/concurrent/Semaphore", 31) == 0;
}

_threshold 并没有传递给 jvm,而是在接收到 JVMTI_EVENT_MONITOR_CONTENDED_ENTERED 事件之后用于判断。

heap

  • 在 tlab 内分配内存

    • tlab 已经有了。
    • tlab 不够大,new tlab create and fill。
  • 在 tlab 外分配内存。

HeapWord* MemAllocator::mem_allocate(Allocation& allocation) const {
  if (UseTLAB) {
    HeapWord* result = allocate_inside_tlab(allocation);
    if (result != NULL) {
      return result;
    }
  }
​
  return allocate_outside_tlab(allocation);
}
​
HeapWord* MemAllocator::allocate_inside_tlab(Allocation& allocation) const {
  assert(UseTLAB, "should use UseTLAB");
​
  // Try allocating from an existing TLAB.
  HeapWord* mem = _thread->tlab().allocate(_word_size);
  if (mem != NULL) {
    return mem;
  }
​
  // Try refilling the TLAB and allocating the object in it.
  return allocate_inside_tlab_slow(allocation);
}

接受两类回调事件:

size_in_bytes 为要分配的结构体大小。

也就是说这两类事件不是像 go 一样固定每 512 kb 分配就记录一次,因此可以去获取到整体的内存分配情况。

(如果 tlab 够大的话,这段时间的分配就不会记录其中,所以应该是程序启动的时候内存分配的多)。

void MemAllocator::Allocation::notify_allocation_jfr_sampler() {
  HeapWord* mem = (HeapWord*)obj();
  size_t size_in_bytes = _allocator._word_size * HeapWordSize;
​
  if (_allocated_outside_tlab) {
    AllocTracer::send_allocation_outside_tlab(_allocator._klass, mem, size_in_bytes, _thread);
  } else if (_allocated_tlab_size != 0) {
    // TLAB was refilled
    AllocTracer::send_allocation_in_new_tlab(_allocator._klass, mem, _allocated_tlab_size * HeapWordSize,
                                             size_in_bytes, _thread);
  }
}

对于 send_allocation_in_new_tlab,我们的 total_size 是 tlab size.

对于 send_allocation_outside_tlab_event, total size 是 alloc_size.

// Called whenever our breakpoint trap is hit
void AllocTracer::trapHandler(int signo, siginfo_t* siginfo, void* ucontext) {
    StackFrame frame(ucontext);
    EventType event_type;
    uintptr_t total_size;
    uintptr_t instance_size;
​
    // PC points either to BREAKPOINT instruction or to the next one
    if (_in_new_tlab.covers(frame.pc())) {
        // send_allocation_in_new_tlab(Klass* klass, HeapWord* obj, size_t tlab_size, size_t alloc_size, Thread* thread)
        // send_allocation_in_new_tlab_event(KlassHandle klass, size_t tlab_size, size_t alloc_size)
        event_type = ALLOC_SAMPLE;
        total_size = _trap_kind == 1 ? frame.arg2() : frame.arg1();
        instance_size = _trap_kind == 1 ? frame.arg3() : frame.arg2();
    } else if (_outside_tlab.covers(frame.pc())) {
        // send_allocation_outside_tlab(Klass* klass, HeapWord* obj, size_t alloc_size, Thread* thread)
        // send_allocation_outside_tlab_event(KlassHandle klass, size_t alloc_size);
        event_type = ALLOC_OUTSIDE_TLAB;
        total_size = _trap_kind == 1 ? frame.arg2() : frame.arg1();
        instance_size = 0;
    } else {
        // Not our trap
        Profiler::instance()->trapHandler(signo, siginfo, ucontext);
        return;
    }
​
    // Leave the trapped function by simulating "ret" instruction
    uintptr_t klass = frame.arg0();
    frame.ret();
​
    if (_enabled && updateCounter(_allocated_bytes, total_size, _interval)) {
        recordAllocation(ucontext, event_type, klass, total_size, instance_size);
    }
}

for jdk 11+

--alloc 是通过 SetHeapSamplingInterval 函数去设置每 512 kb 的分配间隔去记录一次样本(并且该样本分配了一个新的 tlab 或者 tlab 外才会通知)。

在 jdk 11+ 中,对于

  • talb outsize, allocation size 就是类的大小。
  • tlab insize, allocation size 是自从上次采样以来 tlab 被填充的大小,以及要分配的类的大小。
void MemAllocator::Allocation::notify_allocation_jvmti_sampler() {
  // support for JVMTI VMObjectAlloc event (no-op if not enabled)
  JvmtiExport::vm_object_alloc_event_collector(obj());
​
  if (!ThreadHeapSampler::enabled()) {
    // Sampling disabled
    return;
  }
​
  if (!_allocated_outside_tlab && _allocated_tlab_size == 0 && !_tlab_end_reset_for_sample) {
    // Sample if it's a non-TLAB allocation, or a TLAB allocation that either refills the TLAB
    // or expands it due to taking a sampler induced slow path.
    return;
  }
​
  assert(JavaThread::current()->heap_sampler().add_sampling_collector(),
         "Should never return false.");
​
  // Only check if the sampler could actually sample something in this path.
  assert(!JvmtiExport::should_post_sampled_object_alloc() ||
         !JvmtiSampledObjectAllocEventCollector::object_alloc_is_safe_to_sample() ||
         _thread->heap_sampler().sampling_collector_present(),
         "Sampling collector not present.");
​
  if (JvmtiExport::should_post_sampled_object_alloc()) {
    // If we want to be sampling, protect the allocated object with a Handle
    // before doing the callback. The callback is done in the destructor of
    // the JvmtiSampledObjectAllocEventCollector.
    PreserveObj obj_h(_thread, _obj_ptr);
    JvmtiSampledObjectAllocEventCollector collector;
    size_t size_in_bytes = _allocator._word_size * HeapWordSize;
    ThreadLocalAllocBuffer& tlab = _thread->tlab();
    size_t bytes_since_last = _allocated_outside_tlab ? 0 : tlab.bytes_since_last_sample_point();
    _thread->heap_sampler().check_for_sampling(obj_h(), size_in_bytes, bytes_since_last);
  }
​
  assert(JavaThread::current()->heap_sampler().remove_sampling_collector(), "Should never return false.");
​
  if (_tlab_end_reset_for_sample || _allocated_tlab_size != 0) {
    _thread->tlab().set_sample_end();
  }
}

AsyncGetCallTrace

  • EXECUTION_SAMPLE, 依赖 async get call trace 获取调用栈。
  • memalloc,在 hotspot 非 v9 并且非 zing 版本上,使用内部函数
  • 其他, getJavaTraceJvmti。

也就是说,只要我们不使用 cpu 或 wall 类型,就不会使用 AsyncGetCallTrace, bugs.openjdk.org/browse/JDK-…,就不会有可能导致 jvm crash。

 if (event_type <= EXECUTION_SAMPLE) {
        // Async events
        int java_frames = getJavaTraceAsync(ucontext, frames + num_frames, _max_stack_depth, &java_ctx);
        if (java_frames > 0 && java_ctx.pc != NULL && VMStructs::hasMethodStructs()) {
            NMethod* nmethod = CodeHeap::findNMethod(java_ctx.pc);
            if (nmethod != NULL) {
                fillFrameTypes(frames + num_frames, java_frames, nmethod);
            }
        }
        num_frames += java_frames;
    } else if (event_type >= ALLOC_SAMPLE && event_type <= ALLOC_OUTSIDE_TLAB && _alloc_engine == &alloc_tracer) {
        if (VMStructs::_get_stack_trace != NULL) {
            // Object allocation in HotSpot happens at known places where it is safe to call JVM TI,
            // but not directly, since the thread is in_vm rather than in_native
            num_frames += getJavaTraceInternal(jvmti_frames + num_frames, frames + num_frames, _max_stack_depth);
        } else {
            num_frames += getJavaTraceAsync(ucontext, frames + num_frames, _max_stack_depth, &java_ctx);
        }
    } else {
        // Lock events and instrumentation events can safely call synchronous JVM TI stack walker.
        // Skip Instrument.recordSample() method
        int start_depth = event_type == INSTRUMENTED_METHOD ? 1 : 0;
        num_frames += getJavaTraceJvmti(jvmti_frames + num_frames, frames + num_frames, start_depth, _max_stack_depth);
    }
    
    void VMStructs::initJvmFunctions() {
    if (!VM::isOpenJ9() && !VM::isZing()) {
        _get_stack_trace = (GetStackTraceFunc)_libjvm->findSymbolByPrefix("_ZN8JvmtiEnv13GetStackTraceEP10JavaThreadiiP");
    }
    //466370500

stalk walker

参考以下两个文章。 遍历 fp 得到 fp 的链条。 然后加载 fp + 1 得到返回地址链表。

github.com/async-profi…

blogs.oracle.com/linux/post/…

int StackWalker::walkFP(void* ucontext, const void** callchain, int max_depth) {
    const void* pc;
    uintptr_t fp;
    uintptr_t prev_fp = (uintptr_t)&fp;
    uintptr_t bottom = prev_fp + MAX_WALK_SIZE;

    if (ucontext == NULL) {
        pc = __builtin_return_address(0);
        fp = (uintptr_t)__builtin_frame_address(1);
    } else {
        StackFrame frame(ucontext);
        pc = (const void*)frame.pc();
        fp = frame.fp();
    }

    int depth = 0;

    // Walk until the bottom of the stack or until the first Java frame
    while (depth < max_depth && !CodeHeap::contains(pc)) {
        callchain[depth++] = pc;

        // Check if the next frame is below on the current stack
        if (fp <= prev_fp || fp >= prev_fp + MAX_FRAME_SIZE || fp >= bottom) {
            break;
        }

        // Frame pointer must be word aligned
        if ((fp & (sizeof(uintptr_t) - 1)) != 0) {
            break;
        }

        pc = stripPointer(SafeAccess::load((void**)fp + FRAME_PC_SLOT));
        if (pc < (const void*)MIN_VALID_PC || pc > (const void*)-MIN_VALID_PC) {
            break;
        }

        prev_fp = fp;
        fp = *(uintptr_t*)fp;
    }

    return depth;
}