ebpf 流程
首先,我们需要加载已编译的 bpf 程序,将其附加到适当的系统调用函数。
在内核态,bfp 程序将会把数据存储到一个内核中的结构体,用户态通过读取结构体来分析数据。
parca ebpf 初始堆栈分析
这是 initial commit,仅包括最少运行的那些代码。(github.com/parca-dev/p…
一个 polarsignals-agent.bpf.c 文件,用于存放 bpf 相关的程序。
获取 stacktrace
我们定义一个 bpf 的 map,类型为 BPF_MAP_TYPE_STACK_TRACE。
函数 bpf_get_stackid() 可以从 ctx 中获取用户或内核态的 stacktrace,返回一个 hash 后的 stacktraceID。(ebpf-docs.dylanreimerink.nl/linux/helpe…)
但是这里存在的一个问题是如果该 map 定义的太小的话,就会导致 hash 冲突,丢失堆栈。(github.com/bpftrace/bp…
#include <bpf/bpf_helpers.h>
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(key_size, sizeof(u32));
__uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
__uint(max_entries, 10000);
} stack_traces SEC(".maps");
SEC("perf_event")
int print_stack_ids(struct bpf_perf_event_data *ctx)
{
char fmt[] = "kern_stack_id=%d user_stack_id=%d";
kern_stack_id = bpf_get_stackid(ctx, &stack_traces, 0);
user_stack_id = bpf_get_stackid(ctx, &stack_traces, 0 | BPF_F_USER_STACK);
if kern_stack_id >= 0 && user_stack_id >=0 {
bpf_trace_printk(fmt, sizeof(fmt), kern_stack_id, user_stack_id);
}
}
char _license[] SEC("license") = "GPL";
获取 stacktrace 和次数
- 通过 stacktraces map,存储 stacktraceid 到 stacktrace 的映射。
- 通过 counts map,存储 {pid, stacktraceid} 到 count 的映射。
/ Stack Traces are slightly different
// in that the value is 1 big byte array
// of the stack addresses
#define BPF_STACK_TRACE(_name, _max_entries) \
struct bpf_map_def SEC("maps") _name = { \
.type = BPF_MAP_TYPE_STACK_TRACE, \
.key_size = sizeof(u32), \
.value_size = sizeof(size_t) * MAX_STACK_DEPTH, \
.max_entries = _max_entries, \
};
/*=============================== INTERNAL STRUCTS ===========================*/
typedef struct stack_count_key {
u32 pid;
int user_stack_id;
int kernel_stack_id;
} stack_count_key_t;
static __always_inline void *
bpf_map_lookup_or_try_init(void *map, const void *key, const void *init)
{
void *val;
long err;
val = bpf_map_lookup_elem(map, key);
if (val)
return val;
err = bpf_map_update_elem(map, key, init, BPF_NOEXIST);
// 17 == EEXIST
if (err && err != -17)
return 0;
return bpf_map_lookup_elem(map, key);
}
// This code gets a bit complex. Probably not suitable for casual hacking.
SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx) {
u64 id = bpf_get_current_pid_tgid();
u32 tgid = id >> 32;
u32 pid = id;
if (pid == 0)
return 0;
// create map key
stack_count_key_t key = {.pid = tgid};
// get stacks
key.user_stack_id = bpf_get_stackid(ctx, &stack_traces, BPF_F_USER_STACK);
key.kernel_stack_id = bpf_get_stackid(ctx, &stack_traces, 0);
u64 zero = 0;
u64 *count;
count = bpf_map_lookup_or_try_init(&counts, &key, &zero);
if (!count)
return 0;
__sync_fetch_and_add(count, 1);
return 0;
}
用户空间处理
加载 bpf 程序:
m, err := bpf.NewModuleFromBuffer(bpfObj, "polarsignals")
if err != nil {
return err
}
defer m.Close()
err = m.BPFLoadObject()
if err != nil {
return err
}
让 bpf 程序 hook perf_event_open 的输出:
这里如果通过 defer 去关闭对应的 fd 会出现问题(github.com/parca-dev/p…
cpus := runtime.NumCPU()
for i := 0; i < cpus; i++ {
// TODO(branz): Close the returned fd
fd, err := unix.PerfEventOpen(&unix.PerfEventAttr{
Type: unix.PERF_TYPE_SOFTWARE,
Config: unix.PERF_COUNT_SW_CPU_CLOCK,
Size: uint32(unsafe.Sizeof(unix.PerfEventAttr{})),
Sample: 100,
Bits: unix.PerfBitDisabled | unix.PerfBitFreq,
}, int(cgroup.Fd()), i, -1, unix.PERF_FLAG_PID_CGROUP)
if err != nil {
return err
}
prog, err := m.GetProgram("do_sample")
if err != nil {
return err
}
// Because this is fd based, even if our program crashes or is ended
// without proper shutdown, things get cleaned up appropriately.
// TODO(brancz): destroy the returned link via bpf_link__destroy
_, err = prog.AttachPerfEvent(fd)
if err != nil {
return fmt.Errorf("attach perf event: %w", err)
}
}
映射到 bpf 程序中的 counts 和 stacktraces map:
counts, err := m.GetMap("counts")
if err != nil {
return fmt.Errorf("get counts map: %w", err)
}
stackTraces, err := m.GetMap("stack_traces")
if err != nil {
return fmt.Errorf("get stack traces map: %w", err)
}
循环读取 counts map,从中获取 stacktrace id。
使用 stacktrace id 读取 stacktraces map,从中获取 stacktrace:
for it.Next() {
sample := &Sample{}
// This byte slice is only valid for this iteration, so it must be
// copied if we want to do anything with it outside of this loop.
keyBytes := it.Key()
r := bytes.NewBuffer(keyBytes)
pidBytes := make([]byte, 4)
if _, err := io.ReadFull(r, pidBytes); err != nil {
return fmt.Errorf("read pid bytes: %w", err)
}
sample.Pid = byteOrder.Uint32(pidBytes)
userStackIDBytes := make([]byte, 4)
if _, err := io.ReadFull(r, userStackIDBytes); err != nil {
return fmt.Errorf("read user stack ID bytes: %w", err)
}
userStackID := int32(byteOrder.Uint32(userStackIDBytes))
kernelStackIDBytes := make([]byte, 4)
if _, err := io.ReadFull(r, kernelStackIDBytes); err != nil {
return fmt.Errorf("read kernel stack ID bytes: %w", err)
}
kernelStackID := int32(byteOrder.Uint32(kernelStackIDBytes))
valueBytes, err := counts.GetValue(keyBytes, 8)
if err != nil {
return fmt.Errorf("get count value: %w", err)
}
sample.Value = byteOrder.Uint64(valueBytes)
stackBytes, err := stackTraces.GetValue(userStackID, 8*stackDepth)
if err != nil {
profile.MissingStacks++
continue
}
stack := make([]uint64, stackDepth)
err = binary.Read(bytes.NewBuffer(stackBytes), byteOrder, stack)
if err != nil {
return fmt.Errorf("read stack trace: %w", err)
}
for _, addr := range stack {
if addr != uint64(0) {
sample.UserStack = append(sample.UserStack, addr)
}
}
if kernelStackID >= 0 {
stackBytes, err = stackTraces.GetValue(kernelStackID, 8*stackDepth)
if err != nil {
profile.MissingStacks++
continue
}
stack = make([]uint64, stackDepth)
err = binary.Read(bytes.NewBuffer(stackBytes), byteOrder, stack)
if err != nil {
return fmt.Errorf("read stack trace: %w", err)
}
for _, addr := range stack {
if addr != uint64(0) {
sample.KernelStack = append(sample.KernelStack, addr)
sym, err := ksym.Resolve(addr)
if err != nil && !errors.Is(err, ksym.FunctionNotFoundError) {
return err
}
sample.KernelStackStrings = append(sample.KernelStackStrings, sym.Name)
}
}
}
profile.Samples = append(profile.Samples, sample)
每次循环后,清空 map,这里由于 api 定义的原因,所以需要再次循环一次 两个 map,才能够清空(github.com/parca-dev/p…
it = stackTraces.Iter(4)
var prev []byte = nil
for it.Next() {
if prev != nil {
err := stackTraces.DeleteKey(prev)
if err != nil {
level.Warn(p.logger).Log("msg", "failed to delete stack trace", "err", err)
}
}
key := it.Key()
prev = make([]byte, len(key))
copy(prev, key)
}
if prev != nil {
err := stackTraces.DeleteKey(prev)
if err != nil {
level.Warn(p.logger).Log("msg", "failed to delete stack trace", "err", err)
}
}
it = counts.Iter(keySize)
prev = nil
for it.Next() {
if prev != nil {
err := counts.DeleteKey(prev)
if err != nil {
level.Warn(p.logger).Log("msg", "failed to delete count", "err", err)
}
}
key := it.Key()
prev = make([]byte, len(key))
copy(prev, key)
}
if prev != nil {
err := counts.DeleteKey(prev)
if err != nil {
level.Warn(p.logger).Log("msg", "failed to delete count", "err", err)
}
}
ebpf 优化
批量从 counts map 中读取数据
一次从 counts map 中读取最多的数据并删除对应的 keys。(github.com/parca-dev/p…
batchSize := p.bpfMaps.counts.GetMaxEntries()
level.Debug(p.logger).Log("msg", "fetching stack trace counts in batch", "batchSize", batchSize)
values, err = p.bpfMaps.counts.GetValueAndDeleteBatch(countKeysPtr, nil, unsafe.Pointer(&nextCountKey), batchSize)
processedCount := len(values)
批量删除 stacktrace:
func (m bpfMaps) clean(stacks []int32, logger log.Logger) {
for _, stackID := range stacks {
err := m.stacks.DeleteKey(unsafe.Pointer(&stackID))
if err != nil {
if !errors.Is(err, syscall.ENOENT) {
// Continuing in case of an error as we still want to delete the rest of the
// stacks in the slice.
level.Debug(logger).Log("msg", "failed to delete stack trace", "errno", err)
}
}
}
}
不过,这个 batch 的功能,随后被 parca 官方回滚了,原因是对 linux 版本有要求(github.com/parca-dev/p…
需要 5.2+,而通常 ebpf 4.14 就足够了。(github.com/parca-dev/p…
增加 rlimit
libbpf 默认映射的 module 只有 512MB,增加 rlimit。
rLimit := syscall.Rlimit{
Cur: uint64(defaultRlimit),
Max: uint64(defaultRlimit),
}
// RLIMIT_MEMLOCK is 0x8.
if err := syscall.Setrlimit(unix.RLIMIT_MEMLOCK, &rLimit); err != nil {
return fmt.Errorf("failed to increase rlimit: %w", err)
}
system profiling
之前的 parca 是为每个 cgroup(pod)进行采样,现在进行系统范围每个进程的分析,这有助于减少分析器的开销:
rust ebpf
parca 团队曾经尝试使用 rust 去支持 ebpf,随后又在 rust 工具链不晚上,无法支持 dwarf 堆栈展开的时候,回滚到 c(github.com/parca-dev/p…
dwarf unwind
所有的 go 程序,都使用 frame pointer,因此可以通过 bpf_get_stackid 获取对应的堆栈。
但是对于 c++ 程序,parca 实现了一套基于 dwarf 的堆栈展开。