ebpf 获取调用栈

1,195 阅读5分钟

ebpf 流程

首先,我们需要加载已编译的 bpf 程序,将其附加到适当的系统调用函数。

在内核态,bfp 程序将会把数据存储到一个内核中的结构体,用户态通过读取结构体来分析数据。 image.png

parca ebpf 初始堆栈分析

这是 initial commit,仅包括最少运行的那些代码。(github.com/parca-dev/p…

一个 polarsignals-agent.bpf.c 文件,用于存放 bpf 相关的程序。

获取 stacktrace

我们定义一个 bpf 的 map,类型为 BPF_MAP_TYPE_STACK_TRACE。

函数 bpf_get_stackid() 可以从 ctx 中获取用户或内核态的 stacktrace,返回一个 hash 后的 stacktraceID。(ebpf-docs.dylanreimerink.nl/linux/helpe…)

但是这里存在的一个问题是如果该 map 定义的太小的话,就会导致 hash 冲突,丢失堆栈。(github.com/bpftrace/bp…

#include <bpf/bpf_helpers.h>

struct {
    __uint(type, BPF_MAP_TYPE_STACK_TRACE);
    __uint(key_size, sizeof(u32));
    __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
    __uint(max_entries, 10000);
} stack_traces SEC(".maps");

SEC("perf_event")
int print_stack_ids(struct bpf_perf_event_data *ctx)
{
    char fmt[] = "kern_stack_id=%d user_stack_id=%d";

    kern_stack_id = bpf_get_stackid(ctx, &stack_traces, 0);
    user_stack_id = bpf_get_stackid(ctx, &stack_traces, 0 | BPF_F_USER_STACK);

    if kern_stack_id >= 0 && user_stack_id >=0 {
        bpf_trace_printk(fmt, sizeof(fmt), kern_stack_id, user_stack_id);
    }
}

char _license[] SEC("license") = "GPL";

获取 stacktrace 和次数

  • 通过 stacktraces map,存储 stacktraceid 到 stacktrace 的映射。
  • 通过 counts map,存储 {pid, stacktraceid} 到 count 的映射。
/ Stack Traces are slightly different
// in that the value is 1 big byte array
// of the stack addresses
#define BPF_STACK_TRACE(_name, _max_entries) \
struct bpf_map_def SEC("maps") _name = { \
  .type = BPF_MAP_TYPE_STACK_TRACE, \
  .key_size = sizeof(u32), \
  .value_size = sizeof(size_t) * MAX_STACK_DEPTH, \
  .max_entries = _max_entries, \
};

/*=============================== INTERNAL STRUCTS ===========================*/

typedef struct stack_count_key {
    u32 pid;
    int user_stack_id;
    int kernel_stack_id;
} stack_count_key_t;

static __always_inline void *
bpf_map_lookup_or_try_init(void *map, const void *key, const void *init)
{
	void *val;
	long err;

	val = bpf_map_lookup_elem(map, key);
	if (val)
		return val;

	err = bpf_map_update_elem(map, key, init, BPF_NOEXIST);
    // 17 == EEXIST
	if (err && err != -17)
		return 0;

	return bpf_map_lookup_elem(map, key);
}

// This code gets a bit complex. Probably not suitable for casual hacking.
SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx) {
    u64 id = bpf_get_current_pid_tgid();
    u32 tgid = id >> 32;
    u32 pid = id;

    if (pid == 0)
        return 0;

    // create map key
    stack_count_key_t key = {.pid = tgid};

    // get stacks
    key.user_stack_id = bpf_get_stackid(ctx, &stack_traces, BPF_F_USER_STACK);
    key.kernel_stack_id = bpf_get_stackid(ctx, &stack_traces, 0);

    u64 zero = 0;
    u64 *count;
    count = bpf_map_lookup_or_try_init(&counts, &key, &zero);
    if (!count)
        return 0;

    __sync_fetch_and_add(count, 1);

    return 0;
}

用户空间处理

加载 bpf 程序:

m, err := bpf.NewModuleFromBuffer(bpfObj, "polarsignals")
	if err != nil {
		return err
	}
	defer m.Close()

	err = m.BPFLoadObject()
	if err != nil {
		return err
	}

让 bpf 程序 hook perf_event_open 的输出:

这里如果通过 defer 去关闭对应的 fd 会出现问题(github.com/parca-dev/p…

	cpus := runtime.NumCPU()
	for i := 0; i < cpus; i++ {
		// TODO(branz): Close the returned fd
		fd, err := unix.PerfEventOpen(&unix.PerfEventAttr{
			Type:   unix.PERF_TYPE_SOFTWARE,
			Config: unix.PERF_COUNT_SW_CPU_CLOCK,
			Size:   uint32(unsafe.Sizeof(unix.PerfEventAttr{})),
			Sample: 100,
			Bits:   unix.PerfBitDisabled | unix.PerfBitFreq,
		}, int(cgroup.Fd()), i, -1, unix.PERF_FLAG_PID_CGROUP)
		if err != nil {
			return err
		}

		prog, err := m.GetProgram("do_sample")
		if err != nil {
			return err
		}

		// Because this is fd based, even if our program crashes or is ended
		// without proper shutdown, things get cleaned up appropriately.

		// TODO(brancz): destroy the returned link via bpf_link__destroy
		_, err = prog.AttachPerfEvent(fd)
		if err != nil {
			return fmt.Errorf("attach perf event: %w", err)
		}
	}

映射到 bpf 程序中的 counts 和 stacktraces map:

counts, err := m.GetMap("counts")
	if err != nil {
		return fmt.Errorf("get counts map: %w", err)
	}

	stackTraces, err := m.GetMap("stack_traces")
	if err != nil {
		return fmt.Errorf("get stack traces map: %w", err)
	}

循环读取 counts map,从中获取 stacktrace id。

使用 stacktrace id 读取 stacktraces map,从中获取 stacktrace:

for it.Next() {
			sample := &Sample{}
			// This byte slice is only valid for this iteration, so it must be
			// copied if we want to do anything with it outside of this loop.
			keyBytes := it.Key()

			r := bytes.NewBuffer(keyBytes)

			pidBytes := make([]byte, 4)
			if _, err := io.ReadFull(r, pidBytes); err != nil {
				return fmt.Errorf("read pid bytes: %w", err)
			}
			sample.Pid = byteOrder.Uint32(pidBytes)

			userStackIDBytes := make([]byte, 4)
			if _, err := io.ReadFull(r, userStackIDBytes); err != nil {
				return fmt.Errorf("read user stack ID bytes: %w", err)
			}
			userStackID := int32(byteOrder.Uint32(userStackIDBytes))

			kernelStackIDBytes := make([]byte, 4)
			if _, err := io.ReadFull(r, kernelStackIDBytes); err != nil {
				return fmt.Errorf("read kernel stack ID bytes: %w", err)
			}
			kernelStackID := int32(byteOrder.Uint32(kernelStackIDBytes))

			valueBytes, err := counts.GetValue(keyBytes, 8)
			if err != nil {
				return fmt.Errorf("get count value: %w", err)
			}
			sample.Value = byteOrder.Uint64(valueBytes)

			stackBytes, err := stackTraces.GetValue(userStackID, 8*stackDepth)
			if err != nil {
				profile.MissingStacks++
				continue
			}
			stack := make([]uint64, stackDepth)
			err = binary.Read(bytes.NewBuffer(stackBytes), byteOrder, stack)
			if err != nil {
				return fmt.Errorf("read stack trace: %w", err)
			}
			for _, addr := range stack {
				if addr != uint64(0) {
					sample.UserStack = append(sample.UserStack, addr)
				}
			}

			if kernelStackID >= 0 {
				stackBytes, err = stackTraces.GetValue(kernelStackID, 8*stackDepth)
				if err != nil {
					profile.MissingStacks++
					continue
				}

				stack = make([]uint64, stackDepth)
				err = binary.Read(bytes.NewBuffer(stackBytes), byteOrder, stack)
				if err != nil {
					return fmt.Errorf("read stack trace: %w", err)
				}

				for _, addr := range stack {
					if addr != uint64(0) {
						sample.KernelStack = append(sample.KernelStack, addr)
						sym, err := ksym.Resolve(addr)
						if err != nil && !errors.Is(err, ksym.FunctionNotFoundError) {
							return err
						}

						sample.KernelStackStrings = append(sample.KernelStackStrings, sym.Name)
					}
				}
			}
			profile.Samples = append(profile.Samples, sample)

每次循环后,清空 map,这里由于 api 定义的原因,所以需要再次循环一次 两个 map,才能够清空(github.com/parca-dev/p…

	it = stackTraces.Iter(4)
		var prev []byte = nil
		for it.Next() {
			if prev != nil {
				err := stackTraces.DeleteKey(prev)
				if err != nil {
					level.Warn(p.logger).Log("msg", "failed to delete stack trace", "err", err)
				}
			}

			key := it.Key()
			prev = make([]byte, len(key))
			copy(prev, key)
		}
		if prev != nil {
			err := stackTraces.DeleteKey(prev)
			if err != nil {
				level.Warn(p.logger).Log("msg", "failed to delete stack trace", "err", err)
			}
		}

		it = counts.Iter(keySize)
		prev = nil
		for it.Next() {
			if prev != nil {
				err := counts.DeleteKey(prev)
				if err != nil {
					level.Warn(p.logger).Log("msg", "failed to delete count", "err", err)
				}
			}

			key := it.Key()
			prev = make([]byte, len(key))
			copy(prev, key)
		}
		if prev != nil {
			err := counts.DeleteKey(prev)
			if err != nil {
				level.Warn(p.logger).Log("msg", "failed to delete count", "err", err)
			}
		}

ebpf 优化

批量从 counts map 中读取数据

一次从 counts map 中读取最多的数据并删除对应的 keys。(github.com/parca-dev/p…

	batchSize := p.bpfMaps.counts.GetMaxEntries()
	level.Debug(p.logger).Log("msg", "fetching stack trace counts in batch", "batchSize", batchSize)

	values, err = p.bpfMaps.counts.GetValueAndDeleteBatch(countKeysPtr, nil, unsafe.Pointer(&nextCountKey), batchSize)
	processedCount := len(values)

批量删除 stacktrace:

func (m bpfMaps) clean(stacks []int32, logger log.Logger) {
	for _, stackID := range stacks {
		err := m.stacks.DeleteKey(unsafe.Pointer(&stackID))
		if err != nil {
			if !errors.Is(err, syscall.ENOENT) {
				// Continuing in case of an error as we still want to delete the rest of the
				// stacks in the slice.
				level.Debug(logger).Log("msg", "failed to delete stack trace", "errno", err)
			}
		}
	}
}

不过,这个 batch 的功能,随后被 parca 官方回滚了,原因是对 linux 版本有要求(github.com/parca-dev/p…

需要 5.2+,而通常 ebpf 4.14 就足够了。(github.com/parca-dev/p…

增加 rlimit

github.com/parca-dev/p…

libbpf 默认映射的 module 只有 512MB,增加 rlimit。
rLimit := syscall.Rlimit{
		Cur: uint64(defaultRlimit),
		Max: uint64(defaultRlimit),
	}

	// RLIMIT_MEMLOCK is 0x8.
	if err := syscall.Setrlimit(unix.RLIMIT_MEMLOCK, &rLimit); err != nil {
		return fmt.Errorf("failed to increase rlimit: %w", err)
	}

system profiling

之前的 parca 是为每个 cgroup(pod)进行采样,现在进行系统范围每个进程的分析,这有助于减少分析器的开销:

github.com/parca-dev/p…

rust ebpf

parca 团队曾经尝试使用 rust 去支持 ebpf,随后又在 rust 工具链不晚上,无法支持 dwarf 堆栈展开的时候,回滚到 c(github.com/parca-dev/p…

dwarf unwind

所有的 go 程序,都使用 frame pointer,因此可以通过 bpf_get_stackid 获取对应的堆栈。

但是对于 c++ 程序,parca 实现了一套基于 dwarf 的堆栈展开。