Commit d2850ce0 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-libbpf-perf-rb-api'

Andrii Nakryiko says:

====================
This patchset adds a high-level API for setting up and polling perf buffers
associated with BPF_MAP_TYPE_PERF_EVENT_ARRAY map. Details of APIs are
described in corresponding commit.

Patch #1 adds a set of APIs to set up and work with perf buffer.
Patch #2 enhances libbpf to support auto-setting PERF_EVENT_ARRAY map size.
Patch #3 adds test.
Patch #4 converts bpftool map event_pipe to new API.
Patch #5 updates README to mention perf_buffer_ prefix.

v6->v7:
- __x64_ syscall prefix (Yonghong);
v5->v6:
- fix C99 for loop variable initialization usage (Yonghong);
v4->v5:
- initialize perf_buffer_raw_opts in bpftool map event_pipe (Jakub);
- add perf_buffer_ to README;
v3->v4:
- fixed bpftool event_pipe cmd error handling (Jakub);
v2->v3:
- added perf_buffer__new_raw for more low-level control;
- converted bpftool map event_pipe to new API (Daniel);
- fixed bug with error handling in create_maps (Song);
v1->v2:
- add auto-sizing of PERF_EVENT_ARRAY maps;
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents c3ec002e cd07a95f
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#define MMAP_PAGE_CNT 16 #define MMAP_PAGE_CNT 16
static bool stop; static volatile bool stop;
struct event_ring_info { struct event_ring_info {
int fd; int fd;
...@@ -44,32 +44,44 @@ struct perf_event_sample { ...@@ -44,32 +44,44 @@ struct perf_event_sample {
unsigned char data[]; unsigned char data[];
}; };
struct perf_event_lost {
struct perf_event_header header;
__u64 id;
__u64 lost;
};
static void int_exit(int signo) static void int_exit(int signo)
{ {
fprintf(stderr, "Stopping...\n"); fprintf(stderr, "Stopping...\n");
stop = true; stop = true;
} }
struct event_pipe_ctx {
bool all_cpus;
int cpu;
int idx;
};
static enum bpf_perf_event_ret static enum bpf_perf_event_ret
print_bpf_output(struct perf_event_header *event, void *private_data) print_bpf_output(void *private_data, int cpu, struct perf_event_header *event)
{ {
struct perf_event_sample *e = container_of(event, struct perf_event_sample, struct perf_event_sample *e = container_of(event,
struct perf_event_sample,
header); header);
struct event_ring_info *ring = private_data; struct perf_event_lost *lost = container_of(event,
struct { struct perf_event_lost,
struct perf_event_header header; header);
__u64 id; struct event_pipe_ctx *ctx = private_data;
__u64 lost; int idx = ctx->all_cpus ? cpu : ctx->idx;
} *lost = (typeof(lost))event;
if (json_output) { if (json_output) {
jsonw_start_object(json_wtr); jsonw_start_object(json_wtr);
jsonw_name(json_wtr, "type"); jsonw_name(json_wtr, "type");
jsonw_uint(json_wtr, e->header.type); jsonw_uint(json_wtr, e->header.type);
jsonw_name(json_wtr, "cpu"); jsonw_name(json_wtr, "cpu");
jsonw_uint(json_wtr, ring->cpu); jsonw_uint(json_wtr, cpu);
jsonw_name(json_wtr, "index"); jsonw_name(json_wtr, "index");
jsonw_uint(json_wtr, ring->key); jsonw_uint(json_wtr, idx);
if (e->header.type == PERF_RECORD_SAMPLE) { if (e->header.type == PERF_RECORD_SAMPLE) {
jsonw_name(json_wtr, "timestamp"); jsonw_name(json_wtr, "timestamp");
jsonw_uint(json_wtr, e->time); jsonw_uint(json_wtr, e->time);
...@@ -89,7 +101,7 @@ print_bpf_output(struct perf_event_header *event, void *private_data) ...@@ -89,7 +101,7 @@ print_bpf_output(struct perf_event_header *event, void *private_data)
if (e->header.type == PERF_RECORD_SAMPLE) { if (e->header.type == PERF_RECORD_SAMPLE) {
printf("== @%lld.%09lld CPU: %d index: %d =====\n", printf("== @%lld.%09lld CPU: %d index: %d =====\n",
e->time / 1000000000ULL, e->time % 1000000000ULL, e->time / 1000000000ULL, e->time % 1000000000ULL,
ring->cpu, ring->key); cpu, idx);
fprint_hex(stdout, e->data, e->size, " "); fprint_hex(stdout, e->data, e->size, " ");
printf("\n"); printf("\n");
} else if (e->header.type == PERF_RECORD_LOST) { } else if (e->header.type == PERF_RECORD_LOST) {
...@@ -103,87 +115,25 @@ print_bpf_output(struct perf_event_header *event, void *private_data) ...@@ -103,87 +115,25 @@ print_bpf_output(struct perf_event_header *event, void *private_data)
return LIBBPF_PERF_EVENT_CONT; return LIBBPF_PERF_EVENT_CONT;
} }
static void int do_event_pipe(int argc, char **argv)
perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
{
enum bpf_perf_event_ret ret;
ret = bpf_perf_event_read_simple(ring->mem,
MMAP_PAGE_CNT * get_page_size(),
get_page_size(), buf, buf_len,
print_bpf_output, ring);
if (ret != LIBBPF_PERF_EVENT_CONT) {
fprintf(stderr, "perf read loop failed with %d\n", ret);
stop = true;
}
}
static int perf_mmap_size(void)
{
return get_page_size() * (MMAP_PAGE_CNT + 1);
}
static void *perf_event_mmap(int fd)
{
int mmap_size = perf_mmap_size();
void *base;
base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (base == MAP_FAILED) {
p_err("event mmap failed: %s\n", strerror(errno));
return NULL;
}
return base;
}
static void perf_event_unmap(void *mem)
{
if (munmap(mem, perf_mmap_size()))
fprintf(stderr, "Can't unmap ring memory!\n");
}
static int bpf_perf_event_open(int map_fd, int key, int cpu)
{ {
struct perf_event_attr attr = { struct perf_event_attr perf_attr = {
.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME, .sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME,
.type = PERF_TYPE_SOFTWARE, .type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_BPF_OUTPUT, .config = PERF_COUNT_SW_BPF_OUTPUT,
.sample_period = 1,
.wakeup_events = 1,
}; };
int pmu_fd;
pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
if (pmu_fd < 0) {
p_err("failed to open perf event %d for CPU %d", key, cpu);
return -1;
}
if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) {
p_err("failed to update map for event %d for CPU %d", key, cpu);
goto err_close;
}
if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
p_err("failed to enable event %d for CPU %d", key, cpu);
goto err_close;
}
return pmu_fd;
err_close:
close(pmu_fd);
return -1;
}
int do_event_pipe(int argc, char **argv)
{
int i, nfds, map_fd, index = -1, cpu = -1;
struct bpf_map_info map_info = {}; struct bpf_map_info map_info = {};
struct event_ring_info *rings; struct perf_buffer_raw_opts opts = {};
size_t tmp_buf_sz = 0; struct event_pipe_ctx ctx = {
void *tmp_buf = NULL; .all_cpus = true,
struct pollfd *pfds; .cpu = -1,
.idx = -1,
};
struct perf_buffer *pb;
__u32 map_info_len; __u32 map_info_len;
bool do_all = true; int err, map_fd;
map_info_len = sizeof(map_info); map_info_len = sizeof(map_info);
map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len); map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
...@@ -205,7 +155,7 @@ int do_event_pipe(int argc, char **argv) ...@@ -205,7 +155,7 @@ int do_event_pipe(int argc, char **argv)
char *endptr; char *endptr;
NEXT_ARG(); NEXT_ARG();
cpu = strtoul(*argv, &endptr, 0); ctx.cpu = strtoul(*argv, &endptr, 0);
if (*endptr) { if (*endptr) {
p_err("can't parse %s as CPU ID", **argv); p_err("can't parse %s as CPU ID", **argv);
goto err_close_map; goto err_close_map;
...@@ -216,7 +166,7 @@ int do_event_pipe(int argc, char **argv) ...@@ -216,7 +166,7 @@ int do_event_pipe(int argc, char **argv)
char *endptr; char *endptr;
NEXT_ARG(); NEXT_ARG();
index = strtoul(*argv, &endptr, 0); ctx.idx = strtoul(*argv, &endptr, 0);
if (*endptr) { if (*endptr) {
p_err("can't parse %s as index", **argv); p_err("can't parse %s as index", **argv);
goto err_close_map; goto err_close_map;
...@@ -228,45 +178,32 @@ int do_event_pipe(int argc, char **argv) ...@@ -228,45 +178,32 @@ int do_event_pipe(int argc, char **argv)
goto err_close_map; goto err_close_map;
} }
do_all = false; ctx.all_cpus = false;
} }
if (!do_all) { if (!ctx.all_cpus) {
if (index == -1 || cpu == -1) { if (ctx.idx == -1 || ctx.cpu == -1) {
p_err("cpu and index must be specified together"); p_err("cpu and index must be specified together");
goto err_close_map; goto err_close_map;
} }
nfds = 1;
} else { } else {
nfds = min(get_possible_cpus(), map_info.max_entries); ctx.cpu = 0;
cpu = 0; ctx.idx = 0;
index = 0;
} }
rings = calloc(nfds, sizeof(rings[0])); opts.attr = &perf_attr;
if (!rings) opts.event_cb = print_bpf_output;
opts.ctx = &ctx;
opts.cpu_cnt = ctx.all_cpus ? 0 : 1;
opts.cpus = &ctx.cpu;
opts.map_keys = &ctx.idx;
pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &opts);
err = libbpf_get_error(pb);
if (err) {
p_err("failed to create perf buffer: %s (%d)",
strerror(err), err);
goto err_close_map; goto err_close_map;
pfds = calloc(nfds, sizeof(pfds[0]));
if (!pfds)
goto err_free_rings;
for (i = 0; i < nfds; i++) {
rings[i].cpu = cpu + i;
rings[i].key = index + i;
rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key,
rings[i].cpu);
if (rings[i].fd < 0)
goto err_close_fds_prev;
rings[i].mem = perf_event_mmap(rings[i].fd);
if (!rings[i].mem)
goto err_close_fds_current;
pfds[i].fd = rings[i].fd;
pfds[i].events = POLLIN;
} }
signal(SIGINT, int_exit); signal(SIGINT, int_exit);
...@@ -277,34 +214,24 @@ int do_event_pipe(int argc, char **argv) ...@@ -277,34 +214,24 @@ int do_event_pipe(int argc, char **argv)
jsonw_start_array(json_wtr); jsonw_start_array(json_wtr);
while (!stop) { while (!stop) {
poll(pfds, nfds, 200); err = perf_buffer__poll(pb, 200);
for (i = 0; i < nfds; i++) if (err < 0 && err != -EINTR) {
perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz); p_err("perf buffer polling failed: %s (%d)",
strerror(err), err);
goto err_close_pb;
}
} }
free(tmp_buf);
if (json_output) if (json_output)
jsonw_end_array(json_wtr); jsonw_end_array(json_wtr);
for (i = 0; i < nfds; i++) { perf_buffer__free(pb);
perf_event_unmap(rings[i].mem);
close(rings[i].fd);
}
free(pfds);
free(rings);
close(map_fd); close(map_fd);
return 0; return 0;
err_close_fds_prev: err_close_pb:
while (i--) { perf_buffer__free(pb);
perf_event_unmap(rings[i].mem);
err_close_fds_current:
close(rings[i].fd);
}
free(pfds);
err_free_rings:
free(rings);
err_close_map: err_close_map:
close(map_fd); close(map_fd);
return -1; return -1;
......
...@@ -9,7 +9,8 @@ described here. It's recommended to follow these conventions whenever a ...@@ -9,7 +9,8 @@ described here. It's recommended to follow these conventions whenever a
new function or type is added to keep libbpf API clean and consistent. new function or type is added to keep libbpf API clean and consistent.
All types and functions provided by libbpf API should have one of the All types and functions provided by libbpf API should have one of the
following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``. following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``,
``perf_buffer_``.
System call wrappers System call wrappers
-------------------- --------------------
......
This diff is collapsed.
...@@ -358,6 +358,26 @@ LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type, ...@@ -358,6 +358,26 @@ LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type,
LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags);
struct perf_buffer;
typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu,
void *data, __u32 size);
typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt);
/* common use perf buffer options */
struct perf_buffer_opts {
/* if specified, sample_cb is called for each sample */
perf_buffer_sample_fn sample_cb;
/* if specified, lost_cb is called for each batch of lost samples */
perf_buffer_lost_fn lost_cb;
/* ctx is provided to sample_cb and lost_cb */
void *ctx;
};
LIBBPF_API struct perf_buffer *
perf_buffer__new(int map_fd, size_t page_cnt,
const struct perf_buffer_opts *opts);
enum bpf_perf_event_ret { enum bpf_perf_event_ret {
LIBBPF_PERF_EVENT_DONE = 0, LIBBPF_PERF_EVENT_DONE = 0,
LIBBPF_PERF_EVENT_ERROR = -1, LIBBPF_PERF_EVENT_ERROR = -1,
...@@ -365,6 +385,35 @@ enum bpf_perf_event_ret { ...@@ -365,6 +385,35 @@ enum bpf_perf_event_ret {
}; };
struct perf_event_header; struct perf_event_header;
typedef enum bpf_perf_event_ret
(*perf_buffer_event_fn)(void *ctx, int cpu, struct perf_event_header *event);
/* raw perf buffer options, giving most power and control */
struct perf_buffer_raw_opts {
/* perf event attrs passed directly into perf_event_open() */
struct perf_event_attr *attr;
/* raw event callback */
perf_buffer_event_fn event_cb;
/* ctx is provided to event_cb */
void *ctx;
/* if cpu_cnt == 0, open all on all possible CPUs (up to the number of
* max_entries of given PERF_EVENT_ARRAY map)
*/
int cpu_cnt;
/* if cpu_cnt > 0, cpus is an array of CPUs to open ring buffers on */
int *cpus;
/* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */
int *map_keys;
};
LIBBPF_API struct perf_buffer *
perf_buffer__new_raw(int map_fd, size_t page_cnt,
const struct perf_buffer_raw_opts *opts);
LIBBPF_API void perf_buffer__free(struct perf_buffer *pb);
LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms);
typedef enum bpf_perf_event_ret typedef enum bpf_perf_event_ret
(*bpf_perf_event_print_t)(struct perf_event_header *hdr, (*bpf_perf_event_print_t)(struct perf_event_header *hdr,
void *private_data); void *private_data);
......
...@@ -179,4 +179,8 @@ LIBBPF_0.0.4 { ...@@ -179,4 +179,8 @@ LIBBPF_0.0.4 {
btf_dump__new; btf_dump__new;
btf__parse_elf; btf__parse_elf;
libbpf_num_possible_cpus; libbpf_num_possible_cpus;
perf_buffer__free;
perf_buffer__new;
perf_buffer__new_raw;
perf_buffer__poll;
} LIBBPF_0.0.3; } LIBBPF_0.0.3;
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>
#include <sys/socket.h>
#include <test_progs.h>
#ifdef __x86_64__
#define SYS_KPROBE_NAME "__x64_sys_nanosleep"
#else
#define SYS_KPROBE_NAME "sys_nanosleep"
#endif
static void on_sample(void *ctx, int cpu, void *data, __u32 size)
{
int cpu_data = *(int *)data, duration = 0;
cpu_set_t *cpu_seen = ctx;
if (cpu_data != cpu)
CHECK(cpu_data != cpu, "check_cpu_data",
"cpu_data %d != cpu %d\n", cpu_data, cpu);
CPU_SET(cpu, cpu_seen);
}
void test_perf_buffer(void)
{
int err, prog_fd, nr_cpus, i, duration = 0;
const char *prog_name = "kprobe/sys_nanosleep";
const char *file = "./test_perf_buffer.o";
struct perf_buffer_opts pb_opts = {};
struct bpf_map *perf_buf_map;
cpu_set_t cpu_set, cpu_seen;
struct bpf_program *prog;
struct bpf_object *obj;
struct perf_buffer *pb;
struct bpf_link *link;
nr_cpus = libbpf_num_possible_cpus();
if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
return;
/* load program */
err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
return;
prog = bpf_object__find_program_by_title(obj, prog_name);
if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
goto out_close;
/* load map */
perf_buf_map = bpf_object__find_map_by_name(obj, "perf_buf_map");
if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
goto out_close;
/* attach kprobe */
link = bpf_program__attach_kprobe(prog, false /* retprobe */,
SYS_KPROBE_NAME);
if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link)))
goto out_close;
/* set up perf buffer */
pb_opts.sample_cb = on_sample;
pb_opts.ctx = &cpu_seen;
pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
goto out_detach;
/* trigger kprobe on every CPU */
CPU_ZERO(&cpu_seen);
for (i = 0; i < nr_cpus; i++) {
CPU_ZERO(&cpu_set);
CPU_SET(i, &cpu_set);
err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set),
&cpu_set);
if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n",
i, err))
goto out_detach;
usleep(1);
}
/* read perf buffer */
err = perf_buffer__poll(pb, 100);
if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
goto out_free_pb;
if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
"expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
goto out_free_pb;
out_free_pb:
perf_buffer__free(pb);
out_detach:
bpf_link__destroy(link);
out_close:
bpf_object__close(obj);
}
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
#include <linux/ptrace.h>
#include <linux/bpf.h>
#include "bpf_helpers.h"
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
} perf_buf_map SEC(".maps");
SEC("kprobe/sys_nanosleep")
int handle_sys_nanosleep_entry(struct pt_regs *ctx)
{
int cpu = bpf_get_smp_processor_id();
bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU,
&cpu, sizeof(cpu));
return 0;
}
char _license[] SEC("license") = "GPL";
__u32 _version SEC("version") = 1;
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment