Commit 4df20483 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-perf-hw-sw-events'

Alexei Starovoitov says:

====================
perf, bpf: add support for bpf in sw/hw perf_events

this patch set is a follow up to the discussion:
https://lkml.kernel.org/r/20160804142853.GO6862%20()%20twins%20!%20programming%20!%20kicks-ass%20!%20net
It turned out to be simpler than what we discussed.

Patches 1-3 is bpf-side prep for the main patch 4
that adds bpf program as an overflow_handler to sw and hw perf_events.

Patches 5 and 6 are examples from myself and Brendan.

Peter,
to implement your suggestion to add ifdef CONFIG_BPF_SYSCALL
inside struct perf_event, I had to shuffle ifdefs in events/core.c
Please double check whether that is what you wanted to see.

v2->v3: fixed few more minor issues
v1->v2: fixed issues spotted by Peter and Daniel.
====================
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 569e937e 72874418
...@@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) ...@@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
static inline void bpf_prog_put(struct bpf_prog *prog) static inline void bpf_prog_put(struct bpf_prog *prog)
{ {
} }
static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
{
return ERR_PTR(-EOPNOTSUPP);
}
#endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_BPF_SYSCALL */
/* verifier prototypes for helper functions called from eBPF programs */ /* verifier prototypes for helper functions called from eBPF programs */
......
...@@ -679,6 +679,10 @@ struct perf_event { ...@@ -679,6 +679,10 @@ struct perf_event {
u64 (*clock)(void); u64 (*clock)(void);
perf_overflow_handler_t overflow_handler; perf_overflow_handler_t overflow_handler;
void *overflow_handler_context; void *overflow_handler_context;
#ifdef CONFIG_BPF_SYSCALL
perf_overflow_handler_t orig_overflow_handler;
struct bpf_prog *prog;
#endif
#ifdef CONFIG_EVENT_TRACING #ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event; struct trace_event_call *tp_event;
...@@ -788,6 +792,11 @@ struct perf_output_handle { ...@@ -788,6 +792,11 @@ struct perf_output_handle {
int page; int page;
}; };
struct bpf_perf_event_data_kern {
struct pt_regs *regs;
struct perf_sample_data *data;
};
#ifdef CONFIG_CGROUP_PERF #ifdef CONFIG_CGROUP_PERF
/* /*
......
...@@ -71,6 +71,7 @@ header-y += binfmts.h ...@@ -71,6 +71,7 @@ header-y += binfmts.h
header-y += blkpg.h header-y += blkpg.h
header-y += blktrace_api.h header-y += blktrace_api.h
header-y += bpf_common.h header-y += bpf_common.h
header-y += bpf_perf_event.h
header-y += bpf.h header-y += bpf.h
header-y += bpqether.h header-y += bpqether.h
header-y += bsg.h header-y += bsg.h
......
...@@ -95,6 +95,7 @@ enum bpf_prog_type { ...@@ -95,6 +95,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
}; };
#define BPF_PSEUDO_MAP_FD 1 #define BPF_PSEUDO_MAP_FD 1
......
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__
#define _UAPI__LINUX_BPF_PERF_EVENT_H__
#include <linux/types.h>
#include <linux/ptrace.h>
struct bpf_perf_event_data {
struct pt_regs regs;
__u64 sample_period;
};
#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
...@@ -2333,7 +2333,8 @@ static int do_check(struct verifier_env *env) ...@@ -2333,7 +2333,8 @@ static int do_check(struct verifier_env *env)
if (err) if (err)
return err; return err;
if (BPF_SIZE(insn->code) != BPF_W) { if (BPF_SIZE(insn->code) != BPF_W &&
BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++; insn_idx++;
continue; continue;
} }
...@@ -2510,6 +2511,20 @@ static int do_check(struct verifier_env *env) ...@@ -2510,6 +2511,20 @@ static int do_check(struct verifier_env *env)
return 0; return 0;
} }
static int check_map_prog_compatibility(struct bpf_map *map,
struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
(map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
(map->map_flags & BPF_F_NO_PREALLOC)) {
verbose("perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
return 0;
}
/* look for pseudo eBPF instructions that access map FDs and /* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers * replace them with actual map pointers
*/ */
...@@ -2517,7 +2532,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) ...@@ -2517,7 +2532,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
{ {
struct bpf_insn *insn = env->prog->insnsi; struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len; int insn_cnt = env->prog->len;
int i, j; int i, j, err;
for (i = 0; i < insn_cnt; i++, insn++) { for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX && if (BPF_CLASS(insn->code) == BPF_LDX &&
...@@ -2561,6 +2576,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) ...@@ -2561,6 +2576,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return PTR_ERR(map); return PTR_ERR(map);
} }
err = check_map_prog_compatibility(map, env->prog);
if (err) {
fdput(f);
return err;
}
/* store map pointer inside BPF_LD_IMM64 instruction */ /* store map pointer inside BPF_LD_IMM64 instruction */
insn[0].imm = (u32) (unsigned long) map; insn[0].imm = (u32) (unsigned long) map;
insn[1].imm = ((u64) (unsigned long) map) >> 32; insn[1].imm = ((u64) (unsigned long) map) >> 32;
...@@ -2642,9 +2663,11 @@ static int convert_ctx_accesses(struct verifier_env *env) ...@@ -2642,9 +2663,11 @@ static int convert_ctx_accesses(struct verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) { for (i = 0; i < insn_cnt; i++, insn++) {
u32 insn_delta, cnt; u32 insn_delta, cnt;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ; type = BPF_READ;
else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
insn->code == (BPF_STX | BPF_MEM | BPF_DW))
type = BPF_WRITE; type = BPF_WRITE;
else else
continue; continue;
......
...@@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event, ...@@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(&event->pending); irq_work_queue(&event->pending);
} }
event->overflow_handler(event, data, regs); READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) { if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1; event->pending_wakeup = 1;
...@@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event) ...@@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event); ftrace_profile_free_filter(event);
} }
#ifdef CONFIG_BPF_SYSCALL
static void bpf_overflow_handler(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct bpf_perf_event_data_kern ctx = {
.data = data,
.regs = regs,
};
int ret = 0;
preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
preempt_enable();
if (!ret)
return;
event->orig_overflow_handler(event, data, regs);
}
static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
{
struct bpf_prog *prog;
if (event->overflow_handler_context)
/* hw breakpoint or kernel counter */
return -EINVAL;
if (event->prog)
return -EEXIST;
prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
if (IS_ERR(prog))
return PTR_ERR(prog);
event->prog = prog;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
return 0;
}
static void perf_event_free_bpf_handler(struct perf_event *event)
{
struct bpf_prog *prog = event->prog;
if (!prog)
return;
WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
event->prog = NULL;
bpf_prog_put(prog);
}
#else
static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
{
return -EOPNOTSUPP;
}
static void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{ {
bool is_kprobe, is_tracepoint; bool is_kprobe, is_tracepoint;
struct bpf_prog *prog; struct bpf_prog *prog;
if (event->attr.type == PERF_TYPE_HARDWARE ||
event->attr.type == PERF_TYPE_SOFTWARE)
return perf_event_set_bpf_handler(event, prog_fd);
if (event->attr.type != PERF_TYPE_TRACEPOINT) if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL; return -EINVAL;
...@@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) ...@@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
{ {
struct bpf_prog *prog; struct bpf_prog *prog;
perf_event_free_bpf_handler(event);
if (!event->tp_event) if (!event->tp_event)
return; return;
...@@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, ...@@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!overflow_handler && parent_event) { if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler; overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context; context = parent_event->overflow_handler_context;
#ifdef CONFIG_BPF_SYSCALL
if (overflow_handler == bpf_overflow_handler) {
struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto err_ns;
}
event->prog = prog;
event->orig_overflow_handler =
parent_event->orig_overflow_handler;
}
#endif
} }
if (overflow_handler) { if (overflow_handler) {
......
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public * modify it under the terms of version 2 of the GNU General Public
...@@ -8,6 +9,7 @@ ...@@ -8,6 +9,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/bpf_perf_event.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/ctype.h> #include <linux/ctype.h>
...@@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = { ...@@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
.type = BPF_PROG_TYPE_TRACEPOINT, .type = BPF_PROG_TYPE_TRACEPOINT,
}; };
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
if (size != sizeof(u64))
return false;
} else {
if (size != sizeof(long))
return false;
}
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
switch (ctx_off) {
case offsetof(struct bpf_perf_event_data, sample_period):
BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)),
dst_reg, src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
offsetof(struct perf_sample_data, period));
break;
default:
*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)),
dst_reg, src_reg,
offsetof(struct bpf_perf_event_data_kern, regs));
*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)),
dst_reg, dst_reg, ctx_off);
break;
}
return insn - insn_buf;
}
static const struct bpf_verifier_ops perf_event_prog_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
static struct bpf_prog_type_list perf_event_tl = {
.ops = &perf_event_prog_ops,
.type = BPF_PROG_TYPE_PERF_EVENT,
};
static int __init register_kprobe_prog_ops(void) static int __init register_kprobe_prog_ops(void)
{ {
bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&kprobe_tl);
bpf_register_prog_type(&tracepoint_tl); bpf_register_prog_type(&tracepoint_tl);
bpf_register_prog_type(&perf_event_tl);
return 0; return 0;
} }
late_initcall(register_kprobe_prog_ops); late_initcall(register_kprobe_prog_ops);
...@@ -25,6 +25,8 @@ hostprogs-y += test_cgrp2_array_pin ...@@ -25,6 +25,8 @@ hostprogs-y += test_cgrp2_array_pin
hostprogs-y += xdp1 hostprogs-y += xdp1
hostprogs-y += xdp2 hostprogs-y += xdp2
hostprogs-y += test_current_task_under_cgroup hostprogs-y += test_current_task_under_cgroup
hostprogs-y += trace_event
hostprogs-y += sampleip
test_verifier-objs := test_verifier.o libbpf.o test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o test_maps-objs := test_maps.o libbpf.o
...@@ -52,6 +54,8 @@ xdp1-objs := bpf_load.o libbpf.o xdp1_user.o ...@@ -52,6 +54,8 @@ xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
xdp2-objs := bpf_load.o libbpf.o xdp1_user.o xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
test_current_task_under_cgroup_user.o test_current_task_under_cgroup_user.o
trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
# Tell kbuild to always build the programs # Tell kbuild to always build the programs
always := $(hostprogs-y) always := $(hostprogs-y)
...@@ -79,6 +83,8 @@ always += test_cgrp2_tc_kern.o ...@@ -79,6 +83,8 @@ always += test_cgrp2_tc_kern.o
always += xdp1_kern.o always += xdp1_kern.o
always += xdp2_kern.o always += xdp2_kern.o
always += test_current_task_under_cgroup_kern.o always += test_current_task_under_cgroup_kern.o
always += trace_event_kern.o
always += sampleip_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(objtree)/usr/include
...@@ -103,6 +109,8 @@ HOSTLOADLIBES_test_overhead += -lelf -lrt ...@@ -103,6 +109,8 @@ HOSTLOADLIBES_test_overhead += -lelf -lrt
HOSTLOADLIBES_xdp1 += -lelf HOSTLOADLIBES_xdp1 += -lelf
HOSTLOADLIBES_xdp2 += -lelf HOSTLOADLIBES_xdp2 += -lelf
HOSTLOADLIBES_test_current_task_under_cgroup += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
HOSTLOADLIBES_trace_event += -lelf
HOSTLOADLIBES_sampleip += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
......
...@@ -55,6 +55,8 @@ static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = ...@@ -55,6 +55,8 @@ static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) =
(void *) BPF_FUNC_skb_get_tunnel_opt; (void *) BPF_FUNC_skb_get_tunnel_opt;
static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) =
(void *) BPF_FUNC_skb_set_tunnel_opt; (void *) BPF_FUNC_skb_set_tunnel_opt;
static unsigned long long (*bpf_get_prandom_u32)(void) =
(void *) BPF_FUNC_get_prandom_u32;
/* llvm builtin functions that eBPF C program may use to /* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions * emit BPF_LD_ABS and BPF_LD_IND instructions
......
...@@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) ...@@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
bool is_xdp = strncmp(event, "xdp", 3) == 0; bool is_xdp = strncmp(event, "xdp", 3) == 0;
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
enum bpf_prog_type prog_type; enum bpf_prog_type prog_type;
char buf[256]; char buf[256];
int fd, efd, err, id; int fd, efd, err, id;
...@@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) ...@@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_type = BPF_PROG_TYPE_TRACEPOINT; prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else if (is_xdp) { } else if (is_xdp) {
prog_type = BPF_PROG_TYPE_XDP; prog_type = BPF_PROG_TYPE_XDP;
} else if (is_perf_event) {
prog_type = BPF_PROG_TYPE_PERF_EVENT;
} else { } else {
printf("Unknown event '%s'\n", event); printf("Unknown event '%s'\n", event);
return -1; return -1;
...@@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) ...@@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd; prog_fd[prog_cnt++] = fd;
if (is_xdp) if (is_xdp || is_perf_event)
return 0; return 0;
if (is_socket) { if (is_socket) {
...@@ -326,6 +329,7 @@ int load_bpf_file(char *path) ...@@ -326,6 +329,7 @@ int load_bpf_file(char *path)
memcmp(shname_prog, "kretprobe/", 10) == 0 || memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "tracepoint/", 11) == 0 || memcmp(shname_prog, "tracepoint/", 11) == 0 ||
memcmp(shname_prog, "xdp", 3) == 0 || memcmp(shname_prog, "xdp", 3) == 0 ||
memcmp(shname_prog, "perf_event", 10) == 0 ||
memcmp(shname_prog, "socket", 6) == 0) memcmp(shname_prog, "socket", 6) == 0)
load_and_attach(shname_prog, insns, data_prog->d_size); load_and_attach(shname_prog, insns, data_prog->d_size);
} }
...@@ -344,6 +348,7 @@ int load_bpf_file(char *path) ...@@ -344,6 +348,7 @@ int load_bpf_file(char *path)
memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 || memcmp(shname, "tracepoint/", 11) == 0 ||
memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "xdp", 3) == 0 ||
memcmp(shname, "perf_event", 10) == 0 ||
memcmp(shname, "socket", 6) == 0) memcmp(shname, "socket", 6) == 0)
load_and_attach(shname, data->d_buf, data->d_size); load_and_attach(shname, data->d_buf, data->d_size);
} }
......
/* Copyright 2016 Netflix, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/version.h>
#include <linux/ptrace.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include "bpf_helpers.h"
#define MAX_IPS 8192
struct bpf_map_def SEC("maps") ip_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u64),
.value_size = sizeof(u32),
.max_entries = MAX_IPS,
};
SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx)
{
u64 ip;
u32 *value, init_val = 1;
ip = ctx->regs.ip;
value = bpf_map_lookup_elem(&ip_map, &ip);
if (value)
*value += 1;
else
/* E2BIG not tested for this example only */
bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST);
return 0;
}
char _license[] SEC("license") = "GPL";
/*
* sampleip: sample instruction pointer and frequency count in a BPF map.
*
* Copyright 2016 Netflix, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <string.h>
#include <assert.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/bpf.h>
#include <sys/ioctl.h>
#include "libbpf.h"
#include "bpf_load.h"
#define DEFAULT_FREQ 99
#define DEFAULT_SECS 5
#define MAX_IPS 8192
#define PAGE_OFFSET 0xffff880000000000
static int nr_cpus;
static void usage(void)
{
printf("USAGE: sampleip [-F freq] [duration]\n");
printf(" -F freq # sample frequency (Hertz), default 99\n");
printf(" duration # sampling duration (seconds), default 5\n");
}
static int sampling_start(int *pmu_fd, int freq)
{
int i;
struct perf_event_attr pe_sample_attr = {
.type = PERF_TYPE_SOFTWARE,
.freq = 1,
.sample_period = freq,
.config = PERF_COUNT_SW_CPU_CLOCK,
.inherit = 1,
};
for (i = 0; i < nr_cpus; i++) {
pmu_fd[i] = perf_event_open(&pe_sample_attr, -1 /* pid */, i,
-1 /* group_fd */, 0 /* flags */);
if (pmu_fd[i] < 0) {
fprintf(stderr, "ERROR: Initializing perf sampling\n");
return 1;
}
assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF,
prog_fd[0]) == 0);
assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0);
}
return 0;
}
static void sampling_end(int *pmu_fd)
{
int i;
for (i = 0; i < nr_cpus; i++)
close(pmu_fd[i]);
}
struct ipcount {
__u64 ip;
__u32 count;
};
/* used for sorting */
struct ipcount counts[MAX_IPS];
static int count_cmp(const void *p1, const void *p2)
{
return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count;
}
static void print_ip_map(int fd)
{
struct ksym *sym;
__u64 key, next_key;
__u32 value;
int i, max;
printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT");
/* fetch IPs and counts */
key = 0, i = 0;
while (bpf_get_next_key(fd, &key, &next_key) == 0) {
bpf_lookup_elem(fd, &next_key, &value);
counts[i].ip = next_key;
counts[i++].count = value;
key = next_key;
}
max = i;
/* sort and print */
qsort(counts, max, sizeof(struct ipcount), count_cmp);
for (i = 0; i < max; i++) {
if (counts[i].ip > PAGE_OFFSET) {
sym = ksym_search(counts[i].ip);
printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name,
counts[i].count);
} else {
printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)",
counts[i].count);
}
}
if (max == MAX_IPS) {
printf("WARNING: IP hash was full (max %d entries); ", max);
printf("may have dropped samples\n");
}
}
static void int_exit(int sig)
{
printf("\n");
print_ip_map(map_fd[0]);
exit(0);
}
int main(int argc, char **argv)
{
char filename[256];
int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS;
/* process arguments */
while ((opt = getopt(argc, argv, "F:h")) != -1) {
switch (opt) {
case 'F':
freq = atoi(optarg);
break;
case 'h':
default:
usage();
return 0;
}
}
if (argc - optind == 1)
secs = atoi(argv[optind]);
if (freq == 0 || secs == 0) {
usage();
return 1;
}
/* initialize kernel symbol translation */
if (load_kallsyms()) {
fprintf(stderr, "ERROR: loading /proc/kallsyms\n");
return 2;
}
/* create perf FDs for each CPU */
nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
pmu_fd = malloc(nr_cpus * sizeof(int));
if (pmu_fd == NULL) {
fprintf(stderr, "ERROR: malloc of pmu_fd\n");
return 1;
}
/* load BPF program */
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (load_bpf_file(filename)) {
fprintf(stderr, "ERROR: loading BPF program (errno %d):\n",
errno);
if (strcmp(bpf_log_buf, "") == 0)
fprintf(stderr, "Try: ulimit -l unlimited\n");
else
fprintf(stderr, "%s", bpf_log_buf);
return 1;
}
signal(SIGINT, int_exit);
/* do sampling */
printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n",
freq, secs);
if (sampling_start(pmu_fd, freq) != 0)
return 1;
sleep(secs);
sampling_end(pmu_fd);
free(pmu_fd);
/* output sample counts */
print_ip_map(map_fd[0]);
return 0;
}
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/ptrace.h>
#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include <uapi/linux/perf_event.h>
#include "bpf_helpers.h"
struct key_t {
char comm[TASK_COMM_LEN];
u32 kernstack;
u32 userstack;
};
struct bpf_map_def SEC("maps") counts = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct key_t),
.value_size = sizeof(u64),
.max_entries = 10000,
};
struct bpf_map_def SEC("maps") stackmap = {
.type = BPF_MAP_TYPE_STACK_TRACE,
.key_size = sizeof(u32),
.value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
.max_entries = 10000,
};
#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
SEC("perf_event")
int bpf_prog1(struct bpf_perf_event_data *ctx)
{
char fmt[] = "CPU-%d period %lld ip %llx";
u32 cpu = bpf_get_smp_processor_id();
struct key_t key;
u64 *val, one = 1;
if (ctx->sample_period < 10000)
/* ignore warmup */
return 0;
bpf_get_current_comm(&key.comm, sizeof(key.comm));
key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS);
if ((int)key.kernstack < 0 && (int)key.userstack < 0) {
bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period,
ctx->regs.ip);
return 0;
}
val = bpf_map_lookup_elem(&counts, &key);
if (val)
(*val)++;
else
bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST);
return 0;
}
char _license[] SEC("license") = "GPL";
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <fcntl.h>
#include <poll.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/bpf.h>
#include <signal.h>
#include <assert.h>
#include <errno.h>
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
#define SAMPLE_FREQ 50
static bool sys_read_seen, sys_write_seen;
static void print_ksym(__u64 addr)
{
struct ksym *sym;
if (!addr)
return;
sym = ksym_search(addr);
printf("%s;", sym->name);
if (!strcmp(sym->name, "sys_read"))
sys_read_seen = true;
else if (!strcmp(sym->name, "sys_write"))
sys_write_seen = true;
}
static void print_addr(__u64 addr)
{
if (!addr)
return;
printf("%llx;", addr);
}
#define TASK_COMM_LEN 16
struct key_t {
char comm[TASK_COMM_LEN];
__u32 kernstack;
__u32 userstack;
};
static void print_stack(struct key_t *key, __u64 count)
{
__u64 ip[PERF_MAX_STACK_DEPTH] = {};
static bool warned;
int i;
printf("%3lld %s;", count, key->comm);
if (bpf_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) {
printf("---;");
} else {
for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
print_ksym(ip[i]);
}
printf("-;");
if (bpf_lookup_elem(map_fd[1], &key->userstack, ip) != 0) {
printf("---;");
} else {
for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
print_addr(ip[i]);
}
printf("\n");
if (key->kernstack == -EEXIST && !warned) {
printf("stackmap collisions seen. Consider increasing size\n");
warned = true;
} else if ((int)key->kernstack < 0 && (int)key->userstack < 0) {
printf("err stackid %d %d\n", key->kernstack, key->userstack);
}
}
static void int_exit(int sig)
{
kill(0, SIGKILL);
exit(0);
}
static void print_stacks(void)
{
struct key_t key = {}, next_key;
__u64 value;
__u32 stackid = 0, next_id;
int fd = map_fd[0], stack_map = map_fd[1];
sys_read_seen = sys_write_seen = false;
while (bpf_get_next_key(fd, &key, &next_key) == 0) {
bpf_lookup_elem(fd, &next_key, &value);
print_stack(&next_key, value);
bpf_delete_elem(fd, &next_key);
key = next_key;
}
if (!sys_read_seen || !sys_write_seen) {
printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n");
int_exit(0);
}
/* clear stack map */
while (bpf_get_next_key(stack_map, &stackid, &next_id) == 0) {
bpf_delete_elem(stack_map, &next_id);
stackid = next_id;
}
}
static void test_perf_event_all_cpu(struct perf_event_attr *attr)
{
int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
int *pmu_fd = malloc(nr_cpus * sizeof(int));
int i;
/* open perf_event on all cpus */
for (i = 0; i < nr_cpus; i++) {
pmu_fd[i] = perf_event_open(attr, -1, i, -1, 0);
if (pmu_fd[i] < 0) {
printf("perf_event_open failed\n");
goto all_cpu_err;
}
assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0);
assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0);
}
system("dd if=/dev/zero of=/dev/null count=5000k");
print_stacks();
all_cpu_err:
for (i--; i >= 0; i--)
close(pmu_fd[i]);
free(pmu_fd);
}
static void test_perf_event_task(struct perf_event_attr *attr)
{
int pmu_fd;
/* open task bound event */
pmu_fd = perf_event_open(attr, 0, -1, -1, 0);
if (pmu_fd < 0) {
printf("perf_event_open failed\n");
return;
}
assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0);
assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
system("dd if=/dev/zero of=/dev/null count=5000k");
print_stacks();
close(pmu_fd);
}
static void test_bpf_perf_event(void)
{
struct perf_event_attr attr_type_hw = {
.sample_freq = SAMPLE_FREQ,
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.inherit = 1,
};
struct perf_event_attr attr_type_sw = {
.sample_freq = SAMPLE_FREQ,
.freq = 1,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_CPU_CLOCK,
.inherit = 1,
};
test_perf_event_all_cpu(&attr_type_hw);
test_perf_event_task(&attr_type_hw);
test_perf_event_all_cpu(&attr_type_sw);
test_perf_event_task(&attr_type_sw);
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
signal(SIGINT, int_exit);
if (load_kallsyms()) {
printf("failed to process /proc/kallsyms\n");
return 1;
}
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 2;
}
if (fork() == 0) {
read_trace_pipe();
return 0;
}
test_bpf_perf_event();
int_exit(0);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment