Commit 0d722379 authored by Yonghong Song's avatar Yonghong Song

introduce {attach|detach}_raw_tracepoint API

The motivation comes from pull request #1689.
It attached a kprobe bpf program to kernel function
ttwu_do_wakeup for more accurate tracing.
Unfortunately, it broke runqlat.py in my
4.17 environment since ttwu_do_wakeup function
is inlined in my kernel with gcc 7.3.1.

4.17 introduced raw_tracepoint and this patch
added the relevant API to bcc. With this,
we can use tracepoints
sched:{sched_wakeup, sched_wakeup_new, sched_switch}
to measure runq latency more reliably.
Signed-off-by: default avatarYonghong Song <yhs@fb.com>
parent 67cc2ff4
......@@ -14,6 +14,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s
- [4. uprobes](#4-uprobes)
- [5. uretprobes](#5-uretprobes)
- [6. USDT probes](#6-usdt-probes)
- [7. Raw Tracepoints](#7-raw-tracepoints)
- [Data](#data)
- [1. bpf_probe_read()](#1-bpf_probe_read)
- [2. bpf_probe_read_str()](#2-bpf_probe_read_str)
......@@ -61,6 +62,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s
- [4. attach_uprobe()](#4-attach_uprobe)
- [5. attach_uretprobe()](#5-attach_uretprobe)
- [6. USDT.enable_probe()](#6-usdtenable_probe)
- [7. attach_raw_tracepoint()](#7-attach_raw_tracepoint)
- [Debug Output](#debug-output)
- [1. trace_print()](#1-trace_print)
- [2. trace_fields()](#2-trace_fields)
......@@ -237,6 +239,35 @@ Examples in situ:
[search /examples](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code),
[search /tools](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code)
### 7. Raw Tracepoints
Syntax: RAW_TRACEPOINT_PROBE(*event*)
This is a macro that instruments the raw tracepoint defined by *event*.
The argument is a pointer to struct ```bpf_raw_tracepoint_args```, which is defined in [bpf.h](https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/bpf.h). The struct field ```args``` contains all parameters of the raw tracepoint where you can found at linux tree [include/trace/events](https://github.com/torvalds/linux/tree/master/include/trace/events)
directory.
For example:
```C
RAW_TRACEPOINT_PROBE(sched_switch)
{
// TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
struct task_struct *prev = (struct task_struct *)ctx->args[1];
struct task_struct *next= (struct task_struct *)ctx->args[2];
s32 prev_tgid, next_tgid;
bpf_probe_read(&prev_tgid, sizeof(prev->tgid), &prev->tgid);
bpf_probe_read(&next_tgid, sizeof(next->tgid), &next->tgid);
bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid);
}
```
This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid.
Examples in situ:
[search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code)
## Data
### 1. bpf_probe_read()
......@@ -993,6 +1024,23 @@ Examples in situ:
[search /examples](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code),
[search /tools](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code)
### 7. attach_raw_tracepoint()
Syntax: ```BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")```
Instruments the kernel raw tracepoint described by ```tracepoint``` (```event``` only, no ```category```), and when hit, runs the BPF function ```name()```.
This is an explicit way to instrument tracepoints. The ```RAW_TRACEPOINT_PROBE``` syntax, covered in the earlier raw tracepoints section, is an alternate method.
For example:
```Python
b.attach_raw_tracepoint("sched_swtich", "do_trace")
```
Examples in situ:
[search /tools](https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code)
## Debug Output
### 1. trace_print()
......
......@@ -683,6 +683,9 @@ int bpf_usdt_readarg_p(int argc, struct pt_regs *ctx, void *buf, u64 len) asm("l
#define TRACEPOINT_PROBE(category, event) \
int tracepoint__##category##__##event(struct tracepoint__##category##__##event *args)
#define RAW_TRACEPOINT_PROBE(event) \
int raw_tracepoint__##event(struct bpf_raw_tracepoint_args *ctx)
#define TP_DATA_LOC_READ_CONST(dst, field, length) \
do { \
unsigned short __offset = args->data_loc_##field & 0xFFFF; \
......
......@@ -1061,6 +1061,21 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
return 0;
}
int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
{
union bpf_attr attr;
int ret;
bzero(&attr, sizeof(attr));
attr.raw_tracepoint.name = ptr_to_u64(tp_name);
attr.raw_tracepoint.prog_fd = progfd;
ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
if (ret < 0)
fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
return ret;
}
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
perf_reader_lost_cb lost_cb, void *cb_cookie,
int pid, int cpu, int page_cnt) {
......
......@@ -81,6 +81,8 @@ int bpf_attach_tracepoint(int progfd, const char *tp_category,
const char *tp_name);
int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
int bpf_attach_raw_tracepoint(int progfd, char *tp_name);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
perf_reader_lost_cb lost_cb, void *cb_cookie,
int pid, int cpu, int page_cnt);
......
......@@ -135,6 +135,17 @@ class BPF(object):
TRACEPOINT = 5
XDP = 6
PERF_EVENT = 7
CGROUP_SKB = 8
CGROUP_SOCK = 9
LWT_IN = 10
LWT_OUT = 11
LWT_XMIT = 12
SOCK_OPS = 13
SK_SKB = 14
CGROUP_DEVICE = 15
SK_MSG = 16
RAW_TRACEPOINT = 17
CGROUP_SOCK_ADDR = 18
# from xdp_action uapi/linux/bpf.h
XDP_ABORTED = 0
......@@ -267,6 +278,7 @@ class BPF(object):
self.kprobe_fds = {}
self.uprobe_fds = {}
self.tracepoint_fds = {}
self.raw_tracepoint_fds = {}
self.perf_buffers = {}
self.open_perf_events = {}
self.tracefile = None
......@@ -310,7 +322,8 @@ class BPF(object):
for usdt_context in usdt_contexts:
usdt_context.attach_uprobes(self)
# If any "kprobe__" or "tracepoint__" prefixed functions were defined,
# If any "kprobe__" or "tracepoint__" or "raw_tracepoint__"
# prefixed functions were defined,
# they will be loaded and attached here.
self._trace_autoload()
......@@ -725,6 +738,52 @@ class BPF(object):
self.tracepoint_fds[tp] = fd
return self
def attach_raw_tracepoint(self, tp=b"", fn_name=b""):
"""attach_raw_tracepoint(self, tp=b"", fn_name=b"")
Run the bpf function denoted by fn_name every time the kernel tracepoint
specified by 'tp' is hit. The bpf function should be loaded as a
RAW_TRACEPOINT type. The fn_name is the kernel tracepoint name,
e.g., sched_switch, sys_enter_bind, etc.
Examples:
BPF(text).attach_raw_tracepoint(tp="sched_switch", fn_name="on_switch")
"""
tp = _assert_is_bytes(tp)
if tp in self.raw_tracepoint_fds:
raise Exception("Raw tracepoint %s has been attached" % tp)
fn_name = _assert_is_bytes(fn_name)
fn = self.load_func(fn_name, BPF.RAW_TRACEPOINT)
fd = lib.bpf_attach_raw_tracepoint(fn.fd, tp)
if fd < 0:
raise Exception("Failed to attach BPF to raw tracepoint")
self.raw_tracepoint_fds[tp] = fd;
return self
def detach_raw_tracepoint(self, tp=b""):
"""detach_raw_tracepoint(tp="")
Stop running the bpf function that is attached to the kernel tracepoint
specified by 'tp'.
Example: bpf.detach_raw_tracepoint("sched_switch")
"""
tp = _assert_is_bytes(tp)
if tp not in self.raw_tracepoint_fds:
raise Exception("Raw tracepoint %s is not attached" % tp)
os.close(self.raw_tracepoint_fds[tp])
del self.raw_tracepoint_fds[tp]
@staticmethod
def support_raw_tracepoint():
# kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support
if BPF.ksymname("bpf_find_raw_tracepoint") != -1:
return True
return False
def detach_tracepoint(self, tp=b""):
"""detach_tracepoint(tp="")
......@@ -954,6 +1013,10 @@ class BPF(object):
fn = self.load_func(func_name, BPF.TRACEPOINT)
tp = fn.name[len(b"tracepoint__"):].replace(b"__", b":")
self.attach_tracepoint(tp=tp, fn_name=fn.name)
elif func_name.startswith(b"raw_tracepoint__"):
fn = self.load_func(func_name, BPF.RAW_TRACEPOINT)
tp = fn.name[len(b"raw_tracepoint__"):]
self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)
def trace_open(self, nonblocking=False):
"""trace_open(nonblocking=False)
......@@ -1154,6 +1217,8 @@ class BPF(object):
self.detach_uprobe_event(k)
for k, v in list(self.tracepoint_fds.items()):
self.detach_tracepoint(k)
for k, v in list(self.raw_tracepoint_fds.items()):
self.detach_raw_tracepoint(k)
# Clean up opened perf ring buffer and perf events
table_keys = list(self.tables.keys())
......
......@@ -100,6 +100,8 @@ lib.bpf_attach_tracepoint.restype = ct.c_int
lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p]
lib.bpf_detach_tracepoint.restype = ct.c_int
lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
lib.bpf_attach_raw_tracepoint.restype = ct.c_int
lib.bpf_attach_raw_tracepoint.argtypes = [ct.c_int, ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, _LOST_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
lib.bpf_open_perf_event.restype = ct.c_int
......
......@@ -95,7 +95,9 @@ static int trace_enqueue(u32 tgid, u32 pid)
start.update(&pid, &ts);
return 0;
}
"""
bpf_text_kprobe = """
int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
{
return trace_enqueue(p->tgid, p->pid);
......@@ -144,6 +146,76 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev)
}
"""
bpf_text_raw_tp = """
RAW_TRACEPOINT_PROBE(sched_wakeup)
{
// TP_PROTO(struct task_struct *p)
struct task_struct *p = (struct task_struct *)ctx->args[0];
u32 tgid, pid;
bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
bpf_probe_read(&pid, sizeof(pid), &p->pid);
return trace_enqueue(tgid, pid);
}
RAW_TRACEPOINT_PROBE(sched_wakeup_new)
{
// TP_PROTO(struct task_struct *p)
struct task_struct *p = (struct task_struct *)ctx->args[0];
u32 tgid, pid;
bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
bpf_probe_read(&pid, sizeof(pid), &p->pid);
return trace_enqueue(tgid, pid);
}
RAW_TRACEPOINT_PROBE(sched_switch)
{
// TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
struct task_struct *prev = (struct task_struct *)ctx->args[1];
struct task_struct *next= (struct task_struct *)ctx->args[2];
u32 pid, tgid;
long state;
// ivcsw: treat like an enqueue event and store timestamp
bpf_probe_read(&state, sizeof(long), &prev->state);
if (state == TASK_RUNNING) {
bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid);
bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid);
if (!(FILTER)) {
u64 ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
}
}
bpf_probe_read(&tgid, sizeof(next->tgid), &next->tgid);
bpf_probe_read(&pid, sizeof(next->pid), &next->pid);
if (FILTER)
return 0;
u64 *tsp, delta;
// fetch timestamp and calculate delta
tsp = start.lookup(&pid);
if (tsp == 0) {
return 0; // missed enqueue
}
delta = bpf_ktime_get_ns() - *tsp;
FACTOR
// store as histogram
STORE
start.delete(&pid);
return 0;
}
"""
is_support_raw_tp = BPF.support_raw_tracepoint()
if is_support_raw_tp:
bpf_text += bpf_text_raw_tp
else:
bpf_text += bpf_text_kprobe
# code substitutions
if args.pid:
# pid from userspace point of view is thread group from kernel pov
......@@ -186,9 +258,10 @@ if debug or args.ebpf:
# load BPF program
b = BPF(text=bpf_text)
b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
if not is_support_raw_tp:
b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
print("Tracing run queue latency... Hit Ctrl-C to end.")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment