Commit 266d6f69 authored by Mark Drayton's avatar Mark Drayton

Use BPF_PERF_OUTPUT for fileslower and stacksnoop

parent 40cf791c
...@@ -31,7 +31,9 @@ ...@@ -31,7 +31,9 @@
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
import argparse import argparse
import ctypes as ct
import signal import signal
import time
# arguments # arguments
examples = """examples: examples = """examples:
...@@ -60,17 +62,31 @@ def signal_ignore(signal, frame): ...@@ -60,17 +62,31 @@ def signal_ignore(signal, frame):
bpf_text = """ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/sched.h>
#define TRACE_READ 0 enum trace_mode {
#define TRACE_WRITE 1 MODE_READ,
MODE_WRITE
};
struct val_t { struct val_t {
u32 sz; u32 sz;
u64 ts; u64 ts;
char name[DNAME_INLINE_LEN]; char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
};
struct data_t {
enum trace_mode mode;
u32 pid;
u32 sz;
u64 delta_us;
char name[DNAME_INLINE_LEN];
char comm[TASK_COMM_LEN];
}; };
BPF_HASH(entryinfo, pid_t, struct val_t); BPF_HASH(entryinfo, pid_t, struct val_t);
BPF_PERF_OUTPUT(events);
// store timestamp and size on entry // store timestamp and size on entry
static int trace_rw_entry(struct pt_regs *ctx, struct file *file, static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
...@@ -92,6 +108,7 @@ static int trace_rw_entry(struct pt_regs *ctx, struct file *file, ...@@ -92,6 +108,7 @@ static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
val.sz = count; val.sz = count;
val.ts = bpf_ktime_get_ns(); val.ts = bpf_ktime_get_ns();
__builtin_memcpy(&val.name, de->d_iname, sizeof(val.name)); __builtin_memcpy(&val.name, de->d_iname, sizeof(val.name));
bpf_get_current_comm(&val.comm, sizeof(val.comm));
entryinfo.update(&pid, &val); entryinfo.update(&pid, &val);
return 0; return 0;
...@@ -131,23 +148,26 @@ static int trace_rw_return(struct pt_regs *ctx, int type) ...@@ -131,23 +148,26 @@ static int trace_rw_return(struct pt_regs *ctx, int type)
if (delta_us < MIN_US) if (delta_us < MIN_US)
return 0; return 0;
if (type == TRACE_READ) { struct data_t data = {};
bpf_trace_printk("R %d %d %s\\n", valp->sz, delta_us, valp->name); data.mode = type;
} else { data.pid = pid;
bpf_trace_printk("W %d %d %s\\n", valp->sz, delta_us, valp->name); data.sz = valp->sz;
} data.delta_us = delta_us;
bpf_probe_read(&data.name, sizeof(data.name), valp->name);
bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
events.perf_submit(ctx, &data, sizeof(data));
return 0; return 0;
} }
int trace_read_return(struct pt_regs *ctx) int trace_read_return(struct pt_regs *ctx)
{ {
return trace_rw_return(ctx, TRACE_READ); return trace_rw_return(ctx, MODE_READ);
} }
int trace_write_return(struct pt_regs *ctx) int trace_write_return(struct pt_regs *ctx)
{ {
return trace_rw_return(ctx, TRACE_WRITE); return trace_rw_return(ctx, MODE_WRITE);
} }
""" """
...@@ -171,26 +191,40 @@ b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") ...@@ -171,26 +191,40 @@ b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return")
b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return")
TASK_COMM_LEN = 16 # linux/sched.h
DNAME_INLINE_LEN = 32 # linux/dcache.h
class Data(ct.Structure):
_fields_ = [
("mode", ct.c_int),
("pid", ct.c_uint),
("sz", ct.c_uint),
("delta_us", ct.c_ulonglong),
("name", ct.c_char * DNAME_INLINE_LEN),
("comm", ct.c_char * TASK_COMM_LEN),
]
mode_s = {
0: 'R',
1: 'W',
}
# header # header
print("Tracing sync read/writes slower than %d ms" % min_ms) print("Tracing sync read/writes slower than %d ms" % min_ms)
print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D", print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "PID", "D",
"BYTES", "LAT(ms)", "FILENAME")) "BYTES", "LAT(ms)", "FILENAME"))
start_ts = 0 start_ts = time.time()
# format output def print_event(cpu, data, size):
while 1: event = ct.cast(data, ct.POINTER(Data)).contents
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
args = msg.split(" ", 3) ms = float(event.delta_us) / 1000
(type_s, sz, delta_us_s) = (args[0], args[1], args[2])
try:
filename = args[3]
except:
filename = "?"
if start_ts == 0:
start_ts = ts
ms = float(int(delta_us_s, 10)) / 1000
print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % (
ts - start_ts, task, pid, type_s, sz, ms, filename)) time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, event.name))
b["events"].open_perf_buffer(print_event)
while 1:
b.kprobe_poll()
...@@ -17,17 +17,30 @@ limitations under the License. ...@@ -17,17 +17,30 @@ limitations under the License.
local program = [[ local program = [[
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct data_t {
u64 stack_id;
u32 pid;
char comm[TASK_COMM_LEN];
};
BPF_STACK_TRACE(stack_traces, 128) BPF_STACK_TRACE(stack_traces, 128)
BPF_PERF_OUTPUT(events);
void trace_stack(struct pt_regs *ctx) { void trace_stack(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid();
FILTER FILTER
int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); struct data_t data = {};
if (stack_id >= 0) data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID),
bpf_trace_printk("stack_id=%d\n", stack_id); data.pid = pid;
bpf_get_current_comm(&data.comm, sizeof(data.comm));
events.perf_submit(ctx, &data, sizeof(data));
} }
]] ]]
local ffi = require("ffi")
return function(BPF, utils) return function(BPF, utils)
local parser = utils.argparse("stacksnoop", local parser = utils.argparse("stacksnoop",
"Trace and print kernel stack traces for a kernel function") "Trace and print kernel stack traces for a kernel function")
...@@ -41,11 +54,7 @@ return function(BPF, utils) ...@@ -41,11 +54,7 @@ return function(BPF, utils)
local filter = "" local filter = ""
if args.pid then if args.pid then
filter = [[ filter = "if (pid != %d) { return; }" % args.pid
u32 pid;
pid = bpf_get_current_pid_tgid();
if (pid != %d) { return; }
]] % args.pid
end end
local text = program:gsub("FILTER", filter) local text = program:gsub("FILTER", filter)
...@@ -53,40 +62,50 @@ return function(BPF, utils) ...@@ -53,40 +62,50 @@ return function(BPF, utils)
bpf:attach_kprobe{event=args.fn, fn_name="trace_stack"} bpf:attach_kprobe{event=args.fn, fn_name="trace_stack"}
if BPF.num_open_kprobes() == 0 then if BPF.num_open_kprobes() == 0 then
print("Function \"%s\" not found. Exiting." % args.fn) print("Function \"%s\" not found. Exiting." % {args.fn})
return return
end end
if args.verbose then if args.verbose then
print("%-18s %-12s %-6s %-3s %s" % print("%-18s %-12s %-6s %-3s %s" %
{"TIME(s)", "COMM", "PID", "CPU", "SYSCALL"}) {"TIME(s)", "COMM", "PID", "CPU", "FUNCTION"})
else else
print("%-18s %s" % {"TIME(s)", "SYSCALL"}) print("%-18s %s" % {"TIME(s)", "FUNCTION"})
end end
local stack_traces = bpf:get_table("stack_traces") local stack_traces = bpf:get_table("stack_traces")
local pipe = bpf:pipe() local start_ts = utils.posix.time_ns()
while true do local function print_event(cpu, event)
local task, pid, cpu, flags, ts, msg = pipe:trace_fields() local ts = (utils.posix.time_ns() - start_ts) / 1e9
local stack_id = string.match(msg, "stack_id=(%d+)")
if stack_id then if args.verbose then
if args.verbose then print("%-18.9f %-12.12s %-6d %-3d %s" %
print("%-18.9f %-12.12s %-6d %-3d %s" % {ts, task, pid, cpu, args.fn}) {ts, ffi.string(event.comm), event.pid, cpu, args.fn})
else else
print("%-18.9f %s" % {ts, args.fn}) print("%-18.9f %s" % {ts, args.fn})
end end
for addr in stack_traces:walk(tonumber(stack_id)) do for addr in stack_traces:walk(tonumber(event.stack_id)) do
local sym, offset = ksym:resolve(addr) local sym, offset = ksym:resolve(addr)
if args.offset then if args.offset then
print("\t%-16p %s+0x%x" % {addr, sym, tonumber(offset)}) print("\t%-16p %s+0x%x" % {addr, sym, tonumber(offset)})
else else
print("\t%-16p %s" % {addr, sym}) print("\t%-16p %s" % {addr, sym})
end
end end
end end
print() print()
end end
local TASK_COMM_LEN = 16 -- linux/sched.h
bpf:get_table("events"):open_perf_buffer(print_event, [[
struct {
uint64_t stack_id;
uint32_t pid;
char comm[%d];
}
]] % {TASK_COMM_LEN})
bpf:kprobe_poll_loop()
end end
...@@ -20,7 +20,8 @@ ...@@ -20,7 +20,8 @@
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
import argparse import argparse
import re import ctypes as ct
import time
# arguments # arguments
examples = """examples: examples = """examples:
...@@ -50,20 +51,30 @@ debug = 0 ...@@ -50,20 +51,30 @@ debug = 0
# define BPF program # define BPF program
bpf_text = """ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct data_t {
u64 stack_id;
u32 pid;
char comm[TASK_COMM_LEN];
};
BPF_STACK_TRACE(stack_traces, 128) BPF_STACK_TRACE(stack_traces, 128)
BPF_PERF_OUTPUT(events);
void trace_stack(struct pt_regs *ctx) { void trace_stack(struct pt_regs *ctx) {
u32 pid = bpf_get_current_pid_tgid();
FILTER FILTER
int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); struct data_t data = {};
if (stack_id >= 0) data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID),
bpf_trace_printk("stack_id=%d\\n", stack_id); data.pid = pid;
bpf_get_current_comm(&data.comm, sizeof(data.comm));
events.perf_submit(ctx, &data, sizeof(data));
} }
""" """
if args.pid: if args.pid:
bpf_text = bpf_text.replace('FILTER', bpf_text = bpf_text.replace('FILTER',
('u32 pid; pid = bpf_get_current_pid_tgid(); ' + 'if (pid != %s) { return; }' % args.pid)
'if (pid != %s) { return; }') % (args.pid))
else: else:
bpf_text = bpf_text.replace('FILTER', '') bpf_text = bpf_text.replace('FILTER', '')
if debug: if debug:
...@@ -72,33 +83,48 @@ if debug: ...@@ -72,33 +83,48 @@ if debug:
# initialize BPF # initialize BPF
b = BPF(text=bpf_text) b = BPF(text=bpf_text)
b.attach_kprobe(event=function, fn_name="trace_stack") b.attach_kprobe(event=function, fn_name="trace_stack")
TASK_COMM_LEN = 16 # linux/sched.h
class Data(ct.Structure):
_fields_ = [
("stack_id", ct.c_ulonglong),
("pid", ct.c_uint),
("comm", ct.c_char * TASK_COMM_LEN),
]
matched = b.num_open_kprobes() matched = b.num_open_kprobes()
if matched == 0: if matched == 0:
print("Function \"%s\" not found. Exiting." % function) print("Function \"%s\" not found. Exiting." % function)
exit() exit()
stack_traces = b.get_table("stack_traces") stack_traces = b.get_table("stack_traces")
msg_regexp = re.compile("stack_id=(\d+)") start_ts = time.time()
# header # header
if verbose: if verbose:
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU", "SYSCALL")) print("%-18s %-12s %-6s %-3s %s" %
("TIME(s)", "COMM", "PID", "CPU", "FUNCTION"))
else: else:
print("%-18s %s" % ("TIME(s)", "SYSCALL")) print("%-18s %s" % ("TIME(s)", "FUNCTION"))
def print_event(cpu, data, size):
event = ct.cast(data, ct.POINTER(Data)).contents
ts = time.time() - start_ts
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, event.comm, event.pid, cpu,
function))
else:
print("%-18.9f %s" % (ts, function))
for addr in stack_traces.walk(event.stack_id):
sym = b.ksymaddr(addr) if offset else b.ksym(addr)
print("\t%016x %s" % (addr, sym))
print()
# format output b["events"].open_perf_buffer(print_event)
while 1: while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields() b.kprobe_poll()
m = msg_regexp.match(msg)
if m:
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, function))
else:
print("%-18.9f %s" % (ts, function))
stack_id = int(m.group(1))
for addr in stack_traces.walk(stack_id):
sym = b.ksymaddr(addr) if offset else b.ksym(addr)
print("\t%016x %s" % (addr, sym))
print()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment