Commit 592414e0 authored by Vicent Marti's avatar Vicent Marti

stacksnoop: Resolve stacks using a StackTrace table

Instead of manually walking the stack and printing each frame to the
trace log, we can use a BPF_STACK_TRACE table to store the stack traces,
and print to the trace log their IDs every time they are traced.

The output of the tool has been slightly modified: we no longer prefix
each line of the stack trace with the timestamp and the other headers,
since the whole stack trace is fetched in one go from the table and the
information would be highly redundant.

The man page and the examples have been updated to reflect the new
output.
parent 3a137db2
...@@ -9,10 +9,7 @@ kernel stack back trace for that call. This shows the ancestry of function ...@@ -9,10 +9,7 @@ kernel stack back trace for that call. This shows the ancestry of function
calls, and is a quick way to investigate low frequency kernel functions and calls, and is a quick way to investigate low frequency kernel functions and
their cause. For high frequency kernel functions, see stackcount. their cause. For high frequency kernel functions, see stackcount.
The stack depth is currently limited to 10 (+1 for the current instruction This tool only works on Linux 4.6+. Stack traces are obtained using the new BPF_STACK_TRACE` APIs.
pointer).
This currently only works on x86_64. Check for future versions.
.SH REQUIREMENTS .SH REQUIREMENTS
CONFIG_BPF and bcc. CONFIG_BPF and bcc.
.SH OPTIONS .SH OPTIONS
......
#!/usr/bin/python
#
# stacksnoop Trace a kernel function and print all kernel stack traces.
# For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C.
#
# USAGE: stacksnoop [-h] [-p PID] [-s] [-v] function
#
# The current implementation uses an unrolled loop for x86_64, and was written
# as a proof of concept. This implementation should be replaced in the future
# with an appropriate bpf_ call, when available.
#
# The stack depth is limited to 10 (+1 for the current instruction pointer).
# This could be tunable in a future version.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 12-Jan-2016 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
import argparse
# arguments
examples = """examples:
./stacksnoop ext4_sync_fs # print kernel stack traces for ext4_sync_fs
./stacksnoop -s ext4_sync_fs # ... also show symbol offsets
./stacksnoop -v ext4_sync_fs # ... show extra columns
./stacksnoop -p 185 ext4_sync_fs # ... only when PID 185 is on-CPU
"""
parser = argparse.ArgumentParser(
description="Trace and print kernel stack traces for a kernel function",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-s", "--offset", action="store_true",
help="show address offsets")
parser.add_argument("-v", "--verbose", action="store_true",
help="print more fields")
parser.add_argument("function",
help="kernel function name")
args = parser.parse_args()
function = args.function
offset = args.offset
verbose = args.verbose
debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
static int print_frame(u64 *bp, int *depth) {
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (ret < __START_KERNEL_map)
return 0;
bpf_trace_printk("r%d: %llx\\n", *depth, ret);
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
return 0;
*depth += 1;
return 1;
}
return 0;
}
void trace_stack(struct pt_regs *ctx) {
FILTER
u64 bp = 0;
int depth = 0;
bpf_trace_printk("\\n");
if (ctx->ip)
bpf_trace_printk("ip: %llx\\n", ctx->ip);
bp = ctx->bp;
// unrolled loop, 10 frames deep:
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
};
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER',
('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
'if (pid != %s) { return; }') % (args.pid))
else:
bpf_text = bpf_text.replace('FILTER', '')
if debug:
print(bpf_text)
# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event=function, fn_name="trace_stack")
matched = b.num_open_kprobes()
if matched == 0:
print("Function \"%s\" not found. Exiting." % function)
exit()
# header
if verbose:
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU",
"STACK"))
else:
print("%-18s %s" % ("TIME(s)", "STACK"))
# format output
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
if msg != "":
(reg, addr) = msg.split(" ")
if offset:
ip = b.ksymaddr(int(addr, 16))
else:
ip = b.ksym(int(addr, 16))
msg = msg + " " + ip
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, msg))
else:
print("%-18.9f %s" % (ts, msg))
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
import argparse import argparse
import re
# arguments # arguments
examples = """examples: examples = """examples:
...@@ -50,45 +51,14 @@ debug = 0 ...@@ -50,45 +51,14 @@ debug = 0
bpf_text = """ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
static int print_frame(u64 *bp, int *depth) { BPF_STACK_TRACE(stack_traces, 128)
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (ret < __START_KERNEL_map)
return 0;
bpf_trace_printk("r%d: %llx\\n", *depth, ret);
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
return 0;
*depth += 1;
return 1;
}
return 0;
}
void trace_stack(struct pt_regs *ctx) { void trace_stack(struct pt_regs *ctx) {
FILTER FILTER
u64 bp = 0; int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
int depth = 0; if (stack_id >= 0)
bpf_trace_printk("stack_id=%d\\n", stack_id);
bpf_trace_printk("\\n"); }
if (ctx->ip)
bpf_trace_printk("ip: %llx\\n", ctx->ip);
bp = ctx->bp;
// unrolled loop, 10 frames deep:
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
};
""" """
if args.pid: if args.pid:
bpf_text = bpf_text.replace('FILTER', bpf_text = bpf_text.replace('FILTER',
...@@ -107,24 +77,28 @@ if matched == 0: ...@@ -107,24 +77,28 @@ if matched == 0:
print("Function \"%s\" not found. Exiting." % function) print("Function \"%s\" not found. Exiting." % function)
exit() exit()
stack_traces = b.get_table("stack_traces")
msg_regexp = re.compile("stack_id=(\d+)")
# header # header
if verbose: if verbose:
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU", print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU", "SYSCALL"))
"STACK"))
else: else:
print("%-18s %s" % ("TIME(s)", "STACK")) print("%-18s %s" % ("TIME(s)", "SYSCALL"))
# format output # format output
while 1: while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields() (task, pid, cpu, flags, ts, msg) = b.trace_fields()
if msg != "": m = msg_regexp.match(msg)
(reg, addr) = msg.split(" ") if m:
if offset: if verbose:
ip = b.ksymaddr(int(addr, 16)) print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, function))
else: else:
ip = b.ksym(int(addr, 16)) print("%-18.9f %s" % (ts, function))
msg = msg + " " + ip
if verbose: stack_id = int(m.group(1))
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, msg)) for addr in stack_traces.walk(stack_id):
else: sym = b.ksymaddr(addr) if offset else b.ksym(addr)
print("%-18.9f %s" % (ts, msg)) print("\t%016x %s" % (addr, sym))
print()
...@@ -3,27 +3,20 @@ Demonstrations of stacksnoop, the Linux eBPF/bcc version. ...@@ -3,27 +3,20 @@ Demonstrations of stacksnoop, the Linux eBPF/bcc version.
This program traces the given kernel function and prints the kernel stack trace This program traces the given kernel function and prints the kernel stack trace
for every call. This tool is useful for studying low frequency kernel functions, for every call. This tool is useful for studying low frequency kernel functions,
to see how they were invoked. For example, tracing the ext4_sync_fs() call: to see how they were invoked. For example, tracing the submit_bio() call:
# ./stacksnoop ext4_sync_fs # ./stacksnoop submit_bio
TIME(s) STACK TIME(s) SYSCALL
42005194.132250004 3592.838736000 submit_bio
42005194.132253997 ip: ffffffff81280461 ext4_sync_fs ffffffff813bd961 submit_bio
42005194.132256001 r0: ffffffff811ed7f9 iterate_supers ffffffff81257c12 submit_bh
42005194.132257000 r1: ffffffff8121ba25 sys_sync ffffffff81301948 jbd2_journal_commit_transaction
42005194.132257000 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath ffffffff8130653a kjournald2
42005194.132275000 ffffffff810a2df8 kthread
42005194.132275999 ip: ffffffff81280461 ext4_sync_fs ffffffff8183a122 ret_from_fork
42005194.132275999 r0: ffffffff811ed7f9 iterate_supers
42005194.132276997 r1: ffffffff8121ba35 sys_sync This shows that submit_bio() was called by submit_bh(), which was called
42005194.132276997 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath by jbd2_journal_commit_transaction(), and so on.
This shows that ext4_sync_fs() was called by iterate_supers(), which was called
by sys_sync(), and so on. (It tells me that this was a syscall invoked sync,
so an application has requested it.)
The "ip" refers to the instruction pointer, and the "r#" refers to the return
address for each stack frame.
For high frequency functions, see stackcount, which summarizes in-kernel for For high frequency functions, see stackcount, which summarizes in-kernel for
efficiency. If you don't know if your function is low or high frequency, try efficiency. If you don't know if your function is low or high frequency, try
...@@ -32,20 +25,17 @@ funccount. ...@@ -32,20 +25,17 @@ funccount.
The -v option includes more fields, including the on-CPU process (COMM and PID): The -v option includes more fields, including the on-CPU process (COMM and PID):
# ./stacksnoop -v ext4_sync_fs # ./stacksnoop -v submit_bio
TIME(s) COMM PID CPU STACK TIME(s) COMM PID CPU SYSCALL
42005557.056332998 sync 22352 1 3734.855027000 jbd2/dm-0-8 313 0 submit_bio
42005557.056336999 sync 22352 1 ip: ffffffff81280461 ext4_sync_fs ffffffff813bd961 submit_bio
42005557.056339003 sync 22352 1 r0: ffffffff811ed7f9 iterate_supers ffffffff81257c12 submit_bh
42005557.056340002 sync 22352 1 r1: ffffffff8121ba25 sys_sync ffffffff81301948 jbd2_journal_commit_transaction
42005557.056340002 sync 22352 1 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath ffffffff8130653a kjournald2
42005557.056358002 sync 22352 1 ffffffff810a2df8 kthread
42005557.056358002 sync 22352 1 ip: ffffffff81280461 ext4_sync_fs ffffffff8183a122 ret_from_fork
42005557.056359001 sync 22352 1 r0: ffffffff811ed7f9 iterate_supers
42005557.056359999 sync 22352 1 r1: ffffffff8121ba35 sys_sync This identifies the application issuing the sync syscall: the jbd2 process
42005557.056359999 sync 22352 1 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath
This identifies the application issuing the sync syscall: the sync(1) command
(COMM column). (COMM column).
...@@ -53,29 +43,32 @@ Here's another example, showing the path to second_overflow() and on-CPU ...@@ -53,29 +43,32 @@ Here's another example, showing the path to second_overflow() and on-CPU
process: process:
# ./stacksnoop -v second_overflow # ./stacksnoop -v second_overflow
TIME(s) COMM PID CPU STACK TIME(s) COMM PID CPU SYSCALL
42005696.529449999 <idle> 0 0 3837.526433000 <idle> 0 1 second_overflow
42005696.529457003 <idle> 0 0 ip: ffffffff810e5701 second_overflow ffffffff810fac41 second_overflow
42005696.529459000 <idle> 0 0 r0: ffffffff810ecb1b tick_do_update_jiffies64 ffffffff81102320 tick_do_update_jiffies64
42005696.529459998 <idle> 0 0 r1: ffffffff810ed6e0 tick_irq_enter ffffffff81102bf0 tick_irq_enter
42005696.529459998 <idle> 0 0 r2: ffffffff8107a195 irq_enter ffffffff810882ac irq_enter
42005696.529460996 <idle> 0 0 r3: ffffffff8146bb6f xen_evtchn_do_upcall ffffffff8183c7df smp_apic_timer_interrupt
42005696.529460996 <idle> 0 0 r4: ffffffff81777a2e xen_do_hypervisor_callback ffffffff8183aae2 apic_timer_interrupt
42005697.616295002 <idle> 0 0 ffffffff81038f9e default_idle
42005697.616301000 <idle> 0 0 ip: ffffffff810e5701 second_overflow ffffffff8103979f arch_cpu_idle
42005697.616302997 <idle> 0 0 r0: ffffffff810ecb1b tick_do_update_jiffies64 ffffffff810c69da default_idle_call
42005697.616304003 <idle> 0 0 r1: ffffffff810ed6e0 tick_irq_enter ffffffff810c6cd7 cpu_startup_entry
42005697.616304003 <idle> 0 0 r2: ffffffff8107a195 irq_enter ffffffff81051cbe start_secondary
42005697.616305001 <idle> 0 0 r3: ffffffff8146bb6f xen_evtchn_do_upcall
42005697.616305001 <idle> 0 0 r4: ffffffff81777a2e xen_do_hypervisor_callback 3838.526953000 <idle> 0 1 second_overflow
42005698.556240998 <idle> 0 1 ffffffff810fac41 second_overflow
42005698.556247003 <idle> 0 1 ip: ffffffff810e5701 second_overflow ffffffff81102320 tick_do_update_jiffies64
42005698.556249000 <idle> 0 1 r0: ffffffff810ecb1b tick_do_update_jiffies64 ffffffff81102bf0 tick_irq_enter
42005698.556249000 <idle> 0 1 r1: ffffffff810ed6e0 tick_irq_enter ffffffff810882ac irq_enter
42005698.556249999 <idle> 0 1 r2: ffffffff8107a195 irq_enter ffffffff8183c7df smp_apic_timer_interrupt
42005698.556249999 <idle> 0 1 r3: ffffffff8146bb6f xen_evtchn_do_upcall ffffffff8183aae2 apic_timer_interrupt
42005698.556250997 <idle> 0 1 r4: ffffffff81777a2e xen_do_hypervisor_callback ffffffff81038f9e default_idle
[...] ffffffff8103979f arch_cpu_idle
ffffffff810c69da default_idle_call
ffffffff810c6cd7 cpu_startup_entry
ffffffff81051cbe start_secondary
This fires every second (see TIME(s)), and is from tick_do_update_jiffies64(). This fires every second (see TIME(s)), and is from tick_do_update_jiffies64().
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment