Commit 592414e0 authored by Vicent Marti's avatar Vicent Marti

stacksnoop: Resolve stacks using a StackTrace table

Instead of manually walking the stack and printing each frame to the
trace log, we can use a BPF_STACK_TRACE table to store the stack traces,
and print to the trace log their IDs every time they are traced.

The output of the tool has been slightly modified: we no longer prefix
each line of the stack trace with the timestamp and the other headers,
since the whole stack trace is fetched in one go from the table and the
information would be highly redundant.

The man page and the examples have been updated to reflect the new
output.
parent 3a137db2
......@@ -9,10 +9,7 @@ kernel stack back trace for that call. This shows the ancestry of function
calls, and is a quick way to investigate low frequency kernel functions and
their cause. For high frequency kernel functions, see stackcount.
The stack depth is currently limited to 10 (+1 for the current instruction
pointer).
This currently only works on x86_64. Check for future versions.
This tool only works on Linux 4.6+. Stack traces are obtained using the new BPF_STACK_TRACE` APIs.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH OPTIONS
......
#!/usr/bin/python
#
# stacksnoop Trace a kernel function and print all kernel stack traces.
# For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C.
#
# USAGE: stacksnoop [-h] [-p PID] [-s] [-v] function
#
# The current implementation uses an unrolled loop for x86_64, and was written
# as a proof of concept. This implementation should be replaced in the future
# with an appropriate bpf_ call, when available.
#
# The stack depth is limited to 10 (+1 for the current instruction pointer).
# This could be tunable in a future version.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 12-Jan-2016 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
import argparse
# arguments
examples = """examples:
./stacksnoop ext4_sync_fs # print kernel stack traces for ext4_sync_fs
./stacksnoop -s ext4_sync_fs # ... also show symbol offsets
./stacksnoop -v ext4_sync_fs # ... show extra columns
./stacksnoop -p 185 ext4_sync_fs # ... only when PID 185 is on-CPU
"""
parser = argparse.ArgumentParser(
description="Trace and print kernel stack traces for a kernel function",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-s", "--offset", action="store_true",
help="show address offsets")
parser.add_argument("-v", "--verbose", action="store_true",
help="print more fields")
parser.add_argument("function",
help="kernel function name")
args = parser.parse_args()
function = args.function
offset = args.offset
verbose = args.verbose
debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
static int print_frame(u64 *bp, int *depth) {
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (ret < __START_KERNEL_map)
return 0;
bpf_trace_printk("r%d: %llx\\n", *depth, ret);
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
return 0;
*depth += 1;
return 1;
}
return 0;
}
void trace_stack(struct pt_regs *ctx) {
FILTER
u64 bp = 0;
int depth = 0;
bpf_trace_printk("\\n");
if (ctx->ip)
bpf_trace_printk("ip: %llx\\n", ctx->ip);
bp = ctx->bp;
// unrolled loop, 10 frames deep:
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
};
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER',
('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
'if (pid != %s) { return; }') % (args.pid))
else:
bpf_text = bpf_text.replace('FILTER', '')
if debug:
print(bpf_text)
# initialize BPF
b = BPF(text=bpf_text)
b.attach_kprobe(event=function, fn_name="trace_stack")
matched = b.num_open_kprobes()
if matched == 0:
print("Function \"%s\" not found. Exiting." % function)
exit()
# header
if verbose:
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU",
"STACK"))
else:
print("%-18s %s" % ("TIME(s)", "STACK"))
# format output
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
if msg != "":
(reg, addr) = msg.split(" ")
if offset:
ip = b.ksymaddr(int(addr, 16))
else:
ip = b.ksym(int(addr, 16))
msg = msg + " " + ip
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, msg))
else:
print("%-18.9f %s" % (ts, msg))
......@@ -20,6 +20,7 @@
from __future__ import print_function
from bcc import BPF
import argparse
import re
# arguments
examples = """examples:
......@@ -50,45 +51,14 @@ debug = 0
bpf_text = """
#include <uapi/linux/ptrace.h>
static int print_frame(u64 *bp, int *depth) {
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (ret < __START_KERNEL_map)
return 0;
bpf_trace_printk("r%d: %llx\\n", *depth, ret);
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
return 0;
*depth += 1;
return 1;
}
return 0;
}
BPF_STACK_TRACE(stack_traces, 128)
void trace_stack(struct pt_regs *ctx) {
FILTER
u64 bp = 0;
int depth = 0;
bpf_trace_printk("\\n");
if (ctx->ip)
bpf_trace_printk("ip: %llx\\n", ctx->ip);
bp = ctx->bp;
// unrolled loop, 10 frames deep:
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
if (!print_frame(&bp, &depth)) return;
};
int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
if (stack_id >= 0)
bpf_trace_printk("stack_id=%d\\n", stack_id);
}
"""
if args.pid:
bpf_text = bpf_text.replace('FILTER',
......@@ -107,24 +77,28 @@ if matched == 0:
print("Function \"%s\" not found. Exiting." % function)
exit()
stack_traces = b.get_table("stack_traces")
msg_regexp = re.compile("stack_id=(\d+)")
# header
if verbose:
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU",
"STACK"))
print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU", "SYSCALL"))
else:
print("%-18s %s" % ("TIME(s)", "STACK"))
print("%-18s %s" % ("TIME(s)", "SYSCALL"))
# format output
while 1:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
if msg != "":
(reg, addr) = msg.split(" ")
if offset:
ip = b.ksymaddr(int(addr, 16))
m = msg_regexp.match(msg)
if m:
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, function))
else:
ip = b.ksym(int(addr, 16))
msg = msg + " " + ip
if verbose:
print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, msg))
else:
print("%-18.9f %s" % (ts, msg))
print("%-18.9f %s" % (ts, function))
stack_id = int(m.group(1))
for addr in stack_traces.walk(stack_id):
sym = b.ksymaddr(addr) if offset else b.ksym(addr)
print("\t%016x %s" % (addr, sym))
print()
......@@ -3,27 +3,20 @@ Demonstrations of stacksnoop, the Linux eBPF/bcc version.
This program traces the given kernel function and prints the kernel stack trace
for every call. This tool is useful for studying low frequency kernel functions,
to see how they were invoked. For example, tracing the ext4_sync_fs() call:
# ./stacksnoop ext4_sync_fs
TIME(s) STACK
42005194.132250004
42005194.132253997 ip: ffffffff81280461 ext4_sync_fs
42005194.132256001 r0: ffffffff811ed7f9 iterate_supers
42005194.132257000 r1: ffffffff8121ba25 sys_sync
42005194.132257000 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath
42005194.132275000
42005194.132275999 ip: ffffffff81280461 ext4_sync_fs
42005194.132275999 r0: ffffffff811ed7f9 iterate_supers
42005194.132276997 r1: ffffffff8121ba35 sys_sync
42005194.132276997 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath
This shows that ext4_sync_fs() was called by iterate_supers(), which was called
by sys_sync(), and so on. (It tells me that this was a syscall invoked sync,
so an application has requested it.)
The "ip" refers to the instruction pointer, and the "r#" refers to the return
address for each stack frame.
to see how they were invoked. For example, tracing the submit_bio() call:
# ./stacksnoop submit_bio
TIME(s) SYSCALL
3592.838736000 submit_bio
ffffffff813bd961 submit_bio
ffffffff81257c12 submit_bh
ffffffff81301948 jbd2_journal_commit_transaction
ffffffff8130653a kjournald2
ffffffff810a2df8 kthread
ffffffff8183a122 ret_from_fork
This shows that submit_bio() was called by submit_bh(), which was called
by jbd2_journal_commit_transaction(), and so on.
For high frequency functions, see stackcount, which summarizes in-kernel for
efficiency. If you don't know if your function is low or high frequency, try
......@@ -32,20 +25,17 @@ funccount.
The -v option includes more fields, including the on-CPU process (COMM and PID):
# ./stacksnoop -v ext4_sync_fs
TIME(s) COMM PID CPU STACK
42005557.056332998 sync 22352 1
42005557.056336999 sync 22352 1 ip: ffffffff81280461 ext4_sync_fs
42005557.056339003 sync 22352 1 r0: ffffffff811ed7f9 iterate_supers
42005557.056340002 sync 22352 1 r1: ffffffff8121ba25 sys_sync
42005557.056340002 sync 22352 1 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath
42005557.056358002 sync 22352 1
42005557.056358002 sync 22352 1 ip: ffffffff81280461 ext4_sync_fs
42005557.056359001 sync 22352 1 r0: ffffffff811ed7f9 iterate_supers
42005557.056359999 sync 22352 1 r1: ffffffff8121ba35 sys_sync
42005557.056359999 sync 22352 1 r2: ffffffff81775cb6 entry_SYSCALL_64_fastpath
This identifies the application issuing the sync syscall: the sync(1) command
# ./stacksnoop -v submit_bio
TIME(s) COMM PID CPU SYSCALL
3734.855027000 jbd2/dm-0-8 313 0 submit_bio
ffffffff813bd961 submit_bio
ffffffff81257c12 submit_bh
ffffffff81301948 jbd2_journal_commit_transaction
ffffffff8130653a kjournald2
ffffffff810a2df8 kthread
ffffffff8183a122 ret_from_fork
This identifies the application issuing the sync syscall: the jbd2 process
(COMM column).
......@@ -53,29 +43,32 @@ Here's another example, showing the path to second_overflow() and on-CPU
process:
# ./stacksnoop -v second_overflow
TIME(s) COMM PID CPU STACK
42005696.529449999 <idle> 0 0
42005696.529457003 <idle> 0 0 ip: ffffffff810e5701 second_overflow
42005696.529459000 <idle> 0 0 r0: ffffffff810ecb1b tick_do_update_jiffies64
42005696.529459998 <idle> 0 0 r1: ffffffff810ed6e0 tick_irq_enter
42005696.529459998 <idle> 0 0 r2: ffffffff8107a195 irq_enter
42005696.529460996 <idle> 0 0 r3: ffffffff8146bb6f xen_evtchn_do_upcall
42005696.529460996 <idle> 0 0 r4: ffffffff81777a2e xen_do_hypervisor_callback
42005697.616295002 <idle> 0 0
42005697.616301000 <idle> 0 0 ip: ffffffff810e5701 second_overflow
42005697.616302997 <idle> 0 0 r0: ffffffff810ecb1b tick_do_update_jiffies64
42005697.616304003 <idle> 0 0 r1: ffffffff810ed6e0 tick_irq_enter
42005697.616304003 <idle> 0 0 r2: ffffffff8107a195 irq_enter
42005697.616305001 <idle> 0 0 r3: ffffffff8146bb6f xen_evtchn_do_upcall
42005697.616305001 <idle> 0 0 r4: ffffffff81777a2e xen_do_hypervisor_callback
42005698.556240998 <idle> 0 1
42005698.556247003 <idle> 0 1 ip: ffffffff810e5701 second_overflow
42005698.556249000 <idle> 0 1 r0: ffffffff810ecb1b tick_do_update_jiffies64
42005698.556249000 <idle> 0 1 r1: ffffffff810ed6e0 tick_irq_enter
42005698.556249999 <idle> 0 1 r2: ffffffff8107a195 irq_enter
42005698.556249999 <idle> 0 1 r3: ffffffff8146bb6f xen_evtchn_do_upcall
42005698.556250997 <idle> 0 1 r4: ffffffff81777a2e xen_do_hypervisor_callback
[...]
TIME(s) COMM PID CPU SYSCALL
3837.526433000 <idle> 0 1 second_overflow
ffffffff810fac41 second_overflow
ffffffff81102320 tick_do_update_jiffies64
ffffffff81102bf0 tick_irq_enter
ffffffff810882ac irq_enter
ffffffff8183c7df smp_apic_timer_interrupt
ffffffff8183aae2 apic_timer_interrupt
ffffffff81038f9e default_idle
ffffffff8103979f arch_cpu_idle
ffffffff810c69da default_idle_call
ffffffff810c6cd7 cpu_startup_entry
ffffffff81051cbe start_secondary
3838.526953000 <idle> 0 1 second_overflow
ffffffff810fac41 second_overflow
ffffffff81102320 tick_do_update_jiffies64
ffffffff81102bf0 tick_irq_enter
ffffffff810882ac irq_enter
ffffffff8183c7df smp_apic_timer_interrupt
ffffffff8183aae2 apic_timer_interrupt
ffffffff81038f9e default_idle
ffffffff8103979f arch_cpu_idle
ffffffff810c69da default_idle_call
ffffffff810c6cd7 cpu_startup_entry
ffffffff81051cbe start_secondary
This fires every second (see TIME(s)), and is from tick_do_update_jiffies64().
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment