Commit 58889898 authored by Brendan Gregg's avatar Brendan Gregg Committed by Sasha Goldshtein

stackcount: add -K, -U, -d, -D, and -f (#1335)

* stackcount: add -K and -U

* stackcount: add -D for duration

* stackcount: add -f for folded output
parent d9641bef
......@@ -2,14 +2,14 @@
.SH NAME
stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-r] [\-s]
[\-P] [\-v] [\-d] pattern
.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] [\-s]
[\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern
.SH DESCRIPTION
stackcount traces functions and frequency counts them with their entire
stack trace, summarized in-kernel for efficiency. This allows higher
frequency events to be studied. The output consists of unique stack traces,
and their occurrence counts. In addition to kernel and user functions, kernel
tracepoints and USDT tracepoint are also supported.
stack trace, kernel stack and user stack, summarized in-kernel for efficiency.
This allows higher frequency events to be studied. The output consists of
unique stack traces, and their occurrence counts. In addition to kernel and
user functions, kernel tracepoints and USDT tracepoint are also supported.
The pattern is a string with optional '*' wildcards, similar to file globbing.
If you'd prefer to use regular expressions, use the \-r option.
......@@ -38,11 +38,19 @@ Include a timestamp with interval output.
Show raw addresses.
.TP
\-d
Print a delimiter ("--") in-between the kernel and user stacks.
.TP
\-\-debug
Print the source of the BPF program when loading it (for debugging purposes).
.TP
\-i interval
Summary interval, in seconds.
.TP
\-D duration
Total duration of trace, in seconds.
\-f
Folded output format.
.TP
\-p PID
Trace this process ID only (filtered in-kernel).
.TP
......@@ -52,47 +60,59 @@ A function name, or a search pattern. Can include wildcards ("*"). If the
\-r option is used, can include regular expressions.
.SH EXAMPLES
.TP
Count kernel stack traces for submit_bio():
Count kernel and user stack traces for submit_bio():
#
.B stackcount submit_bio
.TP
Count kernel stack traces for ip_output():
Count stacks with a delimiter for submit_bio():
#
.B stackcount \-d submit_bio
.TP
Count kernel stack trace only for submit_bio():
#
.B stackcount \-K submit_bio
.TP
Count user stack trace only for submit_bio():
#
.B stackcount \-U submit_bio
.TP
Count stack traces for ip_output():
#
.B stackcount ip_output
.TP
Show symbol offsets:
#
.B stackcount -s ip_output
.B stackcount \-s ip_output
.TP
Show offsets and raw addresses (verbose):
#
.B stackcount -sv ip_output
.B stackcount \-sv ip_output
.TP
Count kernel stacks for kernel functions matching tcp_send*:
Count stacks for kernel functions matching tcp_send*:
#
.B stackcount 'tcp_send*'
.TP
Same as previous, but using regular expressions:
#
.B stackcount -r '^tcp_send.*'
.B stackcount \-r '^tcp_send.*'
.TP
Output every 5 seconds, with timestamps:
#
.B stackcount -Ti 5 ip_output
.B stackcount \-Ti 5 ip_output
.TP
Only count stacks when PID 185 is on-CPU:
#
.B stackcount -p 185 ip_output
.B stackcount \-p 185 ip_output
.TP
Count user stacks for dynamic heap allocations with malloc in PID 185:
#
.B stackcount -p 185 c:malloc
.B stackcount \-p 185 c:malloc
.TP
Count user stacks for thread creation (USDT tracepoint) in PID 185:
#
.B stackcount -p 185 u:pthread:pthread_create
.B stackcount \-p 185 u:pthread:pthread_create
.TP
Count kernel stacks for context switch events using a kernel tracepoint:
Count stacks for context switch events using a kernel tracepoint:
#
.B stackcount t:sched:sched_switch
.SH OVERHEAD
......
......@@ -3,8 +3,8 @@
# stackcount Count events and their stack traces.
# For Linux, uses BCC, eBPF.
#
# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s]
# [-P] [-v] pattern
# USAGE: stackcount.py [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s]
# [-P] [-K] [-U] [-v] [-d] [-f] [--debug]
#
# The pattern is a string with optional '*' wildcards, similar to file
# globbing. If you'd prefer to use regular expressions, use the -r option.
......@@ -27,7 +27,8 @@ import traceback
debug = False
class Probe(object):
def __init__(self, pattern, use_regex=False, pid=None, per_pid=False):
def __init__(self, pattern, kernel_stack, user_stack, use_regex=False,
pid=None, per_pid=False):
"""Init a new probe.
Init the probe from the pattern provided by the user. The supported
......@@ -41,6 +42,8 @@ class Probe(object):
t:cat:event -- probe a kernel tracepoint
u:lib:probe -- probe a USDT tracepoint
"""
self.kernel_stack = kernel_stack
self.user_stack = user_stack
parts = pattern.split(':')
if len(parts) == 1:
parts = ["p", "", parts[0]]
......@@ -103,28 +106,51 @@ class Probe(object):
self.pattern)
def load(self):
ctx_name = "ctx"
stack_trace = ""
if self.user_stack:
stack_trace += """
key.user_stack_id = stack_traces.get_stackid(
%s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK
);""" % (ctx_name)
else:
stack_trace += "key.user_stack_id = -1;"
if self.kernel_stack:
stack_trace += """
key.kernel_stack_id = stack_traces.get_stackid(
%s, BPF_F_REUSE_STACKID
);""" % (ctx_name)
else:
stack_trace += "key.kernel_stack_id = -1;"
trace_count_text = """
int trace_count(void *ctx) {
FILTER
struct key_t key = {};
key.pid = GET_PID;
key.stackid = stack_traces.get_stackid(ctx, STACK_FLAGS);
key.tgid = GET_TGID;
STORE_COMM
%s
u64 zero = 0;
u64 *val = counts.lookup_or_init(&key, &zero);
(*val)++;
return 0;
}
"""
trace_count_text = trace_count_text % (stack_trace)
bpf_text = """#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct key_t {
u32 pid;
int stackid;
// no pid (thread ID) so that we do not needlessly split this key
u32 tgid;
int kernel_stack_id;
int user_stack_id;
char name[TASK_COMM_LEN];
};
BPF_HASH(counts, struct key_t);
BPF_STACK_TRACE(stack_traces, 1024);
"""
# We really mean the tgid from the kernel's perspective, which is in
......@@ -138,18 +164,18 @@ BPF_STACK_TRACE(stack_traces, 1024);
# We need per-pid statistics when tracing a user-space process, because
# the meaning of the symbols depends on the pid. We also need them if
# per-pid statistics were requested with -P.
if self.per_pid or not self.is_kernel_probe():
trace_count_text = trace_count_text.replace('GET_PID',
# per-pid statistics were requested with -P, or for user stacks.
if self.per_pid or not self.is_kernel_probe() or self.user_stack:
trace_count_text = trace_count_text.replace('GET_TGID',
'bpf_get_current_pid_tgid() >> 32')
trace_count_text = trace_count_text.replace('STORE_COMM',
'bpf_get_current_comm(&key.name, sizeof(key.name));')
else:
# kernel stacks only. skip splitting on PID so these aggregate
# together, and don't store the process name.
trace_count_text = trace_count_text.replace(
'GET_PID', '0xffffffff')
stack_flags = 'BPF_F_REUSE_STACKID'
if not self.is_kernel_probe():
stack_flags += '| BPF_F_USER_STACK' # can't do both U *and* K
trace_count_text = trace_count_text.replace('STACK_FLAGS', stack_flags)
'GET_TGID', '0xffffffff')
trace_count_text = trace_count_text.replace('STORE_COMM', '')
self.usdt = None
if self.type == "u":
......@@ -182,6 +208,7 @@ class Tool(object):
def __init__(self):
examples = """examples:
./stackcount submit_bio # count kernel stack traces for submit_bio
./stackcount -d ip_output # include a user/kernel stack delimiter
./stackcount -s ip_output # show symbol offsets
./stackcount -sv ip_output # show offsets and raw addresses (verbose)
./stackcount 'tcp_send*' # count stacks for funcs matching tcp_send*
......@@ -191,6 +218,8 @@ class Tool(object):
./stackcount -p 185 c:malloc # count stacks for malloc in PID 185
./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint
./stackcount -p 185 u:node:* # count stacks for all USDT probes in node
./stackcount -K t:sched:sched_switch # kernel stacks only
./stackcount -U t:sched:sched_switch # user stacks only
"""
parser = argparse.ArgumentParser(
description="Count events and their stack traces",
......@@ -198,8 +227,10 @@ class Tool(object):
epilog=examples)
parser.add_argument("-p", "--pid", type=int,
help="trace this PID only")
parser.add_argument("-i", "--interval", default=99999999,
parser.add_argument("-i", "--interval",
help="summary interval, seconds")
parser.add_argument("-D", "--duration",
help="total duration of trace, seconds")
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-r", "--regexp", action="store_true",
......@@ -208,19 +239,56 @@ class Tool(object):
help="show address offsets")
parser.add_argument("-P", "--perpid", action="store_true",
help="display stacks separately for each process")
parser.add_argument("-K", "--kernel-stacks-only",
action="store_true", help="kernel stack only", default=False)
parser.add_argument("-U", "--user-stacks-only",
action="store_true", help="user stack only", default=False)
parser.add_argument("-v", "--verbose", action="store_true",
help="show raw addresses")
parser.add_argument("-d", "--debug", action="store_true",
parser.add_argument("-d", "--delimited", action="store_true",
help="insert delimiter between kernel/user stacks")
parser.add_argument("-f", "--folded", action="store_true",
help="output folded format")
parser.add_argument("--debug", action="store_true",
help="print BPF program before starting (for debugging purposes)")
parser.add_argument("pattern",
help="search expression for events")
self.args = parser.parse_args()
global debug
debug = self.args.debug
self.probe = Probe(self.args.pattern, self.args.regexp,
self.args.pid, self.args.perpid)
def _print_frame(self, addr, pid):
if self.args.duration and not self.args.interval:
self.args.interval = self.args.duration
if not self.args.interval:
self.args.interval = 99999999
if self.args.kernel_stacks_only and self.args.user_stacks_only:
print("ERROR: -K and -U are mutually exclusive. If you want " +
"both stacks, that is the default.")
exit()
if not self.args.kernel_stacks_only and not self.args.user_stacks_only:
self.kernel_stack = True
self.user_stack = True
else:
self.kernel_stack = self.args.kernel_stacks_only
self.user_stack = self.args.user_stacks_only
self.probe = Probe(self.args.pattern,
self.kernel_stack, self.user_stack,
self.args.regexp, self.args.pid, self.args.perpid)
self.need_delimiter = self.args.delimited and not (
self.args.kernel_stacks_only or self.args.user_stacks_only)
def _print_kframe(self, addr):
print(" ", end="")
if self.args.verbose:
print("%-16x " % addr, end="")
if self.args.offset:
print("%s" % self.probe.bpf.ksym(addr, show_offset=True))
else:
print("%s" % self.probe.bpf.ksym(addr))
def _print_uframe(self, addr, pid):
print(" ", end="")
if self.args.verbose:
print("%-16x " % addr, end="")
......@@ -233,33 +301,30 @@ class Tool(object):
def _signal_ignore(signal, frame):
print()
def _comm_for_pid(self, pid):
if pid in self.comm_cache:
return self.comm_cache[pid]
try:
comm = " %s [%d]" % (
open("/proc/%d/comm" % pid).read().strip(),
pid)
self.comm_cache[pid] = comm
return comm
except:
return " unknown process [%d]" % pid
def _print_comm(self, comm, pid):
print(" %s [%d]" % (comm, pid))
def run(self):
self.probe.load()
self.probe.attach()
if not self.args.folded:
print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
(self.probe.matched, self.args.pattern))
b = self.probe.bpf
exiting = 0 if self.args.interval else 1
seconds = 0
while True:
try:
sleep(int(self.args.interval))
seconds += int(self.args.interval)
except KeyboardInterrupt:
exiting = 1
# as cleanup can take many seconds, trap Ctrl-C:
signal.signal(signal.SIGINT, Tool._signal_ignore)
if self.args.duration and seconds >= int(self.args.duration):
exiting = 1
if not self.args.folded:
print()
if self.args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")
......@@ -269,15 +334,36 @@ class Tool(object):
self.comm_cache = {}
for k, v in sorted(counts.items(),
key=lambda counts: counts[1].value):
for addr in stack_traces.walk(k.stackid):
pid = -1 if self.probe.is_kernel_probe() else k.pid
self._print_frame(addr, pid)
if not self.args.pid and k.pid != 0xffffffff:
print(self._comm_for_pid(k.pid))
user_stack = [] if k.user_stack_id < 0 else \
stack_traces.walk(k.user_stack_id)
kernel_stack = [] if k.kernel_stack_id < 0 else \
stack_traces.walk(k.kernel_stack_id)
if self.args.folded:
# print folded stack output
user_stack = list(user_stack)
kernel_stack = list(kernel_stack)
line = [k.name.decode()] + \
[b.sym(addr, k.tgid) for addr in
reversed(user_stack)] + \
(self.need_delimiter and ["-"] or []) + \
[b.ksym(addr) for addr in reversed(kernel_stack)]
print("%s %d" % (";".join(line), v.value))
else:
# print multi-line stack output
for addr in kernel_stack:
self._print_kframe(addr)
if self.need_delimiter:
print(" --")
for addr in user_stack:
self._print_uframe(addr, k.tgid)
if not self.args.pid and k.tgid != 0xffffffff:
self._print_comm(k.name, k.tgid)
print(" %d\n" % v.value)
counts.clear()
if exiting:
if not self.args.folded:
print("Detaching...")
exit()
......
......@@ -3,55 +3,118 @@ Demonstrations of stackcount, the Linux eBPF/bcc version.
This program traces functions and frequency counts them with their entire
stack trace, summarized in-kernel for efficiency. For example, counting
stack traces that led to submit_bio(), which creates block device I/O:
stack traces that led to the submit_bio() kernel function, which creates
block device I/O:
# ./stackcount submit_bio
Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
^C
submit_bio
ext4_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_release_file
__fput
____fput
task_work_run
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
[unknown]
[unknown]
tar [15069]
5
submit_bio
ext4_bio_write_page
mpage_submit_page
mpage_map_and_submit_buffers
ext4_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_release_file
__fput
____fput
task_work_run
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
[unknown]
[unknown]
tar [15069]
15
submit_bio
ext4_readpages
__do_page_cache_readahead
ondemand_readahead
page_cache_async_readahead
generic_file_read_iter
__vfs_read
vfs_read
sys_read
entry_SYSCALL_64_fastpath
[unknown]
tar [15069]
113
Detaching...
The output shows unique stack traces, in order from leaf (on-CPU) to root,
followed by their occurrence count. The last stack trace in the above output
shows syscall handling, sys_read(), vfs_read(), and then "readahead" functions:
looks like an application issued file read has triggered read ahead. The
application can be seen after the stack trace, in this case, "tar [15069]"
for the "tar" command, PID 15069.
The order of printed stack traces is from least to most frequent. The most
frequent in this case, the ext4_rename() stack, was taken 113 times during
tracing.
The "[unknown]" frames are from user-level, since this simple workload is
the tar command, which apparently has been compiled without frame pointers.
It's a common compiler optimization, but it breaks frame pointer-based stack
walkers. Similar broken stacks will be seen by other profilers and debuggers
that use frame pointers. Hopefully your application preserves them so that
the user-level stack trace is visible. So how does one get frame pointers, if
your application doesn't have them to start with? For the current bcc (until
it supports other stack walkers), you need to be running a application binaries
that preserves frame pointers, eg, using gcc's -fno-omit-frame-pointer. That's
about all I'll say here: this is a big topic that is not bcc/BPF specific.
It can be useful to trace the path to submit_bio to explain unusual rates of
disk IOPS. These could have in-kernel origins (eg, background scrub).
Now adding the -d option to delimit kernel and user stacks:
# ./stackcount -d submit_bio
Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
^C
submit_bio
submit_bh
journal_submit_commit_record.isra.13
journal_submit_commit_record
jbd2_journal_commit_transaction
kjournald2
kthread
ret_from_fork
mb_cache_list
--
jbd2/xvda1-8 [405]
1
submit_bio
__block_write_full_page.constprop.39
block_write_full_page
blkdev_writepage
__writepage
write_cache_pages
generic_writepages
do_writepages
__writeback_single_inode
writeback_sb_inodes
__writeback_inodes_wb
2
submit_bio
__block_write_full_page.constprop.39
block_write_full_page
blkdev_writepage
__writepage
write_cache_pages
generic_writepages
do_writepages
__filemap_fdatawrite_range
filemap_fdatawrite
fdatawrite_one_bdev
36
submit_bio
submit_bh
jbd2_journal_commit_transaction
kjournald2
kthread
ret_from_fork
mb_cache_list
38
--
jbd2/xvda1-8 [405]
2
submit_bio
ext4_writepages
......@@ -59,138 +122,220 @@ Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_rename
ext4_rename2
vfs_rename
sys_rename
ext4_release_file
__fput
____fput
task_work_run
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
79
--
[unknown]
[unknown]
tar [15187]
5
Detaching...
submit_bio
ext4_bio_write_page
mpage_submit_page
mpage_map_and_submit_buffers
ext4_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_release_file
__fput
____fput
task_work_run
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
--
[unknown]
[unknown]
tar [15187]
15
The output shows unique stack traces, in order from leaf (on-CPU) to root,
followed by their occurrence count. The last stack trace in the above output
shows syscall handling, ext4_rename(), and filemap_flush(): looks like an
application issued file rename has caused back end disk I/O due to ext4
block allocation and a filemap_flush(). I'd have to browse the code to those
functions to confirm!
submit_bio
ext4_readpages
__do_page_cache_readahead
ondemand_readahead
page_cache_async_readahead
generic_file_read_iter
__vfs_read
vfs_read
sys_read
entry_SYSCALL_64_fastpath
--
[unknown]
[unknown]
[unknown]
tar [15187]
171
The order of printed stack traces is from least to most frequent. The most
frequent in this case, the ext4_rename() stack, was taken 79 times during
tracing.
Detaching...
It can be useful to trace the path to submit_bio to explain unusual rates of
disk IOPS. These could have in-kernel origins (eg, background scrub).
A "--" is printed between the kernel and user stacks.
As another example, here are the code paths that led to ip_output(), which
sends a packet at the IP level:
As a different example, here is the kernel function hrtimer_init_sleeper():
# ./stackcount ip_output
Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
# ./stackcount.py -d hrtimer_init_sleeper
Tracing 1 functions for "hrtimer_init_sleeper"... Hit Ctrl-C to end.
^C
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
tcp_tsq_handler.part.32
tcp_tasklet_func
tasklet_action
__do_softirq
do_softirq_own_stack
do_softirq
__local_bh_enable_ip
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
[unknown]
containerd [16020]
1
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_send_ack
tcp_send_delayed_ack
__tcp_ack_snd_check
tcp_rcv_established
tcp_v4_do_rcv
tcp_v4_rcv
ip_local_deliver_finish
ip_local_deliver
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
__pthread_cond_timedwait
Monitor::IWait(Thread*, long)
Monitor::wait(bool, long, bool)
CompileQueue::get()
CompileBroker::compiler_thread_loop()
JavaThread::thread_main_inner()
JavaThread::run()
java_start(Thread*)
start_thread
java [4996]
1
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_send_ack
tcp_send_delayed_ack
__tcp_ack_snd_check
tcp_rcv_established
tcp_v4_do_rcv
tcp_v4_rcv
ip_local_deliver_finish
ip_local_deliver
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
[unknown]
[unknown]
containerd [16020]
1
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_send_ack
tcp_delack_timer_handler
tcp_delack_timer
call_timer_fn
run_timer_softirq
__do_softirq
irq_exit
xen_evtchn_do_upcall
1
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
__pthread_cond_timedwait
VMThread::loop()
VMThread::run()
java_start(Thread*)
start_thread
java [4996]
3
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
[unknown]
dockerd [16008]
4
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
[unknown]
[unknown]
dockerd [16008]
4
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
__pthread_cond_timedwait
Lio/netty/util/ThreadDeathWatcher$Watcher;::run
Interpreter
Interpreter
call_stub
JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)
JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)
JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)
thread_entry(JavaThread*, Thread*)
JavaThread::thread_main_inner()
JavaThread::run()
java_start(Thread*)
start_thread
java [4996]
4
hrtimer_init_sleeper
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
--
__pthread_cond_timedwait
clock_gettime
[unknown]
java [4996]
79
Detaching...
I was just trying to find a more interesting example. This output includes
some Java stacks where user-level has been walked correctly (even includes a
JIT symbol translation). dockerd and containerd don't have frame pointers
(grumble), but Java does (which is running with -XX:+PreserveFramePointer).
Here's another kernel function, ip_output():
# ./stackcount.py -d ip_output
Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
^C
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
__tcp_push_pending_frames
tcp_rcv_established
tcp_v4_do_rcv
release_sock
tcp_push
tcp_sendmsg
inet_sendmsg
sock_sendmsg
3
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
tcp_tsq_handler.part.32
tcp_tasklet_func
tasklet_action
__do_softirq
run_ksoftirqd
smpboot_thread_fn
kthread
3
sock_write_iter
__vfs_write
vfs_write
SyS_write
entry_SYSCALL_64_fastpath
--
__write_nocancel
[unknown]
sshd [15015]
5
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
__tcp_push_pending_frames
tcp_rcv_established
tcp_v4_do_rcv
tcp_v4_rcv
ip_local_deliver_finish
ip_local_deliver
ip_rcv_finish
4
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_send_ack
tcp_cleanup_rbuf
tcp_recvmsg
inet_recvmsg
sock_recvmsg
sock_read_iter
__vfs_read
vfs_read
tcp_push
tcp_sendmsg
inet_sendmsg
sock_sendmsg
sock_write_iter
__vfs_write
vfs_write
SyS_write
entry_SYSCALL_64_fastpath
--
__write_nocancel
[unknown]
[unknown]
sshd [8234]
5
ip_output
......@@ -198,14 +343,30 @@ Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
tcp_transmit_skb
tcp_write_xmit
__tcp_push_pending_frames
tcp_rcv_established
tcp_v4_do_rcv
release_sock
tcp_push
tcp_sendmsg
inet_sendmsg
sock_sendmsg
9
sock_write_iter
__vfs_write
vfs_write
SyS_write
entry_SYSCALL_64_fastpath
--
__write_nocancel
sshd [15015]
7
Detaching...
This time just sshd is triggering ip_output() calls.
Watch what happens if I filter on kernel stacks only (-K) for ip_output():
# ./stackcount.py -K ip_output
Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
^C
ip_output
ip_queue_xmit
tcp_transmit_skb
......@@ -217,61 +378,45 @@ Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
sock_sendmsg
sock_write_iter
__vfs_write
51
vfs_write
SyS_write
entry_SYSCALL_64_fastpath
13
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
__tcp_push_pending_frames
tcp_rcv_established
tcp_v4_do_rcv
tcp_v4_rcv
ip_local_deliver_finish
ip_local_deliver
ip_rcv_finish
171
Detaching...
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
__tcp_push_pending_frames
tcp_rcv_established
tcp_v4_do_rcv
tcp_v4_rcv
ip_local_deliver_finish
ip_local_deliver
ip_rcv_finish
994
They have grouped together as a single unique stack, since the kernel part
was the same.
ip_output
ip_queue_xmit
tcp_transmit_skb
tcp_write_xmit
tcp_tsq_handler.part.32
tcp_tasklet_func
tasklet_action
__do_softirq
irq_exit
xen_evtchn_do_upcall
xen_do_hypervisor_callback
1002
Detaching...
Here is just the user stacks, fetched during the kernel function ip_output():
The last two most frequent stack traces are via tcp_transmit_skb(). Note the
send last begins with ip_rcv_finish(), for a local receive, which then becomes
a transmit: likely pushing more frames when processing the newly received ones.
# ./stackcount.py -U ip_output
Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
^C
[unknown]
snmpd [1645]
1
__write_nocancel
[unknown]
[unknown]
sshd [8234]
3
__write_nocancel
sshd [15015]
4
As may be obvious, this is a great tool for quickly understanding kernel code
flow.
I should really run a custom sshd with frame pointers so we can see its
stack trace...
User-space functions can also be traced if a library name is provided. For
example, to quickly identify code locations that allocate heap memory:
example, to quickly identify code locations that allocate heap memory for
PID 4902 (using -p), by tracing malloc from libc ("c:malloc"):
# ./stackcount -l c -p 4902 malloc
# ./stackcount -p 4902 c:malloc
Tracing 1 functions for "malloc"... Hit Ctrl-C to end.
^C
malloc
......@@ -290,10 +435,12 @@ Tracing 1 functions for "malloc"... Hit Ctrl-C to end.
Detaching...
Kernel stacks are absent as this didn't enter kernel code.
Note that user-space uses of stackcount can be somewhat more limited because
a lot of user-space libraries and binaries are compiled without debuginfo, or
with frame-pointer omission (-fomit-frame-pointer), which makes it impossible
to reliably obtain the stack trace.
a lot of user-space libraries and binaries are compiled without frame-pointers
as discussed earlier (the -fomit-frame-pointer compiler default) or are used
without debuginfo.
In addition to kernel and user-space functions, kernel tracepoints and USDT
......@@ -313,132 +460,275 @@ Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end.
[unknown]
7
You can use "readelf -n file" to see if it has USDT tracepoints.
Similarly, to determine where context switching is happening in the kernel,
use the sched:sched_switch kernel tracepoint:
# ./stackcount t:sched:sched_switch
... (omitted for brevity)
__schedule
schedule
worker_thread
kthread
ret_from_fork
kworker/0:2 [25482]
1
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
poll_schedule_timeout
do_select
core_sys_select
SyS_select
ep_poll
SyS_epoll_wait
entry_SYSCALL_64_fastpath
40
epoll_wait
Lsun/nio/ch/SelectorImpl;::lockAndDoSelect
Lsun/nio/ch/SelectorImpl;::select
Lio/netty/channel/nio/NioEventLoop;::select
Lio/netty/channel/nio/NioEventLoop;::run
Interpreter
Interpreter
call_stub
JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)
JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)
JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)
thread_entry(JavaThread*, Thread*)
JavaThread::thread_main_inner()
JavaThread::run()
java_start(Thread*)
start_thread
java [4996]
1
... (omitted for brevity)
__schedule
schedule
schedule_preempt_disabled
cpu_startup_entry
start_secondary
85
xen_play_dead
arch_cpu_idle_dead
cpu_startup_entry
cpu_bringup_and_idle
swapper/1 [0]
289
A -i option can be used to set an output interval, and -T to include a
timestamp. For example:
# ./stackcount -Ti 1 submit_bio
# ./stackcount.py -Tdi 1 submit_bio
Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
01:11:37
06:05:13
06:05:14
submit_bio
submit_bh
journal_submit_commit_record.isra.13
jbd2_journal_commit_transaction
kjournald2
xfs_do_writepage
write_cache_pages
xfs_vm_writepages
do_writepages
__writeback_single_inode
writeback_sb_inodes
__writeback_inodes_wb
wb_writeback
wb_workfn
process_one_work
worker_thread
kthread
ret_from_fork
mb_cache_list
--
kworker/u16:1 [15686]
1
submit_bio
ext4_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_rename
ext4_rename2
vfs_rename
sys_rename
process_one_work
worker_thread
kthread
ret_from_fork
--
kworker/u16:0 [16007]
1
submit_bio
xfs_buf_submit
xlog_bdstrat
xlog_sync
xlog_state_release_iclog
_xfs_log_force
xfs_log_force
xfs_fs_sync_fs
sync_fs_one_sb
iterate_supers
sys_sync
entry_SYSCALL_64_fastpath
20
--
[unknown]
sync [16039]
1
submit_bio
submit_bh
journal_submit_commit_record
jbd2_journal_commit_transaction
kjournald2
kthread
ret_from_fork
mb_cache_list
39
--
jbd2/xvda1-8 [405]
1
submit_bio
process_one_work
worker_thread
kthread
ret_from_fork
--
kworker/0:2 [25482]
2
01:11:38
submit_bio
ext4_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_rename
ext4_rename2
vfs_rename
sys_rename
entry_SYSCALL_64_fastpath
20
__writeback_single_inode
writeback_sb_inodes
__writeback_inodes_wb
wb_writeback
wb_workfn
process_one_work
worker_thread
kthread
ret_from_fork
--
kworker/u16:0 [16007]
4
submit_bio
xfs_vm_writepages
do_writepages
__writeback_single_inode
writeback_sb_inodes
__writeback_inodes_wb
wb_writeback
wb_workfn
process_one_work
worker_thread
kthread
ret_from_fork
--
kworker/u16:1 [15686]
5
01:11:39
submit_bio
ext4_writepages
__block_write_full_page
block_write_full_page
blkdev_writepage
__writepage
write_cache_pages
generic_writepages
blkdev_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_rename
ext4_rename2
vfs_rename
sys_rename
filemap_fdatawrite
fdatawrite_one_bdev
iterate_bdevs
sys_sync
entry_SYSCALL_64_fastpath
20
--
[unknown]
sync [16039]
7
^C
01:11:39
submit_bio
submit_bh
jbd2_journal_commit_transaction
kjournald2
kthread
ret_from_fork
--
jbd2/xvda1-8 [405]
8
submit_bio
ext4_bio_write_page
mpage_submit_page
mpage_map_and_submit_buffers
ext4_writepages
do_writepages
__filemap_fdatawrite_range
filemap_flush
ext4_alloc_da_blocks
ext4_rename
ext4_rename2
vfs_rename
sys_rename
entry_SYSCALL_64_fastpath
20
__writeback_single_inode
writeback_sb_inodes
__writeback_inodes_wb
wb_writeback
wb_workfn
process_one_work
worker_thread
kthread
ret_from_fork
--
kworker/u16:0 [16007]
8
submit_bio
__block_write_full_page
block_write_full_page
blkdev_writepage
__writepage
write_cache_pages
generic_writepages
blkdev_writepages
do_writepages
__writeback_single_inode
writeback_sb_inodes
__writeback_inodes_wb
wb_writeback
wb_workfn
process_one_work
worker_thread
kthread
ret_from_fork
--
kworker/u16:0 [16007]
60
06:05:15
06:05:16
Detaching...
This only included output for the 06:05:14 interval. The other internals
did not span block device I/O.
The -s output prints the return instruction offset for each function (aka
symbol offset). Eg:
# ./stackcount -s tcp_sendmsg
# ./stackcount.py -s tcp_sendmsg
Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end.
^C
tcp_sendmsg0x1
sock_sendmsg0x38
sock_write_iter0x78
__vfs_write0xaa
vfs_write0xa9
sys_write0x46
entry_SYSCALL_64_fastpath0x16
29
tcp_sendmsg+0x1
sock_sendmsg+0x38
sock_write_iter+0x85
__vfs_write+0xe3
vfs_write+0xb8
SyS_write+0x55
entry_SYSCALL_64_fastpath+0x1e
__write_nocancel+0x7
sshd [15015]
3
tcp_sendmsg+0x1
sock_sendmsg+0x38
sock_write_iter+0x85
__vfs_write+0xe3
vfs_write+0xb8
SyS_write+0x55
entry_SYSCALL_64_fastpath+0x1e
__write_nocancel+0x7
sshd [8234]
3
Detaching...
......@@ -446,9 +736,40 @@ If it wasn't clear how one function called another, knowing the instruction
offset can help you locate the lines of code from a disassembly dump.
A wildcard can also be used. Eg, all functions beginning with "tcp_send":
The -v output is verbose, and shows raw addresses:
# ./stackcount -s 'tcp_send*'
./stackcount.py -v tcp_sendmsg
Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end.
^C
ffffffff817b05c1 tcp_sendmsg
ffffffff8173ea48 sock_sendmsg
ffffffff8173eae5 sock_write_iter
ffffffff81232b33 __vfs_write
ffffffff812331b8 vfs_write
ffffffff81234625 SyS_write
ffffffff818739bb entry_SYSCALL_64_fastpath
7f120511e6e0 __write_nocancel
sshd [8234]
3
ffffffff817b05c1 tcp_sendmsg
ffffffff8173ea48 sock_sendmsg
ffffffff8173eae5 sock_write_iter
ffffffff81232b33 __vfs_write
ffffffff812331b8 vfs_write
ffffffff81234625 SyS_write
ffffffff818739bb entry_SYSCALL_64_fastpath
7f919c5a26e0 __write_nocancel
sshd [15015]
11
Detaching...
A wildcard can also be used. Eg, all functions beginning with "tcp_send",
kernel stacks only (-K) with offsets (-s):
# ./stackcount -Ks 'tcp_send*'
Tracing 14 functions for "tcp_send*"... Hit Ctrl-C to end.
^C
tcp_send_delayed_ack0x1
......@@ -501,39 +822,69 @@ Detaching...
Use -r to allow regular expressions.
The -f option will emit folded output, which can be used as input to other
tools including flame graphs. For example, with delimiters as well:
# ./stackcount.py -df t:sched:sched_switch
^Csnmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
kworker/7:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1
watchdog/0;-;ret_from_fork;kthread;smpboot_thread_fn;schedule;__schedule 1
snmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
svscan;[unknown];-;entry_SYSCALL_64_fastpath;SyS_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule;__schedule 1
python;[unknown];__select_nocancel;-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
kworker/2:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1
[...]
Flame graphs visualize stack traces. For information about them and links
to open source software, see http://www.brendangregg.com/flamegraphs.html .
This folded output can be piped directly into flamegraph.pl (the Perl version).
USAGE message:
# ./stackcount -h
usage: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s]
[-l LIBRARY] [-v] [-d] pattern
usage: stackcount [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s]
[-P] [-K] [-U] [-v] [-d] [-f] [--debug]
pattern
Count function calls and their stack traces
Count events and their stack traces
positional arguments:
pattern search expression for functions
pattern search expression for events
optional arguments:
-h, --help show this help message and exit
-p PID, --pid PID trace this PID only
-i INTERVAL, --interval INTERVAL
summary interval, seconds
-D DURATION, --duration DURATION
total duration of trace, seconds
-T, --timestamp include timestamp on output
-r, --regexp use regular expressions. Default is "*" wildcards
only.
-s, --offset show address offsets
-l, --library trace user-space functions from this library or executable
-P, --perpid display stacks separately for each process
-K, --kernel-stacks-only
kernel stack only
-U, --user-stacks-only
user stack only
-v, --verbose show raw addresses
-d, --debug print BPF program before starting (for debugging purposes)
-d, --delimited insert delimiter between kernel/user stacks
-f, --folded output folded format
--debug print BPF program before starting (for debugging
purposes)
examples:
./stackcount submit_bio # count kernel stack traces for submit_bio
./stackcount ip_output # count kernel stack traces for ip_output
./stackcount -d ip_output # include a user/kernel stack delimiter
./stackcount -s ip_output # show symbol offsets
./stackcount -sv ip_output # show offsets and raw addresses (verbose)
./stackcount 'tcp_send*' # count stacks for funcs matching tcp_send*
./stackcount -r '^tcp_send.*' # same as above, using regular expressions
./stackcount -Ti 5 ip_output # output every 5 seconds, with timestamps
./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only
./stackcount -p 185 -l c malloc # count stacks for malloc in PID 185
./stackcount t:sched:sched_fork # count stacks for the sched_fork tracepoint
./stackcount -p 185 c:malloc # count stacks for malloc in PID 185
./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint
./stackcount -p 185 u:node:* # count stacks for all USDT probes in node
./stackcount -K t:sched:sched_switch # kernel stacks only
./stackcount -U t:sched:sched_switch # user stacks only
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment