Commit 69ceaca0 authored by Sasha Goldshtein's avatar Sasha Goldshtein Committed by GitHub

softirqs: Migrate to kernel tracepoints instead of kprobes (#1091)

This commit migrates softirqs to use kernel tracepoints instead of
kprobes. Because tracepoints only provide the vector number and not
the function name, we use a conversion table, which is borrowed from
kernel/softirq.c, to translate the vector number to a display name.
This table is expected to be fairly stable. Notably, new names have
not been added since approximately 2009, and the last rename (without
adding or removing a name) was in 2014.

Resolves #1031.
parent 08b8bd69
......@@ -9,10 +9,10 @@ show this time as either totals or histogram distributions. A system-wide
summary of this time is shown by the %soft column of mpstat(1), and soft IRQ
event counts (but not times) are available in /proc/softirqs.
WARNING: This currently uses dynamic tracing of various soft interrupt
functions, and can easily not work with different kernel versions. Check and
adjust the code as necessary. Also try in a test environment and ensure this
tool is safe before use. Future versions should switch to tracepoints.
This tool uses the irq:softirq_enter and irq:softirq_exit kernel tracepoints,
which is a stable tracing mechanism. BPF programs can attach to tracepoints
from Linux 4.7 only. An older version of this tool is available in tools/old,
and uses kprobes instead of tracepoints.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
......@@ -87,7 +87,7 @@ example usage, output, and commentary for this tool.
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Brendan Gregg
.SH AUTHORS
Brendan Gregg, Sasha Goldshtein
.SH SEE ALSO
hardirqs(8)
......@@ -242,11 +242,9 @@ class SmokeTests(TestCase):
def test_slabratetop(self):
self.run_with_duration("slabratetop.py 1 1")
@skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
def test_softirqs(self):
# TODO Temporary disabled as softirqs.py doesn't work on recent
# kernels (can't find some of its attach targets). Need to revisit
# it to use the softirq tracepoints. Tracked in bcc#1031.
# self.run_with_duration("softirqs.py 1 1")
self.run_with_duration("softirqs.py 1 1")
pass
@skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
......
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# softirqs Summarize soft IRQ (interrupt) event time.
# For Linux, uses BCC, eBPF.
#
# USAGE: softirqs [-h] [-T] [-N] [-d] [interval] [count]
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Oct-2015 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
# arguments
examples = """examples:
./softirqs # sum soft irq event time
./softirqs -d # show soft irq event time as histograms
./softirqs 1 10 # print 1 second summaries, 10 times
./softirqs -NT 1 # 1s summaries, nanoseconds, and timestamps
"""
parser = argparse.ArgumentParser(
description="Summarize soft irq event time as histograms",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-N", "--nanoseconds", action="store_true",
help="output in nanoseconds")
parser.add_argument("-d", "--dist", action="store_true",
help="show distributions as histograms")
parser.add_argument("interval", nargs="?", default=99999999,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
args = parser.parse_args()
countdown = int(args.count)
if args.nanoseconds:
factor = 1
label = "nsecs"
else:
factor = 1000
label = "usecs"
debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
typedef struct irq_key {
u64 ip;
u64 slot;
} irq_key_t;
BPF_HASH(start, u32);
BPF_HASH(iptr, u32);
BPF_HISTOGRAM(dist, irq_key_t);
// time IRQ
int trace_start(struct pt_regs *ctx)
{
u32 pid = bpf_get_current_pid_tgid();
u64 ip = PT_REGS_IP(ctx), ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
iptr.update(&pid, &ip);
return 0;
}
int trace_completion(struct pt_regs *ctx)
{
u64 *tsp, delta, ip, *ipp;
u32 pid = bpf_get_current_pid_tgid();
// fetch timestamp and calculate delta
tsp = start.lookup(&pid);
ipp = iptr.lookup(&pid);
if (tsp == 0 || ipp == 0) {
return 0; // missed start
}
delta = bpf_ktime_get_ns() - *tsp;
ip = *ipp;
// store as sum or histogram
STORE
start.delete(&pid);
iptr.delete(&pid);
return 0;
}
"""
# code substitutions
if args.dist:
bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = bpf_log2l(delta)};' +
'dist.increment(key);')
else:
bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = 0 /* ignore */};' +
'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero);' +
'(*vp) += delta;')
if debug:
print(bpf_text)
# load BPF program
b = BPF(text=bpf_text)
# this should really use irq:softirq_entry/exit tracepoints; for now the
# soft irq functions are individually traced (search your kernel for
# open_softirq() calls, and adjust the following list as needed).
for softirqfunc in ("blk_iopoll_softirq", "blk_done_softirq",
"rcu_process_callbacks", "run_rebalance_domains", "tasklet_action",
"tasklet_hi_action", "run_timer_softirq", "net_tx_action",
"net_rx_action"):
b.attach_kprobe(event=softirqfunc, fn_name="trace_start")
b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion")
print("Tracing soft irq event time... Hit Ctrl-C to end.")
# output
exiting = 0 if args.interval else 1
dist = b.get_table("dist")
while (1):
try:
sleep(int(args.interval))
except KeyboardInterrupt:
exiting = 1
print()
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")
if args.dist:
dist.print_log2_hist(label, "softirq", section_print_fn=b.ksym)
else:
print("%-26s %11s" % ("SOFTIRQ", "TOTAL_" + label))
for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
print("%-26s %11d" % (b.ksym(k.ip), v.value / factor))
dist.clear()
countdown -= 1
if exiting or countdown == 0:
exit()
......@@ -9,7 +9,8 @@
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Oct-2015 Brendan Gregg Created this.
# 20-Oct-2015 Brendan Gregg Created this.
# 03-Apr-2017 Sasha Goldshtein Migrated to kernel tracepoints.
from __future__ import print_function
from bcc import BPF
......@@ -24,7 +25,7 @@ examples = """examples:
./softirqs -NT 1 # 1s summaries, nanoseconds, and timestamps
"""
parser = argparse.ArgumentParser(
description="Summarize soft irq event time as histograms",
description="Summarize soft irq event time as histograms.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
......@@ -52,42 +53,49 @@ bpf_text = """
#include <uapi/linux/ptrace.h>
typedef struct irq_key {
u64 ip;
u32 vec;
u64 slot;
} irq_key_t;
BPF_HASH(start, u32);
typedef struct account_val {
u64 ts;
u32 vec;
} account_val_t;
BPF_HASH(start, u32, account_val_t);
BPF_HASH(iptr, u32);
BPF_HISTOGRAM(dist, irq_key_t);
// time IRQ
int trace_start(struct pt_regs *ctx)
TRACEPOINT_PROBE(irq, softirq_entry)
{
u32 pid = bpf_get_current_pid_tgid();
u64 ip = PT_REGS_IP(ctx), ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
iptr.update(&pid, &ip);
account_val_t val = {};
val.ts = bpf_ktime_get_ns();
val.vec = args->vec;
start.update(&pid, &val);
return 0;
}
int trace_completion(struct pt_regs *ctx)
TRACEPOINT_PROBE(irq, softirq_exit)
{
u64 *tsp, delta, ip, *ipp;
u64 delta;
u32 vec;
u32 pid = bpf_get_current_pid_tgid();
account_val_t *valp;
irq_key_t key = {0};
// fetch timestamp and calculate delta
tsp = start.lookup(&pid);
ipp = iptr.lookup(&pid);
if (tsp == 0 || ipp == 0) {
valp = start.lookup(&pid);
if (valp == 0) {
return 0; // missed start
}
delta = bpf_ktime_get_ns() - *tsp;
ip = *ipp;
delta = bpf_ktime_get_ns() - valp->ts;
vec = valp->vec;
// store as sum or histogram
STORE
start.delete(&pid);
iptr.delete(&pid);
return 0;
}
"""
......@@ -95,12 +103,12 @@ int trace_completion(struct pt_regs *ctx)
# code substitutions
if args.dist:
bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = bpf_log2l(delta)};' +
'key.vec = vec; key.slot = bpf_log2l(delta); ' +
'dist.increment(key);')
else:
bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = 0 /* ignore */};' +
'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero);' +
'key.vec = valp->vec; ' +
'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero); ' +
'(*vp) += delta;')
if debug:
print(bpf_text)
......@@ -108,15 +116,11 @@ if debug:
# load BPF program
b = BPF(text=bpf_text)
# this should really use irq:softirq_entry/exit tracepoints; for now the
# soft irq functions are individually traced (search your kernel for
# open_softirq() calls, and adjust the following list as needed).
for softirqfunc in ("blk_iopoll_softirq", "blk_done_softirq",
"rcu_process_callbacks", "run_rebalance_domains", "tasklet_action",
"tasklet_hi_action", "run_timer_softirq", "net_tx_action",
"net_rx_action"):
b.attach_kprobe(event=softirqfunc, fn_name="trace_start")
b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion")
def vec_to_name(vec):
# copied from softirq_to_name() in kernel/softirq.c
# may need updates if new softirq handlers are added
return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
"tasklet", "sched", "hrtimer", "rcu"][vec]
print("Tracing soft irq event time... Hit Ctrl-C to end.")
......@@ -134,11 +138,11 @@ while (1):
print("%-8s\n" % strftime("%H:%M:%S"), end="")
if args.dist:
dist.print_log2_hist(label, "softirq", section_print_fn=b.ksym)
dist.print_log2_hist(label, "softirq", section_print_fn=vec_to_name)
else:
print("%-26s %11s" % ("SOFTIRQ", "TOTAL_" + label))
print("%-16s %11s" % ("SOFTIRQ", "TOTAL_" + label))
for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
print("%-26s %11d" % (b.ksym(k.ip), v.value / factor))
print("%-16s %11d" % (vec_to_name(k.vec), v.value / factor))
dist.clear()
countdown -= 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment