Commit 69ceaca0 authored by Sasha Goldshtein's avatar Sasha Goldshtein Committed by GitHub

softirqs: Migrate to kernel tracepoints instead of kprobes (#1091)

This commit migrates softirqs to use kernel tracepoints instead of
kprobes. Because tracepoints only provide the vector number and not
the function name, we use a conversion table, which is borrowed from
kernel/softirq.c, to translate the vector number to a display name.
This table is expected to be fairly stable. Notably, new names have
not been added since approximately 2009, and the last rename (without
adding or removing a name) was in 2014.

Resolves #1031.
parent 08b8bd69
...@@ -9,10 +9,10 @@ show this time as either totals or histogram distributions. A system-wide ...@@ -9,10 +9,10 @@ show this time as either totals or histogram distributions. A system-wide
summary of this time is shown by the %soft column of mpstat(1), and soft IRQ summary of this time is shown by the %soft column of mpstat(1), and soft IRQ
event counts (but not times) are available in /proc/softirqs. event counts (but not times) are available in /proc/softirqs.
WARNING: This currently uses dynamic tracing of various soft interrupt This tool uses the irq:softirq_enter and irq:softirq_exit kernel tracepoints,
functions, and can easily not work with different kernel versions. Check and which is a stable tracing mechanism. BPF programs can attach to tracepoints
adjust the code as necessary. Also try in a test environment and ensure this from Linux 4.7 only. An older version of this tool is available in tools/old,
tool is safe before use. Future versions should switch to tracepoints. and uses kprobes instead of tracepoints.
Since this uses BPF, only the root user can use this tool. Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS .SH REQUIREMENTS
...@@ -87,7 +87,7 @@ example usage, output, and commentary for this tool. ...@@ -87,7 +87,7 @@ example usage, output, and commentary for this tool.
Linux Linux
.SH STABILITY .SH STABILITY
Unstable - in development. Unstable - in development.
.SH AUTHOR .SH AUTHORS
Brendan Gregg Brendan Gregg, Sasha Goldshtein
.SH SEE ALSO .SH SEE ALSO
hardirqs(8) hardirqs(8)
...@@ -242,11 +242,9 @@ class SmokeTests(TestCase): ...@@ -242,11 +242,9 @@ class SmokeTests(TestCase):
def test_slabratetop(self): def test_slabratetop(self):
self.run_with_duration("slabratetop.py 1 1") self.run_with_duration("slabratetop.py 1 1")
@skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
def test_softirqs(self): def test_softirqs(self):
# TODO Temporary disabled as softirqs.py doesn't work on recent self.run_with_duration("softirqs.py 1 1")
# kernels (can't find some of its attach targets). Need to revisit
# it to use the softirq tracepoints. Tracked in bcc#1031.
# self.run_with_duration("softirqs.py 1 1")
pass pass
@skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
......
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# softirqs Summarize soft IRQ (interrupt) event time.
# For Linux, uses BCC, eBPF.
#
# USAGE: softirqs [-h] [-T] [-N] [-d] [interval] [count]
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Oct-2015 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
# arguments
examples = """examples:
./softirqs # sum soft irq event time
./softirqs -d # show soft irq event time as histograms
./softirqs 1 10 # print 1 second summaries, 10 times
./softirqs -NT 1 # 1s summaries, nanoseconds, and timestamps
"""
parser = argparse.ArgumentParser(
description="Summarize soft irq event time as histograms",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-N", "--nanoseconds", action="store_true",
help="output in nanoseconds")
parser.add_argument("-d", "--dist", action="store_true",
help="show distributions as histograms")
parser.add_argument("interval", nargs="?", default=99999999,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
args = parser.parse_args()
countdown = int(args.count)
if args.nanoseconds:
factor = 1
label = "nsecs"
else:
factor = 1000
label = "usecs"
debug = 0
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
typedef struct irq_key {
u64 ip;
u64 slot;
} irq_key_t;
BPF_HASH(start, u32);
BPF_HASH(iptr, u32);
BPF_HISTOGRAM(dist, irq_key_t);
// time IRQ
int trace_start(struct pt_regs *ctx)
{
u32 pid = bpf_get_current_pid_tgid();
u64 ip = PT_REGS_IP(ctx), ts = bpf_ktime_get_ns();
start.update(&pid, &ts);
iptr.update(&pid, &ip);
return 0;
}
int trace_completion(struct pt_regs *ctx)
{
u64 *tsp, delta, ip, *ipp;
u32 pid = bpf_get_current_pid_tgid();
// fetch timestamp and calculate delta
tsp = start.lookup(&pid);
ipp = iptr.lookup(&pid);
if (tsp == 0 || ipp == 0) {
return 0; // missed start
}
delta = bpf_ktime_get_ns() - *tsp;
ip = *ipp;
// store as sum or histogram
STORE
start.delete(&pid);
iptr.delete(&pid);
return 0;
}
"""
# code substitutions
if args.dist:
bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = bpf_log2l(delta)};' +
'dist.increment(key);')
else:
bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = 0 /* ignore */};' +
'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero);' +
'(*vp) += delta;')
if debug:
print(bpf_text)
# load BPF program
b = BPF(text=bpf_text)
# this should really use irq:softirq_entry/exit tracepoints; for now the
# soft irq functions are individually traced (search your kernel for
# open_softirq() calls, and adjust the following list as needed).
for softirqfunc in ("blk_iopoll_softirq", "blk_done_softirq",
"rcu_process_callbacks", "run_rebalance_domains", "tasklet_action",
"tasklet_hi_action", "run_timer_softirq", "net_tx_action",
"net_rx_action"):
b.attach_kprobe(event=softirqfunc, fn_name="trace_start")
b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion")
print("Tracing soft irq event time... Hit Ctrl-C to end.")
# output
exiting = 0 if args.interval else 1
dist = b.get_table("dist")
while (1):
try:
sleep(int(args.interval))
except KeyboardInterrupt:
exiting = 1
print()
if args.timestamp:
print("%-8s\n" % strftime("%H:%M:%S"), end="")
if args.dist:
dist.print_log2_hist(label, "softirq", section_print_fn=b.ksym)
else:
print("%-26s %11s" % ("SOFTIRQ", "TOTAL_" + label))
for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
print("%-26s %11d" % (b.ksym(k.ip), v.value / factor))
dist.clear()
countdown -= 1
if exiting or countdown == 0:
exit()
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
# Licensed under the Apache License, Version 2.0 (the "License") # Licensed under the Apache License, Version 2.0 (the "License")
# #
# 20-Oct-2015 Brendan Gregg Created this. # 20-Oct-2015 Brendan Gregg Created this.
# 03-Apr-2017 Sasha Goldshtein Migrated to kernel tracepoints.
from __future__ import print_function from __future__ import print_function
from bcc import BPF from bcc import BPF
...@@ -24,7 +25,7 @@ examples = """examples: ...@@ -24,7 +25,7 @@ examples = """examples:
./softirqs -NT 1 # 1s summaries, nanoseconds, and timestamps ./softirqs -NT 1 # 1s summaries, nanoseconds, and timestamps
""" """
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Summarize soft irq event time as histograms", description="Summarize soft irq event time as histograms.",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples) epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true", parser.add_argument("-T", "--timestamp", action="store_true",
...@@ -52,42 +53,49 @@ bpf_text = """ ...@@ -52,42 +53,49 @@ bpf_text = """
#include <uapi/linux/ptrace.h> #include <uapi/linux/ptrace.h>
typedef struct irq_key { typedef struct irq_key {
u64 ip; u32 vec;
u64 slot; u64 slot;
} irq_key_t; } irq_key_t;
BPF_HASH(start, u32);
typedef struct account_val {
u64 ts;
u32 vec;
} account_val_t;
BPF_HASH(start, u32, account_val_t);
BPF_HASH(iptr, u32); BPF_HASH(iptr, u32);
BPF_HISTOGRAM(dist, irq_key_t); BPF_HISTOGRAM(dist, irq_key_t);
// time IRQ TRACEPOINT_PROBE(irq, softirq_entry)
int trace_start(struct pt_regs *ctx)
{ {
u32 pid = bpf_get_current_pid_tgid(); u32 pid = bpf_get_current_pid_tgid();
u64 ip = PT_REGS_IP(ctx), ts = bpf_ktime_get_ns(); account_val_t val = {};
start.update(&pid, &ts); val.ts = bpf_ktime_get_ns();
iptr.update(&pid, &ip); val.vec = args->vec;
start.update(&pid, &val);
return 0; return 0;
} }
int trace_completion(struct pt_regs *ctx) TRACEPOINT_PROBE(irq, softirq_exit)
{ {
u64 *tsp, delta, ip, *ipp; u64 delta;
u32 vec;
u32 pid = bpf_get_current_pid_tgid(); u32 pid = bpf_get_current_pid_tgid();
account_val_t *valp;
irq_key_t key = {0};
// fetch timestamp and calculate delta // fetch timestamp and calculate delta
tsp = start.lookup(&pid); valp = start.lookup(&pid);
ipp = iptr.lookup(&pid); if (valp == 0) {
if (tsp == 0 || ipp == 0) {
return 0; // missed start return 0; // missed start
} }
delta = bpf_ktime_get_ns() - *tsp; delta = bpf_ktime_get_ns() - valp->ts;
ip = *ipp; vec = valp->vec;
// store as sum or histogram // store as sum or histogram
STORE STORE
start.delete(&pid); start.delete(&pid);
iptr.delete(&pid);
return 0; return 0;
} }
""" """
...@@ -95,12 +103,12 @@ int trace_completion(struct pt_regs *ctx) ...@@ -95,12 +103,12 @@ int trace_completion(struct pt_regs *ctx)
# code substitutions # code substitutions
if args.dist: if args.dist:
bpf_text = bpf_text.replace('STORE', bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = bpf_log2l(delta)};' + 'key.vec = vec; key.slot = bpf_log2l(delta); ' +
'dist.increment(key);') 'dist.increment(key);')
else: else:
bpf_text = bpf_text.replace('STORE', bpf_text = bpf_text.replace('STORE',
'irq_key_t key = {.ip = ip, .slot = 0 /* ignore */};' + 'key.vec = valp->vec; ' +
'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero);' + 'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero); ' +
'(*vp) += delta;') '(*vp) += delta;')
if debug: if debug:
print(bpf_text) print(bpf_text)
...@@ -108,15 +116,11 @@ if debug: ...@@ -108,15 +116,11 @@ if debug:
# load BPF program # load BPF program
b = BPF(text=bpf_text) b = BPF(text=bpf_text)
# this should really use irq:softirq_entry/exit tracepoints; for now the def vec_to_name(vec):
# soft irq functions are individually traced (search your kernel for # copied from softirq_to_name() in kernel/softirq.c
# open_softirq() calls, and adjust the following list as needed). # may need updates if new softirq handlers are added
for softirqfunc in ("blk_iopoll_softirq", "blk_done_softirq", return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
"rcu_process_callbacks", "run_rebalance_domains", "tasklet_action", "tasklet", "sched", "hrtimer", "rcu"][vec]
"tasklet_hi_action", "run_timer_softirq", "net_tx_action",
"net_rx_action"):
b.attach_kprobe(event=softirqfunc, fn_name="trace_start")
b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion")
print("Tracing soft irq event time... Hit Ctrl-C to end.") print("Tracing soft irq event time... Hit Ctrl-C to end.")
...@@ -134,11 +138,11 @@ while (1): ...@@ -134,11 +138,11 @@ while (1):
print("%-8s\n" % strftime("%H:%M:%S"), end="") print("%-8s\n" % strftime("%H:%M:%S"), end="")
if args.dist: if args.dist:
dist.print_log2_hist(label, "softirq", section_print_fn=b.ksym) dist.print_log2_hist(label, "softirq", section_print_fn=vec_to_name)
else: else:
print("%-26s %11s" % ("SOFTIRQ", "TOTAL_" + label)) print("%-16s %11s" % ("SOFTIRQ", "TOTAL_" + label))
for k, v in sorted(dist.items(), key=lambda dist: dist[1].value): for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
print("%-26s %11d" % (b.ksym(k.ip), v.value / factor)) print("%-16s %11d" % (vec_to_name(k.vec), v.value / factor))
dist.clear() dist.clear()
countdown -= 1 countdown -= 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment