memleak: Resolve stacks with a StackTrace table

Instead of manually walking the (kernel) stack inside the eBPF code, create a `BPF_STACK_TRACE` table and store the stack IDs in the alloc info struct. This greatly simplifies the leak detection code: instead of storing the full stack trace as a string to de-duplicate the allocation point for the different allocations, we can store the `stack_id`. Since we're creating stack IDs with `BPF_F_REUSE_STACKID`, the kernel will take care of deduplication for us. Additionally, the `StackDecoder` class has been specialized into a `UStackDecoder` and `KStackDecoder` (for userland and kernel stacks, respectively). This lets us pass the decode class directly to `stack_traces.walk()` to automate symbol resolution. A new class, `Allocation` has been created to encapsulate what previously was a 2-element tuple of `(count, size)`. This

memleak: Resolve stacks with a StackTrace table
Instead of manually walking the (kernel) stack inside the eBPF code, create a `BPF_STACK_TRACE` table and store the stack IDs in the alloc info struct. This greatly simplifies the leak detection code: instead of storing the full stack trace as a string to de-duplicate the allocation point for the different allocations, we can store the `stack_id`. Since we're creating stack IDs with `BPF_F_REUSE_STACKID`, the kernel will take care of deduplication for us. Additionally, the `StackDecoder` class has been specialized into a `UStackDecoder` and `KStackDecoder` (for userland and kernel stacks, respectively). This lets us pass the decode class directly to `stack_traces.walk()` to automate symbol resolution. A new class, `Allocation` has been created to encapsulate what previously was a 2-element tuple of `(count, size)`. This
e25ae03e · Vicent Marti · 478636be · e25ae03e · e25ae03e · e25ae03e
Commit e25ae03e authored Mar 25, 2016 by Vicent Marti
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 369 additions and 74 deletions

man/man8/memleak.8 man/man8/memleak.8 +2 -12

tools/memleak.py tools/memleak.py +46 -62

tools/old/memleak.py tools/old/memleak.py +321 -0

No files found.
--- a/man/man8/memleak.8
+++ b/man/man8/memleak.8
@@ -3,7 +3,7 @@
 memleak \- Print a summary of outstanding allocations and their call stacks to detect memory leaks. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
 .B memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND] [-s SAMPLE_RATE]
-[-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE] [-Z MAX_SIZE] [INTERVAL] [COUNT]
+[-T TOP] [-z MIN_SIZE] [-Z MAX_SIZE] [INTERVAL] [COUNT]
 .SH DESCRIPTION
 memleak traces and matches memory allocation and deallocation requests, and
 collects call stacks for each allocation. memleak can then print a summary
@@ -15,10 +15,7 @@ When tracing all processes, memleak instruments kmalloc and kfree.
 memleak may introduce significant overhead when tracing processes that allocate
 and free many blocks very quickly. See the OVERHEAD section below.

-The stack depth is limited to 10 by default (+1 for the current instruction pointer),
-but it can be controlled using the \-d switch if deeper stacks are required.
-
-This currently only works on x86_64. Check for future versions.
+This tool only works on Linux 4.6+. Stack traces are obtained using the new BPF_STACK_TRACE` APIs.
 .SH REQUIREMENTS
 CONFIG_BPF and bcc.
 .SH OPTIONS
@@ -45,10 +42,6 @@ Run the specified command and trace its allocations only. This traces malloc and
 \-s SAMPLE_RATE
 Record roughly every SAMPLE_RATE-th allocation to reduce overhead.
 .TP
-\-d STACK_DEPTH
-Capture STACK_DEPTH frames (or less) when obtaining allocation call stacks.
-The default value is 10.
-.TP
 \-t TOP
 Print only the top TOP stacks (sorted by size).
 The default value is 10.
@@ -106,9 +99,6 @@ placed in a typical period of 10 seconds:
 #
 .B perf stat -a -e 'probe:__kmalloc' -- sleep 10

-Another setting that may help reduce overhead is lowering the number of stack
-frames captured and parsed by memleak for each allocation, using the \-d switch.
-
 .SH SOURCE
 This is from bcc.
 .IP

--- a/tools/memleak.py
+++ b/tools/memleak.py
@@ -44,29 +44,33 @@ class Time(object):
                        raise OSError(errno_, os.strerror(errno_))
                return t.tv_sec * 1e9 + t.tv_nsec

-class StackDecoder(object):
+class KStackDecoder(object):
+        def refresh(self):
+                pass
+
+        def __call__(self, addr):
+                return "%s [kernel] (%x)" % (BPF.ksym(addr), addr)
+
+class UStackDecoder(object):
        def __init__(self, pid):
                self.pid = pid
-                if pid != -1:
-                        self.proc_sym = ProcessSymbols(pid)
+                self.proc_sym = ProcessSymbols(pid)

        def refresh(self):
-                if self.pid != -1:
-                        self.proc_sym.refresh_code_ranges()
-
-        def decode_stack(self, info, is_kernel_trace):
-                stack = ""
-                if info.num_frames <= 0:
-                        return "???"
-                for i in range(0, info.num_frames):
-                        addr = info.callstack[i]
-                        if is_kernel_trace:
-                                stack += " %s [kernel] (%x) ;" % \
-                                        (BPF.ksym(addr), addr)
-                        else:
-                                stack += " %s (%x) ;" % \
-                                        (self.proc_sym.decode_addr(addr), addr)
-                return stack
+                self.proc_sym.refresh_code_ranges()
+
+        def __call__(self, addr):
+                return "%s (%x)" % (self.proc_sym.decode_addr(addr), addr)
+
+class Allocation(object):
+    def __init__(self, stack, size):
+        self.stack = stack
+        self.count = 1
+        self.size = size
+
+    def update(self, size):
+        self.count += 1
+        self.size += size

 def run_command_get_output(command):
        p = subprocess.Popen(command.split(),
@@ -125,8 +129,6 @@ parser.add_argument("-c", "--command",
        help="execute and trace the specified command")
 parser.add_argument("-s", "--sample-rate", default=1, type=int,
        help="sample every N-th allocation to decrease the overhead")
-parser.add_argument("-d", "--stack-depth", default=10, type=int,
-        help="maximum stack depth to capture")
 parser.add_argument("-T", "--top", type=int, default=10,
        help="display only this many top allocating stacks (by size)")
 parser.add_argument("-z", "--min-size", type=int,
@@ -144,7 +146,6 @@ interval = args.interval
 min_age_ns = 1e6 * args.older
 sample_every_n = args.sample_rate
 num_prints = args.count
-max_stack_size = args.stack_depth + 2
 top_stacks = args.top
 min_size = args.min_size
 max_size = args.max_size
@@ -163,33 +164,12 @@ bpf_source = """
 struct alloc_info_t {
        u64 size;
        u64 timestamp_ns;
-        int num_frames;
-        u64 callstack[MAX_STACK_SIZE];
+        int stack_id;
 };

 BPF_HASH(sizes, u64);
 BPF_HASH(allocs, u64, struct alloc_info_t);
-
-// Adapted from https://github.com/iovisor/bcc/tools/offcputime.py
-static u64 get_frame(u64 *bp) {
-        if (*bp) {
-                // The following stack walker is x86_64 specific
-                u64 ret = 0;
-                if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
-                        return 0;
-                if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
-                        *bp = 0;
-                return ret;
-        }
-        return 0;
-}
-static int grab_stack(struct pt_regs *ctx, struct alloc_info_t *info)
-{
-        int depth = 0;
-        u64 bp = ctx->bp;
-        GRAB_ONE_FRAME
-        return depth;
-}
+BPF_STACK_TRACE(stack_traces, 1024)

 int alloc_enter(struct pt_regs *ctx, size_t size)
 {
@@ -223,12 +203,12 @@ int alloc_exit(struct pt_regs *ctx)
        sizes.delete(&pid);

        info.timestamp_ns = bpf_ktime_get_ns();
-        info.num_frames = grab_stack(ctx, &info) - 2;
+        info.stack_id = stack_traces.get_stackid(ctx, STACK_FLAGS);
        allocs.update(&address, &info);

        if (SHOULD_PRINT) {
-                bpf_trace_printk("alloc exited, size = %lu, result = %lx, frames = %d\\n",
-                                 info.size, address, info.num_frames);
+                bpf_trace_printk("alloc exited, size = %lu, result = %lx\\n",
+                                 info.size, address);
        }
        return 0;
 }
@@ -251,9 +231,6 @@ int free_enter(struct pt_regs *ctx, void *address)
 """
 bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0")
 bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n))
-bpf_source = bpf_source.replace("GRAB_ONE_FRAME", max_stack_size *
-        "\tif (!(info->callstack[depth++] = get_frame(&bp))) return depth;\n")
-bpf_source = bpf_source.replace("MAX_STACK_SIZE", str(max_stack_size))

 size_filter = ""
 if min_size is not None and max_size is not None:
@@ -265,6 +242,11 @@ elif max_size is not None:
        size_filter = "if (size > %d) return 0;" % max_size
 bpf_source = bpf_source.replace("SIZE_FILTER", size_filter)

+stack_flags = "BPF_F_REUSE_STACKID"
+if not kernel_trace:
+        stack_flags += "|BPF_F_USER_STACK"
+bpf_source = bpf_source.replace("STACK_FLAGS", stack_flags)
+
 bpf_program = BPF(text=bpf_source)

 if not kernel_trace:
@@ -281,29 +263,31 @@ else:
        bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit")
        bpf_program.attach_kprobe(event="kfree", fn_name="free_enter")

-decoder = StackDecoder(pid)
+decoder = KStackDecoder() if kernel_trace else UStackDecoder(pid)

 def print_outstanding():
-        stacks = {}
        print("[%s] Top %d stacks with outstanding allocations:" %
              (datetime.now().strftime("%H:%M:%S"), top_stacks))
-        allocs = bpf_program.get_table("allocs")
+        alloc_info = {}
+        allocs = bpf_program["allocs"]
+        stack_traces = bpf_program["stack_traces"]
        for address, info in sorted(allocs.items(), key=lambda a: a[1].size):
                if Time.monotonic_time() - min_age_ns < info.timestamp_ns:
                        continue
-                stack = decoder.decode_stack(info, kernel_trace)
-                if stack in stacks:
-                        stacks[stack] = (stacks[stack][0] + 1,
-                                         stacks[stack][1] + info.size)
+                if info.stack_id < 0:
+                        continue
+                if info.stack_id in alloc_info:
+                        alloc_info[info.stack_id].update(info.size)
                else:
-                        stacks[stack] = (1, info.size)
+                        stack = list(stack_traces.walk(info.stack_id, decoder))
+                        alloc_info[info.stack_id] = Allocation(stack, info.size)
                if args.show_allocs:
                        print("\taddr = %x size = %s" %
                              (address.value, info.size))
-        to_show = sorted(stacks.items(), key=lambda s: s[1][1])[-top_stacks:]
-        for stack, (count, size) in to_show:
+        to_show = sorted(alloc_info.values(), key=lambda a: a.size)[-top_stacks:]
+        for alloc in to_show:
                print("\t%d bytes in %d allocations from stack\n\t\t%s" %
-                      (size, count, stack.replace(";", "\n\t\t")))
+                      (alloc.size, alloc.count, "\n\t\t".join(alloc.stack)))

 count_so_far = 0
 while True:

--- a/tools/old/memleak.py
+++ b/tools/old/memleak.py