Merge pull request #772 from goldshtn/strcmp

trace, argdist: STRCMP helper function

Merge pull request #772 from goldshtn/strcmp
trace, argdist: STRCMP helper function
315998d8 · Brendan Gregg · GitHub · 797c3ec1 · 3286c063 · 315998d8
Commit 315998d8 authored Oct 19, 2016 by Brendan Gregg Committed by GitHub Oct 19, 2016
6 changed files
--- a/man/man8/argdist.8
+++ b/man/man8/argdist.8
@@ -111,6 +111,10 @@ Only parameter values that pass the filter will be collected. This is any valid
 C expression that refers to the parameter values, such as "fd == 1 && length > 16".
 The $entry, $retval, and $latency variables can be used here as well, in return
 probes.
+The filter expression may also use the STRCMP pseudo-function to compare
+a predefined string to a string argument. For example: STRCMP("test.txt", file).
+The order of arguments is important: the first argument MUST be a quoted
+literal string, and the second argument can be a runtime string.
 .TP
 .B [label]
 The label that will be displayed when printing the probed values. By default,

--- a/man/man8/trace.8
+++ b/man/man8/trace.8
@@ -94,6 +94,12 @@ Note that only arg1-arg6 are supported, and only if the function is using the
 standard x86_64 convention where the first six arguments are in the RDI, RSI, 
 RDX, RCX, R8, R9 registers. If no predicate is specified, all function 
 invocations are traced.
+The predicate expression may also use the STRCMP pseudo-function to compare
+a predefined string to a string argument. For example: STRCMP("test", arg1).
+The order of arguments is important: the first argument MUST be a quoted
+literal string, and the second argument can be a runtime string, most typically
+an argument. 
 .TP
 .B ["format string"[, arguments]]
 A printf-style format string that will be used for the trace message. You can

--- a/tools/argdist.py
+++ b/tools/argdist.py
@@ -19,6 +19,7 @@ import sys
 class Probe(object):
        next_probe_index = 0
+        streq_index = 0
        aliases = {"$PID": "bpf_get_current_pid_tgid()"}
        def _substitute_aliases(self, expr):
@@ -174,6 +175,7 @@ u64 __time = bpf_ktime_get_ns();
        def __init__(self, tool, type, specifier):
                self.usdt_ctx = None
+                self.streq_functions = ""
                self.pid = tool.args.pid
                self.cumulative = tool.args.cumulative or False
                self.raw_spec = specifier
@@ -242,9 +244,32 @@ u64 __time = bpf_ktime_get_ns();
                self.usdt_ctx.enable_probe(
                        self.function, self.probe_func_name)
+        def _generate_streq_function(self, string):
+                fname = "streq_%d" % Probe.streq_index
+                Probe.streq_index += 1
+                self.streq_functions += """
+static inline bool %s(char const *ignored, char const *str) {
+        char needle[] = %s;
+        char haystack[sizeof(needle)];
+        bpf_probe_read(&haystack, sizeof(haystack), (void *)str);
+        for (int i = 0; i < sizeof(needle); ++i) {
+                if (needle[i] != haystack[i]) {
+                        return false;
+                }
+        }
+        return true;
+}
+                """ % (fname, string)
+                return fname
        def _substitute_exprs(self):
                def repl(expr):
                        expr = self._substitute_aliases(expr)
+                        matches = re.finditer('STRCMP\\(("[^"]+\\")', expr)
+                        for match in matches:
+                                string = match.group(1)
+                                fname = self._generate_streq_function(string)
+                                expr = expr.replace("STRCMP", fname, 1)
                        return expr.replace("$retval", "PT_REGS_RC(ctx)")
                for i in range(0, len(self.exprs)):
                        self.exprs[i] = repl(self.exprs[i])
@@ -370,7 +395,7 @@ DATA_DECL
                program = program.replace("COLLECT", collect)
                program = program.replace("PREFIX", prefix)
-                return program
+                return self.streq_functions + program
        def _attach_u(self):
                libpath = BPF.find_library(self.library)

--- a/tools/argdist_example.txt
+++ b/tools/argdist_example.txt
@@ -277,6 +277,45 @@ t:net:net_dev_start_xmit():u16:args->protocol
 Note that to discover the format of the net:net_dev_start_xmit tracepoint, you
 use the tplist tool (tplist -v net:net_dev_start_xmit).
+Occasionally, it is useful to filter certain expressions by string. This is not
+trivially supported by BPF, but argdist provides a STRCMP helper you can use in
+filter expressions. For example, to get a histogram of latencies opening a
+specific file, run this:
+# argdist -c -H 'r:c:open(char *file):u64:$latency/1000:STRCMP("test.txt",$entry(file))'
+[02:16:38]
+[02:16:39]
+[02:16:40]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 2        |****************************************|
+[02:16:41]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |**********                              |
+        16 -> 31         : 4        |****************************************|
+[02:16:42]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |********                                |
+        16 -> 31         : 5        |****************************************|
+[02:16:43]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |********                                |
+        16 -> 31         : 5        |****************************************|
 Here's a final example that finds how many write() system calls are performed
 by each process on the system:

--- a/tools/trace.py
+++ b/tools/trace.py
@@ -46,6 +46,7 @@ class Time(object):
 class Probe(object):
        probe_count = 0
+        streq_index = 0
        max_events = None
        event_count = 0
        first_ts = 0
@@ -61,6 +62,7 @@ class Probe(object):
        def __init__(self, probe, string_size, kernel_stack, user_stack):
                self.usdt = None
+                self.streq_functions = ""
                self.raw_probe = probe
                self.string_size = string_size
                self.kernel_stack = kernel_stack
@@ -159,7 +161,7 @@ class Probe(object):
                self._bail("unrecognized USDT probe %s" % self.usdt_name)
        def _parse_filter(self, filt):
-                self.filter = self._replace_args(filt)
+                self.filter = self._rewrite_expr(filt)
        def _parse_types(self, fmt):
                for match in re.finditer(
@@ -178,14 +180,14 @@ class Probe(object):
                        return
                action = action.strip()
-                match = re.search(r'(\".*\"),?(.*)', action)
+                match = re.search(r'(\".*?\"),?(.*)', action)
                if match is None:
                        self._bail("expected format string in \"s")
                self.raw_format = match.group(1)
                self._parse_types(self.raw_format)
-                for part in match.group(2).split(','):
+                for part in re.split('(?<!"),', match.group(2)):
-                        part = self._replace_args(part)
+                        part = self._rewrite_expr(part)
                        if len(part) > 0:
                                self.values.append(part)
@@ -204,7 +206,25 @@ class Probe(object):
                "$cpu": "bpf_get_smp_processor_id()"
        }
-        def _replace_args(self, expr):
+        def _generate_streq_function(self, string):
+                fname = "streq_%d" % Probe.streq_index
+                Probe.streq_index += 1
+                self.streq_functions += """
+static inline bool %s(char const *ignored, unsigned long str) {
+        char needle[] = %s;
+        char haystack[sizeof(needle)];
+        bpf_probe_read(&haystack, sizeof(haystack), (void *)str);
+        for (int i = 0; i < sizeof(needle); ++i) {
+                if (needle[i] != haystack[i]) {
+                        return false;
+                }
+        }
+        return true;
+}
+                """ % (fname, string)
+                return fname
+        def _rewrite_expr(self, expr):
                for alias, replacement in Probe.aliases.items():
                        # For USDT probes, we replace argN values with the
                        # actual arguments for that probe obtained using
@@ -212,6 +232,11 @@ class Probe(object):
                        if alias.startswith("arg") and self.probe_type == "u":
                                continue
                        expr = expr.replace(alias, replacement)
+                matches = re.finditer('STRCMP\\(("[^"]+\\")', expr)
+                for match in matches:
+                        string = match.group(1)
+                        fname = self._generate_streq_function(string)
+                        expr = expr.replace("STRCMP", fname, 1)
                return expr
        p_type = {"u": ct.c_uint, "d": ct.c_int,
@@ -405,7 +430,7 @@ BPF_PERF_OUTPUT(%s);
                               self.struct_name, data_fields,
                               stack_trace, self.events_name, ctx_name)
-                return data_decl + "\n" + text
+                return self.streq_functions + data_decl + "\n" + text
        @classmethod
        def _time_off_str(cls, timestamp_ns):
@@ -526,7 +551,7 @@ trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
        Trace the write() call from libc to monitor writes to STDOUT
 trace 'r::__kmalloc (retval == 0) "kmalloc failed!"
        Trace returns from __kmalloc which returned a null pointer
-trace 'r:c:malloc (retval) "allocated = %p", retval
+trace 'r:c:malloc (retval) "allocated = %x", retval
        Trace returns from malloc and print non-NULL allocated buffers
 trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
        Trace the block_rq_complete kernel tracepoint and print # of tx sectors

--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -136,6 +136,16 @@ In the previous invocation, arg1 and arg2 are the class name and method name
 for the Ruby method being invoked.
+Occasionally, it can be useful to filter specific strings. For example, you
+might be interested in open() calls that open a specific file:
+# trace 'p:c:open (STRCMP("test.txt", arg1)) "opening %s", arg1'
+TIME     PID    COMM         FUNC             -
+01:43:15 10938  cat          open             opening test.txt
+01:43:20 10939  cat          open             opening test.txt
+^C
 As a final example, let's trace open syscalls for a specific process. By 
 default, tracing is system-wide, but the -p switch overrides this:
@@ -202,7 +212,7 @@ trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
        Trace the write() call from libc to monitor writes to STDOUT
 trace 'r::__kmalloc (retval == 0) "kmalloc failed!"
        Trace returns from __kmalloc which returned a null pointer
-trace 'r:c:malloc (retval) "allocated = %p", retval
+trace 'r:c:malloc (retval) "allocated = %x", retval
        Trace returns from malloc and print non-NULL allocated buffers
 trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
        Trace the block_rq_complete kernel tracepoint and print # of tx sectors