Commit b9b6a2ea authored by Arnaldo Carvalho de Melo's avatar Arnaldo Carvalho de Melo

perf trace: Do not hardcode the size of the tracepoint common_ fields

We shouldn't hardcode the size of the tracepoint common_ fields, use the
offset of the 'id'/'__syscallnr' field in the sys_enter event instead.

This caused the augmented syscalls code to fail on a particular build of a
PREEMPT_RT_FULL kernel where these extra 'common_migrate_disable' and
'common_padding' fields were before the syscall id one:

  # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/format
  name: sys_enter
  ID: 22
  format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;
	field:unsigned short common_migrate_disable;	offset:8;	size:2;	signed:0;
	field:unsigned short common_padding;	offset:10;	size:2;	signed:0;

	field:long id;	offset:16;	size:8;	signed:1;
	field:unsigned long args[6];	offset:24;	size:48;	signed:0;

  print fmt: "NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", REC->id, REC->args[0], REC->args[1], REC->args[2], REC->args[3], REC->args[4], REC->args[5]
  #

All those 'common_' prefixed fields are zeroed when they hit a BPF tracepoint
hook, we better just discard those, i.e. somehow pass an offset to the
BPF program from the start of the ctx and make adjustments in the 'perf trace'
handlers to adjust the offset of the syscall arg offsets obtained from tracefs.

Till then, fix it the quick way and add this to the augmented_raw_syscalls.c to
bet it to work in such kernels:

  diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
  index 53c233370fae..1f746f931e13 100644
  --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
  +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
  @@ -38,12 +38,14 @@ struct bpf_map SEC("maps") syscalls = {

   struct syscall_enter_args {
          unsigned long long common_tp_fields;
  +       long               rt_common_tp_fields;
          long               syscall_nr;
          unsigned long      args[6];
   };

   struct syscall_exit_args {
          unsigned long long common_tp_fields;
  +       long               rt_common_tp_fields;
          long               syscall_nr;
          long               ret;
   };

Just to check that this was the case. Fix it properly later, for now remove the
hardcoding of the offset in the 'perf trace' side and document the situation
with this patch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Luis Cláudio Gonçalves <lclaudio@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-2pqavrktqkliu5b9nzouio21@git.kernel.orgSigned-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 14541b1e
...@@ -112,8 +112,9 @@ struct trace { ...@@ -112,8 +112,9 @@ struct trace {
} stats; } stats;
unsigned int max_stack; unsigned int max_stack;
unsigned int min_stack; unsigned int min_stack;
bool sort_events; int raw_augmented_syscalls_args_size;
bool raw_augmented_syscalls; bool raw_augmented_syscalls;
bool sort_events;
bool not_ev_qualifier; bool not_ev_qualifier;
bool live; bool live;
bool full_time; bool full_time;
...@@ -283,12 +284,17 @@ static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel) ...@@ -283,12 +284,17 @@ static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
return -ENOENT; return -ENOENT;
} }
static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel) static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp)
{ {
struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */ if (evsel->priv != NULL) {
if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap)) struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
if (syscall_id == NULL)
syscall_id = perf_evsel__field(tp, "__syscall_nr");
if (syscall_id == NULL)
goto out_delete;
if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
goto out_delete; goto out_delete;
return 0; return 0;
...@@ -1768,16 +1774,16 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel, ...@@ -1768,16 +1774,16 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
return printed; return printed;
} }
static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented) static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
{ {
void *augmented_args = NULL; void *augmented_args = NULL;
/* /*
* For now with BPF raw_augmented we hook into raw_syscalls:sys_enter * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
* and there we get all 6 syscall args plus the tracepoint common * and there we get all 6 syscall args plus the tracepoint common fields
* fields (sizeof(long)) and the syscall_nr (another long). So we check * that gets calculated at the start and the syscall_nr (another long).
* if that is the case and if so don't look after the sc->args_size, * So we check if that is the case and if so don't look after the
* but always after the full raw_syscalls:sys_enter payload, which is * sc->args_size but always after the full raw_syscalls:sys_enter payload,
* fixed. * which is fixed.
* *
* We'll revisit this later to pass s->args_size to the BPF augmenter * We'll revisit this later to pass s->args_size to the BPF augmenter
* (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
...@@ -1785,7 +1791,7 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam ...@@ -1785,7 +1791,7 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam
* use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
* traffic to just what is needed for each syscall. * traffic to just what is needed for each syscall.
*/ */
int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size; int args_size = raw_augmented_args_size ?: sc->args_size;
*augmented_args_size = sample->raw_size - args_size; *augmented_args_size = sample->raw_size - args_size;
if (*augmented_args_size > 0) if (*augmented_args_size > 0)
...@@ -1839,7 +1845,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, ...@@ -1839,7 +1845,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
* here and avoid using augmented syscalls when the evsel is the raw_syscalls one. * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
*/ */
if (evsel != trace->syscalls.events.sys_enter) if (evsel != trace->syscalls.events.sys_enter)
augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
ttrace->entry_time = sample->time; ttrace->entry_time = sample->time;
msg = ttrace->entry_str; msg = ttrace->entry_str;
printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
...@@ -1897,7 +1903,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse ...@@ -1897,7 +1903,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse
goto out_put; goto out_put;
args = perf_evsel__sc_tp_ptr(evsel, args, sample); args = perf_evsel__sc_tp_ptr(evsel, args, sample);
augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
fprintf(trace->output, "%s", msg); fprintf(trace->output, "%s", msg);
err = 0; err = 0;
...@@ -3814,13 +3820,6 @@ int cmd_trace(int argc, const char **argv) ...@@ -3814,13 +3820,6 @@ int cmd_trace(int argc, const char **argv)
* syscall. * syscall.
*/ */
if (trace.syscalls.events.augmented) { if (trace.syscalls.events.augmented) {
evsel = trace.syscalls.events.augmented;
if (perf_evsel__init_augmented_syscall_tp(evsel) ||
perf_evsel__init_augmented_syscall_tp_args(evsel))
goto out;
evsel->handler = trace__sys_enter;
evlist__for_each_entry(trace.evlist, evsel) { evlist__for_each_entry(trace.evlist, evsel) {
bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0; bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
...@@ -3829,9 +3828,41 @@ int cmd_trace(int argc, const char **argv) ...@@ -3829,9 +3828,41 @@ int cmd_trace(int argc, const char **argv)
goto init_augmented_syscall_tp; goto init_augmented_syscall_tp;
} }
if (strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_enter") == 0) {
struct perf_evsel *augmented = trace.syscalls.events.augmented;
if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
perf_evsel__init_augmented_syscall_tp_args(augmented))
goto out;
augmented->handler = trace__sys_enter;
}
if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) { if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
struct syscall_tp *sc;
init_augmented_syscall_tp: init_augmented_syscall_tp:
perf_evsel__init_augmented_syscall_tp(evsel); if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
goto out;
sc = evsel->priv;
/*
* For now with BPF raw_augmented we hook into
* raw_syscalls:sys_enter and there we get all
* 6 syscall args plus the tracepoint common
* fields and the syscall_nr (another long).
* So we check if that is the case and if so
* don't look after the sc->args_size but
* always after the full raw_syscalls:sys_enter
* payload, which is fixed.
*
* We'll revisit this later to pass
* s->args_size to the BPF augmenter (now
* tools/perf/examples/bpf/augmented_raw_syscalls.c,
* so that it copies only what we need for each
* syscall, like what happens when we use
* syscalls:sys_enter_NAME, so that we reduce
* the kernel/userspace traffic to just what is
* needed for each syscall.
*/
if (trace.raw_augmented_syscalls)
trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
perf_evsel__init_augmented_syscall_tp_ret(evsel); perf_evsel__init_augmented_syscall_tp_ret(evsel);
evsel->handler = trace__sys_exit; evsel->handler = trace__sys_exit;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment