Commit 32b8af82 authored by Jiri Olsa's avatar Jiri Olsa Committed by Arnaldo Carvalho de Melo

perf stat: Introduce --per-thread option

Currently all the -p option PID arguments tasks values get aggregated
and printed as single values.

Adding --per-tasks option to print values per task.

  $ perf stat  -e cycles,instructions --per-thread -p 30190,30242
  ^C
   Performance counter stats for process id '30190,30242':

               cat-30190                     0      cycles
               yes-30242         3,842,525,421      cycles
               cat-30190                     0      instructions
               yes-30242        10,370,817,010      instructions

         1.143155657 seconds time elapsed

Also works under interval mode:

  $ perf stat  -e cycles,instructions --per-thread -p 30190,30242 -I 1000
  #           time             comm-pid                  counts unit events
       1.000073435              cat-30190                89,058      cycles
       1.000073435              yes-30242         3,360,786,902      cycles                     (100.00%)
       1.000073435              cat-30190                14,066      instructions
       1.000073435              yes-30242         9,069,937,462      instructions
       2.000204830              cat-30190                     0      cycles
       2.000204830              yes-30242         3,351,667,626      cycles
       2.000204830              cat-30190                     0      instructions
       2.000204830              yes-30242         9,045,796,885      instructions
  ^C     2.771286639              cat-30190                     0      cycles
       2.771286639              yes-30242         2,593,884,166      cycles
       2.771286639              cat-30190                     0      instructions
       2.771286639              yes-30242         7,001,171,191      instructions

It works only with -t and -p options, otherwise following error is
printed:

  $ perf stat  -e cycles --per-thread  -I 1000 ls
  The --per-thread option is only available when monitoring via -p -t options.
      -p, --pid <pid>       stat events on existing process id
      -t, --tid <tid>       stat events on existing thread id
Signed-off-by: default avatarJiri Olsa <jolsa@kernel.org>
Tested-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1435310967-14570-23-git-send-email-jolsa@kernel.orgSigned-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent d4f63a47
...@@ -144,6 +144,10 @@ is a useful mode to detect imbalance between physical cores. To enable this mod ...@@ -144,6 +144,10 @@ is a useful mode to detect imbalance between physical cores. To enable this mod
use --per-core in addition to -a. (system-wide). The output includes the use --per-core in addition to -a. (system-wide). The output includes the
core number and the number of online logical processors on that physical processor. core number and the number of online logical processors on that physical processor.
--per-thread::
Aggregate counts per monitored threads, when monitoring threads (-t option)
or processes (-p option).
-D msecs:: -D msecs::
--delay msecs:: --delay msecs::
After starting the program, wait msecs before measuring. This is useful to After starting the program, wait msecs before measuring. This is useful to
......
...@@ -231,6 +231,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, ...@@ -231,6 +231,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread,
count = &zero; count = &zero;
switch (aggr_mode) { switch (aggr_mode) {
case AGGR_THREAD:
case AGGR_CORE: case AGGR_CORE:
case AGGR_SOCKET: case AGGR_SOCKET:
case AGGR_NONE: case AGGR_NONE:
...@@ -602,6 +603,14 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) ...@@ -602,6 +603,14 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
csv_output ? 0 : -4, csv_output ? 0 : -4,
perf_evsel__cpus(evsel)->map[id], csv_sep); perf_evsel__cpus(evsel)->map[id], csv_sep);
break; break;
case AGGR_THREAD:
fprintf(output, "%*s-%*d%s",
csv_output ? 0 : 16,
thread_map__comm(evsel->threads, id),
csv_output ? 0 : -8,
thread_map__pid(evsel->threads, id),
csv_sep);
break;
case AGGR_GLOBAL: case AGGR_GLOBAL:
default: default:
break; break;
...@@ -750,6 +759,40 @@ static void print_aggr(char *prefix) ...@@ -750,6 +759,40 @@ static void print_aggr(char *prefix)
} }
} }
static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
{
int nthreads = thread_map__nr(counter->threads);
int ncpus = cpu_map__nr(counter->cpus);
int cpu, thread;
double uval;
for (thread = 0; thread < nthreads; thread++) {
u64 ena = 0, run = 0, val = 0;
for (cpu = 0; cpu < ncpus; cpu++) {
val += perf_counts(counter->counts, cpu, thread)->val;
ena += perf_counts(counter->counts, cpu, thread)->ena;
run += perf_counts(counter->counts, cpu, thread)->run;
}
if (prefix)
fprintf(output, "%s", prefix);
uval = val * counter->scale;
if (nsec_counter(counter))
nsec_printout(thread, 0, counter, uval);
else
abs_printout(thread, 0, counter, uval);
if (!csv_output)
print_noise(counter, 1.0);
print_running(run, ena);
fputc('\n', output);
}
}
/* /*
* Print out the results of a single counter: * Print out the results of a single counter:
* aggregated counts in system-wide mode * aggregated counts in system-wide mode
...@@ -876,6 +919,9 @@ static void print_interval(char *prefix, struct timespec *ts) ...@@ -876,6 +919,9 @@ static void print_interval(char *prefix, struct timespec *ts)
case AGGR_NONE: case AGGR_NONE:
fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit");
break; break;
case AGGR_THREAD:
fprintf(output, "# time comm-pid counts %*s events\n", unit_width, "unit");
break;
case AGGR_GLOBAL: case AGGR_GLOBAL:
default: default:
fprintf(output, "# time counts %*s events\n", unit_width, "unit"); fprintf(output, "# time counts %*s events\n", unit_width, "unit");
...@@ -944,6 +990,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) ...@@ -944,6 +990,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
case AGGR_SOCKET: case AGGR_SOCKET:
print_aggr(prefix); print_aggr(prefix);
break; break;
case AGGR_THREAD:
evlist__for_each(evsel_list, counter)
print_aggr_thread(counter, prefix);
break;
case AGGR_GLOBAL: case AGGR_GLOBAL:
evlist__for_each(evsel_list, counter) evlist__for_each(evsel_list, counter)
print_counter_aggr(counter, prefix); print_counter_aggr(counter, prefix);
...@@ -1031,6 +1081,7 @@ static int perf_stat_init_aggr_mode(void) ...@@ -1031,6 +1081,7 @@ static int perf_stat_init_aggr_mode(void)
break; break;
case AGGR_NONE: case AGGR_NONE:
case AGGR_GLOBAL: case AGGR_GLOBAL:
case AGGR_THREAD:
default: default:
break; break;
} }
...@@ -1255,6 +1306,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -1255,6 +1306,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
"aggregate counts per processor socket", AGGR_SOCKET), "aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-core", &aggr_mode, OPT_SET_UINT(0, "per-core", &aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE), "aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-thread", &aggr_mode,
"aggregate counts per thread", AGGR_THREAD),
OPT_UINTEGER('D', "delay", &initial_delay, OPT_UINTEGER('D', "delay", &initial_delay,
"ms to wait before starting measurement after program start"), "ms to wait before starting measurement after program start"),
OPT_END() OPT_END()
...@@ -1346,8 +1399,19 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -1346,8 +1399,19 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
run_count = 1; run_count = 1;
} }
/* no_aggr, cgroup are for system-wide only */ if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) {
if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && fprintf(stderr, "The --per-thread option is only available "
"when monitoring via -p -t options.\n");
parse_options_usage(NULL, options, "p", 1);
parse_options_usage(NULL, options, "t", 1);
goto out;
}
/*
* no_aggr, cgroup are for system-wide only
* --per-thread is aggregated per thread, we dont mix it with cpu mode
*/
if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) &&
!target__has_cpu(&target)) { !target__has_cpu(&target)) {
fprintf(stderr, "both cgroup and no-aggregation " fprintf(stderr, "both cgroup and no-aggregation "
"modes only available in system-wide mode\n"); "modes only available in system-wide mode\n");
...@@ -1375,6 +1439,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -1375,6 +1439,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
} }
goto out; goto out;
} }
/*
* Initialize thread_map with comm names,
* so we could print it out on output.
*/
if (aggr_mode == AGGR_THREAD)
thread_map__read_comms(evsel_list->threads);
if (interval && interval < 100) { if (interval && interval < 100) {
pr_err("print interval must be >= 100ms\n"); pr_err("print interval must be >= 100ms\n");
parse_options_usage(stat_usage, options, "I", 1); parse_options_usage(stat_usage, options, "I", 1);
......
...@@ -30,6 +30,7 @@ enum aggr_mode { ...@@ -30,6 +30,7 @@ enum aggr_mode {
AGGR_GLOBAL, AGGR_GLOBAL,
AGGR_SOCKET, AGGR_SOCKET,
AGGR_CORE, AGGR_CORE,
AGGR_THREAD,
}; };
struct perf_counts_values { struct perf_counts_values {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment