Commit 2b4b2621 authored by Dave Marchevsky's avatar Dave Marchevsky Committed by Daniel Borkmann

selftests/bpf: Add benchmark for local_storage RCU Tasks Trace usage

This benchmark measures grace period latency and kthread cpu usage of
RCU Tasks Trace when many processes are creating/deleting BPF
local_storage. Intent here is to quantify improvement on these metrics
after Paul's recent RCU Tasks patches [0].

Specifically, fork 15k tasks which call a bpf prog that creates/destroys
task local_storage and sleep in a loop, resulting in many
call_rcu_tasks_trace calls.

To determine grace period latency, trace time elapsed between
rcu_tasks_trace_pregp_step and rcu_tasks_trace_postgp; for cpu usage
look at rcu_task_trace_kthread's stime in /proc/PID/stat.

On my virtualized test environment (Skylake, 8 cpus) benchmark results
demonstrate significant improvement:

BEFORE Paul's patches:

  SUMMARY tasks_trace grace period latency        avg 22298.551 us stddev 1302.165 us
  SUMMARY ticks per tasks_trace grace period      avg 2.291 stddev 0.324

AFTER Paul's patches:

  SUMMARY tasks_trace grace period latency        avg 16969.197 us  stddev 2525.053 us
  SUMMARY ticks per tasks_trace grace period      avg 1.146 stddev 0.178

Note that since these patches are not in bpf-next benchmarking was done
by cherry-picking this patch onto rcu tree.

  [0] https://lore.kernel.org/rcu/20220620225402.GA3842369@paulmck-ThinkPad-P17-Gen-1/Signed-off-by: default avatarDave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarPaul E. McKenney <paulmck@kernel.org>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220705190018.3239050-1-davemarchevsky@fb.com
parent 935dc35c
...@@ -574,6 +574,7 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h ...@@ -574,6 +574,7 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
$(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
$(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
$(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
...@@ -587,7 +588,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ ...@@ -587,7 +588,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(OUTPUT)/bench_bpf_loop.o \ $(OUTPUT)/bench_bpf_loop.o \
$(OUTPUT)/bench_strncmp.o \ $(OUTPUT)/bench_strncmp.o \
$(OUTPUT)/bench_bpf_hashmap_full_update.o \ $(OUTPUT)/bench_bpf_hashmap_full_update.o \
$(OUTPUT)/bench_local_storage.o $(OUTPUT)/bench_local_storage.o \
$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o
$(call msg,BINARY,,$@) $(call msg,BINARY,,$@)
$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
......
...@@ -79,6 +79,43 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) ...@@ -79,6 +79,43 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns)
hits_per_sec, hits_per_prod, drops_per_sec, hits_per_sec + drops_per_sec); hits_per_sec, hits_per_prod, drops_per_sec, hits_per_sec + drops_per_sec);
} }
void
grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat)
{
int i;
memset(gp_stat, 0, sizeof(struct basic_stats));
for (i = 0; i < res_cnt; i++)
gp_stat->mean += res[i].gp_ns / 1000.0 / (double)res[i].gp_ct / (0.0 + res_cnt);
#define IT_MEAN_DIFF (res[i].gp_ns / 1000.0 / (double)res[i].gp_ct - gp_stat->mean)
if (res_cnt > 1) {
for (i = 0; i < res_cnt; i++)
gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0);
}
gp_stat->stddev = sqrt(gp_stat->stddev);
#undef IT_MEAN_DIFF
}
void
grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat)
{
int i;
memset(gp_stat, 0, sizeof(struct basic_stats));
for (i = 0; i < res_cnt; i++)
gp_stat->mean += res[i].stime / (double)res[i].gp_ct / (0.0 + res_cnt);
#define IT_MEAN_DIFF (res[i].stime / (double)res[i].gp_ct - gp_stat->mean)
if (res_cnt > 1) {
for (i = 0; i < res_cnt; i++)
gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0);
}
gp_stat->stddev = sqrt(gp_stat->stddev);
#undef IT_MEAN_DIFF
}
void hits_drops_report_final(struct bench_res res[], int res_cnt) void hits_drops_report_final(struct bench_res res[], int res_cnt)
{ {
int i; int i;
...@@ -236,6 +273,7 @@ extern struct argp bench_ringbufs_argp; ...@@ -236,6 +273,7 @@ extern struct argp bench_ringbufs_argp;
extern struct argp bench_bloom_map_argp; extern struct argp bench_bloom_map_argp;
extern struct argp bench_bpf_loop_argp; extern struct argp bench_bpf_loop_argp;
extern struct argp bench_local_storage_argp; extern struct argp bench_local_storage_argp;
extern struct argp bench_local_storage_rcu_tasks_trace_argp;
extern struct argp bench_strncmp_argp; extern struct argp bench_strncmp_argp;
static const struct argp_child bench_parsers[] = { static const struct argp_child bench_parsers[] = {
...@@ -244,6 +282,8 @@ static const struct argp_child bench_parsers[] = { ...@@ -244,6 +282,8 @@ static const struct argp_child bench_parsers[] = {
{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
{ &bench_local_storage_argp, 0, "local_storage benchmark", 0 }, { &bench_local_storage_argp, 0, "local_storage benchmark", 0 },
{ &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 }, { &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
{ &bench_local_storage_rcu_tasks_trace_argp, 0,
"local_storage RCU Tasks Trace slowdown benchmark", 0 },
{}, {},
}; };
...@@ -449,6 +489,7 @@ extern const struct bench bench_bpf_hashmap_full_update; ...@@ -449,6 +489,7 @@ extern const struct bench bench_bpf_hashmap_full_update;
extern const struct bench bench_local_storage_cache_seq_get; extern const struct bench bench_local_storage_cache_seq_get;
extern const struct bench bench_local_storage_cache_interleaved_get; extern const struct bench bench_local_storage_cache_interleaved_get;
extern const struct bench bench_local_storage_cache_hashmap_control; extern const struct bench bench_local_storage_cache_hashmap_control;
extern const struct bench bench_local_storage_tasks_trace;
static const struct bench *benchs[] = { static const struct bench *benchs[] = {
&bench_count_global, &bench_count_global,
...@@ -487,6 +528,7 @@ static const struct bench *benchs[] = { ...@@ -487,6 +528,7 @@ static const struct bench *benchs[] = {
&bench_local_storage_cache_seq_get, &bench_local_storage_cache_seq_get,
&bench_local_storage_cache_interleaved_get, &bench_local_storage_cache_interleaved_get,
&bench_local_storage_cache_hashmap_control, &bench_local_storage_cache_hashmap_control,
&bench_local_storage_tasks_trace,
}; };
static void setup_benchmark() static void setup_benchmark()
......
...@@ -30,11 +30,19 @@ struct env { ...@@ -30,11 +30,19 @@ struct env {
struct cpu_set cons_cpus; struct cpu_set cons_cpus;
}; };
struct basic_stats {
double mean;
double stddev;
};
struct bench_res { struct bench_res {
long hits; long hits;
long drops; long drops;
long false_hits; long false_hits;
long important_hits; long important_hits;
unsigned long gp_ns;
unsigned long gp_ct;
unsigned int stime;
}; };
struct bench { struct bench {
...@@ -65,6 +73,10 @@ void ops_report_final(struct bench_res res[], int res_cnt); ...@@ -65,6 +73,10 @@ void ops_report_final(struct bench_res res[], int res_cnt);
void local_storage_report_progress(int iter, struct bench_res *res, void local_storage_report_progress(int iter, struct bench_res *res,
long delta_ns); long delta_ns);
void local_storage_report_final(struct bench_res res[], int res_cnt); void local_storage_report_final(struct bench_res res[], int res_cnt);
void grace_period_latency_basic_stats(struct bench_res res[], int res_cnt,
struct basic_stats *gp_stat);
void grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt,
struct basic_stats *gp_stat);
static inline __u64 get_time_ns(void) static inline __u64 get_time_ns(void)
{ {
......
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
#include <argp.h>
#include <sys/prctl.h>
#include "local_storage_rcu_tasks_trace_bench.skel.h"
#include "bench.h"
#include <signal.h>
static struct {
__u32 nr_procs;
__u32 kthread_pid;
bool quiet;
} args = {
.nr_procs = 1000,
.kthread_pid = 0,
.quiet = false,
};
enum {
ARG_NR_PROCS = 7000,
ARG_KTHREAD_PID = 7001,
ARG_QUIET = 7002,
};
static const struct argp_option opts[] = {
{ "nr_procs", ARG_NR_PROCS, "NR_PROCS", 0,
"Set number of user processes to spin up"},
{ "kthread_pid", ARG_KTHREAD_PID, "PID", 0,
"Pid of rcu_tasks_trace kthread for ticks tracking"},
{ "quiet", ARG_QUIET, "{0,1}", 0,
"If true, don't report progress"},
{},
};
static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
long ret;
switch (key) {
case ARG_NR_PROCS:
ret = strtol(arg, NULL, 10);
if (ret < 1 || ret > UINT_MAX) {
fprintf(stderr, "invalid nr_procs\n");
argp_usage(state);
}
args.nr_procs = ret;
break;
case ARG_KTHREAD_PID:
ret = strtol(arg, NULL, 10);
if (ret < 1) {
fprintf(stderr, "invalid kthread_pid\n");
argp_usage(state);
}
args.kthread_pid = ret;
break;
case ARG_QUIET:
ret = strtol(arg, NULL, 10);
if (ret < 0 || ret > 1) {
fprintf(stderr, "invalid quiet %ld\n", ret);
argp_usage(state);
}
args.quiet = ret;
break;
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
const struct argp bench_local_storage_rcu_tasks_trace_argp = {
.options = opts,
.parser = parse_arg,
};
#define MAX_SLEEP_PROCS 150000
static void validate(void)
{
if (env.producer_cnt != 1) {
fprintf(stderr, "benchmark doesn't support multi-producer!\n");
exit(1);
}
if (env.consumer_cnt != 1) {
fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
exit(1);
}
if (args.nr_procs > MAX_SLEEP_PROCS) {
fprintf(stderr, "benchmark supports up to %u sleeper procs!\n",
MAX_SLEEP_PROCS);
exit(1);
}
}
static long kthread_pid_ticks(void)
{
char procfs_path[100];
long stime;
FILE *f;
if (!args.kthread_pid)
return -1;
sprintf(procfs_path, "/proc/%u/stat", args.kthread_pid);
f = fopen(procfs_path, "r");
if (!f) {
fprintf(stderr, "couldn't open %s, exiting\n", procfs_path);
goto err_out;
}
if (fscanf(f, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %ld", &stime) != 1) {
fprintf(stderr, "fscanf of %s failed, exiting\n", procfs_path);
goto err_out;
}
fclose(f);
return stime;
err_out:
if (f)
fclose(f);
exit(1);
return 0;
}
static struct {
struct local_storage_rcu_tasks_trace_bench *skel;
long prev_kthread_stime;
} ctx;
static void sleep_and_loop(void)
{
while (true) {
sleep(rand() % 4);
syscall(__NR_getpgid);
}
}
static void local_storage_tasks_trace_setup(void)
{
int i, err, forkret, runner_pid;
runner_pid = getpid();
for (i = 0; i < args.nr_procs; i++) {
forkret = fork();
if (forkret < 0) {
fprintf(stderr, "Error forking sleeper proc %u of %u, exiting\n", i,
args.nr_procs);
goto err_out;
}
if (!forkret) {
err = prctl(PR_SET_PDEATHSIG, SIGKILL);
if (err < 0) {
fprintf(stderr, "prctl failed with err %d, exiting\n", errno);
goto err_out;
}
if (getppid() != runner_pid) {
fprintf(stderr, "Runner died while spinning up procs, exiting\n");
goto err_out;
}
sleep_and_loop();
}
}
printf("Spun up %u procs (our pid %d)\n", args.nr_procs, runner_pid);
setup_libbpf();
ctx.skel = local_storage_rcu_tasks_trace_bench__open_and_load();
if (!ctx.skel) {
fprintf(stderr, "Error doing open_and_load, exiting\n");
goto err_out;
}
ctx.prev_kthread_stime = kthread_pid_ticks();
if (!bpf_program__attach(ctx.skel->progs.get_local)) {
fprintf(stderr, "Error attaching bpf program\n");
goto err_out;
}
if (!bpf_program__attach(ctx.skel->progs.pregp_step)) {
fprintf(stderr, "Error attaching bpf program\n");
goto err_out;
}
if (!bpf_program__attach(ctx.skel->progs.postgp)) {
fprintf(stderr, "Error attaching bpf program\n");
goto err_out;
}
return;
err_out:
exit(1);
}
static void measure(struct bench_res *res)
{
long ticks;
res->gp_ct = atomic_swap(&ctx.skel->bss->gp_hits, 0);
res->gp_ns = atomic_swap(&ctx.skel->bss->gp_times, 0);
ticks = kthread_pid_ticks();
res->stime = ticks - ctx.prev_kthread_stime;
ctx.prev_kthread_stime = ticks;
}
static void *consumer(void *input)
{
return NULL;
}
static void *producer(void *input)
{
while (true)
syscall(__NR_getpgid);
return NULL;
}
static void report_progress(int iter, struct bench_res *res, long delta_ns)
{
if (ctx.skel->bss->unexpected) {
fprintf(stderr, "Error: Unexpected order of bpf prog calls (postgp after pregp).");
fprintf(stderr, "Data can't be trusted, exiting\n");
exit(1);
}
if (args.quiet)
return;
printf("Iter %d\t avg tasks_trace grace period latency\t%lf ns\n",
iter, res->gp_ns / (double)res->gp_ct);
printf("Iter %d\t avg ticks per tasks_trace grace period\t%lf\n",
iter, res->stime / (double)res->gp_ct);
}
static void report_final(struct bench_res res[], int res_cnt)
{
struct basic_stats gp_stat;
grace_period_latency_basic_stats(res, res_cnt, &gp_stat);
printf("SUMMARY tasks_trace grace period latency");
printf("\tavg %.3lf us\tstddev %.3lf us\n", gp_stat.mean, gp_stat.stddev);
grace_period_ticks_basic_stats(res, res_cnt, &gp_stat);
printf("SUMMARY ticks per tasks_trace grace period");
printf("\tavg %.3lf\tstddev %.3lf\n", gp_stat.mean, gp_stat.stddev);
}
/* local-storage-tasks-trace: Benchmark performance of BPF local_storage's use
* of RCU Tasks-Trace.
*
* Stress RCU Tasks Trace by forking many tasks, all of which do no work aside
* from sleep() loop, and creating/destroying BPF task-local storage on wakeup.
* The number of forked tasks is configurable.
*
* exercising code paths which call call_rcu_tasks_trace while there are many
* thousands of tasks on the system should result in RCU Tasks-Trace having to
* do a noticeable amount of work.
*
* This should be observable by measuring rcu_tasks_trace_kthread CPU usage
* after the grace period has ended, or by measuring grace period latency.
*
* This benchmark uses both approaches, attaching to rcu_tasks_trace_pregp_step
* and rcu_tasks_trace_postgp functions to measure grace period latency and
* using /proc/PID/stat to measure rcu_tasks_trace_kthread kernel ticks
*/
const struct bench bench_local_storage_tasks_trace = {
.name = "local-storage-tasks-trace",
.validate = validate,
.setup = local_storage_tasks_trace_setup,
.producer_thread = producer,
.consumer_thread = consumer,
.measure = measure,
.report_progress = report_progress,
.report_final = report_final,
};
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
kthread_pid=`pgrep rcu_tasks_trace_kthread`
if [ -z $kthread_pid ]; then
echo "error: Couldn't find rcu_tasks_trace_kthread"
exit 1
fi
./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet 1 local-storage-tasks-trace
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, int);
} task_storage SEC(".maps");
long hits;
long gp_hits;
long gp_times;
long current_gp_start;
long unexpected;
bool postgp_seen;
SEC("fentry/" SYS_PREFIX "sys_getpgid")
int get_local(void *ctx)
{
struct task_struct *task;
int idx;
int *s;
idx = 0;
task = bpf_get_current_task_btf();
s = bpf_task_storage_get(&task_storage, task, &idx,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!s)
return 0;
*s = 3;
bpf_task_storage_delete(&task_storage, task);
__sync_add_and_fetch(&hits, 1);
return 0;
}
SEC("fentry/rcu_tasks_trace_pregp_step")
int pregp_step(struct pt_regs *ctx)
{
current_gp_start = bpf_ktime_get_ns();
return 0;
}
SEC("fentry/rcu_tasks_trace_postgp")
int postgp(struct pt_regs *ctx)
{
if (!current_gp_start && postgp_seen) {
/* Will only happen if prog tracing rcu_tasks_trace_pregp_step doesn't
* execute before this prog
*/
__sync_add_and_fetch(&unexpected, 1);
return 0;
}
__sync_add_and_fetch(&gp_times, bpf_ktime_get_ns() - current_gp_start);
__sync_add_and_fetch(&gp_hits, 1);
current_gp_start = 0;
postgp_seen = true;
return 0;
}
char _license[] SEC("license") = "GPL";
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment