Commit 78d0b161 authored by Andrii Nakryiko's avatar Andrii Nakryiko Committed by Masami Hiramatsu (Google)

objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids

Profiling shows that calling nr_possible_cpus() in objpool_pop() takes
a noticeable amount of CPU (when profiled on 80-core machine), as we
need to recalculate number of set bits in a CPU bit mask. This number
can't change, so there is no point in paying the price for recalculating
it. As such, cache this value in struct objpool_head and use it in
objpool_pop().

On the other hand, cached pool->nr_cpus isn't necessary, as it's not
used in hot path and is also a pretty trivial value to retrieve. So drop
pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size
of struct objpool_head remains the same, which is a nice bonus.

Same BPF selftests benchmarks were used to evaluate the effect. Using
changes in previous patch (inlining of objpool_pop/objpool_push) as
baseline, here are the differences:

BASELINE
========
kretprobe      :    9.937 ± 0.174M/s
kretprobe-multi:   10.440 ± 0.108M/s

AFTER
=====
kretprobe      :   10.106 ± 0.120M/s (+1.7%)
kretprobe-multi:   10.515 ± 0.180M/s (+0.7%)

Link: https://lore.kernel.org/all/20240424215214.3956041-3-andrii@kernel.org/

Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com>
Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Signed-off-by: default avatarMasami Hiramatsu (Google) <mhiramat@kernel.org>
parent a3b00f10
...@@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); ...@@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
* struct objpool_head - object pooling metadata * struct objpool_head - object pooling metadata
* @obj_size: object size, aligned to sizeof(void *) * @obj_size: object size, aligned to sizeof(void *)
* @nr_objs: total objs (to be pre-allocated with objpool) * @nr_objs: total objs (to be pre-allocated with objpool)
* @nr_cpus: local copy of nr_cpu_ids * @nr_possible_cpus: cached value of num_possible_cpus()
* @capacity: max objs can be managed by one objpool_slot * @capacity: max objs can be managed by one objpool_slot
* @gfp: gfp flags for kmalloc & vmalloc * @gfp: gfp flags for kmalloc & vmalloc
* @ref: refcount of objpool * @ref: refcount of objpool
...@@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); ...@@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
struct objpool_head { struct objpool_head {
int obj_size; int obj_size;
int nr_objs; int nr_objs;
int nr_cpus; int nr_possible_cpus;
int capacity; int capacity;
gfp_t gfp; gfp_t gfp;
refcount_t ref; refcount_t ref;
...@@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool) ...@@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool)
raw_local_irq_save(flags); raw_local_irq_save(flags);
cpu = raw_smp_processor_id(); cpu = raw_smp_processor_id();
for (i = 0; i < num_possible_cpus(); i++) { for (i = 0; i < pool->nr_possible_cpus; i++) {
obj = __objpool_try_get_slot(pool, cpu); obj = __objpool_try_get_slot(pool, cpu);
if (obj) if (obj)
break; break;
......
...@@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, ...@@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
{ {
int i, cpu_count = 0; int i, cpu_count = 0;
for (i = 0; i < pool->nr_cpus; i++) { for (i = 0; i < nr_cpu_ids; i++) {
struct objpool_slot *slot; struct objpool_slot *slot;
int nodes, size, rc; int nodes, size, rc;
...@@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, ...@@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
continue; continue;
/* compute how many objects to be allocated with this slot */ /* compute how many objects to be allocated with this slot */
nodes = nr_objs / num_possible_cpus(); nodes = nr_objs / pool->nr_possible_cpus;
if (cpu_count < (nr_objs % num_possible_cpus())) if (cpu_count < (nr_objs % pool->nr_possible_cpus))
nodes++; nodes++;
cpu_count++; cpu_count++;
...@@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool) ...@@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool)
if (!pool->cpu_slots) if (!pool->cpu_slots)
return; return;
for (i = 0; i < pool->nr_cpus; i++) for (i = 0; i < nr_cpu_ids; i++)
kvfree(pool->cpu_slots[i]); kvfree(pool->cpu_slots[i]);
kfree(pool->cpu_slots); kfree(pool->cpu_slots);
} }
...@@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, ...@@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
/* initialize objpool pool */ /* initialize objpool pool */
memset(pool, 0, sizeof(struct objpool_head)); memset(pool, 0, sizeof(struct objpool_head));
pool->nr_cpus = nr_cpu_ids; pool->nr_possible_cpus = num_possible_cpus();
pool->obj_size = object_size; pool->obj_size = object_size;
pool->capacity = capacity; pool->capacity = capacity;
pool->gfp = gfp & ~__GFP_ZERO; pool->gfp = gfp & ~__GFP_ZERO;
pool->context = context; pool->context = context;
pool->release = release; pool->release = release;
slot_size = pool->nr_cpus * sizeof(struct objpool_slot); slot_size = nr_cpu_ids * sizeof(struct objpool_slot);
pool->cpu_slots = kzalloc(slot_size, pool->gfp); pool->cpu_slots = kzalloc(slot_size, pool->gfp);
if (!pool->cpu_slots) if (!pool->cpu_slots)
return -ENOMEM; return -ENOMEM;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment