Commit 9f2c6e96 authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by Daniel Borkmann

bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc.

User space might be creating and destroying a lot of hash maps. Synchronous
rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets
and other map memory and may cause artificial OOM situation under stress.
Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc:
- remove rcu_barrier from hash map, since htab doesn't use call_rcu
  directly and there are no callback to wait for.
- bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks.
  Use it to avoid barriers in fast path.
- When barriers are needed copy bpf_mem_alloc into temp structure
  and wait for rcu barrier-s in the worker to let the rest of
  hash map freeing to proceed.
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220902211058.60789-17-alexei.starovoitov@gmail.com
parent bfc03c15
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#ifndef _BPF_MEM_ALLOC_H #ifndef _BPF_MEM_ALLOC_H
#define _BPF_MEM_ALLOC_H #define _BPF_MEM_ALLOC_H
#include <linux/compiler_types.h> #include <linux/compiler_types.h>
#include <linux/workqueue.h>
struct bpf_mem_cache; struct bpf_mem_cache;
struct bpf_mem_caches; struct bpf_mem_caches;
...@@ -10,6 +11,7 @@ struct bpf_mem_caches; ...@@ -10,6 +11,7 @@ struct bpf_mem_caches;
struct bpf_mem_alloc { struct bpf_mem_alloc {
struct bpf_mem_caches __percpu *caches; struct bpf_mem_caches __percpu *caches;
struct bpf_mem_cache __percpu *cache; struct bpf_mem_cache __percpu *cache;
struct work_struct work;
}; };
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
......
...@@ -1546,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map) ...@@ -1546,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map)
* There is no need to synchronize_rcu() here to protect map elements. * There is no need to synchronize_rcu() here to protect map elements.
*/ */
/* some of free_htab_elem() callbacks for elements of this map may /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
* not have executed. Wait for them. * underneath and is reponsible for waiting for callbacks to finish
* during bpf_mem_alloc_destroy().
*/ */
rcu_barrier();
if (!htab_is_prealloc(htab)) { if (!htab_is_prealloc(htab)) {
delete_all_elements(htab); delete_all_elements(htab);
} else { } else {
......
...@@ -414,10 +414,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) ...@@ -414,10 +414,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
{ {
struct llist_node *llnode, *t; struct llist_node *llnode, *t;
/* The caller has done rcu_barrier() and no progs are using this /* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for * bpf_mem_cache_free() for all remaining elements and they can be in
* all remaining elements and they can be in free_by_rcu or in * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
* waiting_for_gp lists, so drain those lists now.
*/ */
llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
free_one(c, llnode); free_one(c, llnode);
...@@ -429,42 +428,91 @@ static void drain_mem_cache(struct bpf_mem_cache *c) ...@@ -429,42 +428,91 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
free_one(c, llnode); free_one(c, llnode);
} }
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
free_percpu(ma->cache);
free_percpu(ma->caches);
ma->cache = NULL;
ma->caches = NULL;
}
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
/* waiting_for_gp lists was drained, but __free_rcu might
* still execute. Wait for it now before we freeing percpu caches.
*/
rcu_barrier_tasks_trace();
rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
static void free_mem_alloc_deferred(struct work_struct *work)
{
struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
free_mem_alloc(ma);
kfree(ma);
}
static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
{
struct bpf_mem_alloc *copy;
if (!rcu_in_progress) {
/* Fast path. No callbacks are pending, hence no need to do
* rcu_barrier-s.
*/
free_mem_alloc_no_barrier(ma);
return;
}
copy = kmalloc(sizeof(*ma), GFP_KERNEL);
if (!copy) {
/* Slow path with inline barrier-s */
free_mem_alloc(ma);
return;
}
/* Defer barriers into worker to let the rest of map memory to be freed */
copy->cache = ma->cache;
ma->cache = NULL;
copy->caches = ma->caches;
ma->caches = NULL;
INIT_WORK(&copy->work, free_mem_alloc_deferred);
queue_work(system_unbound_wq, &copy->work);
}
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
{ {
struct bpf_mem_caches *cc; struct bpf_mem_caches *cc;
struct bpf_mem_cache *c; struct bpf_mem_cache *c;
int cpu, i; int cpu, i, rcu_in_progress;
if (ma->cache) { if (ma->cache) {
rcu_in_progress = 0;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu); c = per_cpu_ptr(ma->cache, cpu);
drain_mem_cache(c); drain_mem_cache(c);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
} }
/* objcg is the same across cpus */ /* objcg is the same across cpus */
if (c->objcg) if (c->objcg)
obj_cgroup_put(c->objcg); obj_cgroup_put(c->objcg);
/* c->waiting_for_gp list was drained, but __free_rcu might destroy_mem_alloc(ma, rcu_in_progress);
* still execute. Wait for it now before we free 'c'.
*/
rcu_barrier_tasks_trace();
rcu_barrier();
free_percpu(ma->cache);
ma->cache = NULL;
} }
if (ma->caches) { if (ma->caches) {
rcu_in_progress = 0;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(ma->caches, cpu); cc = per_cpu_ptr(ma->caches, cpu);
for (i = 0; i < NUM_CACHES; i++) { for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i]; c = &cc->cache[i];
drain_mem_cache(c); drain_mem_cache(c);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
} }
} }
if (c->objcg) if (c->objcg)
obj_cgroup_put(c->objcg); obj_cgroup_put(c->objcg);
rcu_barrier_tasks_trace(); destroy_mem_alloc(ma, rcu_in_progress);
rcu_barrier();
free_percpu(ma->caches);
ma->caches = NULL;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment