Commit 968a3b92 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-mem-cache-free-rcu'

Alexei Starovoitov says:

====================
v3->v4:
- extra patch 14 from Hou to check for object leaks.
- fixed the race/leak in free_by_rcu_ttrace. Extra hunk in patch 8.
- added Acks and fixed typos.

v2->v3:
- dropped _tail optimization for free_by_rcu_ttrace
- new patch 5 to refactor inc/dec of c->active
- change 'draining' logic in patch 7
- add rcu_barrier in patch 12
- __llist_add-> llist_add(waiting_for_gp_ttrace) in patch 9 to fix race
- David's Ack in patch 13 and explanation that migrate_disable cannot be removed just yet.

v1->v2:
- Fixed race condition spotted by Hou. Patch 7.

v1:

Introduce bpf_mem_cache_free_rcu() that is similar to kfree_rcu except
the objects will go through an additional RCU tasks trace grace period
before being freed into slab.

Patches 1-9 - a bunch of prep work
Patch 10 - a patch from Paul that exports rcu_request_urgent_qs_task().
Patch 12 - the main bpf_mem_cache_free_rcu patch.
Patch 13 - use it in bpf_cpumask.

bpf_local_storage, bpf_obj_drop, qp-trie will be other users eventually.

With additional hack patch to htab that replaces bpf_mem_cache_free with bpf_mem_cache_free_rcu
the following are benchmark results:
- map_perf_test 4 8 16348 1000000
drops from 800k to 600k. Waiting for RCU GP makes objects cache cold.

- bench htab-mem -a -p 8
20% drop in performance and big increase in memory. From 3 Mbyte to 50 Mbyte. As expected.

- bench htab-mem -a -p 16 --use-case add_del_on_diff_cpu
Same performance and better memory consumption.
Before these patches this bench would OOM (with or without 'reuse after GP')
Patch 8 addresses the issue.

At the end the performance drop and additional memory consumption due to _rcu()
were expected and came out to be within reasonable margin.
Without Paul's patch 10 the memory consumption in 'bench htab-mem' is in Gbytes
which wouldn't be acceptable.

Patch 8 is a heuristic to address 'alloc on one cpu, free on another' issue.
It works well in practice. One can probably construct an artificial benchmark
to make heuristic ineffective, but we have to trade off performance, code complexity,
and memory consumption.

The life cycle of objects:
alloc: dequeue free_llist
free: enqeueu free_llist
free_llist above high watermark -> free_by_rcu_ttrace
free_rcu: enqueue free_by_rcu -> waiting_for_gp
after RCU GP waiting_for_gp -> free_by_rcu_ttrace
free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents c21de5fc 4ed8b5bc
......@@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
/* kmalloc/kfree equivalent: */
void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
/* kmem_cache_alloc/free equivalent: */
void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_cache_raw_free(void *ptr);
void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
......
......@@ -138,6 +138,8 @@ static inline int rcu_needs_cpu(void)
return 0;
}
static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
/*
* Take advantage of the fact that there is only one CPU, which
* allows us to ignore virtualization-based context switches.
......
......@@ -21,6 +21,7 @@ void rcu_softirq_qs(void);
void rcu_note_context_switch(bool preempt);
int rcu_needs_cpu(void);
void rcu_cpu_stall_reset(void);
void rcu_request_urgent_qs_task(struct task_struct *t);
/*
* Note a virtualization-based context switch. This is simply a
......
......@@ -9,7 +9,6 @@
/**
* struct bpf_cpumask - refcounted BPF cpumask wrapper structure
* @cpumask: The actual cpumask embedded in the struct.
* @rcu: The RCU head used to free the cpumask with RCU safety.
* @usage: Object reference counter. When the refcount goes to 0, the
* memory is released back to the BPF allocator, which provides
* RCU safety.
......@@ -25,7 +24,6 @@
*/
struct bpf_cpumask {
cpumask_t cpumask;
struct rcu_head rcu;
refcount_t usage;
};
......@@ -82,16 +80,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
return cpumask;
}
static void cpumask_free_cb(struct rcu_head *head)
{
struct bpf_cpumask *cpumask;
cpumask = container_of(head, struct bpf_cpumask, rcu);
migrate_disable();
bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
migrate_enable();
}
/**
* bpf_cpumask_release() - Release a previously acquired BPF cpumask.
* @cpumask: The cpumask being released.
......@@ -102,8 +90,12 @@ static void cpumask_free_cb(struct rcu_head *head)
*/
__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
{
if (refcount_dec_and_test(&cpumask->usage))
call_rcu(&cpumask->rcu, cpumask_free_cb);
if (!refcount_dec_and_test(&cpumask->usage))
return;
migrate_disable();
bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
migrate_enable();
}
/**
......
This diff is collapsed.
......@@ -493,7 +493,6 @@ static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
static inline void rcu_async_hurry(void) { }
static inline void rcu_async_relax(void) { }
static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
......@@ -508,7 +507,6 @@ void show_rcu_tasks_gp_kthreads(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#define RCU_SCHEDULER_INACTIVE 0
......
......@@ -96,7 +96,7 @@ static __always_inline
int list_push_pop_multiple(struct bpf_spin_lock *lock, struct bpf_list_head *head, bool leave_in_map)
{
struct bpf_list_node *n;
struct foo *f[8], *pf;
struct foo *f[200], *pf;
int i;
/* Loop following this check adds nodes 2-at-a-time in order to
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment