Commit 50b045a8 authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov

bpf, lru: avoid messing with eviction heuristics upon syscall lookup

One of the biggest issues we face right now with picking LRU map over
regular hash table is that a map walk out of user space, for example,
to just dump the existing entries or to remove certain ones, will
completely mess up LRU eviction heuristics and wrong entries such
as just created ones will get evicted instead. The reason for this
is that we mark an entry as "in use" via bpf_lru_node_set_ref() from
system call lookup side as well. Thus upon walk, all entries are
being marked, so information of actual least recently used ones
are "lost".

In case of Cilium where it can be used (besides others) as a BPF
based connection tracker, this current behavior causes disruption
upon control plane changes that need to walk the map from user space
to evict certain entries. Discussion result from bpfconf [0] was that
we should simply just remove marking from system call side as no
good use case could be found where it's actually needed there.
Therefore this patch removes marking for regular LRU and per-CPU
flavor. If there ever should be a need in future, the behavior could
be selected via map creation flag, but due to mentioned reason we
avoid this here.

  [0] http://vger.kernel.org/bpfconf.html

Fixes: 29ba732a ("bpf: Add BPF_MAP_TYPE_LRU_HASH")
Fixes: 8f844938 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH")
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent c6110222
...@@ -527,11 +527,13 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) ...@@ -527,11 +527,13 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf; return insn - insn_buf;
} }
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
void *key, const bool mark)
{ {
struct htab_elem *l = __htab_map_lookup_elem(map, key); struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l) { if (l) {
if (mark)
bpf_lru_node_set_ref(&l->lru_node); bpf_lru_node_set_ref(&l->lru_node);
return l->key + round_up(map->key_size, 8); return l->key + round_up(map->key_size, 8);
} }
...@@ -539,6 +541,16 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -539,6 +541,16 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
return NULL; return NULL;
} }
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
{
return __htab_lru_map_lookup_elem(map, key, true);
}
static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
{
return __htab_lru_map_lookup_elem(map, key, false);
}
static u32 htab_lru_map_gen_lookup(struct bpf_map *map, static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf) struct bpf_insn *insn_buf)
{ {
...@@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = { ...@@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_free = htab_map_free, .map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key, .map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem, .map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem, .map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup, .map_gen_lookup = htab_lru_map_gen_lookup,
...@@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{ {
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l; struct htab_elem *l;
void __percpu *pptr; void __percpu *pptr;
int ret = -ENOENT; int ret = -ENOENT;
...@@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) ...@@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key); l = __htab_map_lookup_elem(map, key);
if (!l) if (!l)
goto out; goto out;
if (htab_is_lru(htab)) /* We do not mark LRU map element here in order to not mess up
bpf_lru_node_set_ref(&l->lru_node); * eviction heuristics when user space does a map walk.
*/
pptr = htab_elem_get_ptr(l, map->key_size); pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, bpf_long_memcpy(value + off,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment