Commit a6ed3ea6 authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by David S. Miller

bpf: restore behavior of bpf_map_update_elem

The introduction of pre-allocated hash elements inadvertently broke
the behavior of bpf hash maps where users expected to call
bpf_map_update_elem() without considering that the map can be full.
Some programs do:
old_value = bpf_map_lookup_elem(map, key);
if (old_value) {
  ... prepare new_value on stack ...
  bpf_map_update_elem(map, key, new_value);
}
Before pre-alloc the update() for existing element would work even
in 'map full' condition. Restore this behavior.

The above program could have updated old_value in place instead of
update() which would be faster and most programs use that approach,
but sometimes the values are large and the programs use update()
helper to do atomic replacement of the element.
Note we cannot simply update element's value in-place like percpu
hash map does and have to allocate extra num_possible_cpu elements
and use this extra reserve when the map is full.

Fixes: 6c905981 ("bpf: pre-allocate hash map elements")
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 5e3b724e
...@@ -26,11 +26,18 @@ struct bpf_htab { ...@@ -26,11 +26,18 @@ struct bpf_htab {
struct bucket *buckets; struct bucket *buckets;
void *elems; void *elems;
struct pcpu_freelist freelist; struct pcpu_freelist freelist;
void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */ u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */ u32 elem_size; /* size of each element in bytes */
}; };
enum extra_elem_state {
HTAB_NOT_AN_EXTRA_ELEM = 0,
HTAB_EXTRA_ELEM_FREE,
HTAB_EXTRA_ELEM_USED
};
/* each htab element is struct htab_elem + key + value */ /* each htab element is struct htab_elem + key + value */
struct htab_elem { struct htab_elem {
union { union {
...@@ -38,7 +45,10 @@ struct htab_elem { ...@@ -38,7 +45,10 @@ struct htab_elem {
struct bpf_htab *htab; struct bpf_htab *htab;
struct pcpu_freelist_node fnode; struct pcpu_freelist_node fnode;
}; };
union {
struct rcu_head rcu; struct rcu_head rcu;
enum extra_elem_state state;
};
u32 hash; u32 hash;
char key[0] __aligned(8); char key[0] __aligned(8);
}; };
...@@ -113,6 +123,23 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) ...@@ -113,6 +123,23 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
return err; return err;
} }
static int alloc_extra_elems(struct bpf_htab *htab)
{
void __percpu *pptr;
int cpu;
pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
for_each_possible_cpu(cpu) {
((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
HTAB_EXTRA_ELEM_FREE;
}
htab->extra_elems = pptr;
return 0;
}
/* Called from syscall */ /* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr) static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{ {
...@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (percpu) if (percpu)
cost += (u64) round_up(htab->map.value_size, 8) * cost += (u64) round_up(htab->map.value_size, 8) *
num_possible_cpus() * htab->map.max_entries; num_possible_cpus() * htab->map.max_entries;
else
cost += (u64) htab->elem_size * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE) if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */ /* make sure page count doesn't overflow */
...@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock); raw_spin_lock_init(&htab->buckets[i].lock);
} }
if (!percpu) {
err = alloc_extra_elems(htab);
if (err)
goto free_buckets;
}
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
err = prealloc_elems_and_freelist(htab); err = prealloc_elems_and_freelist(htab);
if (err) if (err)
goto free_buckets; goto free_extra_elems;
} }
return &htab->map; return &htab->map;
free_extra_elems:
free_percpu(htab->extra_elems);
free_buckets: free_buckets:
kvfree(htab->buckets); kvfree(htab->buckets);
free_htab: free_htab:
...@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) ...@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
kfree(l); kfree(l);
} }
static void htab_elem_free_rcu(struct rcu_head *head) static void htab_elem_free_rcu(struct rcu_head *head)
...@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head) ...@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{ {
if (l->state == HTAB_EXTRA_ELEM_USED) {
l->state = HTAB_EXTRA_ELEM_FREE;
return;
}
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
pcpu_freelist_push(&htab->freelist, &l->fnode); pcpu_freelist_push(&htab->freelist, &l->fnode);
} else { } else {
...@@ -381,26 +422,45 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) ...@@ -381,26 +422,45 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash, void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus) bool percpu, bool onallcpus,
bool old_elem_exists)
{ {
u32 size = htab->map.value_size; u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
struct htab_elem *l_new; struct htab_elem *l_new;
void __percpu *pptr; void __percpu *pptr;
int err = 0;
if (prealloc) { if (prealloc) {
l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
if (!l_new) if (!l_new)
return ERR_PTR(-E2BIG); err = -E2BIG;
} else { } else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count); atomic_dec(&htab->count);
return ERR_PTR(-E2BIG); err = -E2BIG;
} } else {
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); l_new = kmalloc(htab->elem_size,
GFP_ATOMIC | __GFP_NOWARN);
if (!l_new) if (!l_new)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
}
if (err) {
if (!old_elem_exists)
return ERR_PTR(err);
/* if we're updating the existing element and the hash table
* is full, use per-cpu extra elems
*/
l_new = this_cpu_ptr(htab->extra_elems);
if (l_new->state != HTAB_EXTRA_ELEM_FREE)
return ERR_PTR(-E2BIG);
l_new->state = HTAB_EXTRA_ELEM_USED;
} else {
l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
}
memcpy(l_new->key, key, key_size); memcpy(l_new->key, key, key_size);
if (percpu) { if (percpu) {
...@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret) if (ret)
goto err; goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
!!l_old);
if (IS_ERR(l_new)) { if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */ /* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new); ret = PTR_ERR(l_new);
...@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ...@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
} }
} else { } else {
l_new = alloc_htab_elem(htab, key, value, key_size, l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus); hash, true, onallcpus, false);
if (IS_ERR(l_new)) { if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new); ret = PTR_ERR(l_new);
goto err; goto err;
...@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map) ...@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
htab_free_elems(htab); htab_free_elems(htab);
pcpu_freelist_destroy(&htab->freelist); pcpu_freelist_destroy(&htab->freelist);
} }
free_percpu(htab->extra_elems);
kvfree(htab->buckets); kvfree(htab->buckets);
kfree(htab); kfree(htab);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment