Commit a96a44ab authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by Daniel Borkmann

bpf: bpf_sk_storage: Fix invalid wait context lockdep report

'./test_progs -t test_local_storage' reported a splat:

[   27.137569] =============================
[   27.138122] [ BUG: Invalid wait context ]
[   27.138650] 6.5.0-03980-gd11ae1b1 #247 Tainted: G           O
[   27.139542] -----------------------------
[   27.140106] test_progs/1729 is trying to lock:
[   27.140713] ffff8883ef047b88 (stock_lock){-.-.}-{3:3}, at: local_lock_acquire+0x9/0x130
[   27.141834] other info that might help us debug this:
[   27.142437] context-{5:5}
[   27.142856] 2 locks held by test_progs/1729:
[   27.143352]  #0: ffffffff84bcd9c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x4/0x40
[   27.144492]  #1: ffff888107deb2c0 (&storage->lock){..-.}-{2:2}, at: bpf_local_storage_update+0x39e/0x8e0
[   27.145855] stack backtrace:
[   27.146274] CPU: 0 PID: 1729 Comm: test_progs Tainted: G           O       6.5.0-03980-gd11ae1b1 #247
[   27.147550] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[   27.149127] Call Trace:
[   27.149490]  <TASK>
[   27.149867]  dump_stack_lvl+0x130/0x1d0
[   27.152609]  dump_stack+0x14/0x20
[   27.153131]  __lock_acquire+0x1657/0x2220
[   27.153677]  lock_acquire+0x1b8/0x510
[   27.157908]  local_lock_acquire+0x29/0x130
[   27.159048]  obj_cgroup_charge+0xf4/0x3c0
[   27.160794]  slab_pre_alloc_hook+0x28e/0x2b0
[   27.161931]  __kmem_cache_alloc_node+0x51/0x210
[   27.163557]  __kmalloc+0xaa/0x210
[   27.164593]  bpf_map_kzalloc+0xbc/0x170
[   27.165147]  bpf_selem_alloc+0x130/0x510
[   27.166295]  bpf_local_storage_update+0x5aa/0x8e0
[   27.167042]  bpf_fd_sk_storage_update_elem+0xdb/0x1a0
[   27.169199]  bpf_map_update_value+0x415/0x4f0
[   27.169871]  map_update_elem+0x413/0x550
[   27.170330]  __sys_bpf+0x5e9/0x640
[   27.174065]  __x64_sys_bpf+0x80/0x90
[   27.174568]  do_syscall_64+0x48/0xa0
[   27.175201]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[   27.175932] RIP: 0033:0x7effb40e41ad
[   27.176357] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d8
[   27.179028] RSP: 002b:00007ffe64c21fc8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   27.180088] RAX: ffffffffffffffda RBX: 00007ffe64c22768 RCX: 00007effb40e41ad
[   27.181082] RDX: 0000000000000020 RSI: 00007ffe64c22008 RDI: 0000000000000002
[   27.182030] RBP: 00007ffe64c21ff0 R08: 0000000000000000 R09: 00007ffe64c22788
[   27.183038] R10: 0000000000000064 R11: 0000000000000202 R12: 0000000000000000
[   27.184006] R13: 00007ffe64c22788 R14: 00007effb42a1000 R15: 0000000000000000
[   27.184958]  </TASK>

It complains about acquiring a local_lock while holding a raw_spin_lock.
It means it should not allocate memory while holding a raw_spin_lock
since it is not safe for RT.

raw_spin_lock is needed because bpf_local_storage supports tracing
context. In particular for task local storage, it is easy to
get a "current" task PTR_TO_BTF_ID in tracing bpf prog.
However, task (and cgroup) local storage has already been moved to
bpf mem allocator which can be used after raw_spin_lock.

The splat is for the sk storage. For sk (and inode) storage,
it has not been moved to bpf mem allocator. Using raw_spin_lock or not,
kzalloc(GFP_ATOMIC) could theoretically be unsafe in tracing context.
However, the local storage helper requires a verifier accepted
sk pointer (PTR_TO_BTF_ID), it is hypothetical if that (mean running
a bpf prog in a kzalloc unsafe context and also able to hold a verifier
accepted sk pointer) could happen.

This patch avoids kzalloc after raw_spin_lock to silent the splat.
There is an existing kzalloc before the raw_spin_lock. At that point,
a kzalloc is very likely required because a lookup has just been done
before. Thus, this patch always does the kzalloc before acquiring
the raw_spin_lock and remove the later kzalloc usage after the
raw_spin_lock. After this change, it will have a charge and then
uncharge during the syscall bpf_map_update_elem() code path.
This patch opts for simplicity and not continue the old
optimization to save one charge and uncharge.

This issue is dated back to the very first commit of bpf_sk_storage
which had been refactored multiple times to create task, inode, and
cgroup storage. This patch uses a Fixes tag with a more recent
commit that should be easier to do backport.

Fixes: b00fa38a ("bpf: Enable non-atomic allocations in local storage")
Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901231129.578493-2-martin.lau@linux.dev
parent a192103a
......@@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
void *value, u64 map_flags, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *selem = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
......@@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
if (gfp_flags == GFP_KERNEL) {
selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
if (!selem)
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
}
raw_spin_lock_irqsave(&local_storage->lock, flags);
......@@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
* simple.
*/
err = -EAGAIN;
goto unlock_err;
goto unlock;
}
old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
goto unlock_err;
goto unlock;
if (old_sdata && (map_flags & BPF_F_LOCK)) {
copy_map_value_locked(&smap->map, old_sdata->data, value,
......@@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
if (gfp_flags != GFP_KERNEL) {
/* local_storage->lock is held. Hence, we are sure
* we can unlink and uncharge the old_sdata successfully
* later. Hence, instead of charging the new selem now
* and then uncharge the old selem later (which may cause
* a potential but unnecessary charge failure), avoid taking
* a charge at all here (the "!old_sdata" check) and the
* old_sdata will not be uncharged later during
* bpf_selem_unlink_storage_nolock().
*/
selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
if (!selem) {
err = -ENOMEM;
goto unlock_err;
}
}
alloc_selem = NULL;
/* First, link the new selem to the map */
bpf_selem_link_map(smap, selem);
......@@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
false, false);
true, false);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return SDATA(selem);
unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (selem) {
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
bpf_selem_free(selem, smap, true);
bpf_selem_free(alloc_selem, smap, true);
}
return ERR_PTR(err);
return err ? ERR_PTR(err) : SDATA(selem);
}
static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment