Commit 30985537 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Fix usage of six lock's percpu mode

Six locks have a percpu mode, which we use for interior btree nodes, as
well as btree key cache keys for the subvolumes btree. We've been
switching locks back and forth between percpu and non percpu mode as
needed, but it turns out this is racy - when we're reusing an existing
node, other threads could be attempting to lock it while we're switching
it between modes.

This patch fixes this by never switching 'struct btree' between the two
modes, and instead segragating them between two different freed lists.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 5b3f7805
......@@ -40,6 +40,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
return max_t(int, 0, bc->used - bc->reserve);
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
{
if (b->c.lock.readers)
list_move(&b->list, &bc->freed_pcpu);
else
list_move(&b->list, &bc->freed_nonpcpu);
}
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
......@@ -56,7 +64,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
b->aux_data = NULL;
bc->used--;
list_move(&b->list, &bc->freed);
btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
......@@ -162,11 +171,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
b->c.level = level;
b->c.btree_id = id;
if (level)
six_lock_pcpu_alloc(&b->c.lock);
else
six_lock_pcpu_free_rcu(&b->c.lock);
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
......@@ -432,8 +436,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
BUG_ON(atomic_read(&c->btree_cache.dirty));
while (!list_empty(&bc->freed)) {
b = list_first_entry(&bc->freed, struct btree, list);
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
while (!list_empty(&bc->freed_nonpcpu)) {
b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_del(&b->list);
six_lock_pcpu_free(&b->c.lock);
kfree(b);
......@@ -487,7 +493,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/*
......@@ -562,9 +569,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
}
}
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
{
struct btree_cache *bc = &c->btree_cache;
struct list_head *freed = pcpu_read_locks
? &bc->freed_pcpu
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
......@@ -576,7 +586,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, &bc->freed, list)
list_for_each_entry(b, freed, list)
if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list);
goto got_node;
......@@ -586,6 +596,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
if (!b)
goto err_locked;
if (pcpu_read_locks)
six_lock_pcpu_alloc(&b->c.lock);
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
got_node:
......@@ -598,7 +611,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
list_move(&b2->list, &bc->freed);
btree_node_to_freedlist(bc, b2);
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
goto got_mem;
......@@ -643,7 +656,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
if (b) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
list_move(&b2->list, &bc->freed);
btree_node_to_freedlist(bc, b2);
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
} else {
......@@ -688,7 +701,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return ERR_PTR(-EINTR);
}
b = bch2_btree_node_mem_alloc(c);
b = bch2_btree_node_mem_alloc(c, level != 0);
if (trans && b == ERR_PTR(-ENOMEM)) {
trans->memory_allocation_failure = true;
......
......@@ -20,7 +20,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
......
......@@ -1542,7 +1542,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(c);
b = bch2_btree_node_mem_alloc(c, level != 0);
bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
......
......@@ -166,13 +166,13 @@ btree_key_cache_create(struct bch_fs *c,
}
was_new = false;
} else {
if (btree_id == BTREE_ID_subvolumes)
six_lock_pcpu_alloc(&ck->c.lock);
else
six_lock_pcpu_free(&ck->c.lock);
}
if (btree_id == BTREE_ID_subvolumes)
six_lock_pcpu_alloc(&ck->c.lock);
else
six_lock_pcpu_free(&ck->c.lock);
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
......
......@@ -152,7 +152,8 @@ struct btree_cache {
struct mutex lock;
struct list_head live;
struct list_head freeable;
struct list_head freed;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
......
......@@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct disk_reservation *res,
struct closure *cl,
bool interior_node,
unsigned flags)
{
struct write_point *wp;
......@@ -242,7 +243,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(c);
b = bch2_btree_node_mem_alloc(c, interior_node);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
......@@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
{
struct bch_fs *c = as->c;
struct btree *b;
struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
BUG_ON(!as->nr_prealloc_nodes);
BUG_ON(!p->nr);
b = as->prealloc_nodes[--as->nr_prealloc_nodes];
b = p->b[--p->nr];
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
......@@ -377,43 +379,49 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
static void bch2_btree_reserve_put(struct btree_update *as)
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p;
mutex_lock(&c->btree_reserve_cache_lock);
while (as->nr_prealloc_nodes) {
struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
for (p = as->prealloc_nodes;
p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
p++) {
while (p->nr) {
struct btree *b = p->b[--p->nr];
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
if (c->btree_reserve_cache_nr <
ARRAY_SIZE(c->btree_reserve_cache)) {
struct btree_alloc *a =
&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
if (c->btree_reserve_cache_nr <
ARRAY_SIZE(c->btree_reserve_cache)) {
struct btree_alloc *a =
&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
a->ob = b->ob;
b->ob.nr = 0;
bkey_copy(&a->k, &b->key);
} else {
bch2_open_buckets_put(c, &b->ob);
}
a->ob = b->ob;
b->ob.nr = 0;
bkey_copy(&a->k, &b->key);
} else {
bch2_open_buckets_put(c, &b->ob);
}
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
}
mutex_unlock(&c->btree_reserve_cache_lock);
}
static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
unsigned flags, struct closure *cl)
{
struct bch_fs *c = as->c;
struct btree *b;
unsigned interior;
int ret;
BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
......@@ -423,23 +431,28 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
if (ret)
return ret;
while (as->nr_prealloc_nodes < nr_nodes) {
b = __bch2_btree_node_alloc(c, &as->disk_res,
flags & BTREE_INSERT_NOWAIT
? NULL : cl, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err_free;
}
for (interior = 0; interior < 2; interior++) {
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
b = __bch2_btree_node_alloc(c, &as->disk_res,
flags & BTREE_INSERT_NOWAIT
? NULL : cl,
interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
}
as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
p->b[p->nr++] = b;
}
}
bch2_btree_cache_cannibalize_unlock(c);
return 0;
err_free:
err:
bch2_btree_cache_cannibalize_unlock(c);
trace_btree_reserve_get_fail(c, nr_nodes, cl);
trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], cl);
return ret;
}
......@@ -942,7 +955,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes;
unsigned nr_nodes[2];
unsigned update_level = level;
int journal_flags = 0;
int ret = 0;
......@@ -954,10 +967,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
closure_init_stack(&cl);
retry:
nr_nodes = 0;
nr_nodes[0] = nr_nodes[1] = 0;
update_level = level;
while (1) {
nr_nodes += 1 + split;
nr_nodes[!!update_level] += 1 + split;
update_level++;
if (!btree_path_node(path, update_level))
......@@ -972,7 +986,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
/* Might have to allocate a new root: */
if (update_level < BTREE_MAX_DEPTH)
nr_nodes += 1;
nr_nodes[1] += 1;
if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
......@@ -1050,7 +1064,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
nr_nodes * btree_sectors(c),
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
......@@ -1085,11 +1099,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
list_del_init(&b->list);
mutex_unlock(&c->btree_cache.lock);
if (b->c.level)
six_lock_pcpu_alloc(&b->c.lock);
else
six_lock_pcpu_free(&b->c.lock);
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
......@@ -2015,7 +2024,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
return -EINTR;
}
new_hash = bch2_btree_node_mem_alloc(c);
new_hash = bch2_btree_node_mem_alloc(c, false);
}
path->intent_ref++;
......@@ -2091,7 +2100,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(c);
b = bch2_btree_node_mem_alloc(c, false);
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
......
......@@ -76,8 +76,10 @@ struct btree_update {
struct journal_entry_pin journal;
/* Preallocated nodes we reserve when we start the update: */
struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_prealloc_nodes;
struct prealloc_nodes {
struct btree *b[BTREE_UPDATE_NODES_MAX];
unsigned nr;
} prealloc_nodes[2];
/* Nodes being freed: */
struct keylist old_keys;
......
......@@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
pr_newline(out);
pr_buf(out, "pcpu read locks: ");
pr_tab(out);
pr_buf(out, "%u", b->c.lock.readers != NULL);
pr_newline(out);
pr_buf(out, "written:");
pr_tab(out);
pr_buf(out, "%u", b->written);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment