Commit 1f0f731f authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Btree splits now only take the locks they need

Previously, bch2_btree_update_start() would always take all intent
locks, all the way up to the root.

We've finally got data from users where this became a scalability issue
- so, this patch fixes bch2_btree_update_start() to only take the locks
we need.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 969576ec
...@@ -1070,23 +1070,23 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1070,23 +1070,23 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
nr_nodes[!!update_level] += 1 + split; nr_nodes[!!update_level] += 1 + split;
update_level++; update_level++;
if (!btree_path_node(path, update_level)) ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
break; if (ret)
return ERR_PTR(ret);
/* if (!btree_path_node(path, update_level)) {
* XXX: figure out how far we might need to split, /* Allocating new root? */
* instead of locking/reserving all the way to the root: nr_nodes[1] += split;
*/ update_level = BTREE_MAX_DEPTH;
split = update_level + 1 < BTREE_MAX_DEPTH; break;
} }
/* Might have to allocate a new root: */ if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
if (update_level < BTREE_MAX_DEPTH) BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
nr_nodes[1] += 1; break;
ret = bch2_btree_path_upgrade(trans, path, U8_MAX); split = true;
if (ret) }
return ERR_PTR(ret);
if (flags & BTREE_INSERT_GC_LOCK_HELD) if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock); lockdep_assert_held(&c->gc_lock);
...@@ -1108,6 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1108,6 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->mode = BTREE_INTERIOR_NO_UPDATE; as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
as->btree_id = path->btree_id; as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list); INIT_LIST_HEAD(&as->write_blocked_list);
...@@ -1511,7 +1512,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, ...@@ -1511,7 +1512,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
int ret = 0; int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, b);
...@@ -1698,7 +1699,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t ...@@ -1698,7 +1699,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
int ret; int ret;
lockdep_assert_held(&c->gc_lock); lockdep_assert_held(&c->gc_lock);
BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); BUG_ON(!btree_node_intent_locked(path, b->c.level));
BUG_ON(!b->c.level); BUG_ON(!b->c.level);
BUG_ON(!as || as->b); BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys); bch2_verify_keylist_sorted(keys);
...@@ -1738,6 +1739,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t ...@@ -1738,6 +1739,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
btree_node_interior_verify(c, b); btree_node_interior_verify(c, b);
return 0; return 0;
split: split:
/*
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
if (b->c.level >= as->update_level)
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
return btree_split(as, trans, path, b, keys, flags); return btree_split(as, trans, path, b, keys, flags);
} }
...@@ -1763,7 +1771,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, ...@@ -1763,7 +1771,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
bch2_btree_update_done(as, trans); bch2_btree_update_done(as, trans);
for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags); ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret; return ret;
......
...@@ -52,6 +52,7 @@ struct btree_update { ...@@ -52,6 +52,7 @@ struct btree_update {
unsigned took_gc_lock:1; unsigned took_gc_lock:1;
enum btree_id btree_id; enum btree_id btree_id;
unsigned update_level;
struct disk_reservation disk_res; struct disk_reservation disk_res;
struct journal_preres journal_preres; struct journal_preres journal_preres;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment