Commit 54ca47e1 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Kill bch2_btree_node_get_sibling()

This patch reworks the btree node merge path to use a second btree
iterator to get the sibling node - which means
bch2_btree_iter_get_sibling() can be deleted. Also, it uses
bch2_btree_iter_traverse_all() if necessary - which means it should be
more reliable. We don't currently even try to make it work when
trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction
commit, hopefully this will be a worthwhile tradeoff.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 1259cc31
...@@ -913,136 +913,6 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ...@@ -913,136 +913,6 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
return b; return b;
} }
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
enum btree_node_sibling sib)
{
struct btree_trans *trans = iter->trans;
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
struct bkey_buf tmp;
struct btree *ret = NULL;
unsigned level = b->c.level;
bch2_bkey_buf_init(&tmp);
parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
/*
* There's a corner case where a btree_iter might have a node locked
* that is just outside its current pos - when
* bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
*
* But the lock ordering checks in __bch2_btree_node_lock() go off of
* iter->pos, not the node's key: so if the iterator is marked as
* needing to be traversed, we risk deadlock if we don't bail out here:
*/
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
return ERR_PTR(-EINTR);
if (!bch2_btree_node_relock(iter, level + 1)) {
ret = ERR_PTR(-EINTR);
goto out;
}
node_iter = iter->l[parent->c.level].iter;
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
k = sib == btree_prev_sib
? bch2_btree_node_iter_prev(&node_iter, parent)
: (bch2_btree_node_iter_advance(&node_iter, parent),
bch2_btree_node_iter_peek(&node_iter, parent));
if (!k)
goto out;
bch2_bkey_buf_unpack(&tmp, c, parent, k);
ret = bch2_btree_node_get(c, iter, tmp.k, level,
SIX_LOCK_intent, _THIS_IP_);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
struct btree_iter *linked;
if (!bch2_btree_node_relock(iter, level + 1))
goto out;
/*
* We might have got -EINTR because trylock failed, and we're
* holding other locks that would cause us to deadlock:
*/
trans_for_each_iter(trans, linked)
if (btree_iter_lock_cmp(iter, linked) < 0)
__bch2_btree_iter_unlock(linked);
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, tmp.k, level,
SIX_LOCK_intent, _THIS_IP_);
/*
* before btree_iter_relock() calls btree_iter_verify_locks():
*/
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (!bch2_btree_node_relock(iter, level)) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
if (!IS_ERR(ret)) {
six_unlock_intent(&ret->c.lock);
ret = ERR_PTR(-EINTR);
}
}
bch2_trans_relock(trans);
}
out:
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (PTR_ERR_OR_ZERO(ret) == -EINTR)
bch2_btree_iter_upgrade(iter, level + 2);
BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
if (!IS_ERR_OR_NULL(ret)) {
struct btree *n1 = ret, *n2 = b;
if (sib != btree_prev_sib)
swap(n1, n2);
if (bpos_cmp(bpos_successor(n1->key.k.p),
n2->data->min_key)) {
char buf1[200], buf2[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
"prev: %s\n"
"next: %s\n",
bch2_btree_ids[iter->btree_id], level,
buf1, buf2);
six_unlock_intent(&ret->c.lock);
ret = NULL;
}
}
bch2_btree_trans_verify_locks(trans);
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, const struct bkey_i *k,
enum btree_id btree_id, unsigned level) enum btree_id btree_id, unsigned level)
...@@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ...@@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
" format: u64s %u fields %u %u %u %u %u\n" " format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n" " unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n" " bytes used %zu/%zu (%zu%% full)\n"
" sib u64s: %u, %u (merge threshold %zu)\n" " sib u64s: %u, %u (merge threshold %u)\n"
" nr packed keys %u\n" " nr packed keys %u\n"
" nr unpacked keys %u\n" " nr unpacked keys %u\n"
" floats %zu\n" " floats %zu\n"
...@@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ...@@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
b->nr.live_u64s * 100 / btree_max_u64s(c), b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0], b->sib_u64s[0],
b->sib_u64s[1], b->sib_u64s[1],
BTREE_FOREGROUND_MERGE_THRESHOLD(c), c->btree_foreground_merge_threshold,
b->nr.packed_keys, b->nr.packed_keys,
b->nr.unpacked_keys, b->nr.unpacked_keys,
stats.floats, stats.floats,
......
...@@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, ...@@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned, bool); enum btree_id, unsigned, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, enum btree_id, unsigned); const struct bkey_i *, enum btree_id, unsigned);
......
...@@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, ...@@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
enum btree_node_sibling sib) enum btree_node_sibling sib)
{ {
struct btree_trans *trans = iter->trans; struct btree_trans *trans = iter->trans;
struct btree_iter *sib_iter = NULL;
struct btree_update *as; struct btree_update *as;
struct bkey_format_state new_s; struct bkey_format_state new_s;
struct bkey_format new_f; struct bkey_format new_f;
struct bkey_i delete; struct bkey_i delete;
struct btree *b, *m, *n, *prev, *next, *parent; struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s; size_t sib_u64s;
int ret = 0; int ret = 0;
if (trans->nounlock)
return;
BUG_ON(!btree_node_locked(iter, level)); BUG_ON(!btree_node_locked(iter, level));
retry: retry:
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
BUG_ON(!btree_node_locked(iter, level)); BUG_ON(!btree_node_locked(iter, level));
b = iter->l[level].b; b = iter->l[level].b;
parent = btree_node_parent(iter, b); if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
if (!parent) (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
b->sib_u64s[sib] = U16_MAX;
goto out; goto out;
}
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) sib_pos = sib == btree_prev_sib
goto out; ? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
/* XXX: can't be holding read locks */ sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
m = bch2_btree_node_get_sibling(c, iter, b, sib); sib_pos, U8_MAX, level,
if (IS_ERR(m)) { BTREE_ITER_INTENT);
ret = PTR_ERR(m); ret = bch2_btree_iter_traverse(sib_iter);
if (ret)
goto err; goto err;
}
/* NULL means no sibling: */ m = sib_iter->l[level].b;
if (!m) {
if (btree_node_parent(iter, b) !=
btree_node_parent(sib_iter, m)) {
b->sib_u64s[sib] = U16_MAX; b->sib_u64s[sib] = U16_MAX;
goto out; goto out;
} }
...@@ -1581,6 +1595,8 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, ...@@ -1581,6 +1595,8 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
next = m; next = m;
} }
BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
bch2_bkey_format_init(&new_s); bch2_bkey_format_init(&new_s);
bch2_bkey_format_add_pos(&new_s, prev->data->min_key); bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
__bch2_btree_calc_format(&new_s, prev); __bch2_btree_calc_format(&new_s, prev);
...@@ -1598,23 +1614,21 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, ...@@ -1598,23 +1614,21 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
} }
sib_u64s = min(sib_u64s, btree_max_u64s(c)); sib_u64s = min(sib_u64s, btree_max_u64s(c));
sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
b->sib_u64s[sib] = sib_u64s; b->sib_u64s[sib] = sib_u64s;
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
six_unlock_intent(&m->c.lock);
goto out; goto out;
}
parent = btree_node_parent(iter, b);
as = bch2_btree_update_start(iter, level, as = bch2_btree_update_start(iter, level,
btree_update_reserve_required(c, parent) + 1, btree_update_reserve_required(c, parent) + 1,
flags| flags|
BTREE_INSERT_NOFAIL| BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE); BTREE_INSERT_USE_RESERVE);
ret = PTR_ERR_OR_ZERO(as); ret = PTR_ERR_OR_ZERO(as);
if (ret) { if (ret)
six_unlock_intent(&m->c.lock);
goto err; goto err;
}
trace_btree_merge(c, b); trace_btree_merge(c, b);
...@@ -1648,6 +1662,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, ...@@ -1648,6 +1662,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
bch2_btree_update_get_open_buckets(as, n); bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent); six_lock_increment(&b->c.lock, SIX_LOCK_intent);
six_lock_increment(&m->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b); bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_drop(iter, m); bch2_btree_iter_node_drop(iter, m);
...@@ -1663,6 +1678,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, ...@@ -1663,6 +1678,7 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
bch2_btree_update_done(as); bch2_btree_update_done(as);
out: out:
bch2_btree_trans_verify_locks(trans); bch2_btree_trans_verify_locks(trans);
bch2_trans_iter_free(trans, sib_iter);
/* /*
* Don't downgrade locks here: we're called after successful insert, * Don't downgrade locks here: we're called after successful insert,
...@@ -1675,11 +1691,16 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, ...@@ -1675,11 +1691,16 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
*/ */
return; return;
err: err:
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); bch2_trans_iter_put(trans, sib_iter);
sib_iter = NULL;
if (ret == -EINTR && bch2_trans_relock(trans)) {
ret = 0;
goto retry;
}
if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
bch2_trans_unlock(trans); ret = bch2_btree_iter_traverse_all(trans);
ret = bch2_btree_iter_traverse(iter);
if (!ret) if (!ret)
goto retry; goto retry;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment