Commit 19d54324 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Really don't hold btree locks while btree IOs are in flight

This is something we've attempted to stick to for quite some time, as it
helps guarantee filesystem latency - but there's a few remaining paths
that this patch fixes.

This is also necessary for an upcoming patch to update btree pointers
after every btree write - since the btree write completion path will now
be doing btree operations.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent e3a67bdb
...@@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ...@@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0; int ret = 0;
lockdep_assert_held(&bc->lock); lockdep_assert_held(&bc->lock);
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
if (!flush)
return -ENOMEM;
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
if (!six_trylock_intent(&b->c.lock)) if (!six_trylock_intent(&b->c.lock))
return -ENOMEM; return -ENOMEM;
...@@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ...@@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (!six_trylock_write(&b->c.lock)) if (!six_trylock_write(&b->c.lock))
goto out_unlock_intent; goto out_unlock_intent;
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
if (!flush)
goto out_unlock;
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
if (btree_node_noevict(b)) if (btree_node_noevict(b))
goto out_unlock; goto out_unlock;
if (!btree_node_may_write(b)) if (!btree_node_may_write(b))
goto out_unlock; goto out_unlock;
if (btree_node_dirty(b) && if (btree_node_dirty(b)) {
if (!flush ||
test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
goto out_unlock; goto out_unlock;
if (btree_node_dirty(b) ||
btree_node_write_in_flight(b) ||
btree_node_read_in_flight(b)) {
if (!flush)
goto out_unlock;
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
/* /*
* Using the underscore version because we don't want to compact * Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted * bsets after the write, since this node is about to be evicted
...@@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ...@@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
else else
__bch2_btree_node_write(c, b); __bch2_btree_node_write(c, b);
/* wait for any in flight btree write */ six_unlock_write(&b->c.lock);
btree_node_wait_on_io(b); six_unlock_intent(&b->c.lock);
goto wait_on_io;
} }
out: out:
if (b->hash_val && !ret) if (b->hash_val && !ret)
...@@ -581,6 +594,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ...@@ -581,6 +594,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
} }
BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_write_in_flight(b)); BUG_ON(btree_node_write_in_flight(b));
out: out:
b->flags = 0; b->flags = 0;
...@@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ...@@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
{ {
struct btree_cache *bc = &c->btree_cache; struct btree_cache *bc = &c->btree_cache;
struct btree *b; struct btree *b;
u32 seq;
BUG_ON(level + 1 >= BTREE_MAX_DEPTH); BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
/* /*
...@@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ...@@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return NULL; return NULL;
} }
set_btree_node_read_in_flight(b);
six_unlock_write(&b->c.lock);
seq = b->c.lock.state.seq;
six_unlock_intent(&b->c.lock);
/* Unlock before doing IO: */ /* Unlock before doing IO: */
if (iter && sync) if (iter && sync)
bch2_trans_unlock(iter->trans); bch2_trans_unlock(iter->trans);
bch2_btree_node_read(c, b, sync); bch2_btree_node_read(c, b, sync);
six_unlock_write(&b->c.lock); if (!sync)
if (!sync) {
six_unlock_intent(&b->c.lock);
return NULL; return NULL;
}
/* /*
* XXX: this will probably always fail because btree_iter_relock() * XXX: this will probably always fail because btree_iter_relock()
* currently fails for iterators that aren't pointed at a valid btree * currently fails for iterators that aren't pointed at a valid btree
* node * node
*/ */
if (iter && !bch2_trans_relock(iter->trans)) { if (iter && !bch2_trans_relock(iter->trans))
six_unlock_intent(&b->c.lock);
return ERR_PTR(-EINTR); return ERR_PTR(-EINTR);
}
if (lock_type == SIX_LOCK_read) if (!six_relock_type(&b->c.lock, lock_type, seq))
six_lock_downgrade(&b->c.lock); return ERR_PTR(-EINTR);
return b; return b;
} }
...@@ -831,11 +846,12 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ...@@ -831,11 +846,12 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
} }
if (unlikely(btree_node_read_in_flight(b))) { if (unlikely(btree_node_read_in_flight(b))) {
u32 seq = b->c.lock.state.seq;
six_unlock_type(&b->c.lock, lock_type); six_unlock_type(&b->c.lock, lock_type);
bch2_trans_unlock(iter->trans); bch2_trans_unlock(iter->trans);
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, bch2_btree_node_wait_on_read(b);
TASK_UNINTERRUPTIBLE);
/* /*
* XXX: check if this always fails - btree_iter_relock() * XXX: check if this always fails - btree_iter_relock()
...@@ -844,6 +860,8 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ...@@ -844,6 +860,8 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
*/ */
if (iter && !bch2_trans_relock(iter->trans)) if (iter && !bch2_trans_relock(iter->trans))
return ERR_PTR(-EINTR); return ERR_PTR(-EINTR);
if (!six_relock_type(&b->c.lock, lock_type, seq))
goto retry; goto retry;
} }
...@@ -923,8 +941,7 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ...@@ -923,8 +941,7 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
} }
/* XXX: waiting on IO with btree locks held: */ /* XXX: waiting on IO with btree locks held: */
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, __bch2_btree_node_wait_on_read(b);
TASK_UNINTERRUPTIBLE);
prefetch(b->aux_data); prefetch(b->aux_data);
...@@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) ...@@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
b = btree_cache_find(bc, k); b = btree_cache_find(bc, k);
if (!b) if (!b)
return; return;
wait_on_io:
/* not allowed to wait on io with btree locks held: */
/* XXX we're called from btree_gc which will be holding other btree
* nodes locked
* */
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
six_lock_intent(&b->c.lock, NULL, NULL); six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL); six_lock_write(&b->c.lock, NULL, NULL);
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, if (btree_node_dirty(b)) {
TASK_UNINTERRUPTIBLE);
__bch2_btree_node_write(c, b); __bch2_btree_node_write(c, b);
six_unlock_write(&b->c.lock);
/* wait for any in flight btree write */ six_unlock_intent(&b->c.lock);
btree_node_wait_on_io(b); goto wait_on_io;
}
BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_dirty(b));
......
...@@ -22,6 +22,44 @@ ...@@ -22,6 +22,44 @@
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
clear_btree_node_write_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
void bch2_btree_node_io_lock(struct btree *b)
{
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void __bch2_btree_node_wait_on_read(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void __bch2_btree_node_wait_on_write(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_read(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static void verify_no_dups(struct btree *b, static void verify_no_dups(struct btree *b,
struct bkey_packed *start, struct bkey_packed *start,
struct bkey_packed *end) struct bkey_packed *end)
...@@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans, ...@@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
EBUG_ON(iter && iter->l[b->c.level].b != b); EBUG_ON(iter && iter->l[b->c.level].b != b);
BUG_ON(bset_written(b, bset(b, &b->set[1]))); BUG_ON(bset_written(b, bset(b, &b->set[1])));
if (b->nsets == MAX_BSETS) { if (b->nsets == MAX_BSETS &&
!btree_node_write_in_flight(b)) {
unsigned log_u64s[] = { unsigned log_u64s[] = {
ilog2(bset_u64s(&b->set[0])), ilog2(bset_u64s(&b->set[0])),
ilog2(bset_u64s(&b->set[1])), ilog2(bset_u64s(&b->set[1])),
...@@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ...@@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_pos_to_text(&PBUF(buf), c, b); btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b); trace_btree_read(c, b);
set_btree_node_read_in_flight(b);
if (bch2_verify_all_btree_replicas && if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync)) !btree_node_read_all_replicas(c, b, sync))
return; return;
...@@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, ...@@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bkey_copy(&b->key, k); bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
set_btree_node_read_in_flight(b);
bch2_btree_node_read(c, b, true); bch2_btree_node_read(c, b, true);
if (btree_node_read_error(b)) { if (btree_node_read_error(b)) {
...@@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) ...@@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_write *w = btree_prev_write(b); struct btree_write *w = btree_prev_write(b);
bch2_btree_complete_write(c, b, w); bch2_btree_complete_write(c, b, w);
btree_node_io_unlock(b); bch2_btree_node_io_unlock(b);
} }
static void bch2_btree_node_write_error(struct bch_fs *c, static void bch2_btree_node_write_error(struct bch_fs *c,
...@@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) ...@@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
bool validate_before_checksum = false; bool validate_before_checksum = false;
void *data; void *data;
BUG_ON(btree_node_write_in_flight(b));
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return; return;
...@@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) ...@@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
* XXX waiting on btree writes with btree locks held - * XXX waiting on btree writes with btree locks held -
* this can deadlock, and we hit the write error path * this can deadlock, and we hit the write error path
*/ */
btree_node_wait_on_io(b); bch2_btree_node_wait_on_write(b);
continue; continue;
} }
......
...@@ -52,24 +52,12 @@ struct btree_write_bio { ...@@ -52,24 +52,12 @@ struct btree_write_bio {
struct bch_write_bio wbio; struct bch_write_bio wbio;
}; };
static inline void btree_node_io_unlock(struct btree *b) void bch2_btree_node_io_unlock(struct btree *);
{ void bch2_btree_node_io_lock(struct btree *);
EBUG_ON(!btree_node_write_in_flight(b)); void __bch2_btree_node_wait_on_read(struct btree *);
clear_btree_node_write_in_flight(b); void __bch2_btree_node_wait_on_write(struct btree *);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); void bch2_btree_node_wait_on_read(struct btree *);
} void bch2_btree_node_wait_on_write(struct btree *);
static inline void btree_node_io_lock(struct btree *b)
{
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static inline void btree_node_wait_on_io(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static inline bool btree_node_may_write(struct btree *b) static inline bool btree_node_may_write(struct btree *b)
{ {
...@@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ...@@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
} }
six_unlock_type(&b->c.lock, lock_held); six_unlock_type(&b->c.lock, lock_held);
btree_node_wait_on_io(b); bch2_btree_node_wait_on_write(b);
btree_node_lock_type(c, b, lock_held); btree_node_lock_type(c, b, lock_held);
} }
} }
......
...@@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as) ...@@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as)
six_unlock_read(&old->c.lock); six_unlock_read(&old->c.lock);
if (seq == as->old_nodes_seq[i]) if (seq == as->old_nodes_seq[i])
btree_node_wait_on_io(old); bch2_btree_node_wait_on_write(old);
} }
/* /*
......
...@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ...@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
if (c->opts.nochanges) if (c->opts.nochanges)
return; return;
btree_node_io_lock(b); bch2_btree_node_io_lock(b);
mutex_lock(&c->verify_lock); mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) { if (!c->verify_ondisk) {
...@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ...@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
} }
out: out:
mutex_unlock(&c->verify_lock); mutex_unlock(&c->verify_lock);
btree_node_io_unlock(b); bch2_btree_node_io_unlock(b);
} }
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment