Commit d8601afc authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Simplify journal replay

With BTREE_ITER_WITH_JOURNAL, there's no longer any restrictions on the
order we have to replay keys from the journal in, and we can also start
up journal reclaim right away - and delete a bunch of code.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
parent 8e432d98
...@@ -902,8 +902,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) ...@@ -902,8 +902,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
static bool allocator_thread_running(struct bch_dev *ca) static bool allocator_thread_running(struct bch_dev *ca)
{ {
unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) && test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags)
? ALLOCATOR_running ? ALLOCATOR_running
: ALLOCATOR_stopped; : ALLOCATOR_stopped;
alloc_thread_set_state(ca, state); alloc_thread_set_state(ca, state);
......
...@@ -510,8 +510,6 @@ enum { ...@@ -510,8 +510,6 @@ enum {
BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE, BCH_FS_TOPOLOGY_REPAIR_DONE,
BCH_FS_ALLOC_REPLAY_DONE,
BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE, BCH_FS_FSCK_DONE,
BCH_FS_STARTED, BCH_FS_STARTED,
BCH_FS_RW, BCH_FS_RW,
......
...@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) ...@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4; size_t max_dirty = 4096 + (nr_keys * 3) / 4;
return nr_dirty > max_dirty && return nr_dirty > max_dirty;
test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
} }
int bch2_btree_key_cache_journal_flush(struct journal *, int bch2_btree_key_cache_journal_flush(struct journal *,
......
...@@ -45,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) ...@@ -45,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
BUG_ON(!b->c.level); BUG_ON(!b->c.level);
if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
return; return;
bch2_btree_node_iter_init_from_start(&iter, b); bch2_btree_node_iter_init_from_start(&iter, b);
...@@ -1851,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) ...@@ -1851,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{ {
struct async_btree_rewrite *a; struct async_btree_rewrite *a;
if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
return;
if (!percpu_ref_tryget(&c->writes)) if (!percpu_ref_tryget(&c->writes))
return; return;
......
...@@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, ...@@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s; int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added; int live_u64s_added, u64s_added;
EBUG_ON(!insert->level &&
!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
&insert_l(insert)->iter, insert->k))) &insert_l(insert)->iter, insert->k)))
return false; return false;
......
...@@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, ...@@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
u64 seq; u64 seq;
int err; int err;
if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
return 0;
lockdep_assert_held(&j->reclaim_lock); lockdep_assert_held(&j->reclaim_lock);
while (1) { while (1) {
...@@ -689,8 +686,6 @@ static int bch2_journal_reclaim_thread(void *arg) ...@@ -689,8 +686,6 @@ static int bch2_journal_reclaim_thread(void *arg)
set_freezable(); set_freezable();
kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
j->last_flushed = jiffies; j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) { while (!ret && !kthread_should_stop()) {
......
...@@ -148,7 +148,6 @@ enum journal_space_from { ...@@ -148,7 +148,6 @@ enum journal_space_from {
enum { enum {
JOURNAL_REPLAY_DONE, JOURNAL_REPLAY_DONE,
JOURNAL_STARTED, JOURNAL_STARTED,
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE, JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH, JOURNAL_MAY_SKIP_FLUSH,
......
...@@ -474,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq) ...@@ -474,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++); bch2_journal_pin_put(j, j->replay_journal_seq++);
} }
static int __bch2_journal_replay_key(struct btree_trans *trans, static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k) struct journal_key *k)
{ {
struct btree_iter iter; struct btree_iter iter;
unsigned iter_flags = unsigned iter_flags =
...@@ -484,7 +484,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, ...@@ -484,7 +484,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
int ret; int ret;
if (!k->level && k->btree_id == BTREE_ID_alloc) if (!k->level && k->btree_id == BTREE_ID_alloc)
iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; iter_flags |= BTREE_ITER_CACHED;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level, BTREE_MAX_DEPTH, k->level,
...@@ -503,29 +503,12 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, ...@@ -503,29 +503,12 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
return ret; return ret;
} }
static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
{
unsigned commit_flags =
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED;
if (!k->allocated)
commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
return bch2_trans_do(c, NULL, NULL, commit_flags,
__bch2_journal_replay_key(&trans, k));
}
static int journal_sort_seq_cmp(const void *_l, const void *_r) static int journal_sort_seq_cmp(const void *_l, const void *_r)
{ {
const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r); const struct journal_key *r = *((const struct journal_key **)_r);
return cmp_int(r->level, l->level) ?: return cmp_int(l->journal_seq, r->journal_seq);
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
} }
static int bch2_journal_replay(struct bch_fs *c) static int bch2_journal_replay(struct bch_fs *c)
...@@ -533,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c) ...@@ -533,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys; struct journal_keys *keys = &c->journal_keys;
struct journal_key **keys_sorted, *k; struct journal_key **keys_sorted, *k;
struct journal *j = &c->journal; struct journal *j = &c->journal;
struct bch_dev *ca;
unsigned idx;
size_t i; size_t i;
u64 seq;
int ret; int ret;
keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
...@@ -555,73 +535,25 @@ static int bch2_journal_replay(struct bch_fs *c) ...@@ -555,73 +535,25 @@ static int bch2_journal_replay(struct bch_fs *c)
replay_now_at(j, keys->journal_seq_base); replay_now_at(j, keys->journal_seq_base);
} }
seq = j->replay_journal_seq;
/*
* First replay updates to the alloc btree - these will only update the
* btree key cache:
*/
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
if (!k->level && k->btree_id == BTREE_ID_alloc) {
j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
ret = bch2_journal_replay_key(c, k);
if (ret)
goto err;
}
}
/* Now we can start the allocator threads: */
set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
for_each_member_device(ca, c, idx)
bch2_wake_allocator(ca);
/*
* Next replay updates to interior btree nodes:
*/
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
if (k->level) {
j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
ret = bch2_journal_replay_key(c, k);
if (ret)
goto err;
}
}
/*
* Now that the btree is in a consistent state, we can start journal
* reclaim (which will be flushing entries from the btree key cache back
* to the btree:
*/
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
journal_reclaim_kick(j);
j->replay_journal_seq = seq;
/*
* Now replay leaf node updates:
*/
for (i = 0; i < keys->nr; i++) { for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i]; k = keys_sorted[i];
cond_resched(); cond_resched();
if (k->level || k->btree_id == BTREE_ID_alloc) if (!k->allocated)
continue; replay_now_at(j, keys->journal_seq_base + k->journal_seq);
replay_now_at(j, keys->journal_seq_base + k->journal_seq);
ret = bch2_journal_replay_key(c, k); ret = bch2_trans_do(c, NULL, NULL,
if (ret) BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED|
(!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
bch2_journal_replay_key(&trans, k));
if (ret) {
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
ret, bch2_btree_ids[k->btree_id], k->level);
goto err; goto err;
}
} }
replay_now_at(j, j->replay_journal_seq_end); replay_now_at(j, j->replay_journal_seq_end);
...@@ -629,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c) ...@@ -629,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j); bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j); bch2_journal_flush_all_pins(j);
kfree(keys_sorted); ret = bch2_journal_error(j);
return bch2_journal_error(j);
err: err:
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
ret, bch2_btree_ids[k->btree_id], k->level);
kfree(keys_sorted); kfree(keys_sorted);
return ret; return ret;
} }
...@@ -1215,7 +1142,8 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -1215,7 +1142,8 @@ int bch2_fs_recovery(struct bch_fs *c)
ret = bch2_journal_replay(c); ret = bch2_journal_replay(c);
if (ret) if (ret)
goto err; goto err;
bch_verbose(c, "journal replay done"); if (c->opts.verbose || !c->sb.clean)
bch_info(c, "journal replay done");
if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
!c->opts.nochanges) { !c->opts.nochanges) {
...@@ -1385,10 +1313,6 @@ int bch2_fs_initialize(struct bch_fs *c) ...@@ -1385,10 +1313,6 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++) for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i); bch2_btree_root_alloc(c, i);
set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
err = "unable to allocate journal buckets"; err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) { for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca); ret = bch2_dev_journal_alloc(ca);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment