Commit 6357d607 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Journal updates to interior nodes

Previously, the btree has always been self contained and internally
consistent on disk without anything from the journal - the journal just
contained pointers to the btree roots.

However, this meant that btree node split or compact operations - i.e.
anything that changes btree node topology and involves updates to
interior nodes - would require that interior btree node to be written
immediately, which means emitting a btree node write that's mostly empty
(using 4k of space on disk if the filesystemm blocksize is 4k to only
write perhaps ~100 bytes of new keys).

More importantly, this meant most btree node writes had to be FUA, and
consumer drives have a history of slow and/or buggy FUA support - other
filesystes have been bit by this.

This patch changes the interior btree update path to journal updates to
interior nodes, after the writes for the new btree nodes have completed.
Best of all, it turns out to simplify the interior node update path
somewhat.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent f44a6a71
...@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ...@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(new_extent_overwrite, 9) \ x(new_extent_overwrite, 9) \
x(incompressible, 10) \ x(incompressible, 10) \
x(btree_ptr_v2, 11) \ x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12) x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13)
#define BCH_SB_FEATURES_ALL \ #define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \ ((1ULL << BCH_FEATURE_new_siphash)| \
......
...@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, ...@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
closure_put(&((struct btree_update *) new)->cl); closure_put(&((struct btree_update *) new)->cl);
bch2_journal_pin_drop(&c->journal, &w->journal); bch2_journal_pin_drop(&c->journal, &w->journal);
closure_wake_up(&w->wait);
} }
static void btree_node_write_done(struct bch_fs *c, struct btree *b) static void btree_node_write_done(struct bch_fs *c, struct btree *b)
...@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, ...@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b; wbio->wbio.bio.bi_private = b;
if (b->c.level || !b->written)
wbio->wbio.bio.bi_opf |= REQ_FUA;
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/* /*
...@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) ...@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
rcu_read_lock(); rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos) { for_each_cached_btree(b, c, tbl, i, pos) {
unsigned long flags = READ_ONCE(b->flags); unsigned long flags = READ_ONCE(b->flags);
unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
if (!(flags & (1 << BTREE_NODE_dirty))) if (!(flags & (1 << BTREE_NODE_dirty)))
continue; continue;
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
b, b,
(flags & (1 << BTREE_NODE_dirty)) != 0, (flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0, (flags & (1 << BTREE_NODE_need_write)) != 0,
...@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) ...@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
b->written, b->written,
!list_empty_careful(&b->write_blocked), !list_empty_careful(&b->write_blocked),
b->will_make_reachable != 0, b->will_make_reachable != 0,
b->will_make_reachable & 1, b->will_make_reachable & 1);
b->writes[ idx].wait.list.first != NULL,
b->writes[!idx].wait.list.first != NULL);
} }
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); ...@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *, void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type); enum six_lock_type);
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b) static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{ {
while (b->written && while (b->written &&
btree_node_need_write(b) && btree_node_need_write(b) &&
btree_node_may_write(b)) { btree_node_may_write(b)) {
if (!btree_node_write_in_flight(b)) { if (!btree_node_write_in_flight(b)) {
bch2_btree_node_write(c, b, SIX_LOCK_read); bch2_btree_node_write(c, b, lock_held);
break; break;
} }
six_unlock_read(&b->c.lock); six_unlock_read(&b->c.lock);
btree_node_wait_on_io(b); btree_node_wait_on_io(b);
btree_node_lock_type(c, b, SIX_LOCK_read); btree_node_lock_type(c, b, lock_held);
} }
} }
...@@ -131,7 +132,7 @@ do { \ ...@@ -131,7 +132,7 @@ do { \
new |= (1 << BTREE_NODE_need_write); \ new |= (1 << BTREE_NODE_need_write); \
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
\ \
btree_node_write_if_need(_c, _b); \ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
} while (0) } while (0)
void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_reads(struct bch_fs *);
......
...@@ -53,7 +53,6 @@ struct bset_tree { ...@@ -53,7 +53,6 @@ struct bset_tree {
struct btree_write { struct btree_write {
struct journal_entry_pin journal; struct journal_entry_pin journal;
struct closure_waitlist wait;
}; };
struct btree_alloc { struct btree_alloc {
...@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) ...@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
struct btree_root { struct btree_root {
struct btree *b; struct btree *b;
struct btree_update *as;
/* On disk root - see async splits: */ /* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level; u8 level;
......
...@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, ...@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
struct btree_iter *); struct btree_iter *);
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *); struct btree_node_iter *, struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags { enum btree_insert_flags {
__BTREE_INSERT_NOUNLOCK, __BTREE_INSERT_NOUNLOCK,
......
...@@ -24,7 +24,6 @@ ...@@ -24,7 +24,6 @@
static void btree_node_will_make_reachable(struct btree_update *, static void btree_node_will_make_reachable(struct btree_update *,
struct btree *); struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *); static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */ /* Debug code: */
...@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, ...@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
} }
static void bch2_btree_node_free_ondisk(struct bch_fs *c, static void bch2_btree_node_free_ondisk(struct bch_fs *c,
struct pending_btree_node_free *pending) struct pending_btree_node_free *pending,
u64 journal_seq)
{ {
BUG_ON(!pending->index_update_done); BUG_ON(!pending->index_update_done);
bch2_mark_key(c, bkey_i_to_s_c(&pending->key), bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE); 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
bch2_mark_key(c, bkey_i_to_s_c(&pending->key), bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
0, 0, NULL, 0, 0, 0, NULL, journal_seq,
BTREE_TRIGGER_OVERWRITE| BTREE_TRIGGER_OVERWRITE|
BTREE_TRIGGER_GC); BTREE_TRIGGER_GC);
} }
...@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as) ...@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
{ {
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal);
BUG_ON(as->nr_new_nodes); BUG_ON((as->nr_new_nodes || as->nr_pending) &&
BUG_ON(as->nr_pending); !bch2_journal_error(&c->journal));;
if (as->reserve) if (as->reserve)
bch2_btree_reserve_put(c, as->reserve); bch2_btree_reserve_put(c, as->reserve);
...@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as) ...@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
} }
static void btree_update_nodes_reachable(struct closure *cl) static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
{ {
struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
bch2_journal_pin_drop(&c->journal, &as->journal);
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
while (as->nr_new_nodes) { while (as->nr_new_nodes) {
...@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl) ...@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
} }
while (as->nr_pending) while (as->nr_pending)
bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
seq);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
closure_wake_up(&as->wait);
bch2_btree_update_free(as);
}
static void btree_update_wait_on_journal(struct closure *cl)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
int ret;
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
if (ret == -EAGAIN) {
continue_at(cl, btree_update_wait_on_journal, system_wq);
return;
}
if (ret < 0)
goto err;
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
err:
continue_at(cl, btree_update_nodes_reachable, system_wq);
} }
static void btree_update_nodes_written(struct closure *cl) static void btree_update_nodes_written(struct closure *cl)
{ {
struct btree_update *as = container_of(cl, struct btree_update, cl); struct btree_update *as = container_of(cl, struct btree_update, cl);
struct journal_res res = { 0 };
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
struct btree *b; struct btree *b;
struct bset *i;
struct bkey_i *k;
unsigned journal_u64s = 0;
int ret;
/* /*
* We did an update to a parent node where the pointers we added pointed * We did an update to a parent node where the pointers we added pointed
...@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl) ...@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
*/ */
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
as->nodes_written = true; as->nodes_written = true;
retry: again:
as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
struct btree_update, unwritten_list); struct btree_update, unwritten_list);
if (!as || !as->nodes_written) { if (!as || !as->nodes_written) {
...@@ -679,31 +662,53 @@ static void btree_update_nodes_written(struct closure *cl) ...@@ -679,31 +662,53 @@ static void btree_update_nodes_written(struct closure *cl)
return; return;
} }
b = as->b;
if (b && !six_trylock_intent(&b->c.lock)) {
mutex_unlock(&c->btree_interior_update_lock);
btree_node_lock_type(c, b, SIX_LOCK_intent);
six_unlock_intent(&b->c.lock);
goto out;
}
journal_u64s = 0;
if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
for_each_keylist_key(&as->parent_keys, k)
journal_u64s += jset_u64s(k->k.u64s);
ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
JOURNAL_RES_GET_RESERVED);
if (ret) {
BUG_ON(!bch2_journal_error(&c->journal));
/* can't unblock btree writes */
goto free_update;
}
if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
for_each_keylist_key(&as->parent_keys, k)
bch2_journal_add_entry(&c->journal, &res,
BCH_JSET_ENTRY_btree_keys,
as->btree_id,
as->level,
k, k->k.u64s);
switch (as->mode) { switch (as->mode) {
case BTREE_INTERIOR_NO_UPDATE: case BTREE_INTERIOR_NO_UPDATE:
BUG(); BUG();
case BTREE_INTERIOR_UPDATING_NODE: case BTREE_INTERIOR_UPDATING_NODE:
/* The usual case: */ /* @b is the node we did the final insert into: */
b = READ_ONCE(as->b); BUG_ON(!res.ref);
if (!six_trylock_read(&b->c.lock)) {
mutex_unlock(&c->btree_interior_update_lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
mutex_lock(&c->btree_interior_update_lock);
goto retry;
}
BUG_ON(!btree_node_dirty(b));
closure_wait(&btree_current_write(b)->wait, &as->cl);
six_lock_write(&b->c.lock, NULL, NULL);
list_del(&as->write_blocked_list); list_del(&as->write_blocked_list);
/* i = btree_bset_last(b);
* for flush_held_btree_writes() waiting on updates to flush or i->journal_seq = cpu_to_le64(
* nodes to be writeable: max(res.seq,
*/ le64_to_cpu(i->journal_seq)));
closure_wake_up(&c->btree_interior_update_wait);
bch2_btree_add_journal_pin(c, b, res.seq);
six_unlock_write(&b->c.lock);
list_del(&as->unwritten_list); list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
...@@ -712,82 +717,51 @@ static void btree_update_nodes_written(struct closure *cl) ...@@ -712,82 +717,51 @@ static void btree_update_nodes_written(struct closure *cl)
* b->write_blocked prevented it from being written, so * b->write_blocked prevented it from being written, so
* write it now if it needs to be written: * write it now if it needs to be written:
*/ */
bch2_btree_node_write_cond(c, b, true); btree_node_write_if_need(c, b, SIX_LOCK_intent);
six_unlock_read(&b->c.lock); six_unlock_intent(&b->c.lock);
continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
break; break;
case BTREE_INTERIOR_UPDATING_AS: case BTREE_INTERIOR_UPDATING_AS:
/* BUG_ON(b);
* The btree node we originally updated has been freed and is
* being rewritten - so we need to write anything here, we just
* need to signal to that btree_update that it's ok to make the
* new replacement node visible:
*/
closure_put(&as->parent_as->cl);
/*
* and then we have to wait on that btree_update to finish:
*/
closure_wait(&as->parent_as->wait, &as->cl);
list_del(&as->unwritten_list); list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
break; break;
case BTREE_INTERIOR_UPDATING_ROOT: case BTREE_INTERIOR_UPDATING_ROOT: {
/* b is the new btree root: */ struct btree_root *r = &c->btree_roots[as->btree_id];
b = READ_ONCE(as->b);
if (!six_trylock_read(&b->c.lock)) {
mutex_unlock(&c->btree_interior_update_lock);
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
mutex_lock(&c->btree_interior_update_lock);
goto retry;
}
BUG_ON(c->btree_roots[b->c.btree_id].as != as);
c->btree_roots[b->c.btree_id].as = NULL;
bch2_btree_set_root_ondisk(c, b, WRITE); BUG_ON(b);
/* mutex_lock(&c->btree_root_lock);
* We don't have to wait anything anything here (before bkey_copy(&r->key, as->parent_keys.keys);
* btree_update_nodes_reachable frees the old nodes r->level = as->level;
* ondisk) - we've ensured that the very next journal write will r->alive = true;
* have the pointer to the new root, and before the allocator c->btree_roots_dirty = true;
* can reuse the old nodes it'll have to do a journal commit: mutex_unlock(&c->btree_root_lock);
*/
six_unlock_read(&b->c.lock);
list_del(&as->unwritten_list); list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
break;
}
}
/*
* Bit of funny circularity going on here we have to break:
*
* We have to drop our journal pin before writing the journal
* entry that points to the new btree root: else, we could
* deadlock if the journal currently happens to be full.
*
* This mean we're dropping the journal pin _before_ the new
* nodes are technically reachable - but this is safe, because
* after the bch2_btree_set_root_ondisk() call above they will
* be reachable as of the very next journal write:
*/
bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_drop(&c->journal, &as->journal);
as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); bch2_journal_res_put(&c->journal, &res);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
btree_update_wait_on_journal(&as->cl);
break;
}
btree_update_nodes_reachable(as, res.seq);
free_update:
bch2_btree_update_free(as);
/*
* for flush_held_btree_writes() waiting on updates to flush or
* nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
out:
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
goto retry; goto again;
} }
/* /*
...@@ -806,46 +780,10 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) ...@@ -806,46 +780,10 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
as->mode = BTREE_INTERIOR_UPDATING_NODE; as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b; as->b = b;
as->level = b->c.level;
list_add(&as->write_blocked_list, &b->write_blocked); list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
/*
* In general, when you're staging things in a journal that will later
* be written elsewhere, and you also want to guarantee ordering: that
* is, if you have updates a, b, c, after a crash you should never see c
* and not a or b - there's a problem:
*
* If the final destination of the update(s) (i.e. btree node) can be
* written/flushed _before_ the relevant journal entry - oops, that
* breaks ordering, since the various leaf nodes can be written in any
* order.
*
* Normally we use bset->journal_seq to deal with this - if during
* recovery we find a btree node write that's newer than the newest
* journal entry, we just ignore it - we don't need it, anything we're
* supposed to have (that we reported as completed via fsync()) will
* still be in the journal, and as far as the state of the journal is
* concerned that btree node write never happened.
*
* That breaks when we're rewriting/splitting/merging nodes, since we're
* mixing btree node writes that haven't happened yet with previously
* written data that has been reported as completed to the journal.
*
* Thus, before making the new nodes reachable, we have to wait the
* newest journal sequence number we have data for to be written (if it
* hasn't been yet).
*/
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
}
static void interior_update_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
struct btree_update *as =
container_of(pin, struct btree_update, journal);
bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
} }
static void btree_update_reparent(struct btree_update *as, static void btree_update_reparent(struct btree_update *as,
...@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as, ...@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
{ {
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL; child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS; child->mode = BTREE_INTERIOR_UPDATING_AS;
child->parent_as = as;
closure_get(&as->cl);
/* /*
* When we write a new btree root, we have to drop our journal pin * When we write a new btree root, we have to drop our journal pin
...@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as, ...@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
* just transfer the journal pin to the new interior update so * just transfer the journal pin to the new interior update so
* btree_update_nodes_written() can drop it. * btree_update_nodes_written() can drop it.
*/ */
bch2_journal_pin_copy(&c->journal, &as->journal, bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
&child->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &child->journal); bch2_journal_pin_drop(&c->journal, &child->journal);
as->journal_seq = max(as->journal_seq, child->journal_seq);
} }
static void btree_update_updated_root(struct btree_update *as) static void btree_update_updated_root(struct btree_update *as, struct btree *b)
{ {
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
struct btree_root *r = &c->btree_roots[as->btree_id];
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!bch2_keylist_empty(&as->parent_keys));
/* mutex_lock(&c->btree_interior_update_lock);
* Old root might not be persistent yet - if so, redirect its list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
* btree_update operation to point to us:
*/
if (r->as)
btree_update_reparent(as, r->as);
as->mode = BTREE_INTERIOR_UPDATING_ROOT; as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->b = r->b; as->level = b->c.level;
r->as = as; bch2_keylist_add(&as->parent_keys, &b->key);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
/*
* When we're rewriting nodes and updating interior nodes, there's an
* issue with updates that haven't been written in the journal getting
* mixed together with older data - see btree_update_updated_node()
* for the explanation.
*
* However, this doesn't affect us when we're writing a new btree root -
* because to make that new root reachable we have to write out a new
* journal entry, which must necessarily be newer than as->journal_seq.
*/
} }
static void btree_node_will_make_reachable(struct btree_update *as, static void btree_node_will_make_reachable(struct btree_update *as,
...@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, ...@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree *b) struct btree *b)
{ {
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
struct closure *cl, *cl_n;
struct btree_update *p, *n; struct btree_update *p, *n;
struct btree_write *w; struct btree_write *w;
struct bset_tree *t;
set_btree_node_dying(b); set_btree_node_dying(b);
...@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, ...@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
btree_interior_update_add_node_reference(as, b); btree_interior_update_add_node_reference(as, b);
/*
* Does this node have data that hasn't been written in the journal?
*
* If so, we have to wait for the corresponding journal entry to be
* written before making the new nodes reachable - we can't just carry
* over the bset->journal_seq tracking, since we'll be mixing those keys
* in with keys that aren't in the journal anymore:
*/
for_each_bset(b, t)
as->journal_seq = max(as->journal_seq,
le64_to_cpu(bset(b, t)->journal_seq));
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
/* /*
...@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, ...@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
clear_btree_node_dirty(b); clear_btree_node_dirty(b);
clear_btree_node_need_write(b); clear_btree_node_need_write(b);
w = btree_current_write(b);
/*
* Does this node have any btree_update operations waiting on this node
* to be written?
*
* If so, wake them up when this btree_update operation is reachable:
*/
llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
llist_add(&cl->list, &as->wait.list);
/* /*
* Does this node have unwritten data that has a pin on the journal? * Does this node have unwritten data that has a pin on the journal?
...@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, ...@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* oldest pin of any of the nodes we're freeing. We'll release the pin * oldest pin of any of the nodes we're freeing. We'll release the pin
* when the new nodes are persistent and reachable on disk: * when the new nodes are persistent and reachable on disk:
*/ */
bch2_journal_pin_copy(&c->journal, &as->journal, w = btree_current_write(b);
&w->journal, interior_update_flush); bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal); bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b); w = btree_prev_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
&w->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &w->journal); bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
...@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, ...@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
{ {
struct btree_reserve *reserve; struct btree_reserve *reserve;
struct btree_update *as; struct btree_update *as;
int ret;
reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
if (IS_ERR(reserve)) if (IS_ERR(reserve))
...@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, ...@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
bch2_keylist_init(&as->parent_keys, as->inline_keys); bch2_keylist_init(&as->parent_keys, as->inline_keys);
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
if (ret) {
bch2_btree_reserve_put(c, reserve);
closure_debug_destroy(&as->cl);
mempool_free(as, &c->btree_interior_update_pool);
return ERR_PTR(ret);
}
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list); list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
...@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) ...@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
} }
static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
{
struct btree_root *r = &c->btree_roots[b->c.btree_id];
mutex_lock(&c->btree_root_lock);
BUG_ON(b != r->b);
bkey_copy(&r->key, &b->key);
r->level = b->c.level;
r->alive = true;
if (rw == WRITE)
c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock);
}
/** /**
* bch_btree_set_root - update the root in memory and on disk * bch_btree_set_root - update the root in memory and on disk
* *
...@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, ...@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
bch2_btree_set_root_inmem(as, b); bch2_btree_set_root_inmem(as, b);
btree_update_updated_root(as); btree_update_updated_root(as, b);
/* /*
* Unlock old root after new root is visible: * Unlock old root after new root is visible:
...@@ -1471,6 +1356,7 @@ static void btree_split(struct btree_update *as, struct btree *b, ...@@ -1471,6 +1356,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_build_aux_trees(n1); bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->c.lock); six_unlock_write(&n1->c.lock);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key); bch2_keylist_add(&as->parent_keys, &n1->key);
} }
...@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ...@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
(bkey_cmp_packed(b, k, &insert->k) >= 0)) (bkey_cmp_packed(b, k, &insert->k) >= 0))
; ;
while (!bch2_keylist_empty(keys)) { for_each_keylist_key(keys, insert)
insert = bch2_keylist_front(keys);
bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
bch2_keylist_pop_front(keys);
}
btree_update_updated_node(as, b); btree_update_updated_node(as, b);
...@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, ...@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bkey_copy(&b->key, new_key); bkey_copy(&b->key, new_key);
} }
btree_update_updated_root(as); btree_update_updated_root(as, b);
bch2_btree_node_unlock_write(b, iter); bch2_btree_node_unlock_write(b, iter);
} }
......
...@@ -69,8 +69,10 @@ struct btree_update { ...@@ -69,8 +69,10 @@ struct btree_update {
unsigned nodes_written:1; unsigned nodes_written:1;
enum btree_id btree_id; enum btree_id btree_id;
u8 level;
struct btree_reserve *reserve; struct btree_reserve *reserve;
struct journal_preres journal_preres;
/* /*
* BTREE_INTERIOR_UPDATING_NODE: * BTREE_INTERIOR_UPDATING_NODE:
...@@ -83,18 +85,6 @@ struct btree_update { ...@@ -83,18 +85,6 @@ struct btree_update {
struct btree *b; struct btree *b;
struct list_head write_blocked_list; struct list_head write_blocked_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
* we're now blocking another btree_update
* @parent_as - btree_update that's waiting on our nodes to finish
* writing, before it can make new nodes visible on disk
* @wait - list of child btree_updates that are waiting on this
* btree_update to make all the new nodes visible before they can free
* their old btree nodes
*/
struct btree_update *parent_as;
struct closure_waitlist wait;
/* /*
* We may be freeing nodes that were dirty, and thus had journal entries * We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the * pinned: we need to transfer the oldest of those pins to the
...@@ -103,8 +93,6 @@ struct btree_update { ...@@ -103,8 +93,6 @@ struct btree_update {
*/ */
struct journal_entry_pin journal; struct journal_entry_pin journal;
u64 journal_seq;
/* /*
* Nodes being freed: * Nodes being freed:
* Protected by c->btree_node_pending_free_lock * Protected by c->btree_node_pending_free_lock
......
...@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, ...@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
return __btree_node_flush(j, pin, 1, seq); return __btree_node_flush(j, pin, 1, seq);
} }
inline void bch2_btree_add_journal_pin(struct bch_fs *c,
struct btree *b, u64 seq)
{
struct btree_write *w = btree_current_write(b);
bch2_journal_pin_add(&c->journal, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
}
static inline void __btree_journal_key(struct btree_trans *trans, static inline void __btree_journal_key(struct btree_trans *trans,
enum btree_id btree_id, enum btree_id btree_id,
struct bkey_i *insert) struct bkey_i *insert)
...@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans, ...@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct journal *j = &c->journal; struct journal *j = &c->journal;
struct btree *b = iter_l(iter)->b; struct btree *b = iter_l(iter)->b;
struct btree_write *w = btree_current_write(b);
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq;
EBUG_ON(trans->journal_res.ref != EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
...@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans, ...@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
cpu_to_le64(trans->journal_res.seq); cpu_to_le64(trans->journal_res.seq);
} }
bch2_journal_pin_add(j, seq, &w->journal, bch2_btree_add_journal_pin(c, b,
btree_node_write_idx(b) == 0 likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? btree_node_flush0 ? trans->journal_res.seq
: btree_node_flush1); : j->replay_journal_seq);
if (unlikely(!btree_node_dirty(b))) if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b); set_btree_node_dirty(b);
......
...@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) ...@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
ret = bch2_write_super(c); ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
...@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) ...@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment