bcachefs: Journal updates to interior nodes

Previously, the btree has always been self contained and internally consistent on disk without anything from the journal - the journal just contained pointers to the btree roots. However, this meant that btree node split or compact operations - i.e. anything that changes btree node topology and involves updates to interior nodes - would require that interior btree node to be written immediately, which means emitting a btree node write that's mostly empty (using 4k of space on disk if the filesystemm blocksize is 4k to only write perhaps ~100 bytes of new keys). More importantly, this meant most btree node writes had to be FUA, and consumer drives have a history of slow and/or buggy FUA support - other filesystes have been bit by this. This patch changes the interior btree update path to journal updates to interior nodes, after the writes for the new btree nodes have completed. Best of all, it turns out to simplify the interior node update path somewhat. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Journal updates to interior nodes
Previously, the btree has always been self contained and internally consistent on disk without anything from the journal - the journal just contained pointers to the btree roots. However, this meant that btree node split or compact operations - i.e. anything that changes btree node topology and involves updates to interior nodes - would require that interior btree node to be written immediately, which means emitting a btree node write that's mostly empty (using 4k of space on disk if the filesystemm blocksize is 4k to only write perhaps ~100 bytes of new keys). More importantly, this meant most btree node writes had to be FUA, and consumer drives have a history of slow and/or buggy FUA support - other filesystes have been bit by this. This patch changes the interior btree update path to journal updates to interior nodes, after the writes for the new btree nodes have completed. Best of all, it turns out to simplify the interior node update path somewhat. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
6357d607 · Kent Overstreet · Kent Overstreet · f44a6a71 · 6357d607 · 6357d607
Commit 6357d607 authored Feb 08, 2020 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
9 changed files
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(new_extent_overwrite,		9)	\
 	x(incompressible,		10)	\
 	x(btree_ptr_v2,			11)	\
-	x(extents_above_btree_updates,	12)
+	x(extents_above_btree_updates,	12)	\
+	x(btree_updates_journalled,	13)

 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\

--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 		closure_put(&((struct btree_update *) new)->cl);

 	bch2_journal_pin_drop(&c->journal, &w->journal);
-	closure_wake_up(&w->wait);
 }

 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;

-	if (b->c.level || !b->written)
-		wbio->wbio.bio.bi_opf |= REQ_FUA;
-
 	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);

 	/*
@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos) {
 		unsigned long flags = READ_ONCE(b->flags);
-		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;

 		if (!(flags & (1 << BTREE_NODE_dirty)))
 			continue;

-		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
 		       b,
 		       (flags & (1 << BTREE_NODE_dirty)) != 0,
 		       (flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		       b->written,
 		       !list_empty_careful(&b->write_blocked),
 		       b->will_make_reachable != 0,
-		       b->will_make_reachable & 1,
-		       b->writes[ idx].wait.list.first != NULL,
-		       b->writes[!idx].wait.list.first != NULL);
+		       b->will_make_reachable & 1);
 	}
 	rcu_read_unlock();


--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
 			  enum six_lock_type);

-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+					    enum six_lock_type lock_held)
 {
 	while (b->written &&
 	       btree_node_need_write(b) &&
 	       btree_node_may_write(b)) {
 		if (!btree_node_write_in_flight(b)) {
-			bch2_btree_node_write(c, b, SIX_LOCK_read);
+			bch2_btree_node_write(c, b, lock_held);
 			break;
 		}

 		six_unlock_read(&b->c.lock);
 		btree_node_wait_on_io(b);
-		btree_node_lock_type(c, b, SIX_LOCK_read);
+		btree_node_lock_type(c, b, lock_held);
 	}
 }

@@ -131,7 +132,7 @@ do {									\
 		new |= (1 << BTREE_NODE_need_write);			\
 	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
 									\
-	btree_node_write_if_need(_c, _b);				\
+	btree_node_write_if_need(_c, _b, SIX_LOCK_read);		\
 } while (0)

 void bch2_btree_flush_all_reads(struct bch_fs *);

--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -53,7 +53,6 @@ struct bset_tree {

 struct btree_write {
 	struct journal_entry_pin	journal;
-	struct closure_waitlist		wait;
 };

 struct btree_alloc {
@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 struct btree_root {
 	struct btree		*b;

-	struct btree_update	*as;
-
 	/* On disk root - see async splits: */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;

--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
 				     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 				struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);

 enum btree_insert_flags {
 	__BTREE_INSERT_NOUNLOCK,

--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -24,7 +24,6 @@
 static void btree_node_will_make_reachable(struct btree_update *,
 					   struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);

 /* Debug code: */

@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 }

 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-					struct pending_btree_node_free *pending)
+			struct pending_btree_node_free *pending,
+			u64 journal_seq)
 {
 	BUG_ON(!pending->index_update_done);

 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-		      0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
+		      0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);

 	if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
 		bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-			      0, 0, NULL, 0,
+			      0, 0, NULL, journal_seq,
 			      BTREE_TRIGGER_OVERWRITE|
 			      BTREE_TRIGGER_GC);
 }
@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
 	struct bch_fs *c = as->c;

+	bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);

-	BUG_ON(as->nr_new_nodes);
-	BUG_ON(as->nr_pending);
+	BUG_ON((as->nr_new_nodes || as->nr_pending) &&
+	       !bch2_journal_error(&c->journal));;

 	if (as->reserve)
 		bch2_btree_reserve_put(c, as->reserve);
@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
 	mutex_unlock(&c->btree_interior_update_lock);
 }

-static void btree_update_nodes_reachable(struct closure *cl)
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
 	struct bch_fs *c = as->c;

-	bch2_journal_pin_drop(&c->journal, &as->journal);
-
 	mutex_lock(&c->btree_interior_update_lock);

 	while (as->nr_new_nodes) {
@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
 	}

 	while (as->nr_pending)
-		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
+					    seq);

 	mutex_unlock(&c->btree_interior_update_lock);
-
-	closure_wake_up(&as->wait);
-
-	bch2_btree_update_free(as);
-}
-
-static void btree_update_wait_on_journal(struct closure *cl)
-{
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
-	struct bch_fs *c = as->c;
-	int ret;
-
-	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-	if (ret == -EAGAIN) {
-		continue_at(cl, btree_update_wait_on_journal, system_wq);
-		return;
-	}
-	if (ret < 0)
-		goto err;
-
-	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
-	continue_at(cl, btree_update_nodes_reachable, system_wq);
 }

 static void btree_update_nodes_written(struct closure *cl)
 {
 	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct journal_res res = { 0 };
 	struct bch_fs *c = as->c;
 	struct btree *b;
+	struct bset *i;
+	struct bkey_i *k;
+	unsigned journal_u64s = 0;
+	int ret;

 	/*
 	 * We did an update to a parent node where the pointers we added pointed
@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
 	 */
 	mutex_lock(&c->btree_interior_update_lock);
 	as->nodes_written = true;
-retry:
+again:
 	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
 				      struct btree_update, unwritten_list);
 	if (!as || !as->nodes_written) {
@@ -679,31 +662,53 @@ static void btree_update_nodes_written(struct closure *cl)
 		return;
 	}

+	b = as->b;
+	if (b && !six_trylock_intent(&b->c.lock)) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		btree_node_lock_type(c, b, SIX_LOCK_intent);
+		six_unlock_intent(&b->c.lock);
+		goto out;
+	}
+
+	journal_u64s = 0;
+
+	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+		for_each_keylist_key(&as->parent_keys, k)
+			journal_u64s += jset_u64s(k->k.u64s);
+
+	ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
+				   JOURNAL_RES_GET_RESERVED);
+	if (ret) {
+		BUG_ON(!bch2_journal_error(&c->journal));
+		/* can't unblock btree writes */
+		goto free_update;
+	}
+
+	if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+		for_each_keylist_key(&as->parent_keys, k)
+			bch2_journal_add_entry(&c->journal, &res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       as->btree_id,
+					       as->level,
+					       k, k->k.u64s);
+
 	switch (as->mode) {
 	case BTREE_INTERIOR_NO_UPDATE:
 		BUG();
 	case BTREE_INTERIOR_UPDATING_NODE:
-		/* The usual case: */
-		b = READ_ONCE(as->b);
-
-		if (!six_trylock_read(&b->c.lock)) {
-			mutex_unlock(&c->btree_interior_update_lock);
-			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->c.lock);
-			mutex_lock(&c->btree_interior_update_lock);
-			goto retry;
-		}
-
-		BUG_ON(!btree_node_dirty(b));
-		closure_wait(&btree_current_write(b)->wait, &as->cl);
+		/* @b is the node we did the final insert into: */
+		BUG_ON(!res.ref);

+		six_lock_write(&b->c.lock, NULL, NULL);
 		list_del(&as->write_blocked_list);

-		/*
-		 * for flush_held_btree_writes() waiting on updates to flush or
-		 * nodes to be writeable:
-		 */
-		closure_wake_up(&c->btree_interior_update_wait);
+		i = btree_bset_last(b);
+		i->journal_seq = cpu_to_le64(
+			max(res.seq,
+			    le64_to_cpu(i->journal_seq)));
+
+		bch2_btree_add_journal_pin(c, b, res.seq);
+		six_unlock_write(&b->c.lock);

 		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
@@ -712,82 +717,51 @@ static void btree_update_nodes_written(struct closure *cl)
 		 * b->write_blocked prevented it from being written, so
 		 * write it now if it needs to be written:
 		 */
-		bch2_btree_node_write_cond(c, b, true);
-		six_unlock_read(&b->c.lock);
-		continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		six_unlock_intent(&b->c.lock);
 		break;

 	case BTREE_INTERIOR_UPDATING_AS:
-		/*
-		 * The btree node we originally updated has been freed and is
-		 * being rewritten - so we need to write anything here, we just
-		 * need to signal to that btree_update that it's ok to make the
-		 * new replacement node visible:
-		 */
-		closure_put(&as->parent_as->cl);
-
-		/*
-		 * and then we have to wait on that btree_update to finish:
-		 */
-		closure_wait(&as->parent_as->wait, &as->cl);
+		BUG_ON(b);

 		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
-
-		continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
 		break;

-	case BTREE_INTERIOR_UPDATING_ROOT:
-		/* b is the new btree root: */
-		b = READ_ONCE(as->b);
-
-		if (!six_trylock_read(&b->c.lock)) {
-			mutex_unlock(&c->btree_interior_update_lock);
-			btree_node_lock_type(c, b, SIX_LOCK_read);
-			six_unlock_read(&b->c.lock);
-			mutex_lock(&c->btree_interior_update_lock);
-			goto retry;
-		}
-
-		BUG_ON(c->btree_roots[b->c.btree_id].as != as);
-		c->btree_roots[b->c.btree_id].as = NULL;
+	case BTREE_INTERIOR_UPDATING_ROOT: {
+		struct btree_root *r = &c->btree_roots[as->btree_id];

-		bch2_btree_set_root_ondisk(c, b, WRITE);
+		BUG_ON(b);

-		/*
-		 * We don't have to wait anything anything here (before
-		 * btree_update_nodes_reachable frees the old nodes
-		 * ondisk) - we've ensured that the very next journal write will
-		 * have the pointer to the new root, and before the allocator
-		 * can reuse the old nodes it'll have to do a journal commit:
-		 */
-		six_unlock_read(&b->c.lock);
+		mutex_lock(&c->btree_root_lock);
+		bkey_copy(&r->key, as->parent_keys.keys);
+		r->level = as->level;
+		r->alive = true;
+		c->btree_roots_dirty = true;
+		mutex_unlock(&c->btree_root_lock);

 		list_del(&as->unwritten_list);
 		mutex_unlock(&c->btree_interior_update_lock);
+		break;
+	}
+	}

-		/*
-		 * Bit of funny circularity going on here we have to break:
-		 *
-		 * We have to drop our journal pin before writing the journal
-		 * entry that points to the new btree root: else, we could
-		 * deadlock if the journal currently happens to be full.
-		 *
-		 * This mean we're dropping the journal pin _before_ the new
-		 * nodes are technically reachable - but this is safe, because
-		 * after the bch2_btree_set_root_ondisk() call above they will
-		 * be reachable as of the very next journal write:
-		 */
 	bch2_journal_pin_drop(&c->journal, &as->journal);

-		as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
-
-		btree_update_wait_on_journal(&as->cl);
-		break;
-	}
+	bch2_journal_res_put(&c->journal, &res);
+	bch2_journal_preres_put(&c->journal, &as->journal_preres);

+	btree_update_nodes_reachable(as, res.seq);
+free_update:
+	bch2_btree_update_free(as);
+	/*
+	 * for flush_held_btree_writes() waiting on updates to flush or
+	 * nodes to be writeable:
+	 */
+	closure_wake_up(&c->btree_interior_update_wait);
+out:
 	mutex_lock(&c->btree_interior_update_lock);
-	goto retry;
+	goto again;
 }

 /*
@@ -806,46 +780,10 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)

 	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
 	as->b		= b;
+	as->level	= b->c.level;
 	list_add(&as->write_blocked_list, &b->write_blocked);

 	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * In general, when you're staging things in a journal that will later
-	 * be written elsewhere, and you also want to guarantee ordering: that
-	 * is, if you have updates a, b, c, after a crash you should never see c
-	 * and not a or b - there's a problem:
-	 *
-	 * If the final destination of the update(s) (i.e. btree node) can be
-	 * written/flushed _before_ the relevant journal entry - oops, that
-	 * breaks ordering, since the various leaf nodes can be written in any
-	 * order.
-	 *
-	 * Normally we use bset->journal_seq to deal with this - if during
-	 * recovery we find a btree node write that's newer than the newest
-	 * journal entry, we just ignore it - we don't need it, anything we're
-	 * supposed to have (that we reported as completed via fsync()) will
-	 * still be in the journal, and as far as the state of the journal is
-	 * concerned that btree node write never happened.
-	 *
-	 * That breaks when we're rewriting/splitting/merging nodes, since we're
-	 * mixing btree node writes that haven't happened yet with previously
-	 * written data that has been reported as completed to the journal.
-	 *
-	 * Thus, before making the new nodes reachable, we have to wait the
-	 * newest journal sequence number we have data for to be written (if it
-	 * hasn't been yet).
-	 */
-	bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-}
-
-static void interior_update_flush(struct journal *j,
-			struct journal_entry_pin *pin, u64 seq)
-{
-	struct btree_update *as =
-		container_of(pin, struct btree_update, journal);
-
-	bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
 }

 static void btree_update_reparent(struct btree_update *as,
@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
 {
 	struct bch_fs *c = as->c;

+	lockdep_assert_held(&c->btree_interior_update_lock);
+
 	child->b = NULL;
 	child->mode = BTREE_INTERIOR_UPDATING_AS;
-	child->parent_as = as;
-	closure_get(&as->cl);

 	/*
 	 * When we write a new btree root, we have to drop our journal pin
@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
 	 * just transfer the journal pin to the new interior update so
 	 * btree_update_nodes_written() can drop it.
 	 */
-	bch2_journal_pin_copy(&c->journal, &as->journal,
-			      &child->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
 	bch2_journal_pin_drop(&c->journal, &child->journal);
-
-	as->journal_seq = max(as->journal_seq, child->journal_seq);
 }

-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
-	struct btree_root *r = &c->btree_roots[as->btree_id];
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);

 	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!bch2_keylist_empty(&as->parent_keys));

-	/*
-	 * Old root might not be persistent yet - if so, redirect its
-	 * btree_update operation to point to us:
-	 */
-	if (r->as)
-		btree_update_reparent(as, r->as);
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);

 	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
-	as->b = r->b;
-	r->as = as;
-
+	as->level	= b->c.level;
+	bch2_keylist_add(&as->parent_keys, &b->key);
 	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * When we're rewriting nodes and updating interior nodes, there's an
-	 * issue with updates that haven't been written in the journal getting
-	 * mixed together with older data - see btree_update_updated_node()
-	 * for the explanation.
-	 *
-	 * However, this doesn't affect us when we're writing a new btree root -
-	 * because to make that new root reachable we have to write out a new
-	 * journal entry, which must necessarily be newer than as->journal_seq.
-	 */
 }

 static void btree_node_will_make_reachable(struct btree_update *as,
@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 					       struct btree *b)
 {
 	struct bch_fs *c = as->c;
-	struct closure *cl, *cl_n;
 	struct btree_update *p, *n;
 	struct btree_write *w;
-	struct bset_tree *t;

 	set_btree_node_dying(b);

@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,

 	btree_interior_update_add_node_reference(as, b);

-	/*
-	 * Does this node have data that hasn't been written in the journal?
-	 *
-	 * If so, we have to wait for the corresponding journal entry to be
-	 * written before making the new nodes reachable - we can't just carry
-	 * over the bset->journal_seq tracking, since we'll be mixing those keys
-	 * in with keys that aren't in the journal anymore:
-	 */
-	for_each_bset(b, t)
-		as->journal_seq = max(as->journal_seq,
-				      le64_to_cpu(bset(b, t)->journal_seq));
-
 	mutex_lock(&c->btree_interior_update_lock);

 	/*
@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,

 	clear_btree_node_dirty(b);
 	clear_btree_node_need_write(b);
-	w = btree_current_write(b);
-
-	/*
-	 * Does this node have any btree_update operations waiting on this node
-	 * to be written?
-	 *
-	 * If so, wake them up when this btree_update operation is reachable:
-	 */
-	llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
-		llist_add(&cl->list, &as->wait.list);

 	/*
 	 * Does this node have unwritten data that has a pin on the journal?
@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 	 * oldest pin of any of the nodes we're freeing. We'll release the pin
 	 * when the new nodes are persistent and reachable on disk:
 	 */
-	bch2_journal_pin_copy(&c->journal, &as->journal,
-			      &w->journal, interior_update_flush);
+	w = btree_current_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
 	bch2_journal_pin_drop(&c->journal, &w->journal);

 	w = btree_prev_write(b);
-	bch2_journal_pin_copy(&c->journal, &as->journal,
-			      &w->journal, interior_update_flush);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
 	bch2_journal_pin_drop(&c->journal, &w->journal);

 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 {
 	struct btree_reserve *reserve;
 	struct btree_update *as;
+	int ret;

 	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
 	if (IS_ERR(reserve))
@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,

 	bch2_keylist_init(&as->parent_keys, as->inline_keys);

+	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+				 jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
+	if (ret) {
+		bch2_btree_reserve_put(c, reserve);
+		closure_debug_destroy(&as->cl);
+		mempool_free(as, &c->btree_interior_update_pool);
+		return ERR_PTR(ret);
+	}
+
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	mutex_unlock(&c->btree_interior_update_lock);
 }

-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
-{
-	struct btree_root *r = &c->btree_roots[b->c.btree_id];
-
-	mutex_lock(&c->btree_root_lock);
-
-	BUG_ON(b != r->b);
-	bkey_copy(&r->key, &b->key);
-	r->level = b->c.level;
-	r->alive = true;
-	if (rw == WRITE)
-		c->btree_roots_dirty = true;
-
-	mutex_unlock(&c->btree_root_lock);
-}
-
 /**
 * bch_btree_set_root - update the root in memory and on disk
 *
@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,

 	bch2_btree_set_root_inmem(as, b);

-	btree_update_updated_root(as);
+	btree_update_updated_root(as, b);

 	/*
 	 * Unlock old root after new root is visible:
@@ -1471,6 +1356,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 		bch2_btree_build_aux_trees(n1);
 		six_unlock_write(&n1->c.lock);

+		if (parent)
 			bch2_keylist_add(&as->parent_keys, &n1->key);
 	}

@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 	       (bkey_cmp_packed(b, k, &insert->k) >= 0))
 		;

-	while (!bch2_keylist_empty(keys)) {
-		insert = bch2_keylist_front(keys);
-
+	for_each_keylist_key(keys, insert)
 		bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
-		bch2_keylist_pop_front(keys);
-	}

 	btree_update_updated_node(as, b);

@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 			bkey_copy(&b->key, new_key);
 		}

-		btree_update_updated_root(as);
+		btree_update_updated_root(as, b);
 		bch2_btree_node_unlock_write(b, iter);
 	}


--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -69,8 +69,10 @@ struct btree_update {
 	unsigned			nodes_written:1;

 	enum btree_id			btree_id;
+	u8				level;

 	struct btree_reserve		*reserve;
+	struct journal_preres		journal_preres;

 	/*
 	 * BTREE_INTERIOR_UPDATING_NODE:
@@ -83,18 +85,6 @@ struct btree_update {
 	struct btree			*b;
 	struct list_head		write_blocked_list;

-	/*
-	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-	 * we're now blocking another btree_update
-	 * @parent_as - btree_update that's waiting on our nodes to finish
-	 * writing, before it can make new nodes visible on disk
-	 * @wait - list of child btree_updates that are waiting on this
-	 * btree_update to make all the new nodes visible before they can free
-	 * their old btree nodes
-	 */
-	struct btree_update		*parent_as;
-	struct closure_waitlist		wait;
-
 	/*
 	 * We may be freeing nodes that were dirty, and thus had journal entries
 	 * pinned: we need to transfer the oldest of those pins to the
@@ -103,8 +93,6 @@ struct btree_update {
 	 */
 	struct journal_entry_pin	journal;

-	u64				journal_seq;
-
 	/*
 	 * Nodes being freed:
 	 * Protected by c->btree_node_pending_free_lock

--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
 	return __btree_node_flush(j, pin, 1, seq);
 }

+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? btree_node_flush0
+			     : btree_node_flush1);
+}
+
 static inline void __btree_journal_key(struct btree_trans *trans,
 				       enum btree_id btree_id,
 				       struct bkey_i *insert)
@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct btree *b = iter_l(iter)->b;
-	struct btree_write *w = btree_current_write(b);
-	u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		? trans->journal_res.seq
-		: j->replay_journal_seq;

 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 			cpu_to_le64(trans->journal_res.seq);
 	}

-	bch2_journal_pin_add(j, seq, &w->journal,
-			     btree_node_write_idx(b) == 0
-			     ? btree_node_flush0
-			     : btree_node_flush1);
+	bch2_btree_add_journal_pin(c, b,
+		likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq);

 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty(b);

--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
 	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);

 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;