bcachefs: Journal updates to interior nodes

Previously, the btree has always been self contained and internally consistent on disk without anything from the journal - the journal just contained pointers to the btree roots. However, this meant that btree node split or compact operations - i.e. anything that changes btree node topology and involves updates to interior nodes - would require that interior btree node to be written immediately, which means emitting a btree node write that's mostly empty (using 4k of space on disk if the filesystemm blocksize is 4k to only write perhaps ~100 bytes of new keys). More importantly, this meant most btree node writes had to be FUA, and consumer drives have a history of slow and/or buggy FUA support - other filesystes have been bit by this. This patch changes the interior btree update path to journal updates to interior nodes, after the writes for the new btree nodes have completed. Best of all, it turns out to simplify the interior node update path somewhat. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Journal updates to interior nodes
Previously, the btree has always been self contained and internally consistent on disk without anything from the journal - the journal just contained pointers to the btree roots. However, this meant that btree node split or compact operations - i.e. anything that changes btree node topology and involves updates to interior nodes - would require that interior btree node to be written immediately, which means emitting a btree node write that's mostly empty (using 4k of space on disk if the filesystemm blocksize is 4k to only write perhaps ~100 bytes of new keys). More importantly, this meant most btree node writes had to be FUA, and consumer drives have a history of slow and/or buggy FUA support - other filesystes have been bit by this. This patch changes the interior btree update path to journal updates to interior nodes, after the writes for the new btree nodes have completed. Best of all, it turns out to simplify the interior node update path somewhat. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
6357d607 · Kent Overstreet · Kent Overstreet · f44a6a71 · 6357d607 · 6357d607
Commit 6357d607 authored Feb 08, 2020 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
9 changed files
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(new_extent_overwrite,		9)	\
 	x(incompressible,		10)	\
 	x(btree_ptr_v2,			11)	\
-	x(extents_above_btree_updates,	12)
+	x(extents_above_btree_updates,	12)	\
+	x(btree_updates_journalled,	13)

 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\

--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 		closure_put(&((struct btree_update *) new)->cl);

 	bch2_journal_pin_drop(&c->journal, &w->journal);
-	closure_wake_up(&w->wait);
 }

 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
 	wbio->wbio.bio.bi_private	= b;

-	if (b->c.level || !b->written)
-		wbio->wbio.bio.bi_opf |= REQ_FUA;
-
 	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);

 	/*
@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 	rcu_read_lock();
 	for_each_cached_btree(b, c, tbl, i, pos) {
 		unsigned long flags = READ_ONCE(b->flags);
-		unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;

 		if (!(flags & (1 << BTREE_NODE_dirty)))
 			continue;

-		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+		pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
 		       b,
 		       (flags & (1 << BTREE_NODE_dirty)) != 0,
 		       (flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
 		       b->written,
 		       !list_empty_careful(&b->write_blocked),
 		       b->will_make_reachable != 0,
-		       b->will_make_reachable & 1,
-		       b->writes[ idx].wait.list.first != NULL,
-		       b->writes[!idx].wait.list.first != NULL);
+		       b->will_make_reachable & 1);
 	}
 	rcu_read_unlock();


--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
 			  enum six_lock_type);

-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+					    enum six_lock_type lock_held)
 {
 	while (b->written &&
 	       btree_node_need_write(b) &&
 	       btree_node_may_write(b)) {
 		if (!btree_node_write_in_flight(b)) {
-			bch2_btree_node_write(c, b, SIX_LOCK_read);
+			bch2_btree_node_write(c, b, lock_held);
 			break;
 		}

 		six_unlock_read(&b->c.lock);
 		btree_node_wait_on_io(b);
-		btree_node_lock_type(c, b, SIX_LOCK_read);
+		btree_node_lock_type(c, b, lock_held);
 	}
 }

@@ -131,7 +132,7 @@ do {									\
 		new |= (1 << BTREE_NODE_need_write);			\
 	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
 									\
-	btree_node_write_if_need(_c, _b);				\
+	btree_node_write_if_need(_c, _b, SIX_LOCK_read);		\
 } while (0)

 void bch2_btree_flush_all_reads(struct bch_fs *);

--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -53,7 +53,6 @@ struct bset_tree {

 struct btree_write {
 	struct journal_entry_pin	journal;
-	struct closure_waitlist		wait;
 };

 struct btree_alloc {
@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 struct btree_root {
 	struct btree		*b;

-	struct btree_update	*as;
-
 	/* On disk root - see async splits: */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;

--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
 				     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
 				struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);

 enum btree_insert_flags {
 	__BTREE_INSERT_NOUNLOCK,

--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -69,8 +69,10 @@ struct btree_update {
 	unsigned			nodes_written:1;

 	enum btree_id			btree_id;
+	u8				level;

 	struct btree_reserve		*reserve;
+	struct journal_preres		journal_preres;

 	/*
 	 * BTREE_INTERIOR_UPDATING_NODE:
@@ -83,18 +85,6 @@ struct btree_update {
 	struct btree			*b;
 	struct list_head		write_blocked_list;

-	/*
-	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-	 * we're now blocking another btree_update
-	 * @parent_as - btree_update that's waiting on our nodes to finish
-	 * writing, before it can make new nodes visible on disk
-	 * @wait - list of child btree_updates that are waiting on this
-	 * btree_update to make all the new nodes visible before they can free
-	 * their old btree nodes
-	 */
-	struct btree_update		*parent_as;
-	struct closure_waitlist		wait;
-
 	/*
 	 * We may be freeing nodes that were dirty, and thus had journal entries
 	 * pinned: we need to transfer the oldest of those pins to the
@@ -103,8 +93,6 @@ struct btree_update {
 	 */
 	struct journal_entry_pin	journal;

-	u64				journal_seq;
-
 	/*
 	 * Nodes being freed:
 	 * Protected by c->btree_node_pending_free_lock

--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
 	return __btree_node_flush(j, pin, 1, seq);
 }

+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? btree_node_flush0
+			     : btree_node_flush1);
+}
+
 static inline void __btree_journal_key(struct btree_trans *trans,
 				       enum btree_id btree_id,
 				       struct bkey_i *insert)
@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct btree *b = iter_l(iter)->b;
-	struct btree_write *w = btree_current_write(b);
-	u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-		? trans->journal_res.seq
-		: j->replay_journal_seq;

 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 			cpu_to_le64(trans->journal_res.seq);
 	}

-	bch2_journal_pin_add(j, seq, &w->journal,
-			     btree_node_write_idx(b) == 0
-			     ? btree_node_flush0
-			     : btree_node_flush1);
+	bch2_btree_add_journal_pin(c, b,
+		likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq);

 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty(b);

--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+	c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
 	ret = bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
 	c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
 	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+	c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);

 	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;