Commit 6357d607 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Journal updates to interior nodes

Previously, the btree has always been self contained and internally
consistent on disk without anything from the journal - the journal just
contained pointers to the btree roots.

However, this meant that btree node split or compact operations - i.e.
anything that changes btree node topology and involves updates to
interior nodes - would require that interior btree node to be written
immediately, which means emitting a btree node write that's mostly empty
(using 4k of space on disk if the filesystemm blocksize is 4k to only
write perhaps ~100 bytes of new keys).

More importantly, this meant most btree node writes had to be FUA, and
consumer drives have a history of slow and/or buggy FUA support - other
filesystes have been bit by this.

This patch changes the interior btree update path to journal updates to
interior nodes, after the writes for the new btree nodes have completed.
Best of all, it turns out to simplify the interior node update path
somewhat.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent f44a6a71
......@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12)
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
......
......@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
closure_put(&((struct btree_update *) new)->cl);
bch2_journal_pin_drop(&c->journal, &w->journal);
closure_wake_up(&w->wait);
}
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
......@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
if (b->c.level || !b->written)
wbio->wbio.bio.bi_opf |= REQ_FUA;
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
......@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos) {
unsigned long flags = READ_ONCE(b->flags);
unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
......@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
b->written,
!list_empty_careful(&b->write_blocked),
b->will_make_reachable != 0,
b->will_make_reachable & 1,
b->writes[ idx].wait.list.first != NULL,
b->writes[!idx].wait.list.first != NULL);
b->will_make_reachable & 1);
}
rcu_read_unlock();
......
......@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{
while (b->written &&
btree_node_need_write(b) &&
btree_node_may_write(b)) {
if (!btree_node_write_in_flight(b)) {
bch2_btree_node_write(c, b, SIX_LOCK_read);
bch2_btree_node_write(c, b, lock_held);
break;
}
six_unlock_read(&b->c.lock);
btree_node_wait_on_io(b);
btree_node_lock_type(c, b, SIX_LOCK_read);
btree_node_lock_type(c, b, lock_held);
}
}
......@@ -131,7 +132,7 @@ do { \
new |= (1 << BTREE_NODE_need_write); \
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
\
btree_node_write_if_need(_c, _b); \
btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);
......
......@@ -53,7 +53,6 @@ struct bset_tree {
struct btree_write {
struct journal_entry_pin journal;
struct closure_waitlist wait;
};
struct btree_alloc {
......@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
struct btree_root {
struct btree *b;
struct btree_update *as;
/* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
......
......@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
struct btree_iter *);
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
__BTREE_INSERT_NOUNLOCK,
......
This diff is collapsed.
......@@ -69,8 +69,10 @@ struct btree_update {
unsigned nodes_written:1;
enum btree_id btree_id;
u8 level;
struct btree_reserve *reserve;
struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
......@@ -83,18 +85,6 @@ struct btree_update {
struct btree *b;
struct list_head write_blocked_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
* we're now blocking another btree_update
* @parent_as - btree_update that's waiting on our nodes to finish
* writing, before it can make new nodes visible on disk
* @wait - list of child btree_updates that are waiting on this
* btree_update to make all the new nodes visible before they can free
* their old btree nodes
*/
struct btree_update *parent_as;
struct closure_waitlist wait;
/*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
......@@ -103,8 +93,6 @@ struct btree_update {
*/
struct journal_entry_pin journal;
u64 journal_seq;
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
......
......@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
return __btree_node_flush(j, pin, 1, seq);
}
inline void bch2_btree_add_journal_pin(struct bch_fs *c,
struct btree *b, u64 seq)
{
struct btree_write *w = btree_current_write(b);
bch2_journal_pin_add(&c->journal, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
}
static inline void __btree_journal_key(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_i *insert)
......@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree *b = iter_l(iter)->b;
struct btree_write *w = btree_current_write(b);
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq;
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
......@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
cpu_to_le64(trans->journal_res.seq);
}
bch2_journal_pin_add(j, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
bch2_btree_add_journal_pin(c, b,
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
......
......@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
......@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment