bcachefs: Assorted journal refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Assorted journal refactoring
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
d16b4a77 · Kent Overstreet · Kent Overstreet · ecf37a4a · d16b4a77 · d16b4a77
Commit d16b4a77 authored Feb 18, 2019 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
5 changed files
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -17,23 +17,14 @@
 #include "super-io.h"
 #include "trace.h"

-static bool journal_entry_is_open(struct journal *j)
+static bool __journal_entry_is_open(union journal_res_state state)
 {
-	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
 }

-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+static bool journal_entry_is_open(struct journal *j)
 {
-	struct journal_buf *w = journal_prev_buf(j);
-
-	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
-	if (!need_write_just_set &&
-	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		bch2_time_stats_update(j->delay_time,
-				       j->need_write_time);
-
-	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+	return __journal_entry_is_open(j->reservations);
 }

 static void journal_pin_new_entry(struct journal *j, int count)
@@ -77,39 +68,76 @@ static inline bool journal_entry_empty(struct jset *j)
 	return true;
 }

-static enum {
-	JOURNAL_ENTRY_ERROR,
-	JOURNAL_ENTRY_INUSE,
-	JOURNAL_ENTRY_CLOSED,
-	JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
+void bch2_journal_halt(struct journal *j)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+			return;
+
+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	journal_wake(j);
+	closure_wake_up(&journal_cur_buf(j)->wait);
+	closure_wake_up(&journal_prev_buf(j)->wait);
+}
+
+/* journal entry close/open: */
+
+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+{
+	struct journal_buf *w = journal_prev_buf(j);
+
+	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
+
+	if (!need_write_just_set &&
+	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		bch2_time_stats_update(j->delay_time,
+				       j->need_write_time);
+
+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+	closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ */
+static bool __journal_entry_close(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
+	bool set_need_write = false;
+	unsigned sectors;

 	lockdep_assert_held(&j->lock);

 	do {
 		old.v = new.v = v;
 		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-			return JOURNAL_ENTRY_CLOSED;
+			return true;

 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
 			/* this entry will never be written: */
 			closure_wake_up(&buf->wait);
-			return JOURNAL_ENTRY_ERROR;
+			return true;
 		}

-		if (new.prev_buf_unwritten)
-			return JOURNAL_ENTRY_INUSE;
+		if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			set_bit(JOURNAL_NEED_WRITE, &j->flags);
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}

-		/*
-		 * avoid race between setting buf->data->u64s and
-		 * journal_res_put starting write:
-		 */
-		journal_state_inc(&new);
+		if (new.prev_buf_unwritten)
+			return false;

 		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
 		new.idx++;
@@ -119,15 +147,12 @@ static enum {
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);

-	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);

-	j->prev_buf_sectors =
-		vstruct_blocks_plus(buf->data, c->block_bits,
-				    buf->u64s_reserved) *
-		c->opts.block_size;
-	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+				      buf->u64s_reserved) << c->block_bits;
+	BUG_ON(sectors > buf->sectors);
+	buf->sectors = sectors;

 	bkey_extent_init(&buf->key);

@@ -163,32 +188,22 @@ static enum {
 	bch2_journal_buf_init(j);

 	cancel_delayed_work(&j->write_work);
-	spin_unlock(&j->lock);

 	/* ugh - might be called from __journal_res_get() under wait_event() */
 	__set_current_state(TASK_RUNNING);
-	bch2_journal_buf_put(j, old.idx, need_write_just_set);
-
-	return JOURNAL_UNLOCKED;
+	bch2_journal_buf_put(j, old.idx, set_need_write);
+	return true;
 }

-void bch2_journal_halt(struct journal *j)
+static bool journal_entry_close(struct journal *j)
 {
-	union journal_res_state old, new;
-	u64 v = atomic64_read(&j->reservations.counter);
-
-	do {
-		old.v = new.v = v;
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return;
+	bool ret;

-		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+	spin_lock(&j->lock);
+	ret = __journal_entry_close(j);
+	spin_unlock(&j->lock);

-	journal_wake(j);
-	closure_wake_up(&journal_cur_buf(j)->wait);
-	closure_wake_up(&journal_prev_buf(j)->wait);
+	return ret;
 }

 /*
@@ -196,17 +211,16 @@ void bch2_journal_halt(struct journal *j)
 * journal reservation - journal entry is open means journal is dirty:
 *
 * returns:
- * 1:		success
- * 0:		journal currently full (must wait)
- * -EROFS:	insufficient rw devices
- * -EIO:	journal error
+ * 0:		success
+ * -ENOSPC:	journal currently full, must invoke reclaim
+ * -EAGAIN:	journal blocked, must wait
+ * -EROFS:	insufficient rw devices or journal error
 */
 static int journal_entry_open(struct journal *j)
 {
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
-	ssize_t u64s;
-	int sectors;
+	int u64s, ret;
 	u64 v;

 	lockdep_assert_held(&j->lock);
@@ -216,29 +230,22 @@ static int journal_entry_open(struct journal *j)
 		return -EAGAIN;

 	if (!fifo_free(&j->pin))
-		return 0;
+		return -ENOSPC;

-	sectors = bch2_journal_entry_sectors(j);
-	if (sectors <= 0)
-		return sectors;
+	ret = bch2_journal_space_available(j);
+	if (ret)
+		return ret;

-	buf->disk_sectors	= sectors;
 	buf->u64s_reserved	= j->entry_u64s_reserved;
+	buf->disk_sectors	= j->cur_entry_sectors;
+	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);

-	sectors = min_t(unsigned, sectors, buf->size >> 9);
-	j->cur_buf_sectors	= sectors;
-
-	u64s = (sectors << 9) / sizeof(u64);
-
-	/* Subtract the journal header */
-	u64s -= sizeof(struct jset) / sizeof(u64);
-	u64s -= buf->u64s_reserved;
-	u64s  = max_t(ssize_t, 0L, u64s);
-
-	BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+		journal_entry_overhead(j);
+	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);

 	if (u64s <= le32_to_cpu(buf->data->u64s))
-		return 0;
+		return -ENOSPC;

 	/*
 	 * Must be set before marking the journal entry as open:
@@ -250,10 +257,11 @@ static int journal_entry_open(struct journal *j)
 		old.v = new.v = v;

 		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-			return -EIO;
+			return -EROFS;

 		/* Handle any already added entries */
 		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+		journal_state_inc(&new);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);

@@ -266,48 +274,16 @@ static int journal_entry_open(struct journal *j)
 			 &j->write_work,
 			 msecs_to_jiffies(j->write_delay_ms));
 	journal_wake(j);
-	return 1;
-}
-
-static bool __journal_entry_close(struct journal *j)
-{
-	bool set_need_write;
-
-	if (!journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return true;
-	}
-
-	set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
-	if (set_need_write)
-		j->need_write_time = local_clock();
-
-	switch (journal_buf_switch(j, set_need_write)) {
-	case JOURNAL_ENTRY_INUSE:
-		spin_unlock(&j->lock);
-		return false;
-	default:
-		spin_unlock(&j->lock);
-		fallthrough;
-	case JOURNAL_UNLOCKED:
-		return false;
-	}
-}
-
-static bool journal_entry_close(struct journal *j)
-{
-	spin_lock(&j->lock);
-	return __journal_entry_close(j);
+	return 0;
 }

 static bool journal_quiesced(struct journal *j)
 {
-	bool ret;
+	union journal_res_state state = READ_ONCE(j->reservations);
+	bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);

-	spin_lock(&j->lock);
-	ret = !j->reservations.prev_buf_unwritten &&
-		!journal_entry_is_open(j);
-	__journal_entry_close(j);
+	if (!ret)
+		journal_entry_close(j);
 	return ret;
 }

@@ -357,7 +333,11 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 	if (journal_res_get_fast(j, res, flags))
 		return 0;

+	if (bch2_journal_error(j))
+		return -EROFS;
+
 	spin_lock(&j->lock);
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
 	 * that just did journal_entry_open() and call journal_entry_close()
@@ -375,56 +355,42 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 	 */
 	buf = journal_cur_buf(j);
 	if (journal_entry_is_open(j) &&
-	    buf->size >> 9 < buf->disk_sectors &&
-	    buf->size < JOURNAL_ENTRY_SIZE_MAX)
-		j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+	    buf->buf_size >> 9 < buf->disk_sectors &&
+	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);

-	/*
-	 * Close the current journal entry if necessary, then try to start a new
-	 * one:
-	 */
-	switch (journal_buf_switch(j, false)) {
-	case JOURNAL_ENTRY_ERROR:
-		spin_unlock(&j->lock);
-		return -EROFS;
-	case JOURNAL_ENTRY_INUSE:
+	if (journal_entry_is_open(j) &&
+	    !__journal_entry_close(j)) {
 		/*
-		 * The current journal entry is still open, but we failed to get
-		 * a journal reservation because there's not enough space in it,
-		 * and we can't close it and start another because we haven't
-		 * finished writing out the previous entry:
+		 * We failed to get a reservation on the current open journal
+		 * entry because it's full, and we can't close it because
+		 * there's still a previous one in flight:
 		 */
-		spin_unlock(&j->lock);
 		trace_journal_entry_full(c);
-		goto blocked;
-	case JOURNAL_ENTRY_CLOSED:
-		break;
-	case JOURNAL_UNLOCKED:
-		goto retry;
+		ret = -EAGAIN;
+	} else {
+		ret = journal_entry_open(j);
 	}

-	/* We now have a new, closed journal buf - see if we can open it: */
-	ret = journal_entry_open(j);
+	if ((ret == -EAGAIN || ret == -ENOSPC) &&
+	    !j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+
 	spin_unlock(&j->lock);

-	if (ret < 0)
-		return ret;
-	if (ret)
+	if (!ret)
 		goto retry;
+	if (ret == -ENOSPC) {
+		/*
+		 * Journal is full - can't rely on reclaim from work item due to
+		 * freezing:
+		 */
+		trace_journal_full(c);
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+		ret = -EAGAIN;
+	}

-	/* Journal's full, we have to wait */
-
-	/*
-	 * Direct reclaim - can't rely on reclaim from work item
-	 * due to freezing..
-	 */
-	bch2_journal_reclaim_work(&j->reclaim_work.work);
-
-	trace_journal_full(c);
-blocked:
-	if (!j->res_get_blocked_start)
-		j->res_get_blocked_start = local_clock() ?: 1;
-	return -EAGAIN;
+	return ret;
 }

 /*
@@ -461,7 +427,7 @@ void bch2_journal_entry_res_resize(struct journal *j,

 	j->entry_u64s_reserved += d;
 	if (d <= 0)
-		goto out_unlock;
+		goto out;

 	j->cur_entry_u64s -= d;
 	smp_mb();
@@ -474,15 +440,12 @@ void bch2_journal_entry_res_resize(struct journal *j,
 		 * Not enough room in current journal entry, have to flush it:
 		 */
 		__journal_entry_close(j);
-		goto out;
+	} else {
+		journal_cur_buf(j)->u64s_reserved += d;
 	}
-
-	journal_cur_buf(j)->u64s_reserved += d;
-out_unlock:
-	spin_unlock(&j->lock);
 out:
+	spin_unlock(&j->lock);
 	res->u64s += d;
-	return;
 }

 /* journal flushing: */
@@ -512,47 +475,47 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	int ret;
-retry:
+
 	spin_lock(&j->lock);

-	if (seq < journal_cur_seq(j) ||
+	/*
+	 * Can't try to open more than one sequence number ahead:
+	 */
+	BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+
+	if (journal_cur_seq(j) > seq ||
 	    journal_entry_is_open(j)) {
 		spin_unlock(&j->lock);
 		return 0;
 	}

-	if (journal_cur_seq(j) < seq) {
-		switch (journal_buf_switch(j, false)) {
-		case JOURNAL_ENTRY_ERROR:
-			spin_unlock(&j->lock);
-			return -EROFS;
-		case JOURNAL_ENTRY_INUSE:
-			/* haven't finished writing out the previous one: */
-			trace_journal_entry_full(c);
-			goto blocked;
-		case JOURNAL_ENTRY_CLOSED:
-			break;
-		case JOURNAL_UNLOCKED:
-			goto retry;
-		}
-	}
-
-	BUG_ON(journal_cur_seq(j) < seq);
+	if (journal_cur_seq(j) < seq &&
+	    !__journal_entry_close(j)) {
+		/* haven't finished writing out the previous one: */
+		trace_journal_entry_full(c);
+		ret = -EAGAIN;
+	} else {
+		BUG_ON(journal_cur_seq(j) != seq);

-	ret = journal_entry_open(j);
-	if (ret) {
-		spin_unlock(&j->lock);
-		return ret < 0 ? ret : 0;
+		ret = journal_entry_open(j);
 	}
-blocked:
-	if (!j->res_get_blocked_start)
+
+	if ((ret == -EAGAIN || ret == -ENOSPC) &&
+	    !j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;

-	closure_wait(&j->async_wait, cl);
+	if (ret == -EAGAIN || ret == -ENOSPC)
+		closure_wait(&j->async_wait, cl);
+
 	spin_unlock(&j->lock);

-	bch2_journal_reclaim_work(&j->reclaim_work.work);
-	return -EAGAIN;
+	if (ret == -ENOSPC) {
+		trace_journal_full(c);
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+		ret = -EAGAIN;
+	}
+
+	return ret;
 }

 static int journal_seq_error(struct journal *j, u64 seq)
@@ -635,8 +598,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,

 	if (seq == journal_cur_seq(j))
 		__journal_entry_close(j);
-	else
-		spin_unlock(&j->lock);
+	spin_unlock(&j->lock);
 }

 static int journal_seq_flushed(struct journal *j, u64 seq)
@@ -648,8 +610,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)

 	if (seq == journal_cur_seq(j))
 		__journal_entry_close(j);
-	else
-		spin_unlock(&j->lock);
+	spin_unlock(&j->lock);

 	return ret;
 }
@@ -783,7 +744,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		goto err;

 	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-				nr + sizeof(*journal_buckets) / sizeof(u64));
+						 nr + sizeof(*journal_buckets) / sizeof(u64));
 	if (!journal_buckets)
 		goto err;

@@ -846,9 +807,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 		ja->nr++;

 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
-				ca->mi.bucket_size,
-				gc_phase(GC_PHASE_SB),
-				0);
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB),
+					  0);

 		if (c) {
 			spin_unlock(&c->journal.lock);
@@ -899,7 +860,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 		 */

 		if (bch2_disk_reservation_get(c, &disk_res,
-				bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+					      bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
 			mutex_unlock(&c->sb_lock);
 			return -ENOSPC;
 		}
@@ -996,7 +957,7 @@ void bch2_fs_journal_start(struct journal *j)
 		journal_pin_new_entry(j, 0);

 	/*
-	 * journal_buf_switch() only inits the next journal entry when it
+	 * __journal_entry_close() only inits the next journal entry when it
 	 * closes an open journal entry - the very first journal entry gets
 	 * initialized here:
 	 */
@@ -1063,8 +1024,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)

 void bch2_fs_journal_exit(struct journal *j)
 {
-	kvpfree(j->buf[1].data, j->buf[1].size);
-	kvpfree(j->buf[0].data, j->buf[0].size);
+	kvpfree(j->buf[1].data, j->buf[1].buf_size);
+	kvpfree(j->buf[0].data, j->buf[0].buf_size);
 	free_fifo(&j->pin);
 }

@@ -1088,8 +1049,8 @@ int bch2_fs_journal_init(struct journal *j)

 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);

-	j->buf[0].size		= JOURNAL_ENTRY_SIZE_MIN;
-	j->buf[1].size		= JOURNAL_ENTRY_SIZE_MIN;
+	j->buf[0].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
+	j->buf[1].buf_size	= JOURNAL_ENTRY_SIZE_MIN;
 	j->write_delay_ms	= 1000;
 	j->reclaim_delay_ms	= 100;

@@ -1102,8 +1063,8 @@ int bch2_fs_journal_init(struct journal *j)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);

 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-	    !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
-	    !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+	    !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
+	    !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
 		ret = -ENOMEM;
 		goto out;
 	}

--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -179,6 +179,11 @@ static inline unsigned jset_u64s(unsigned u64s)
 	return u64s + sizeof(struct jset_entry) / sizeof(u64);
 }

+static inline int journal_entry_overhead(struct journal *j)
+{
+	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
 static inline struct jset_entry *
 bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 {
@@ -225,7 +230,7 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *
 			       id, 0, k, k->k.u64s);
 }

-void bch2_journal_buf_put_slowpath(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *, bool);

 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
 				       bool need_write_just_set)
@@ -236,17 +241,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
 				    .buf0_count = idx == 0,
 				    .buf1_count = idx == 1,
 				    }).v, &j->reservations.counter);
-
-	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
-
-	/*
-	 * Do not initiate a journal write if the journal is in an error state
-	 * (previous journal entry write may have failed)
-	 */
-	if (s.idx != idx &&
-	    !journal_state_count(s, idx) &&
-	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
-		bch2_journal_buf_put_slowpath(j, need_write_just_set);
+	if (!journal_state_count(s, idx)) {
+		EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
+		__bch2_journal_buf_put(j, need_write_just_set);
+	}
 }

 /*
@@ -333,6 +331,8 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
 	return 0;
 }

+/* journal_entry_res: */
+
 void bch2_journal_entry_res_resize(struct journal *,
 				   struct journal_entry_res *,
 				   unsigned);

--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -902,13 +902,16 @@ static unsigned journal_dev_buckets_available(struct journal *j,
 	return available;
 }

-/* returns number of sectors available for next journal entry: */
-int bch2_journal_entry_sectors(struct journal *j)
+int bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned sectors_available = UINT_MAX;
+	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned i, nr_online = 0, nr_devs = 0;
+	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+		? journal_prev_buf(j)->sectors
+		: 0;
+	int ret = 0;

 	lockdep_assert_held(&j->lock);

@@ -921,16 +924,16 @@ int bch2_journal_entry_sectors(struct journal *j)
 		if (!ja->nr)
 			continue;

+		nr_online++;
+
 		buckets_this_device = journal_dev_buckets_available(j, ja);
 		sectors_this_device = ja->sectors_free;

-		nr_online++;
-
 		/*
 		 * We that we don't allocate the space for a journal entry
 		 * until we write it out - thus, account for it here:
 		 */
-		if (j->prev_buf_sectors >= sectors_this_device) {
+		if (unwritten_sectors >= sectors_this_device) {
 			if (!buckets_this_device)
 				continue;

@@ -938,7 +941,7 @@ int bch2_journal_entry_sectors(struct journal *j)
 			sectors_this_device = ca->mi.bucket_size;
 		}

-		sectors_this_device -= j->prev_buf_sectors;
+		sectors_this_device -= unwritten_sectors;

 		if (buckets_this_device)
 			sectors_this_device = ca->mi.bucket_size;
@@ -946,19 +949,26 @@ int bch2_journal_entry_sectors(struct journal *j)
 		if (!sectors_this_device)
 			continue;

-		sectors_available = min(sectors_available,
-					sectors_this_device);
+		sectors_next_entry = min(sectors_next_entry,
+					 sectors_this_device);
+
 		nr_devs++;
 	}
 	rcu_read_unlock();

-	if (nr_online < c->opts.metadata_replicas_required)
-		return -EROFS;
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = -EROFS;
+		sectors_next_entry = 0;
+	} else if (!sectors_next_entry ||
+		   nr_devs < min_t(unsigned, nr_online,
+				   c->opts.metadata_replicas)) {
+		ret = -ENOSPC;
+		sectors_next_entry = 0;
+	}

-	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
-		return 0;
+	WRITE_ONCE(j->cur_entry_sectors, sectors_next_entry);

-	return sectors_available;
+	return ret;
 }

 static void __journal_write_alloc(struct journal *j,
@@ -1059,9 +1069,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
 done:
-	if (replicas >= replicas_want)
-		j->prev_buf_sectors = 0;
-
 	spin_unlock(&j->lock);
 	rcu_read_unlock();

@@ -1117,17 +1124,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	unsigned new_size = READ_ONCE(j->buf_size_want);
 	void *new_buf;

-	if (buf->size >= new_size)
+	if (buf->buf_size >= new_size)
 		return;

 	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
 	if (!new_buf)
 		return;

-	memcpy(new_buf, buf->data, buf->size);
-	kvpfree(buf->data, buf->size);
+	memcpy(new_buf, buf->data, buf->buf_size);
+	kvpfree(buf->data, buf->buf_size);
 	buf->data	= new_buf;
-	buf->size	= new_size;
+	buf->buf_size	= new_size;
 }

 static void journal_write_done(struct closure *cl)
@@ -1227,15 +1234,14 @@ void bch2_journal_write(struct closure *cl)

 	j->write_start_time = local_clock();

-	start	= vstruct_last(w->data);
+	start	= vstruct_last(jset);
 	end	= bch2_journal_super_entries_add_common(c, start,
 						le64_to_cpu(jset->seq));
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);

-	le32_add_cpu(&w->data->u64s, u64s);
-	BUG_ON(vstruct_sectors(jset, c->block_bits) >
-	       w->disk_sectors);
+	le32_add_cpu(&jset->u64s, u64s);
+	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);

 	journal_write_compact(jset);

@@ -1273,10 +1279,10 @@ void bch2_journal_write(struct closure *cl)
 		goto err;

 	sectors = vstruct_sectors(jset, c->block_bits);
-	BUG_ON(sectors > j->prev_buf_sectors);
+	BUG_ON(sectors > w->sectors);

-	bytes = vstruct_bytes(w->data);
-	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+	bytes = vstruct_bytes(jset);
+	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);

 	if (journal_write_alloc(j, w, sectors)) {
 		bch2_journal_halt(j);
@@ -1286,6 +1292,12 @@ void bch2_journal_write(struct closure *cl)
 		return;
 	}

+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_entry_sectors:
+	 */
+	w->sectors = 0;
+
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
@@ -1316,7 +1328,7 @@ void bch2_journal_write(struct closure *cl)
 		trace_journal_write(bio);
 		closure_bio_submit(bio, cl);

-		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
 	}

 	for_each_rw_member(ca, c, i)

--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -40,7 +40,7 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);

-int bch2_journal_entry_sectors(struct journal *);
+int bch2_journal_space_available(struct journal *);
 void bch2_journal_write(struct closure *);

 #endif /* _BCACHEFS_JOURNAL_IO_H */
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -22,8 +22,10 @@ struct journal_buf {

 	struct closure_waitlist	wait;

-	unsigned		size;
-	unsigned		disk_sectors;
+	unsigned		buf_size;	/* size in bytes of @data */
+	unsigned		sectors;	/* maximum size for current entry */
+	unsigned		disk_sectors;	/* maximum size entry could have been, if
+						   buf_size was bigger */
 	unsigned		u64s_reserved;
 	/* bloom filter: */
 	unsigned long		has_inode[1024 / sizeof(unsigned long)];
@@ -129,9 +131,14 @@ struct journal {
 	unsigned long		flags;

 	union journal_res_state reservations;
+
+	/* Max size of current journal entry */
 	unsigned		cur_entry_u64s;
-	unsigned		prev_buf_sectors;
-	unsigned		cur_buf_sectors;
+	unsigned		cur_entry_sectors;
+
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
 	unsigned		buf_size_want;

 	/*
@@ -159,9 +166,6 @@ struct journal {
 	u64			seq_ondisk;
 	u64			last_seq_ondisk;

-	/* Reserved space in journal entry to be used just prior to write */
-	unsigned		entry_u64s_reserved;
-
 	/*
 	 * FIFO of journal entries whose btree updates have not yet been
 	 * written out.