Commit ed0e24c0 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Be more precise with journal error reporting

We were incorrectly detecting a journal deadlock - the journal filling
up - when only the journal pin fifo had filled up; if the journal pin
fifo is full that just means we need to wait on reclaim.

This plumbs through better error reporting so we can better discriminate
in the journal_res_get path what's going on.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent d8ebed7d
......@@ -234,7 +234,7 @@ static int journal_entry_open(struct journal *j)
BUG_ON(journal_entry_is_open(j));
if (j->blocked)
return -EAGAIN;
return cur_entry_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
......@@ -250,7 +250,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
return -ENOSPC;
return cur_entry_journal_full;
/*
* Must be set before marking the journal entry as open:
......@@ -262,7 +262,7 @@ static int journal_entry_open(struct journal *j)
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
return -EROFS;
return cur_entry_insufficient_devices;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
......@@ -375,7 +375,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
ret = -ENOSPC;
ret = cur_entry_journal_full;
goto unlock;
}
......@@ -398,14 +398,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
* there's still a previous one in flight:
*/
trace_journal_entry_full(c);
ret = -EAGAIN;
ret = cur_entry_blocked;
} else {
ret = journal_entry_open(j);
}
unlock:
if ((ret == -EAGAIN || ret == -ENOSPC) &&
!j->res_get_blocked_start)
if ((ret && ret != cur_entry_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
trace_journal_full(c);
}
can_discard = j->can_discard;
spin_unlock(&j->lock);
......@@ -413,41 +415,39 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
if (!ret)
goto retry;
if (ret == -ENOSPC) {
if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
"JOURNAL_RES_GET_RESERVED set but journal full")) {
char *buf;
buf = kmalloc(4096, GFP_NOFS);
if (buf) {
bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
pr_err("\n%s", buf);
kfree(buf);
}
if (WARN_ONCE(ret == cur_entry_journal_full &&
!can_discard &&
(flags & JOURNAL_RES_GET_RESERVED),
"JOURNAL_RES_GET_RESERVED set but journal full")) {
char *buf;
buf = kmalloc(4096, GFP_NOFS);
if (buf) {
bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
pr_err("\n%s", buf);
kfree(buf);
}
}
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
trace_journal_full(c);
if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
goto retry;
}
if (mutex_trylock(&j->reclaim_lock)) {
bch2_journal_reclaim(j);
mutex_unlock(&j->reclaim_lock);
}
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
if ((ret == cur_entry_journal_full ||
ret == cur_entry_journal_pin_full) &&
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
goto retry;
}
ret = -EAGAIN;
if (mutex_trylock(&j->reclaim_lock)) {
bch2_journal_reclaim(j);
mutex_unlock(&j->reclaim_lock);
}
}
return ret;
return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
}
/*
......@@ -1072,6 +1072,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
"current entry sectors:\t%u\n"
"current entry error:\t%u\n"
"current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
......@@ -1079,7 +1080,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
j->cur_entry_sectors);
j->cur_entry_sectors,
j->cur_entry_error);
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
......
......@@ -164,12 +164,12 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
ret = -EROFS;
ret = cur_entry_insufficient_devices;
goto out;
}
if (!fifo_free(&j->pin)) {
ret = -ENOSPC;
ret = cur_entry_journal_pin_full;
goto out;
}
......@@ -180,7 +180,7 @@ void bch2_journal_space_available(struct journal *j)
clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
if (!discarded.next_entry)
ret = -ENOSPC;
ret = cur_entry_journal_full;
overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
journal_entry_overhead(j);
......
......@@ -146,7 +146,13 @@ struct journal {
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
int cur_entry_error;
enum {
cur_entry_ok,
cur_entry_blocked,
cur_entry_journal_full,
cur_entry_journal_pin_full,
cur_entry_insufficient_devices,
} cur_entry_error;
union journal_preres_state prereserved;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment