Commit 2940295c authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Be more careful about JOURNAL_RES_GET_RESERVED

JOURNAL_RES_GET_RESERVED should only be used for updatse that need to be
done to free up space in the journal. In particular, when we're flushing
keys from the key cache, if we're flushing them out of order we
shouldn't be using it, since we're using up our remaining space in the
journal without dropping a pin that will let us make forward progress.

With this patch, BTREE_INSERT_JOURNAL_RECLAIM without
BTREE_INSERT_JOURNAL_RESERVED may return -EAGAIN - we can't wait on
journal reclaim if we're already in journal reclaim.

This means we need to propagate these errors up to journal reclaim,
indicating that flushing a journal pin should be retried in the future.

This is prep work for a patch to change the way journal reclaim works,
to split out flushing key cache keys because the btree key cache is too
dirty from journal reclaim because we need space in the journal.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 6167f7c8
...@@ -353,6 +353,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) ...@@ -353,6 +353,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
static int btree_key_cache_flush_pos(struct btree_trans *trans, static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bkey_cached_key key, struct bkey_cached_key key,
u64 journal_seq, u64 journal_seq,
unsigned commit_flags,
bool evict) bool evict)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
...@@ -391,12 +392,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, ...@@ -391,12 +392,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL| BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED| (ck->journal.seq == journal_last_seq(j)
BTREE_INSERT_JOURNAL_RECLAIM); ? BTREE_INSERT_JOURNAL_RESERVED
: 0)|
commit_flags);
err: err:
if (ret == -EINTR) if (ret == -EINTR)
goto retry; goto retry;
if (ret == -EAGAIN)
goto out;
if (ret) { if (ret) {
bch2_fs_fatal_err_on(!bch2_journal_error(j), c, bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
"error flushing key cache: %i", ret); "error flushing key cache: %i", ret);
...@@ -439,15 +445,16 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, ...@@ -439,15 +445,16 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
return ret; return ret;
} }
static void btree_key_cache_journal_flush(struct journal *j, static int btree_key_cache_journal_flush(struct journal *j,
struct journal_entry_pin *pin, struct journal_entry_pin *pin,
u64 seq) u64 seq)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_cached *ck = struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal); container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key; struct bkey_cached_key key;
struct btree_trans trans; struct btree_trans trans;
int ret = 0;
int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
...@@ -462,10 +469,13 @@ static void btree_key_cache_journal_flush(struct journal *j, ...@@ -462,10 +469,13 @@ static void btree_key_cache_journal_flush(struct journal *j,
six_unlock_read(&ck->c.lock); six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
btree_key_cache_flush_pos(&trans, key, seq, false); ret = btree_key_cache_flush_pos(&trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
unlock: unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
return ret;
} }
/* /*
...@@ -481,7 +491,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, ...@@ -481,7 +491,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
if (!bch2_btree_key_cache_find(c, id, pos)) if (!bch2_btree_key_cache_find(c, id, pos))
return 0; return 0;
return btree_key_cache_flush_pos(trans, key, 0, true); return btree_key_cache_flush_pos(trans, key, 0, 0, true);
} }
bool bch2_btree_insert_key_cached(struct btree_trans *trans, bool bch2_btree_insert_key_cached(struct btree_trans *trans,
......
...@@ -916,10 +916,12 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, ...@@ -916,10 +916,12 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
struct closure cl; struct closure cl;
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0; ? BCH_DISK_RESERVATION_NOFAIL : 0;
int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) int journal_flags = 0;
? JOURNAL_RES_GET_RECLAIM : 0;
int ret = 0; int ret = 0;
if (flags & BTREE_INSERT_JOURNAL_RESERVED)
journal_flags |= JOURNAL_RES_GET_RESERVED;
closure_init_stack(&cl); closure_init_stack(&cl);
retry: retry:
/* /*
...@@ -982,6 +984,9 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, ...@@ -982,6 +984,9 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
goto err;
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES, BTREE_UPDATE_JOURNAL_RES,
journal_flags); journal_flags);
......
...@@ -134,7 +134,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, ...@@ -134,7 +134,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
return true; return true;
} }
static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
unsigned i, u64 seq) unsigned i, u64 seq)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
...@@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ...@@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
bch2_btree_node_write_cond(c, b, bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w && w->journal.seq == seq)); (btree_current_write(b) == w && w->journal.seq == seq));
six_unlock_read(&b->c.lock); six_unlock_read(&b->c.lock);
return 0;
} }
static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{ {
return __btree_node_flush(j, pin, 0, seq); return __btree_node_flush(j, pin, 0, seq);
} }
static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{ {
return __btree_node_flush(j, pin, 1, seq); return __btree_node_flush(j, pin, 1, seq);
} }
...@@ -563,8 +564,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ...@@ -563,8 +564,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_journal_preres_get(&c->journal, ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s, &trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK| JOURNAL_RES_GET_NONBLOCK|
((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
? JOURNAL_RES_GET_RECLAIM : 0)); ? JOURNAL_RES_GET_RESERVED : 0));
if (unlikely(ret == -EAGAIN)) if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans, ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s); trans->journal_preres_u64s);
...@@ -721,6 +722,10 @@ int bch2_trans_commit_error(struct btree_trans *trans, ...@@ -721,6 +722,10 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_JOURNAL_RES: case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
!(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
return -EAGAIN;
ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
if (ret) if (ret)
return ret; return ret;
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_update.h" #include "btree_update.h"
#include "buckets.h" #include "buckets.h"
#include "error.h"
#include "journal.h" #include "journal.h"
#include "journal_io.h" #include "journal_io.h"
#include "journal_reclaim.h" #include "journal_reclaim.h"
...@@ -449,6 +450,27 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, ...@@ -449,6 +450,27 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
if (!ret) if (!ret)
goto retry; goto retry;
if ((ret == cur_entry_journal_full ||
ret == cur_entry_journal_pin_full) &&
!can_discard &&
j->reservations.idx == j->reservations.unwritten_idx &&
(flags & JOURNAL_RES_GET_RESERVED)) {
char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
bch_err(c, "Journal stuck!");
if (journal_debug_buf) {
bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
bch_err(c, "%s", journal_debug_buf);
bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
bch_err(c, "Journal pins:\n%s", journal_debug_buf);
kfree(journal_debug_buf);
}
bch2_fatal_error(c);
dump_stack();
}
/* /*
* Journal is full - can't rely on reclaim from work item due to * Journal is full - can't rely on reclaim from work item due to
* freezing: * freezing:
...@@ -1169,6 +1191,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ...@@ -1169,6 +1191,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq_ondisk:\t%llu\n" "last_seq_ondisk:\t%llu\n"
"flushed_seq_ondisk:\t%llu\n" "flushed_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n" "prereserved:\t\t%u/%u\n"
"each entry reserved:\t%u\n"
"nr flush writes:\t%llu\n" "nr flush writes:\t%llu\n"
"nr noflush writes:\t%llu\n" "nr noflush writes:\t%llu\n"
"nr direct reclaim:\t%llu\n" "nr direct reclaim:\t%llu\n"
...@@ -1183,6 +1206,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ...@@ -1183,6 +1206,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->flushed_seq_ondisk, j->flushed_seq_ondisk,
j->prereserved.reserved, j->prereserved.reserved,
j->prereserved.remaining, j->prereserved.remaining,
j->entry_u64s_reserved,
j->nr_flush_writes, j->nr_flush_writes,
j->nr_noflush_writes, j->nr_noflush_writes,
j->nr_direct_reclaim, j->nr_direct_reclaim,
......
...@@ -308,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ...@@ -308,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
#define JOURNAL_RES_GET_NONBLOCK (1 << 0) #define JOURNAL_RES_GET_NONBLOCK (1 << 0)
#define JOURNAL_RES_GET_CHECK (1 << 1) #define JOURNAL_RES_GET_CHECK (1 << 1)
#define JOURNAL_RES_GET_RESERVED (1 << 2) #define JOURNAL_RES_GET_RESERVED (1 << 2)
#define JOURNAL_RES_GET_RECLAIM (1 << 3)
static inline int journal_res_get_fast(struct journal *j, static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res, struct journal_res *res,
...@@ -446,7 +445,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, ...@@ -446,7 +445,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
* into the reclaim path and deadlock: * into the reclaim path and deadlock:
*/ */
if (!(flags & JOURNAL_RES_GET_RECLAIM) && if (!(flags & JOURNAL_RES_GET_RESERVED) &&
new.reserved > new.remaining) new.reserved > new.remaining)
return 0; return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter, } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
......
...@@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j) ...@@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j)
u64s_remaining = (u64) clean << 6; u64s_remaining = (u64) clean << 6;
u64s_remaining -= (u64) total << 3; u64s_remaining -= (u64) total << 3;
u64s_remaining = max(0LL, u64s_remaining); u64s_remaining = max(0LL, u64s_remaining);
u64s_remaining /= 2; u64s_remaining /= 4;
u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
out: out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
...@@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j, ...@@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j,
if (!journal_pin_active(pin)) if (!journal_pin_active(pin))
return; return;
if (j->flush_in_progress == pin)
j->flush_in_progress_dropped = true;
pin_list = journal_seq_pin(j, pin->seq); pin_list = journal_seq_pin(j, pin->seq);
pin->seq = 0; pin->seq = 0;
list_del_init(&pin->list); list_del_init(&pin->list);
...@@ -439,34 +442,27 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) ...@@ -439,34 +442,27 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
struct journal_entry_pin_list *pin_list; struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL; struct journal_entry_pin *ret = NULL;
if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
return NULL;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
if (*seq > max_seq || if (*seq > max_seq ||
(ret = list_first_entry_or_null(&pin_list->list, (ret = list_first_entry_or_null(&pin_list->list,
struct journal_entry_pin, list))) struct journal_entry_pin, list)))
break; break;
if (ret) {
list_move(&ret->list, &pin_list->flushed);
BUG_ON(j->flush_in_progress);
j->flush_in_progress = ret;
}
spin_unlock(&j->lock);
return ret; return ret;
} }
/* returns true if we did work */ /* returns true if we did work */
static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
unsigned min_nr) unsigned min_nr)
{ {
struct journal_entry_pin *pin; struct journal_entry_pin *pin;
u64 seq, ret = 0; size_t nr_flushed = 0;
journal_pin_flush_fn flush_fn;
u64 seq;
int err;
if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
return 0;
lockdep_assert_held(&j->reclaim_lock); lockdep_assert_held(&j->reclaim_lock);
...@@ -475,23 +471,42 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, ...@@ -475,23 +471,42 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
j->last_flushed = jiffies; j->last_flushed = jiffies;
spin_lock(&j->lock);
pin = journal_get_next_pin(j, min_nr pin = journal_get_next_pin(j, min_nr
? U64_MAX : seq_to_flush, &seq); ? U64_MAX : seq_to_flush, &seq);
if (pin) {
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
j->flush_in_progress_dropped = false;
flush_fn = pin->flush;
}
spin_unlock(&j->lock);
if (!pin) if (!pin)
break; break;
if (min_nr) if (min_nr)
min_nr--; min_nr--;
pin->flush(j, pin, seq); err = flush_fn(j, pin, seq);
BUG_ON(j->flush_in_progress != pin); spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
j->flush_in_progress = NULL; j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
wake_up(&j->pin_flush_wait); wake_up(&j->pin_flush_wait);
ret++;
if (err)
break;
nr_flushed++;
} }
return ret; return nr_flushed;
} }
static u64 journal_seq_to_flush(struct journal *j) static u64 journal_seq_to_flush(struct journal *j)
...@@ -556,8 +571,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) ...@@ -556,8 +571,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0; bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush, nr_flushed = 0; u64 seq_to_flush;
size_t min_nr; size_t min_nr, nr_flushed;
unsigned flags; unsigned flags;
int ret = 0; int ret = 0;
......
...@@ -50,7 +50,7 @@ struct journal_entry_pin_list { ...@@ -50,7 +50,7 @@ struct journal_entry_pin_list {
struct journal; struct journal;
struct journal_entry_pin; struct journal_entry_pin;
typedef void (*journal_pin_flush_fn)(struct journal *j, typedef int (*journal_pin_flush_fn)(struct journal *j,
struct journal_entry_pin *, u64); struct journal_entry_pin *, u64);
struct journal_entry_pin { struct journal_entry_pin {
...@@ -251,6 +251,7 @@ struct journal { ...@@ -251,6 +251,7 @@ struct journal {
unsigned long last_flushed; unsigned long last_flushed;
struct journal_entry_pin *flush_in_progress; struct journal_entry_pin *flush_in_progress;
bool flush_in_progress_dropped;
wait_queue_head_t pin_flush_wait; wait_queue_head_t pin_flush_wait;
/* protects advancing ja->discard_idx: */ /* protects advancing ja->discard_idx: */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment