Commit ebb84d09 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Increase journal pipelining

This patch increases the maximum journal buffers in flight from 2 to 4 -
this will be particularly helpful when in the future we stop requiring
flush+fua for every journal write.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 5db43418
......@@ -23,7 +23,7 @@ static u64 last_unwritten_seq(struct journal *j)
lockdep_assert_held(&j->lock);
return journal_cur_seq(j) - s.prev_buf_unwritten;
return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
}
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
......@@ -51,7 +51,7 @@ journal_seq_to_buf(struct journal *j, u64 seq)
j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
if (journal_seq_unwritten(j, seq)) {
buf = j->buf + (seq & 1);
buf = j->buf + (seq & JOURNAL_BUF_MASK);
EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
}
return buf;
......@@ -108,15 +108,8 @@ void bch2_journal_halt(struct journal *j)
/* journal entry close/open: */
void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
void __bch2_journal_buf_put(struct journal *j)
{
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
bch2_time_stats_update(j->delay_time,
j->need_write_time);
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
......@@ -129,7 +122,6 @@ static bool __journal_entry_close(struct journal *j)
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
bool set_need_write = false;
unsigned sectors;
lockdep_assert_held(&j->lock);
......@@ -148,15 +140,13 @@ static bool __journal_entry_close(struct journal *j)
if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
set_bit(JOURNAL_NEED_WRITE, &j->flags);
j->need_write_time = local_clock();
set_need_write = true;
}
if (new.prev_buf_unwritten)
return false;
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
new.prev_buf_unwritten = 1;
if (new.idx == new.unwritten_idx)
return false;
BUG_ON(journal_state_count(new, new.idx));
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
......@@ -190,24 +180,44 @@ static bool __journal_entry_close(struct journal *j)
*/
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
bch2_journal_buf_put(j, old.idx, set_need_write);
bch2_journal_buf_put(j, old.idx);
return true;
}
static bool journal_entry_want_write(struct journal *j)
{
union journal_res_state s = READ_ONCE(j->reservations);
bool ret = false;
/*
* Don't close it yet if we already have a write in flight, but do set
* NEED_WRITE:
*/
if (s.idx != s.unwritten_idx)
set_bit(JOURNAL_NEED_WRITE, &j->flags);
else
ret = __journal_entry_close(j);
return ret;
}
static bool journal_entry_close(struct journal *j)
{
bool ret;
spin_lock(&j->lock);
ret = __journal_entry_close(j);
ret = journal_entry_want_write(j);
spin_unlock(&j->lock);
return ret;
......@@ -289,8 +299,8 @@ static int journal_entry_open(struct journal *j)
static bool journal_quiesced(struct journal *j)
{
union journal_res_state state = READ_ONCE(j->reservations);
bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
union journal_res_state s = READ_ONCE(j->reservations);
bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
if (!ret)
journal_entry_close(j);
......@@ -317,17 +327,29 @@ static void journal_write_work(struct work_struct *work)
u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
{
size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
u64 seq = 0;
union journal_res_state s;
unsigned i;
u64 seq;
if (!test_bit(h, j->buf[0].has_inode) &&
!test_bit(h, j->buf[1].has_inode))
return 0;
spin_lock(&j->lock);
if (test_bit(h, journal_cur_buf(j)->has_inode))
seq = journal_cur_seq(j);
else if (test_bit(h, journal_prev_buf(j)->has_inode))
seq = journal_cur_seq(j) - 1;
s = READ_ONCE(j->reservations);
i = s.idx;
while (1) {
if (test_bit(h, j->buf[i].has_inode))
goto out;
if (i == s.unwritten_idx)
break;
i = (i - 1) & JOURNAL_BUF_MASK;
seq--;
}
seq = 0;
out:
spin_unlock(&j->lock);
return seq;
......@@ -574,7 +596,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
BUG();
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
journal_entry_want_write(j);
out:
spin_unlock(&j->lock);
return ret;
......@@ -863,15 +885,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
union journal_res_state state;
struct journal_buf *w;
bool ret;
bool ret = false;
unsigned i;
spin_lock(&j->lock);
state = READ_ONCE(j->reservations);
w = j->buf + !state.idx;
i = state.idx;
ret = state.prev_buf_unwritten &&
bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
while (i != state.unwritten_idx) {
i = (i - 1) & JOURNAL_BUF_MASK;
if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
ret = true;
}
spin_unlock(&j->lock);
return ret;
......@@ -957,7 +982,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
journal_pin_new_entry(j, 1);
j->reservations.idx = journal_cur_seq(j);
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
bch2_journal_buf_init(j);
......@@ -1015,8 +1040,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
kvpfree(j->buf[1].data, j->buf[1].buf_size);
kvpfree(j->buf[0].data, j->buf[0].buf_size);
unsigned i;
for (i = 0; i < ARRAY_SIZE(j->buf); i++)
kvpfree(j->buf[i].data, j->buf[i].buf_size);
free_fifo(&j->pin);
}
......@@ -1024,6 +1051,7 @@ int bch2_fs_journal_init(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
static struct lock_class_key res_key;
unsigned i;
int ret = 0;
pr_verbose_init(c->opts, "");
......@@ -1038,8 +1066,6 @@ int bch2_fs_journal_init(struct journal *j)
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
......@@ -1051,13 +1077,20 @@ int bch2_fs_journal_init(struct journal *j)
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
!(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data) {
ret = -ENOMEM;
goto out;
}
}
j->pin.front = j->pin.back = 1;
out:
pr_verbose_init(c->opts, "ret %i", ret);
......@@ -1071,7 +1104,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
struct bch_dev *ca;
unsigned iter;
unsigned i;
rcu_read_lock();
spin_lock(&j->lock);
......@@ -1114,16 +1147,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
pr_buf(out,
"current entry refs:\t%u\n"
"prev entry unwritten:\t",
journal_state_count(s, s.idx));
if (s.prev_buf_unwritten)
pr_buf(out, "yes, ref %u sectors %u\n",
journal_state_count(s, !s.idx),
journal_prev_buf(j)->sectors);
else
pr_buf(out, "no\n");
"current entry:\tidx %u refcount %u\n",
s.idx, journal_state_count(s, s.idx));
i = s.idx;
while (i != s.unwritten_idx) {
i = (i - 1) & JOURNAL_BUF_MASK;
pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
i, journal_state_count(s, i), j->buf[i].sectors);
}
pr_buf(out,
"need write:\t\t%i\n"
......@@ -1131,7 +1164,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
for_each_member_device_rcu(ca, c, iter,
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
......@@ -1146,7 +1179,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"\tdirty_idx_ondisk\t%u (seq %llu)\n"
"\tdirty_idx\t\t%u (seq %llu)\n"
"\tcur_idx\t\t%u (seq %llu)\n",
iter, ja->nr,
i, ja->nr,
bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
ja->sectors_free,
ja->discard_idx,
......
......@@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
return j->buf + j->reservations.idx;
}
static inline struct journal_buf *journal_prev_buf(struct journal *j)
{
return j->buf + !j->reservations.idx;
}
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
......@@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
{
return idx == 0 ? s.buf0_count : s.buf1_count;
switch (idx) {
case 0: return s.buf0_count;
case 1: return s.buf1_count;
case 2: return s.buf2_count;
case 3: return s.buf3_count;
}
BUG();
}
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
s->buf1_count += s->idx == 1;
s->buf2_count += s->idx == 2;
s->buf3_count += s->idx == 3;
}
static inline void bch2_journal_set_has_inode(struct journal *j,
......@@ -257,21 +260,24 @@ static inline bool journal_entry_empty(struct jset *j)
return true;
}
void __bch2_journal_buf_put(struct journal *, bool);
void __bch2_journal_buf_put(struct journal *);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
bool need_write_just_set)
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
{
union journal_res_state s;
s.v = atomic64_sub_return(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
.buf2_count = idx == 2,
.buf3_count = idx == 3,
}).v, &j->reservations.counter);
if (!journal_state_count(s, idx)) {
EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
__bch2_journal_buf_put(j, need_write_just_set);
}
EBUG_ON(((s.idx - idx) & 3) >
((s.idx - s.unwritten_idx) & 3));
if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
__bch2_journal_buf_put(j);
}
/*
......@@ -291,7 +297,7 @@ static inline void bch2_journal_res_put(struct journal *j,
BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
bch2_journal_buf_put(j, res->idx, false);
bch2_journal_buf_put(j, res->idx);
res->ref = 0;
}
......@@ -327,11 +333,18 @@ static inline int journal_res_get_fast(struct journal *j,
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
return 0;
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
new.cur_entry_offset += res->u64s;
journal_state_inc(&new);
/*
* If the refcount would overflow, we have to wait:
* XXX - tracepoint this:
*/
if (!journal_state_count(new, new.idx))
return 0;
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
......
......@@ -950,16 +950,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
buf->buf_size = new_size;
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
return j->buf + j->reservations.unwritten_idx;
}
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_prev_buf(j);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq);
u64 v;
int err = 0;
bch2_time_stats_update(j->write_time, j->write_start_time);
......@@ -998,9 +1005,14 @@ static void journal_write_done(struct closure *cl)
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
&j->reservations.counter);
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
BUG_ON(new.idx == new.unwritten_idx);
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
closure_wake_up(&w->wait);
journal_wake(j);
......@@ -1008,6 +1020,10 @@ static void journal_write_done(struct closure *cl)
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
if (new.unwritten_idx != new.idx &&
!journal_state_count(new, new.unwritten_idx))
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
static void journal_write_endio(struct bio *bio)
......@@ -1018,7 +1034,7 @@ static void journal_write_endio(struct bio *bio)
if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
struct journal_buf *w = journal_prev_buf(j);
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
......@@ -1035,7 +1051,7 @@ void bch2_journal_write(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_prev_buf(j);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
......@@ -1046,8 +1062,6 @@ void bch2_journal_write(struct closure *cl)
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
journal_buf_realloc(j, w);
jset = w->data;
......
......@@ -58,6 +58,19 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
old.v, new.v)) != old.v);
}
static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
{
unsigned sectors = 0;
while (!sectors && *idx != j->reservations.idx) {
sectors = j->buf[*idx].sectors;
*idx = (*idx + 1) & JOURNAL_BUF_MASK;
}
return sectors;
}
static struct journal_space {
unsigned next_entry;
unsigned remaining;
......@@ -69,15 +82,14 @@ static struct journal_space {
unsigned sectors_next_entry = UINT_MAX;
unsigned sectors_total = UINT_MAX;
unsigned i, nr_devs = 0;
unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
? journal_prev_buf(j)->sectors
: 0;
unsigned unwritten_sectors;
rcu_read_lock();
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
unsigned buckets_this_device, sectors_this_device;
unsigned idx = j->reservations.unwritten_idx;
if (!ja->nr)
continue;
......@@ -89,15 +101,19 @@ static struct journal_space {
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
while ((unwritten_sectors = get_unwritten_sectors(j, &idx))) {
if (unwritten_sectors >= sectors_this_device) {
if (!buckets_this_device)
continue;
if (!buckets_this_device) {
sectors_this_device = 0;
break;
}
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
sectors_this_device -= unwritten_sectors;
}
if (sectors_this_device < ca->mi.bucket_size &&
buckets_this_device) {
......@@ -277,6 +293,14 @@ static void bch2_journal_reclaim_fast(struct journal *j)
bch2_journal_space_available(j);
}
void __bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
if (atomic_dec_and_test(&pin_list->count))
bch2_journal_reclaim_fast(j);
}
void bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
......
......@@ -39,6 +39,7 @@ journal_seq_pin(struct journal *j, u64 seq)
return &j->pin.data[seq & j->pin.mask];
}
void __bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
......
......@@ -11,13 +11,13 @@
struct journal_res;
#define JOURNAL_BUF_BITS 1
#define JOURNAL_BUF_BITS 2
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
/*
* We put two of these in struct journal; we used them for writes to the
* journal that are being staged or in flight.
* We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
* the journal that are being staged or in flight.
*/
struct journal_buf {
struct jset *data;
......@@ -85,10 +85,12 @@ union journal_res_state {
struct {
u64 cur_entry_offset:20,
idx:1,
prev_buf_unwritten:1,
buf0_count:21,
buf1_count:21;
idx:2,
unwritten_idx:2,
buf0_count:10,
buf1_count:10,
buf2_count:10,
buf3_count:10;
};
};
......@@ -169,7 +171,7 @@ struct journal {
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
struct journal_buf buf[2];
struct journal_buf buf[JOURNAL_BUF_NR];
spinlock_t lock;
......
......@@ -1048,13 +1048,13 @@ int bch2_fs_recovery(struct bch_fs *c)
if (!c->sb.clean) {
ret = bch2_journal_seq_blacklist_add(c,
journal_seq,
journal_seq + 4);
journal_seq + 8);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
journal_seq += 4;
journal_seq += 8;
/*
* The superblock needs to be written before we do any btree
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment