Commit 3636ed48 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Deferred btree updates

Will be used in the future for inode updates, which will be very helpful
for multithreaded workloads that have to update the inode with every
extent update (appends, or updates that change i_sectors)

Also will be used eventually for fully persistent alloc info

However - we still need a mechanism for reserving space in the journal
prior to getting a journal reservation, so it's not technically safe to
make use of this just yet, we could deadlock with the journal full
(although not likely to be an issue in practice)
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent eb863265
...@@ -245,9 +245,28 @@ struct btree_iter { ...@@ -245,9 +245,28 @@ struct btree_iter {
#define BTREE_ITER_MAX 8 #define BTREE_ITER_MAX 8
struct deferred_update {
struct journal_entry_pin journal;
spinlock_t lock;
unsigned gen;
u8 allocated_u64s;
enum btree_id btree_id;
/* must be last: */
struct bkey_i k;
};
struct btree_insert_entry { struct btree_insert_entry {
struct btree_iter *iter; struct bkey_i *k;
struct bkey_i *k;
union {
struct btree_iter *iter;
struct deferred_update *d;
};
bool deferred;
}; };
struct btree_trans { struct btree_trans {
......
...@@ -16,6 +16,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, ...@@ -16,6 +16,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
struct bkey_i *); struct bkey_i *);
void bch2_deferred_update_free(struct bch_fs *,
struct deferred_update *);
struct deferred_update *
bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
/* Normal update interface: */ /* Normal update interface: */
struct btree_insert { struct btree_insert {
...@@ -38,6 +43,13 @@ int __bch2_btree_insert_at(struct btree_insert *); ...@@ -38,6 +43,13 @@ int __bch2_btree_insert_at(struct btree_insert *);
.k = (_k), \ .k = (_k), \
}) })
#define BTREE_INSERT_DEFERRED(_d, _k) \
((struct btree_insert_entry) { \
.k = (_k), \
.d = (_d), \
.deferred = true, \
})
/** /**
* bch_btree_insert_at - insert one or more keys at iterator positions * bch_btree_insert_at - insert one or more keys at iterator positions
* @iter: btree iterator * @iter: btree iterator
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "btree_locking.h" #include "btree_locking.h"
#include "buckets.h" #include "buckets.h"
#include "debug.h" #include "debug.h"
#include "error.h"
#include "extents.h" #include "extents.h"
#include "journal.h" #include "journal.h"
#include "journal_reclaim.h" #include "journal_reclaim.h"
...@@ -126,6 +127,27 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, ...@@ -126,6 +127,27 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
return __btree_node_flush(j, pin, 1, seq); return __btree_node_flush(j, pin, 1, seq);
} }
static inline void __btree_journal_key(struct btree_insert *trans,
enum btree_id btree_id,
struct bkey_i *insert)
{
struct journal *j = &trans->c->journal;
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode);
if (trans->journal_seq)
*trans->journal_seq = seq;
}
void bch2_btree_journal_key(struct btree_insert *trans, void bch2_btree_journal_key(struct btree_insert *trans,
struct btree_iter *iter, struct btree_iter *iter,
struct bkey_i *insert) struct bkey_i *insert)
...@@ -140,21 +162,9 @@ void bch2_btree_journal_key(struct btree_insert *trans, ...@@ -140,21 +162,9 @@ void bch2_btree_journal_key(struct btree_insert *trans,
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64 seq = trans->journal_res.seq; __btree_journal_key(trans, iter->btree_id, insert);
bool needs_whiteout = insert->k.needs_whiteout; btree_bset_last(b)->journal_seq =
cpu_to_le64(trans->journal_res.seq);
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
iter->btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode);
if (trans->journal_seq)
*trans->journal_seq = seq;
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
} }
if (unlikely(!journal_pin_active(&w->journal))) { if (unlikely(!journal_pin_active(&w->journal))) {
...@@ -227,8 +237,109 @@ btree_insert_key_leaf(struct btree_insert *trans, ...@@ -227,8 +237,109 @@ btree_insert_key_leaf(struct btree_insert *trans,
return ret; return ret;
} }
#define trans_for_each_entry(trans, i) \ /* Deferred btree updates: */
for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
static void deferred_update_flush(struct journal *j,
struct journal_entry_pin *pin,
u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct deferred_update *d =
container_of(pin, struct deferred_update, journal);
u64 tmp[32];
struct bkey_i *k = (void *) tmp;
unsigned gen;
int ret;
if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS);
BUG_ON(!k); /* XXX */
}
spin_lock(&d->lock);
gen = d->gen;
if (journal_pin_active(&d->journal)) {
BUG_ON(d->k.k.u64s > d->allocated_u64s);
bkey_copy(k, &d->k);
spin_unlock(&d->lock);
ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
BTREE_INSERT_NOFAIL);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
c, "error flushing deferred btree update: %i", ret);
spin_lock(&d->lock);
}
if (gen == d->gen)
bch2_journal_pin_drop(j, &d->journal);
spin_unlock(&d->lock);
if (k != (void *) tmp)
kfree(k);
}
static enum btree_insert_ret
btree_insert_key_deferred(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct deferred_update *d = insert->d;
BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
BUG_ON(insert->k->u64s > d->allocated_u64s);
__btree_journal_key(trans, d->btree_id, insert->k);
spin_lock(&d->lock);
d->gen++;
bkey_copy(&d->k, insert->k);
spin_unlock(&d->lock);
bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
deferred_update_flush);
return BTREE_INSERT_OK;
}
void bch2_deferred_update_free(struct bch_fs *c,
struct deferred_update *d)
{
deferred_update_flush(&c->journal, &d->journal, 0);
BUG_ON(journal_pin_active(&d->journal));
bch2_journal_pin_flush(&c->journal, &d->journal);
kfree(d);
}
struct deferred_update *
bch2_deferred_update_alloc(struct bch_fs *c,
enum btree_id btree_id,
unsigned u64s)
{
struct deferred_update *d;
BUG_ON(u64s > U8_MAX);
d = kmalloc(offsetof(struct deferred_update, k) +
u64s * sizeof(u64), GFP_NOFS);
BUG_ON(!d);
memset(d, 0, offsetof(struct deferred_update, k));
spin_lock_init(&d->lock);
d->allocated_u64s = u64s;
d->btree_id = btree_id;
return d;
}
/* struct btree_insert operations: */
/* /*
* We sort transaction entries so that if multiple iterators point to the same * We sort transaction entries so that if multiple iterators point to the same
...@@ -238,25 +349,32 @@ static bool same_leaf_as_prev(struct btree_insert *trans, ...@@ -238,25 +349,32 @@ static bool same_leaf_as_prev(struct btree_insert *trans,
struct btree_insert_entry *i) struct btree_insert_entry *i)
{ {
return i != trans->entries && return i != trans->entries &&
!i->deferred &&
i[0].iter->l[0].b == i[-1].iter->l[0].b; i[0].iter->l[0].b == i[-1].iter->l[0].b;
} }
static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans, #define __trans_next_entry(_trans, _i, _filter) \
struct btree_insert_entry *i) ({ \
{ while ((_i) < (_trans)->entries + (_trans->nr) && !(_filter)) \
struct btree *b = i->iter->l[0].b; (_i)++; \
\
(_i) < (_trans)->entries + (_trans->nr); \
})
do { #define __trans_for_each_entry(_trans, _i, _filter) \
i++; for ((_i) = (_trans)->entries; \
} while (i < trans->entries + trans->nr && b == i->iter->l[0].b); __trans_next_entry(_trans, _i, _filter); \
(_i)++)
return i; #define trans_for_each_entry(trans, i) \
} __trans_for_each_entry(trans, i, true)
#define trans_for_each_iter(trans, i) \
__trans_for_each_entry(trans, i, !(i)->deferred)
#define trans_for_each_leaf(trans, i) \ #define trans_for_each_leaf(trans, i) \
for ((i) = (trans)->entries; \ __trans_for_each_entry(trans, i, !(i)->deferred && \
(i) < (trans)->entries + (trans)->nr; \ !same_leaf_as_prev(trans, i))
(i) = trans_next_leaf(trans, i))
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter) struct btree_iter *iter)
...@@ -294,7 +412,8 @@ static void multi_unlock_write(struct btree_insert *trans) ...@@ -294,7 +412,8 @@ static void multi_unlock_write(struct btree_insert *trans)
static inline int btree_trans_cmp(struct btree_insert_entry l, static inline int btree_trans_cmp(struct btree_insert_entry l,
struct btree_insert_entry r) struct btree_insert_entry r)
{ {
return btree_iter_cmp(l.iter, r.iter); return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?:
btree_iter_cmp(l.iter, r.iter);
} }
/* Normal update interface: */ /* Normal update interface: */
...@@ -328,6 +447,15 @@ btree_key_can_insert(struct btree_insert *trans, ...@@ -328,6 +447,15 @@ btree_key_can_insert(struct btree_insert *trans,
return BTREE_INSERT_OK; return BTREE_INSERT_OK;
} }
static inline enum btree_insert_ret
do_btree_insert_one(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
return likely(!insert->deferred)
? btree_insert_key_leaf(trans, insert)
: btree_insert_key_deferred(trans, insert);
}
/* /*
* Get journal reservation, take write locks, and attempt to do btree update(s): * Get journal reservation, take write locks, and attempt to do btree update(s):
*/ */
...@@ -340,9 +468,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans, ...@@ -340,9 +468,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
unsigned u64s; unsigned u64s;
int ret; int ret;
trans_for_each_entry(trans, i) trans_for_each_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
/* reserve space for deferred updates */
__trans_for_each_entry(trans, i, i->deferred) {
}
memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->journal_res, 0, sizeof(trans->journal_res));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
...@@ -353,9 +486,13 @@ static inline int do_btree_insert_at(struct btree_insert *trans, ...@@ -353,9 +486,13 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
while ((ret = bch2_journal_res_get(&c->journal, while ((ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s, &trans->journal_res, u64s,
JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) { JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
struct btree_iter *iter = trans->entries[0].iter; struct btree_iter *iter = NULL;
trans_for_each_iter(trans, i)
iter = i->iter;
bch2_btree_iter_unlock(iter); if (iter)
bch2_btree_iter_unlock(iter);
ret = bch2_journal_res_get(&c->journal, ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s, &trans->journal_res, u64s,
...@@ -363,7 +500,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans, ...@@ -363,7 +500,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
if (ret) if (ret)
return ret; return ret;
if (!bch2_btree_iter_relock(iter)) { if (iter && !bch2_btree_iter_relock(iter)) {
trans_restart(" (iter relock after journal res get blocked)"); trans_restart(" (iter relock after journal res get blocked)");
return -EINTR; return -EINTR;
} }
...@@ -387,7 +524,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans, ...@@ -387,7 +524,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
* amount of space available: * amount of space available:
*/ */
u64s = 0; u64s = 0;
trans_for_each_entry(trans, i) { trans_for_each_iter(trans, i) {
/* Multiple inserts might go to same leaf: */ /* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i)) if (!same_leaf_as_prev(trans, i))
u64s = 0; u64s = 0;
...@@ -415,14 +552,17 @@ static inline int do_btree_insert_at(struct btree_insert *trans, ...@@ -415,14 +552,17 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
* have been traversed/locked, depending on what the caller was * have been traversed/locked, depending on what the caller was
* doing: * doing:
*/ */
for_each_btree_iter(trans->entries[0].iter, linked) trans_for_each_iter(trans, i) {
if (linked->uptodate < BTREE_ITER_NEED_RELOCK) for_each_btree_iter(i->iter, linked)
linked->flags |= BTREE_ITER_NOUNLOCK; if (linked->uptodate < BTREE_ITER_NEED_RELOCK)
linked->flags |= BTREE_ITER_NOUNLOCK;
break;
}
} }
trans->did_work = true; trans->did_work = true;
trans_for_each_entry(trans, i) { trans_for_each_entry(trans, i) {
switch (btree_insert_key_leaf(trans, i)) { switch (do_btree_insert_one(trans, i)) {
case BTREE_INSERT_OK: case BTREE_INSERT_OK:
break; break;
case BTREE_INSERT_NEED_TRAVERSE: case BTREE_INSERT_NEED_TRAVERSE:
...@@ -444,12 +584,20 @@ static inline int do_btree_insert_at(struct btree_insert *trans, ...@@ -444,12 +584,20 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
static inline void btree_insert_entry_checks(struct bch_fs *c, static inline void btree_insert_entry_checks(struct bch_fs *c,
struct btree_insert_entry *i) struct btree_insert_entry *i)
{ {
BUG_ON(i->iter->level); enum btree_id btree_id = !i->deferred
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); ? i->iter->btree_id
: i->d->btree_id;
if (!i->deferred) {
BUG_ON(i->iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
bch2_btree_iter_verify_locks(i->iter);
}
BUG_ON(debug_check_bkeys(c) && BUG_ON(debug_check_bkeys(c) &&
!bkey_deleted(&i->k->k) && !bkey_deleted(&i->k->k) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
i->iter->btree_id));
} }
/** /**
...@@ -473,20 +621,18 @@ int __bch2_btree_insert_at(struct btree_insert *trans) ...@@ -473,20 +621,18 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
BUG_ON(!trans->nr); BUG_ON(!trans->nr);
bch2_btree_iter_verify_locks(trans->entries[0].iter);
/* for the sake of sanity: */ /* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
trans_for_each_entry(trans, i) trans_for_each_entry(trans, i)
btree_insert_entry_checks(c, i); btree_insert_entry_checks(c, i);
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
if (unlikely(!percpu_ref_tryget(&c->writes))) if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS; return -EROFS;
retry: retry:
trans_for_each_entry(trans, i) { trans_for_each_iter(trans, i) {
unsigned old_locks_want = i->iter->locks_want; unsigned old_locks_want = i->iter->locks_want;
unsigned old_uptodate = i->iter->uptodate; unsigned old_uptodate = i->iter->uptodate;
...@@ -510,16 +656,22 @@ int __bch2_btree_insert_at(struct btree_insert *trans) ...@@ -510,16 +656,22 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
trans_for_each_leaf(trans, i) trans_for_each_leaf(trans, i)
bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
trans_for_each_entry(trans, i) trans_for_each_iter(trans, i)
bch2_btree_iter_downgrade(i->iter); bch2_btree_iter_downgrade(i->iter);
out: out:
percpu_ref_put(&c->writes); percpu_ref_put(&c->writes);
/* make sure we didn't drop or screw up locks: */ /* make sure we didn't drop or screw up locks: */
bch2_btree_iter_verify_locks(trans->entries[0].iter); trans_for_each_iter(trans, i) {
bch2_btree_iter_verify_locks(i->iter);
break;
}
for_each_btree_iter(trans->entries[0].iter, linked) trans_for_each_iter(trans, i) {
linked->flags &= ~BTREE_ITER_NOUNLOCK; for_each_btree_iter(i->iter, linked)
linked->flags &= ~BTREE_ITER_NOUNLOCK;
break;
}
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
...@@ -598,7 +750,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans) ...@@ -598,7 +750,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
goto out; goto out;
} }
trans_for_each_entry(trans, i) { trans_for_each_iter(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter); int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) { if (ret2) {
ret = ret2; ret = ret2;
......
...@@ -75,6 +75,25 @@ void bch2_journal_pin_drop(struct journal *j, ...@@ -75,6 +75,25 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock); spin_unlock(&j->lock);
} }
void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
spin_lock(&j->lock);
if (pin->seq != seq) {
__journal_pin_drop(j, pin);
__journal_pin_add(j, seq, pin, flush_fn);
} else {
struct journal_entry_pin_list *pin_list =
journal_seq_pin(j, seq);
list_move(&pin->list, &pin_list->list);
}
spin_unlock(&j->lock);
}
void bch2_journal_pin_add_if_older(struct journal *j, void bch2_journal_pin_add_if_older(struct journal *j,
struct journal_entry_pin *src_pin, struct journal_entry_pin *src_pin,
struct journal_entry_pin *pin, struct journal_entry_pin *pin,
......
...@@ -19,6 +19,8 @@ journal_seq_pin(struct journal *j, u64 seq) ...@@ -19,6 +19,8 @@ journal_seq_pin(struct journal *j, u64 seq)
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn); journal_pin_flush_fn);
void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *, void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *, struct journal_entry_pin *,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment