Commit 2ca88e5a authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Btree key cache

This introduces a new kind of btree iterator, cached iterators, which
point to keys cached in a hash table. The cache also acts as a write
cache - in the update path, we journal the update but defer updating the
btree until the cached entry is flushed by journal reclaim.

Cache coherency is for now up to the users to handle, which isn't ideal
but should be good enough for now.

These new iterators will be used for updating inodes and alloc info (the
alloc and stripes btrees).
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 451570a5
......@@ -13,6 +13,7 @@ bcachefs-y := \
btree_gc.o \
btree_io.o \
btree_iter.o \
btree_key_cache.o \
btree_update_interior.o \
btree_update_leaf.o \
buckets.o \
......
......@@ -483,6 +483,7 @@ enum {
BCH_FS_ALLOCATOR_RUNNING,
BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
BCH_FS_RW,
......@@ -632,6 +633,8 @@ struct bch_fs {
struct list_head btree_trans_list;
mempool_t btree_iters_pool;
struct btree_key_cache btree_key_cache;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
......
......@@ -4,22 +4,16 @@
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "debug.h"
#include "extents.h"
#include "journal.h"
#include "trace.h"
#include <linux/prefetch.h>
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
{
return l < BTREE_MAX_DEPTH &&
......@@ -253,7 +247,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
}
/* Must lock btree nodes in key order: */
if (iter->btree_id < linked->btree_id)
if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
ret = false;
if (iter->btree_id == linked->btree_id &&
......@@ -301,7 +296,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
return;
}
for (l = 0; btree_iter_node(iter, l); l++) {
for (l = 0; is_btree_node(iter, l); l++) {
if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
!btree_node_locked(iter, l))
continue;
......@@ -323,7 +318,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
#endif
__flatten
static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
{
return btree_iter_get_locks(iter, false, trace);
}
......@@ -845,6 +840,8 @@ static inline void __btree_iter_init(struct btree_iter *iter,
static inline void btree_iter_node_set(struct btree_iter *iter,
struct btree *b)
{
BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
btree_iter_verify_new_node(iter, b);
EBUG_ON(!btree_iter_pos_in_node(iter, b));
......@@ -865,7 +862,8 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
struct btree_iter *linked;
trans_for_each_iter(iter->trans, linked)
if (btree_iter_pos_in_node(linked, b)) {
if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
btree_iter_pos_in_node(linked, b)) {
/*
* bch2_btree_iter_node_drop() has already been called -
* the old node we're replacing has already been
......@@ -1057,24 +1055,28 @@ static void btree_iter_up(struct btree_iter *iter)
static int btree_iter_traverse_one(struct btree_iter *);
static int __btree_iter_traverse_all(struct btree_trans *trans,
struct btree_iter *orig_iter, int ret)
static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
u8 sorted[BTREE_ITER_MAX];
unsigned i, nr_sorted = 0;
if (trans->in_traverse_all)
return -EINTR;
trans->in_traverse_all = true;
retry_all:
nr_sorted = 0;
trans_for_each_iter(trans, iter)
sorted[nr_sorted++] = iter - trans->iters;
sorted[nr_sorted++] = iter->idx;
#define btree_iter_cmp_by_idx(_l, _r) \
btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
retry_all:
bch2_trans_unlock(trans);
if (unlikely(ret == -ENOMEM)) {
......@@ -1090,11 +1092,6 @@ static int __btree_iter_traverse_all(struct btree_trans *trans,
if (unlikely(ret == -EIO)) {
trans->error = true;
if (orig_iter) {
orig_iter->flags |= BTREE_ITER_ERROR;
orig_iter->l[orig_iter->level].b =
BTREE_ITER_NO_NODE_ERROR;
}
goto out;
}
......@@ -1102,9 +1099,16 @@ static int __btree_iter_traverse_all(struct btree_trans *trans,
/* Now, redo traversals in correct order: */
for (i = 0; i < nr_sorted; i++) {
iter = &trans->iters[sorted[i]];
unsigned idx = sorted[i];
ret = btree_iter_traverse_one(iter);
/*
* sucessfully traversing one iterator can cause another to be
* unlinked, in btree_key_cache_fill()
*/
if (!(trans->iters_linked & (1ULL << idx)))
continue;
ret = btree_iter_traverse_one(&trans->iters[idx]);
if (ret)
goto retry_all;
}
......@@ -1119,12 +1123,14 @@ static int __btree_iter_traverse_all(struct btree_trans *trans,
}
out:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
return ret;
}
int bch2_btree_iter_traverse_all(struct btree_trans *trans)
{
return __btree_iter_traverse_all(trans, NULL, 0);
return __btree_iter_traverse_all(trans, 0);
}
static inline bool btree_iter_good_node(struct btree_iter *iter,
......@@ -1169,9 +1175,6 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
{
unsigned depth_want = iter->level;
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
/*
* if we need interior nodes locked, call btree_iter_relock() to make
* sure we walk back up enough that we lock them:
......@@ -1180,9 +1183,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
iter->locks_want > 1)
bch2_btree_iter_relock(iter, false);
if (btree_iter_type(iter) == BTREE_ITER_CACHED)
return bch2_btree_iter_traverse_cached(iter);
if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
/*
* XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
* here unnecessary
......@@ -1216,7 +1225,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
return 0;
iter->level = depth_want;
iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN;
if (ret == -EIO) {
iter->flags |= BTREE_ITER_ERROR;
iter->l[iter->level].b =
BTREE_ITER_NO_NODE_ERROR;
} else {
iter->l[iter->level].b =
BTREE_ITER_NO_NODE_DOWN;
}
return ret;
}
}
......@@ -1229,12 +1246,13 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
int ret;
ret = bch2_trans_cond_resched(iter->trans) ?:
ret = bch2_trans_cond_resched(trans) ?:
btree_iter_traverse_one(iter);
if (unlikely(ret))
ret = __btree_iter_traverse_all(iter->trans, iter, ret);
ret = __btree_iter_traverse_all(trans, ret);
return ret;
}
......@@ -1383,6 +1401,13 @@ static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
if (!cmp)
goto out;
if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
btree_node_unlock(iter, 0);
iter->l[0].b = BTREE_ITER_NO_NODE_UP;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
return;
}
l = btree_iter_up_until_good_node(iter, cmp);
if (btree_iter_node(iter, l)) {
......@@ -1814,6 +1839,26 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
{
struct bkey_cached *ck;
int ret;
bch2_btree_iter_checks(iter, BTREE_ITER_CACHED);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
ck = (void *) iter->l[0].b;
EBUG_ON(iter->btree_id != ck->key.btree_id ||
bkey_cmp(iter->pos, ck->key.pos));
BUG_ON(!ck->valid);
return bkey_i_to_s_c(ck->k);
}
static inline void bch2_btree_iter_init(struct btree_trans *trans,
struct btree_iter *iter, enum btree_id btree_id,
struct bpos pos, unsigned flags)
......@@ -1999,6 +2044,7 @@ static inline void btree_iter_copy(struct btree_iter *dst,
*dst = *src;
dst->idx = idx;
dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_locked(dst, i))
......@@ -2057,8 +2103,9 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
iter = best;
}
iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iter->flags &= ~BTREE_ITER_USER_FLAGS;
iter->flags |= flags & BTREE_ITER_USER_FLAGS;
if (iter->flags & BTREE_ITER_INTENT)
bch2_btree_iter_upgrade(iter, 1);
......@@ -2263,6 +2310,8 @@ int bch2_trans_exit(struct btree_trans *trans)
mutex_unlock(&trans->c->btree_trans_lock);
#endif
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
if (trans->used_mempool)
......
......@@ -110,6 +110,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_packed *,
unsigned, unsigned);
bool bch2_btree_iter_relock(struct btree_iter *, bool);
bool bch2_trans_relock(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
......@@ -170,6 +171,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
......@@ -177,7 +180,9 @@ void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
static inline int btree_iter_cmp(const struct btree_iter *l,
const struct btree_iter *r)
{
return cmp_int(l->btree_id, r->btree_id) ?: bkey_cmp(l->pos, r->pos);
return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
bkey_cmp(l->pos, r->pos);
}
/*
......@@ -211,6 +216,9 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
unsigned flags)
{
if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED)
return bch2_btree_iter_peek_cached(iter);
else
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_peek_slot(iter)
: bch2_btree_iter_peek(iter);
......
#include "bcachefs.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "error.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "trace.h"
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct bkey_cached *ck = obj;
const struct bkey_cached_key *key = arg->key;
return cmp_int(ck->key.btree_id, key->btree_id) ?:
bkey_cmp(ck->key.pos, key->pos);
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
.head_offset = offsetof(struct bkey_cached, hash),
.key_offset = offsetof(struct bkey_cached, key),
.key_len = sizeof(struct bkey_cached_key),
.obj_cmpfn = bch2_btree_key_cache_cmp_fn,
};
__flatten
static inline struct bkey_cached *
btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
{
struct bkey_cached_key key = {
.btree_id = btree_id,
.pos = pos,
};
return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
bch2_btree_key_cache_params);
}
static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
{
if (!six_trylock_intent(&ck->c.lock))
return false;
if (!six_trylock_write(&ck->c.lock)) {
six_unlock_intent(&ck->c.lock);
return false;
}
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
return false;
}
return true;
}
static void bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
}
static void bkey_cached_free(struct btree_key_cache *c,
struct bkey_cached *ck)
{
list_move(&ck->list, &c->freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
static struct bkey_cached *
bkey_cached_alloc(struct btree_key_cache *c)
{
struct bkey_cached *ck;
list_for_each_entry(ck, &c->freed, list)
if (bkey_cached_lock_for_evict(ck))
return ck;
list_for_each_entry(ck, &c->clean, list)
if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
return ck;
}
ck = kzalloc(sizeof(*ck), GFP_NOFS);
if (!ck)
return NULL;
INIT_LIST_HEAD(&ck->list);
six_lock_init(&ck->c.lock);
lockdep_set_novalidate_class(&ck->c.lock);
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
return ck;
}
static struct bkey_cached *
btree_key_cache_create(struct btree_key_cache *c,
enum btree_id btree_id,
struct bpos pos)
{
struct bkey_cached *ck;
ck = bkey_cached_alloc(c);
if (!ck)
return ERR_PTR(-ENOMEM);
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
ck->key.pos = pos;
ck->valid = false;
BUG_ON(ck->flags);
if (rhashtable_lookup_insert_fast(&c->table,
&ck->hash,
bch2_btree_key_cache_params)) {
/* We raced with another fill: */
bkey_cached_free(c, ck);
return NULL;
}
list_move(&ck->list, &c->clean);
six_unlock_write(&ck->c.lock);
return ck;
}
static int btree_key_cache_fill(struct btree_trans *trans,
struct btree_iter *ck_iter,
struct bkey_cached *ck)
{
struct btree_iter *iter;
struct bkey_s_c k;
unsigned new_u64s = 0;
struct bkey_i *new_k = NULL;
int ret;
iter = bch2_trans_get_iter(trans, ck->key.btree_id,
ck->key.pos, BTREE_ITER_SLOTS);
if (IS_ERR(iter))
return PTR_ERR(iter);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret) {
bch2_trans_iter_put(trans, iter);
return ret;
}
if (!bch2_btree_node_relock(ck_iter, 0)) {
bch2_trans_iter_put(trans, iter);
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
return -EINTR;
}
if (k.k->u64s > ck->u64s) {
new_u64s = roundup_pow_of_two(k.k->u64s);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
bch2_trans_iter_put(trans, iter);
return -ENOMEM;
}
}
bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
ck->k = new_k;
}
bkey_reassemble(ck->k, k);
ck->valid = true;
bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
/* We're not likely to need this iterator again: */
bch2_trans_iter_free(trans, iter);
return 0;
}
static int bkey_cached_check_fn(struct six_lock *lock, void *p)
{
struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
const struct btree_iter *iter = p;
return ck->key.btree_id == iter->btree_id &&
!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
}
int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
int ret = 0;
BUG_ON(iter->level);
if (btree_node_locked(iter, 0)) {
ck = (void *) iter->l[0].b;
goto fill;
}
retry:
ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
if (!ck) {
if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
iter->l[0].b = NULL;
return 0;
}
mutex_lock(&c->btree_key_cache.lock);
ck = btree_key_cache_create(&c->btree_key_cache,
iter->btree_id, iter->pos);
mutex_unlock(&c->btree_key_cache.lock);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
iter->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(iter, 0);
if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
bkey_cached_check_fn, iter)) {
if (ck->key.btree_id != iter->btree_id ||
bkey_cmp(ck->key.pos, iter->pos)) {
goto retry;
}
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
if (ck->key.btree_id != iter->btree_id ||
bkey_cmp(ck->key.pos, iter->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
mark_btree_node_locked(iter, 0, lock_want);
}
iter->l[0].lock_seq = ck->c.lock.state.seq;
iter->l[0].b = (void *) ck;
fill:
if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
if (!btree_node_intent_locked(iter, 0))
bch2_btree_iter_upgrade(iter, 1);
if (!btree_node_intent_locked(iter, 0)) {
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
ret = btree_key_cache_fill(trans, iter, ck);
if (ret)
goto err;
}
iter->uptodate = BTREE_ITER_NEED_PEEK;
bch2_btree_iter_downgrade(iter);
return ret;
err:
if (ret != -EINTR) {
btree_node_unlock(iter, 0);
iter->flags |= BTREE_ITER_ERROR;
iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
}
return ret;
}
static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bkey_cached_key key,
u64 journal_seq,
bool evict)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_iter *c_iter = NULL, *b_iter = NULL;
struct bkey_cached *ck;
int ret;
b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_SLOTS|
BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(b_iter);
if (ret)
goto out;
c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
ret = PTR_ERR_OR_ZERO(c_iter);
if (ret)
goto out;
retry:
ret = bch2_btree_iter_traverse(c_iter);
if (ret)
goto err;
ck = (void *) c_iter->l[0].b;
if (!ck ||
(journal_seq && ck->journal.seq != journal_seq))
goto out;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
if (!evict)
goto out;
goto evict;
}
ret = bch2_btree_iter_traverse(b_iter) ?:
bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_JOURNAL_RESERVED|
BTREE_INSERT_JOURNAL_RECLAIM);
err:
if (ret == -EINTR)
goto retry;
BUG_ON(ret && !bch2_journal_error(j));
if (ret)
goto out;
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
if (!evict) {
mutex_lock(&c->btree_key_cache.lock);
list_move_tail(&ck->list, &c->btree_key_cache.clean);
mutex_unlock(&c->btree_key_cache.lock);
} else {
evict:
BUG_ON(!btree_node_intent_locked(c_iter, 0));
mark_btree_node_unlocked(c_iter, 0);
c_iter->l[0].b = NULL;
six_lock_write(&ck->c.lock, NULL, NULL);
mutex_lock(&c->btree_key_cache.lock);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free(&c->btree_key_cache, ck);
mutex_unlock(&c->btree_key_cache.lock);
}
out:
bch2_trans_iter_put(trans, b_iter);
bch2_trans_iter_put(trans, c_iter);
return ret;
}
static void btree_key_cache_journal_flush(struct journal *j,
struct journal_entry_pin *pin,
u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
struct btree_trans trans;
six_lock_read(&ck->c.lock, NULL, NULL);
key = ck->key;
if (ck->journal.seq != seq ||
!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_read(&ck->c.lock);
return;
}
six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0);
btree_key_cache_flush_pos(&trans, key, seq, false);
bch2_trans_exit(&trans);
}
/*
* Flush and evict a key from the key cache:
*/
int bch2_btree_key_cache_flush(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
struct bch_fs *c = trans->c;
struct bkey_cached_key key = { id, pos };
/* Fastpath - assume it won't be found: */
if (!btree_key_cache_find(c, id, pos))
return 0;
return btree_key_cache_flush_pos(trans, key, 0, true);
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) iter->l[0].b;
BUG_ON(insert->u64s > ck->u64s);
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
int difference;
BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
difference = jset_u64s(insert->u64s) - ck->res.u64s;
if (difference > 0) {
trans->journal_preres.u64s -= difference;
ck->res.u64s += difference;
}
}
bkey_copy(ck->k, insert);
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
mutex_lock(&c->btree_key_cache.lock);
list_del_init(&ck->list);
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
mutex_unlock(&c->btree_key_cache.lock);
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
&ck->journal, btree_key_cache_journal_flush);
return true;
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
BUG_ON(btree_key_cache_find(trans->c, id, pos));
}
#endif
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
{
struct bkey_cached *ck, *n;
mutex_lock(&c->lock);
list_for_each_entry_safe(ck, n, &c->clean, list) {
kfree(ck->k);
kfree(ck);
}
list_for_each_entry_safe(ck, n, &c->freed, list)
kfree(ck);
mutex_unlock(&c->lock);
rhashtable_destroy(&c->table);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
INIT_LIST_HEAD(&c->clean);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
{
return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
}
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
int bch2_btree_iter_traverse_cached(struct btree_iter *);
bool bch2_btree_insert_key_cached(struct btree_trans *,
struct btree_iter *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_key_cache_verify_clean(struct btree_trans *,
enum btree_id, struct bpos);
#else
static inline void
bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos) {}
#endif
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
......@@ -183,6 +183,7 @@ struct btree_node_iter {
enum btree_iter_type {
BTREE_ITER_KEYS,
BTREE_ITER_NODES,
BTREE_ITER_CACHED,
};
#define BTREE_ITER_TYPE ((1 << 2) - 1)
......@@ -214,6 +215,15 @@ enum btree_iter_type {
#define BTREE_ITER_IS_EXTENTS (1 << 6)
#define BTREE_ITER_ERROR (1 << 7)
#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
#define BTREE_ITER_CACHED_NOFILL (1 << 9)
#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
#define BTREE_ITER_USER_FLAGS \
(BTREE_ITER_SLOTS \
|BTREE_ITER_INTENT \
|BTREE_ITER_PREFETCH \
|BTREE_ITER_CACHED_NOFILL \
|BTREE_ITER_CACHED_NOCREATE)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
......@@ -222,6 +232,14 @@ enum btree_iter_uptodate {
BTREE_ITER_NEED_TRAVERSE = 3,
};
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
/*
* @pos - iterator's current position
* @level - current btree depth
......@@ -259,7 +277,8 @@ struct btree_iter {
unsigned long ip_allocated;
};
static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
static inline enum btree_iter_type
btree_iter_type(const struct btree_iter *iter)
{
return iter->flags & BTREE_ITER_TYPE;
}
......@@ -269,6 +288,37 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
return iter->l + iter->level;
}
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
struct list_head freed;
struct list_head clean;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed;
#define BKEY_CACHED_DIRTY 0
struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
u8 u64s;
bool valid;
struct bkey_cached_key key;
struct rhash_head hash;
struct list_head list;
struct journal_preres res;
struct journal_entry_pin journal;
struct bkey_i *k;
};
struct btree_insert_entry {
unsigned trigger_flags;
unsigned trans_triggers_run:1;
......@@ -307,6 +357,7 @@ struct btree_trans {
unsigned error:1;
unsigned nounlock:1;
unsigned need_reset:1;
unsigned in_traverse_all:1;
unsigned mem_top;
unsigned mem_bytes;
......
......@@ -23,6 +23,7 @@ enum btree_insert_flags {
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
......@@ -47,8 +48,12 @@ enum btree_insert_flags {
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
/* Indicates that we have pre-reserved space in the journal: */
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
......
......@@ -529,11 +529,20 @@ static void btree_update_nodes_written(struct btree_update *as)
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
*/
/*
* We can't call into journal reclaim here: we'd block on the journal
* reclaim lock, but we may need to release the open buckets we have
* pinned in order for other btree updates to make forward progress, and
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_JOURNAL_RESERVED,
btree_update_nodes_written_trans(&trans, as));
BUG_ON(ret && !bch2_journal_error(&c->journal));
......
......@@ -6,6 +6,7 @@
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "buckets.h"
#include "debug.h"
......@@ -32,6 +33,9 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
{
bch2_btree_node_lock_write(b, iter);
if (btree_iter_type(iter) == BTREE_ITER_CACHED)
return;
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_btree_iter_reinit_node(iter, b);
......@@ -202,6 +206,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
return true;
}
/* Cached btree updates: */
/* Normal update interface: */
static inline void btree_insert_entry_checks(struct btree_trans *trans,
......@@ -284,6 +290,31 @@ btree_key_can_insert(struct btree_trans *trans,
return BTREE_INSERT_OK;
}
static enum btree_insert_ret
btree_key_can_insert_cached(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert,
unsigned *u64s)
{
struct bkey_cached *ck = (void *) iter->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
BUG_ON(iter->level);
if (*u64s <= ck->u64s)
return BTREE_INSERT_OK;
new_u64s = roundup_pow_of_two(*u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k)
return -ENOMEM;
ck->u64s = new_u64s;
ck->k = new_k;
return BTREE_INSERT_OK;
}
static inline void do_btree_insert_one(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
......@@ -297,7 +328,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
insert->k.needs_whiteout = false;
did_work = btree_insert_key_leaf(trans, iter, insert);
did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED)
? btree_insert_key_leaf(trans, iter, insert)
: bch2_btree_insert_key_cached(trans, iter, insert);
if (!did_work)
return;
......@@ -335,10 +368,16 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
*/
BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
bch2_mark_update(trans, i->iter, i->k, NULL,
i->trigger_flags|BTREE_TRIGGER_GC);
}
}
static inline int
......@@ -371,7 +410,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
u64s = 0;
u64s += i->k->k.u64s;
ret = btree_key_can_insert(trans, i->iter, i->k, &u64s);
ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
? btree_key_can_insert(trans, i->iter, i->k, &u64s)
: btree_key_can_insert_cached(trans, i->iter, i->k, &u64s);
if (ret) {
*stopped_at = i;
return ret;
......@@ -467,7 +508,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK);
JOURNAL_RES_GET_NONBLOCK|
((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
? JOURNAL_RES_GET_RECLAIM : 0));
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s);
......@@ -523,7 +566,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
trans->nounlock = true;
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
......@@ -808,6 +852,14 @@ int __bch2_trans_commit(struct btree_trans *trans)
return ret;
}
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
!(i->trigger_flags & BTREE_TRIGGER_NORUN))
bch2_btree_key_cache_verify_clean(trans,
i->iter->btree_id, i->iter->pos);
#endif
/*
* Running triggers will append more updates to the list of updates as
* we're walking it:
......@@ -880,7 +932,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
BUG_ON(i->iter->locks_want < 1);
u64s = jset_u64s(i->k->k.u64s);
if (0)
if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
trans->journal_u64s += u64s;
}
......
......@@ -1816,6 +1816,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
return 0;
if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
struct bkey_cached *ck = (void *) iter->l[0].b;
return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
0, 0, BTREE_TRIGGER_OVERWRITE);
}
while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k;
......
......@@ -346,6 +346,37 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq,
journal_wake(j);
}
void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
if (journal_pin_active(pin) && pin->seq < seq)
return;
spin_lock(&j->lock);
if (pin->seq != seq) {
bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
} else {
struct journal_entry_pin_list *pin_list =
journal_seq_pin(j, seq);
/*
* If the pin is already pinning the right sequence number, it
* still might've already been flushed:
*/
list_move(&pin->list, &pin_list->list);
}
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
}
void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *dst,
struct journal_entry_pin *src,
......
......@@ -42,6 +42,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
__bch2_journal_pin_add(j, seq, pin, flush_fn);
}
void bch2_journal_pin_update(struct journal *, u64,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_copy(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
......
......@@ -13,6 +13,7 @@
#include "bkey_sort.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "chardev.h"
......@@ -479,6 +480,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_fs_io_exit(c);
bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
......@@ -650,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
......@@ -746,6 +749,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_journal_init(&c->journal) ||
bch2_fs_replicas_init(c) ||
bch2_fs_btree_cache_init(c) ||
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
bch2_fs_btree_iter_init(c) ||
bch2_fs_btree_interior_update_init(c) ||
bch2_fs_io_init(c) ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment