Commit 12ce5b7d authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Btree key cache coherency

 - Updates to non key cache iterators will now be transparently
   redirected to the key cache for cached btrees.

 - Except when creating new keys: then the update goes to underlying
   btree

For for iterating over a cached btree to work, we need to ensure that if
a key exists in the key cache, it also exists in the btree - otherwise
the iterator code will skip past it and not check the key cache.

Otherwise, for consistency, all updates should go to the same place -
the key cache.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent f7b6ca23
...@@ -271,7 +271,8 @@ int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, ...@@ -271,7 +271,8 @@ int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
return PTR_ERR(a); return PTR_ERR(a);
bch2_alloc_pack(trans->c, a, *u); bch2_alloc_pack(trans->c, a, *u);
return bch2_trans_update(trans, iter, &a->k, trigger_flags); return bch2_trans_update(trans, iter, &a->k, trigger_flags|
BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
} }
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
......
...@@ -2188,6 +2188,8 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) ...@@ -2188,6 +2188,8 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
break; break;
if (bpos_cmp(i->k->k.p, iter->path->pos) < 0) if (bpos_cmp(i->k->k.p, iter->path->pos) < 0)
continue; continue;
if (i->key_cache_already_flushed)
continue;
if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
ret = i->k; ret = i->k;
} }
......
...@@ -414,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, ...@@ -414,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
* */ * */
ret = bch2_btree_iter_traverse(&b_iter) ?: ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k, bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?: BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
...@@ -555,13 +556,15 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ...@@ -555,13 +556,15 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
return true; return true;
} }
#ifdef CONFIG_BCACHEFS_DEBUG void bch2_btree_key_cache_drop(struct btree_trans *trans,
void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, struct btree_path *path)
enum btree_id id, struct bpos pos)
{ {
BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); struct bkey_cached *ck = (void *) path->l[0].b;
ck->valid = false;
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
} }
#endif
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc) struct shrink_control *sc)
......
...@@ -32,14 +32,8 @@ bool bch2_btree_insert_key_cached(struct btree_trans *, ...@@ -32,14 +32,8 @@ bool bch2_btree_insert_key_cached(struct btree_trans *,
struct btree_path *, struct bkey_i *); struct btree_path *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *, int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos); enum btree_id, struct bpos);
#ifdef CONFIG_BCACHEFS_DEBUG void bch2_btree_key_cache_drop(struct btree_trans *,
void bch2_btree_key_cache_verify_clean(struct btree_trans *, struct btree_path *);
enum btree_id, struct bpos);
#else
static inline void
bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos) {}
#endif
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
......
...@@ -344,6 +344,7 @@ struct btree_insert_entry { ...@@ -344,6 +344,7 @@ struct btree_insert_entry {
bool cached:1; bool cached:1;
bool insert_trigger_run:1; bool insert_trigger_run:1;
bool overwrite_trigger_run:1; bool overwrite_trigger_run:1;
bool key_cache_already_flushed:1;
/* /*
* @old_k may be a key from the journal; @old_btree_u64s always refers * @old_k may be a key from the journal; @old_btree_u64s always refers
* to the size of the key being overwritten in the btree: * to the size of the key being overwritten in the btree:
...@@ -645,6 +646,8 @@ static inline bool btree_type_has_snapshots(enum btree_id id) ...@@ -645,6 +646,8 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
enum btree_update_flags { enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
__BTREE_UPDATE_NOJOURNAL, __BTREE_UPDATE_NOJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
...@@ -658,6 +661,9 @@ enum btree_update_flags { ...@@ -658,6 +661,9 @@ enum btree_update_flags {
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY \
(1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
......
...@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, ...@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags); struct bkey_i *, enum btree_update_flags);
int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
struct bkey_i *, enum btree_update_flags);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags); struct bkey_i *, enum btree_update_flags);
......
...@@ -23,10 +23,15 @@ ...@@ -23,10 +23,15 @@
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/sort.h> #include <linux/sort.h>
static int __must_check
bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
struct bkey_i *, enum btree_update_flags);
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r) const struct btree_insert_entry *r)
{ {
return cmp_int(l->btree_id, r->btree_id) ?: return cmp_int(l->btree_id, r->btree_id) ?:
cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?: -cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p); bpos_cmp(l->k->k.p, r->k->k.p);
} }
...@@ -378,9 +383,14 @@ static inline void do_btree_insert_one(struct btree_trans *trans, ...@@ -378,9 +383,14 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
i->k->k.needs_whiteout = false; i->k->k.needs_whiteout = false;
did_work = !i->cached if (!i->cached)
? btree_insert_key_leaf(trans, i) did_work = btree_insert_key_leaf(trans, i);
: bch2_btree_insert_key_cached(trans, i->path, i->k); else if (!i->key_cache_already_flushed)
did_work = bch2_btree_insert_key_cached(trans, i->path, i->k);
else {
bch2_btree_key_cache_drop(trans, i->path);
did_work = false;
}
if (!did_work) if (!did_work)
return; return;
...@@ -987,18 +997,6 @@ int __bch2_trans_commit(struct btree_trans *trans) ...@@ -987,18 +997,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
goto out_reset; goto out_reset;
} }
#ifdef CONFIG_BCACHEFS_DEBUG
/*
* if BTREE_TRIGGER_NORUN is set, it means we're probably being called
* from the key cache flush code:
*/
trans_for_each_update(trans, i)
if (!i->cached &&
!(i->flags & BTREE_TRIGGER_NORUN))
bch2_btree_key_cache_verify_clean(trans,
i->btree_id, i->k->k.p);
#endif
ret = bch2_trans_commit_run_triggers(trans); ret = bch2_trans_commit_run_triggers(trans);
if (ret) if (ret)
goto out; goto out;
...@@ -1369,11 +1367,14 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, ...@@ -1369,11 +1367,14 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
return ret; return ret;
} }
int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, static int __must_check
struct bkey_i *k, enum btree_update_flags flags) bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
struct bkey_i *k, enum btree_update_flags flags,
unsigned long ip)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n; struct btree_insert_entry *i, n;
int ret = 0;
BUG_ON(!path->should_be_locked); BUG_ON(!path->should_be_locked);
...@@ -1388,7 +1389,7 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr ...@@ -1388,7 +1389,7 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
.cached = path->cached, .cached = path->cached,
.path = path, .path = path,
.k = k, .k = k,
.ip_allocated = _RET_IP_, .ip_allocated = ip,
}; };
#ifdef CONFIG_BCACHEFS_DEBUG #ifdef CONFIG_BCACHEFS_DEBUG
...@@ -1409,17 +1410,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr ...@@ -1409,17 +1410,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
!btree_insert_entry_cmp(&n, i)) { !btree_insert_entry_cmp(&n, i)) {
BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
/*
* This is a hack to ensure that inode creates update the btree,
* not the key cache, which helps with cache coherency issues in
* other areas:
*/
if (n.cached && !i->cached) {
i->k = n.k;
i->flags = n.flags;
return 0;
}
bch2_path_put(trans, i->path, true); bch2_path_put(trans, i->path, true);
i->flags = n.flags; i->flags = n.flags;
i->cached = n.cached; i->cached = n.cached;
...@@ -1444,19 +1434,60 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr ...@@ -1444,19 +1434,60 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
} }
} }
__btree_path_get(n.path, true); __btree_path_get(i->path, true);
return 0;
/*
* If a key is present in the key cache, it must also exist in the
* btree - this is necessary for cache coherency. When iterating over
* a btree that's cached in the key cache, the btree iter code checks
* the key cache - but the key has to exist in the btree for that to
* work:
*/
if (path->cached &&
bkey_deleted(&i->old_k) &&
!(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)) {
struct btree_path *btree_path;
i->key_cache_already_flushed = true;
i->flags |= BTREE_TRIGGER_NORUN;
btree_path = bch2_path_get(trans, path->btree_id, path->pos,
1, 0, BTREE_ITER_INTENT);
ret = bch2_btree_path_traverse(trans, btree_path, 0);
if (ret)
goto err;
btree_path->should_be_locked = true;
ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
err:
bch2_path_put(trans, btree_path, true);
}
return ret;
}
static int __must_check
bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
struct bkey_i *k, enum btree_update_flags flags)
{
return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
} }
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags) struct bkey_i *k, enum btree_update_flags flags)
{ {
struct btree_path *path = iter->update_path ?: iter->path;
struct bkey_cached *ck;
int ret;
if (iter->flags & BTREE_ITER_IS_EXTENTS) if (iter->flags & BTREE_ITER_IS_EXTENTS)
return bch2_trans_update_extent(trans, iter, k, flags); return bch2_trans_update_extent(trans, iter, k, flags);
if (bkey_deleted(&k->k) && if (bkey_deleted(&k->k) &&
!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
...@@ -1464,8 +1495,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter ...@@ -1464,8 +1495,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
k->k.type = KEY_TYPE_whiteout; k->k.type = KEY_TYPE_whiteout;
} }
return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path, /*
k, flags); * Ensure that updates to cached btrees go to the key cache:
*/
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
!path->cached &&
!path->level &&
btree_id_cached(trans->c, path->btree_id)) {
if (!iter->key_cache_path ||
!iter->key_cache_path->should_be_locked ||
bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
if (!iter->key_cache_path)
iter->key_cache_path =
bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
BTREE_ITER_INTENT|BTREE_ITER_CACHED);
iter->key_cache_path =
bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
iter->flags & BTREE_ITER_INTENT);
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
BTREE_ITER_CACHED);
if (unlikely(ret))
return ret;
ck = (void *) iter->key_cache_path->l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
btree_trans_restart(trans);
return -EINTR;
}
iter->key_cache_path->should_be_locked = true;
}
path = iter->key_cache_path;
}
return bch2_trans_update_by_path(trans, path, k, flags);
} }
void bch2_trans_commit_hook(struct btree_trans *trans, void bch2_trans_commit_hook(struct btree_trans *trans,
......
...@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, ...@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
TP_ARGS(trans_fn, caller_ip) TP_ARGS(trans_fn, caller_ip)
); );
DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
TP_ARGS(trans_fn, caller_ip)
);
DECLARE_EVENT_CLASS(transaction_restart_iter, DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_PROTO(const char *trans_fn, TP_PROTO(const char *trans_fn,
unsigned long caller_ip, unsigned long caller_ip,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment