Commit 9f1833ca authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Update btree ptrs after every write

This closes a significant hole (and last known hole) in our ability to
verify metadata. Previously, since btree nodes are log structured, we
couldn't detect lost btree writes that weren't the first write to a
given node. Additionally, this seems to have lead to some significant
metadata corruption on multi device filesystems with metadata
replication: since a write may have made it to one device and not
another, if we read that btree node back from the replica that did have
that write and started appending after that point, the other replica
would have a gap in the bset entries and reading from that replica
wouldn't find the rest of the bsets.

But, since updates to interior btree nodes are now journalled, we can
close this hole by updating pointers to btree nodes after every write
with the currently written number of sectors, without negatively
affecting performance. This means we will always detect lost or corrupt
metadata - it also means that our btree is now a curious hybrid of COW
and non COW btrees, with all the benefits of both (excluding
complexity).
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
parent f8f86c6a
......@@ -676,7 +676,7 @@ struct bch_fs {
struct btree_key_cache btree_key_cache;
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_error_wq;
struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
......@@ -827,8 +827,6 @@ mempool_t bio_bounce_pages;
atomic64_t btree_writes_nr;
atomic64_t btree_writes_sectors;
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
/* ERRORS */
......
......@@ -1214,7 +1214,8 @@ enum bcachefs_metadata_version {
bcachefs_metadata_version_inode_btree_change = 11,
bcachefs_metadata_version_snapshot = 12,
bcachefs_metadata_version_inode_backpointers = 13,
bcachefs_metadata_version_max = 14,
bcachefs_metadata_version_btree_ptr_sectors_written = 14,
bcachefs_metadata_version_max = 15,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
......
This diff is collapsed.
......@@ -32,6 +32,13 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
atomic_dec(&c->btree_cache.dirty);
}
static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
{
return k->k.type == KEY_TYPE_btree_ptr_v2
? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
: 0;
}
struct btree_read_bio {
struct bch_fs *c;
struct btree *b;
......@@ -48,7 +55,8 @@ struct btree_write_bio {
struct work_struct work;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
void *data;
unsigned bytes;
unsigned data_bytes;
unsigned sector_offset;
struct bch_write_bio wbio;
};
......@@ -137,7 +145,6 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
void bch2_btree_write_error_work(struct work_struct *);
void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
......
......@@ -132,7 +132,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
{
unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
if (iter->locks_want > new_locks_want)
__bch2_btree_iter_downgrade(iter, new_locks_want);
......
......@@ -435,6 +435,7 @@ enum btree_flags {
BTREE_NODE_write_idx,
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_write_in_flight_inner,
BTREE_NODE_just_written,
BTREE_NODE_dying,
BTREE_NODE_fake,
......@@ -449,6 +450,7 @@ BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(write_in_flight_inner);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
......
......@@ -74,7 +74,9 @@ int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
__le64, unsigned);
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
struct btree *, struct bkey_i *);
struct btree *, struct bkey_i *, bool);
int bch2_btree_node_update_key_get_iter(struct btree_trans *,
struct btree *, struct bkey_i *, bool);
int bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
......
......@@ -246,11 +246,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
goto retry;
}
if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
bkey_btree_ptr_v2_init(&tmp.k);
else
bkey_btree_ptr_init(&tmp.k);
bkey_btree_ptr_v2_init(&tmp.k);
bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
bch2_open_bucket_get(c, wp, &ob);
......@@ -567,7 +563,8 @@ static void btree_update_nodes_written(struct btree_update *as)
six_unlock_read(&old->c.lock);
if (seq == as->old_nodes_seq[i])
bch2_btree_node_wait_on_write(old);
wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
/*
......@@ -1153,6 +1150,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
struct bkey_packed *k;
const char *invalid;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
......@@ -1395,6 +1395,7 @@ static void btree_split(struct btree_update *as,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
bch2_btree_node_write(c, n2, SIX_LOCK_intent);
/*
......@@ -1422,12 +1423,12 @@ static void btree_split(struct btree_update *as,
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->c.lock);
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
}
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
/* New nodes all written, now make them visible: */
if (parent) {
......@@ -1703,13 +1704,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->c.lock);
bch2_btree_node_write(c, n, SIX_LOCK_intent);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
bch2_keylist_add(&as->parent_keys, &delete);
bch2_keylist_add(&as->parent_keys, &n->key);
bch2_btree_node_write(c, n, SIX_LOCK_intent);
bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
bch2_btree_update_get_open_buckets(as, n);
......@@ -1883,74 +1884,109 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
queue_work(c->btree_interior_update_worker, &a->work);
}
static void __bch2_btree_node_update_key(struct btree_update *as,
struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
struct bkey_i *new_key)
static int __bch2_btree_node_update_key(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
struct bkey_i *new_key,
bool skip_triggers)
{
struct bch_fs *c = as->c;
struct bch_fs *c = trans->c;
struct btree_iter *iter2 = NULL;
struct btree *parent;
u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
int ret;
btree_update_will_delete_key(as, &b->key);
btree_update_will_add_key(as, new_key);
if (!skip_triggers) {
ret = bch2_trans_mark_key(trans,
bkey_s_c_null,
bkey_i_to_s_c(new_key),
BTREE_TRIGGER_INSERT);
if (ret)
return ret;
ret = bch2_trans_mark_key(trans,
bkey_i_to_s_c(&b->key),
bkey_s_c_null,
BTREE_TRIGGER_OVERWRITE);
if (ret)
return ret;
}
if (new_hash) {
bkey_copy(&new_hash->key, new_key);
ret = bch2_btree_node_hash_insert(&c->btree_cache,
new_hash, b->c.level, b->c.btree_id);
BUG_ON(ret);
}
parent = btree_node_parent(iter, b);
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, new_key);
ret = bch2_btree_node_hash_insert(&c->btree_cache,
new_hash, b->c.level, b->c.btree_id);
BUG_ON(ret);
}
iter2 = bch2_trans_copy_iter(trans, iter);
bch2_keylist_add(&as->parent_keys, new_key);
bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, 0);
BUG_ON(iter2->level != b->c.level);
BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
btree_node_unlock(iter2, iter2->level);
iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
iter2->level++;
bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
bkey_copy(&b->key, new_key);
}
ret = bch2_btree_iter_traverse(iter2) ?:
bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
if (ret)
goto err;
} else {
BUG_ON(btree_node_root(c, b) != b);
bch2_btree_node_lock_write(b, iter);
bkey_copy(&b->key, new_key);
trans->extra_journal_entries = (void *) &journal_entries[0];
trans->extra_journal_entry_u64s =
journal_entry_set((void *) &journal_entries[0],
BCH_JSET_ENTRY_btree_root,
b->c.btree_id, b->c.level,
new_key, new_key->k.u64s);
}
if (btree_ptr_hash_val(&b->key) != b->hash_val) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_JOURNAL_RESERVED|
BTREE_INSERT_NOUNLOCK);
if (ret)
goto err;
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
}
bch2_btree_node_lock_write(b, iter);
btree_update_updated_root(as, b);
bch2_btree_node_unlock_write(b, iter);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
bkey_copy(&b->key, new_key);
}
bch2_btree_update_done(as);
bch2_btree_node_unlock_write(b, iter);
out:
bch2_trans_iter_put(trans, iter2);
return ret;
err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
}
goto out;
}
int bch2_btree_node_update_key(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
struct bkey_i *new_key)
int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
struct btree *b, struct bkey_i *new_key,
bool skip_triggers)
{
struct bch_fs *c = trans->c;
struct btree *parent = btree_node_parent(iter, b);
struct btree_update *as = NULL;
struct btree *new_hash = NULL;
struct closure cl;
int ret = 0;
......@@ -1964,27 +2000,18 @@ int bch2_btree_node_update_key(struct btree_trans *trans,
if (btree_ptr_hash_val(new_key) != b->hash_val) {
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
bch2_trans_unlock(iter->trans);
bch2_trans_unlock(trans);
closure_sync(&cl);
if (!bch2_trans_relock(iter->trans))
if (!bch2_trans_relock(trans))
return -EINTR;
}
new_hash = bch2_btree_node_mem_alloc(c);
}
as = bch2_btree_update_start(iter, b->c.level,
parent ? btree_update_reserve_required(c, parent) : 0,
BTREE_INSERT_NOFAIL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
goto err;
}
__bch2_btree_node_update_key(as, trans, iter, b, new_hash, new_key);
ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
new_key, skip_triggers);
bch2_btree_iter_downgrade(iter);
err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
......@@ -1998,6 +2025,35 @@ int bch2_btree_node_update_key(struct btree_trans *trans,
return ret;
}
int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
struct btree *b, struct bkey_i *new_key,
bool skip_triggers)
{
struct btree_iter *iter;
int ret;
iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto out;
/* has node been freed? */
if (iter->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
}
BUG_ON(!btree_node_hashed(b));
ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
out:
bch2_trans_iter_put(trans, iter);
return ret;
}
/* Init code: */
/*
......
......@@ -910,7 +910,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
unsigned u64s, reset_flags = 0;
int ret = 0;
if (!trans->nr_updates)
if (!trans->nr_updates &&
!trans->extra_journal_entry_u64s)
goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
......
......@@ -95,7 +95,8 @@ struct bch_write_bio {
bounce:1,
put_bio:1,
have_ioref:1,
used_mempool:1;
used_mempool:1,
first_btree_write:1;
);
struct bio bio;
......
......@@ -139,7 +139,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
break;
}
ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(iter);
ret = 0;
......
......@@ -1005,6 +1005,11 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.fix_errors = FSCK_OPT_YES;
}
if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
c->opts.version_upgrade = true;
}
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
......
......@@ -514,8 +514,8 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->io_complete_wq );
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_error_wq)
destroy_workqueue(c->btree_error_wq);
if (c->btree_io_complete_wq)
destroy_workqueue(c->btree_io_complete_wq);
if (c->btree_update_wq)
destroy_workqueue(c->btree_update_wq);
......@@ -567,7 +567,6 @@ void __bch2_fs_stop(struct bch_fs *c)
for_each_member_device(ca, c, i)
cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->btree_write_error_work);
cancel_work_sync(&c->read_only_work);
}
......@@ -696,9 +695,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->bio_bounce_pages_lock);
bio_list_init(&c->btree_write_error_list);
spin_lock_init(&c->btree_write_error_lock);
INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
......@@ -768,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->btree_error_wq = alloc_workqueue("bcachefs_error",
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment