Commit ec25bd8d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs repair code from Kent Overstreet:
 "A couple more small fixes, and new repair code.

  We can now automatically recover from arbitrary corrupted interior
  btree nodes by scanning, and we can reconstruct metadata as needed to
  bring a filesystem back into a working, consistent, read-write state
  and preserve access to whatevver wasn't corrupted.

  Meaning - you can blow away all metadata except for extents and
  dirents leaf nodes, and repair will reconstruct everything else and
  give you your data, and under the correct paths. If inodes are missing
  i_size will be slightly off and permissions/ownership/timestamps will
  be gone, and we do still need the snapshots btree if snapshots were in
  use - in the future we'll be able to guess the snapshot tree structure
  in some situations.

  IOW - aside from shaking out remaining bugs (fuzz testing is still
  coming), repair code should be complete and if repair ever doesn't
  work that's the highest priority bug that I want to know about
  immediately.

  This patchset was kindly tested by a user from India who accidentally
  wiped one drive out of a three drive filesystem with no replication on
  the family computer - it took a couple weeks but we got everything
  important back"

* tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs:
  bcachefs: reconstruct_inode()
  bcachefs: Subvolume reconstruction
  bcachefs: Check for extents that point to same space
  bcachefs: Reconstruct missing snapshot nodes
  bcachefs: Flag btrees with missing data
  bcachefs: Topology repair now uses nodes found by scanning to fill holes
  bcachefs: Repair pass for scanning for btree nodes
  bcachefs: Don't skip fake btree roots in fsck
  bcachefs: bch2_btree_root_alloc() -> bch2_btree_root_alloc_fake()
  bcachefs: Etyzinger cleanups
  bcachefs: bch2_shoot_down_journal_keys()
  bcachefs: Clear recovery_passes_required as they complete without errors
  bcachefs: ratelimit informational fsck errors
  bcachefs: Check for bad needs_discard before doing discard
  bcachefs: Improve bch2_btree_update_to_text()
  mean_and_variance: Drop always failing tests
  bcachefs: fix nocow lock deadlock
  bcachefs: BCH_WATERMARK_interior_updates
  bcachefs: Fix btree node reserve
parents c85af715 09d4c2ac
...@@ -17,6 +17,7 @@ bcachefs-y := \ ...@@ -17,6 +17,7 @@ bcachefs-y := \
btree_journal_iter.o \ btree_journal_iter.o \
btree_key_cache.o \ btree_key_cache.o \
btree_locking.o \ btree_locking.o \
btree_node_scan.o \
btree_trans_commit.o \ btree_trans_commit.o \
btree_update.o \ btree_update.o \
btree_update_interior.o \ btree_update_interior.o \
...@@ -37,6 +38,7 @@ bcachefs-y := \ ...@@ -37,6 +38,7 @@ bcachefs-y := \
error.o \ error.o \
extents.o \ extents.o \
extent_update.o \ extent_update.o \
eytzinger.o \
fs.o \ fs.o \
fs-common.o \ fs-common.o \
fs-ioctl.o \ fs-ioctl.o \
......
...@@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, ...@@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret) if (ret)
goto out; goto out;
if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { if (a->v.dirty_sectors) {
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
trans, "attempting to discard bucket with dirty data\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO;
goto out;
}
if (a->v.data_type != BCH_DATA_need_discard) {
if (data_type_is_empty(a->v.data_type) &&
BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
a->v.gen++; a->v.gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
goto write; goto write;
} }
if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { trans, "bucket incorrectly set in need_discard btree\n"
bch2_trans_inconsistent(trans,
"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
"%s", "%s",
a->v.journal_seq, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
c->journal.flushed_seq_ondisk,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO; ret = -EIO;
}
goto out; goto out;
} }
if (a->v.data_type != BCH_DATA_need_discard) { if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
bch2_trans_inconsistent(trans, trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
"bucket incorrectly set in need_discard btree\n" a->v.journal_seq,
"%s", c->journal.flushed_seq_ondisk,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)); (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO; ret = -EIO;
}
goto out; goto out;
} }
...@@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo ...@@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
if (ret) if (ret)
goto err; goto err;
BUG_ON(a->v.dirty_sectors);
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
a->v.data_type = alloc_data_type(a->v, a->v.data_type); a->v.data_type = alloc_data_type(a->v, a->v.data_type);
...@@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, ...@@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out; goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached); BUG_ON(a->v.data_type != BCH_DATA_cached);
BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors) if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused"); bch_err(c, "invalidating empty bucket, confused");
......
...@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ...@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark) static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{ {
switch (watermark) { switch (watermark) {
case BCH_WATERMARK_reclaim: case BCH_WATERMARK_interior_updates:
return 0; return 0;
case BCH_WATERMARK_reclaim:
return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree: case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4; return OPEN_BUCKETS_COUNT / 4;
......
...@@ -22,7 +22,8 @@ struct bucket_alloc_state { ...@@ -22,7 +22,8 @@ struct bucket_alloc_state {
x(copygc) \ x(copygc) \
x(btree) \ x(btree) \
x(btree_copygc) \ x(btree_copygc) \
x(reclaim) x(reclaim) \
x(interior_updates)
enum bch_watermark { enum bch_watermark {
#define x(name) BCH_WATERMARK_##name, #define x(name) BCH_WATERMARK_##name,
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h" #include "btree_update_interior.h"
#include "btree_write_buffer.h" #include "btree_write_buffer.h"
#include "checksum.h"
#include "error.h" #include "error.h"
#include <linux/mm.h> #include <linux/mm.h>
...@@ -418,6 +419,84 @@ struct extents_to_bp_state { ...@@ -418,6 +419,84 @@ struct extents_to_bp_state {
struct bkey_buf last_flushed; struct bkey_buf last_flushed;
}; };
static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
struct bkey_s_c extent, unsigned dev)
{
struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
int ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
bch2_bkey_drop_device(bkey_i_to_s(n), dev);
return bch2_btree_insert_trans(trans, btree, n, 0);
}
static int check_extent_checksum(struct btree_trans *trans,
enum btree_id btree, struct bkey_s_c extent,
enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct printbuf buf = PRINTBUF;
void *data_buf = NULL;
struct bio *bio = NULL;
size_t bytes;
int ret = 0;
if (bkey_is_btree_ptr(extent.k))
return false;
bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
if (p.ptr.dev == dev)
goto found;
BUG();
found:
if (!p.crc.csum_type)
return false;
bytes = p.crc.compressed_size << 9;
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
if (!bch2_dev_get_ioref(ca, READ))
return false;
data_buf = kvmalloc(bytes, GFP_KERNEL);
if (!data_buf) {
ret = -ENOMEM;
goto err;
}
bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = p.ptr.offset;
bch2_bio_map(bio, data_buf, bytes);
ret = submit_bio_wait(bio);
if (ret)
goto err;
prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
bch2_bkey_val_to_text(&buf, c, extent);
prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
bch2_bkey_val_to_text(&buf, c, extent2);
struct nonce nonce = extent_nonce(extent.k->version, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
c, dup_backpointer_to_bad_csum_extent,
"%s", buf.buf))
ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
fsck_err:
err:
if (bio)
bio_put(bio);
kvfree(data_buf);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
return ret;
}
static int check_bp_exists(struct btree_trans *trans, static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s, struct extents_to_bp_state *s,
struct bpos bucket, struct bpos bucket,
...@@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans, ...@@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct bkey_s_c orig_k) struct bkey_s_c orig_k)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL }; struct btree_iter bp_iter = {};
struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k; struct bkey_s_c bp_k;
struct bkey_buf tmp; struct bkey_buf tmp;
...@@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans, ...@@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp); bch2_bkey_buf_init(&tmp);
if (!bch2_dev_bucket_exists(c, bucket)) {
prt_str(&buf, "extent for nonexistent device:bucket ");
bch2_bpos_to_text(&buf, bucket);
prt_str(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, orig_k);
bch_err(c, "%s", buf.buf);
return -BCH_ERR_fsck_repair_unimplemented;
}
if (bpos_lt(bucket, s->bucket_start) || if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end)) bpos_gt(bucket, s->bucket_end))
return 0; return 0;
if (!bch2_dev_bucket_exists(c, bucket))
goto missing;
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket, bp.bucket_offset), bucket_pos_to_bp(c, bucket, bp.bucket_offset),
0); 0);
...@@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans, ...@@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans,
ret = -BCH_ERR_transaction_restart_write_buffer_flush; ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out; goto out;
} }
goto missing;
goto check_existing_bp;
} }
out: out:
err: err:
fsck_err: fsck_err:
bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter); bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c); bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf); printbuf_exit(&buf);
return ret; return ret;
check_existing_bp:
/* Do we have a backpointer for a different extent? */
if (bp_k.k->type != KEY_TYPE_backpointer)
goto missing;
struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
struct bkey_s_c other_extent =
bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
ret = bkey_err(other_extent);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
ret = 0;
if (ret)
goto err;
if (!other_extent.k)
goto missing;
if (bch2_extents_match(orig_k, other_extent)) {
printbuf_reset(&buf);
prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
bch2_bkey_val_to_text(&buf, c, orig_k);
prt_str(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, other_extent);
bch_err(c, "%s", buf.buf);
if (other_extent.k->size <= orig_k.k->size) {
ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
if (ret)
goto err;
goto out;
} else {
ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
if (ret)
goto err;
goto missing;
}
}
ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
if (ret < 0)
goto err;
if (ret) {
ret = 0;
goto missing;
}
ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
if (ret < 0)
goto err;
if (ret) {
ret = 0;
goto out;
}
printbuf_reset(&buf);
prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
bch2_bkey_val_to_text(&buf, c, orig_k);
prt_str(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, other_extent);
bch_err(c, "%s", buf.buf);
ret = -BCH_ERR_fsck_repair_unimplemented;
goto err;
missing: missing:
printbuf_reset(&buf);
prt_printf(&buf, "missing backpointer for btree=%s l=%u ", prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_id_str(bp.btree_id), bp.level); bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k); bch2_bkey_val_to_text(&buf, c, orig_k);
prt_printf(&buf, "\nbp pos "); prt_printf(&buf, "\n got: ");
bch2_bpos_to_text(&buf, bp_iter.pos); bch2_bkey_val_to_text(&buf, c, bp_k);
struct bkey_i_backpointer n_bp_k;
bkey_backpointer_init(&n_bp_k.k_i);
n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
n_bp_k.v = bp;
prt_printf(&buf, "\n want: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
......
...@@ -456,6 +456,7 @@ enum bch_time_stats { ...@@ -456,6 +456,7 @@ enum bch_time_stats {
#include "alloc_types.h" #include "alloc_types.h"
#include "btree_types.h" #include "btree_types.h"
#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h" #include "btree_write_buffer_types.h"
#include "buckets_types.h" #include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h" #include "buckets_waiting_for_journal_types.h"
...@@ -614,6 +615,7 @@ struct bch_dev { ...@@ -614,6 +615,7 @@ struct bch_dev {
*/ */
#define BCH_FS_FLAGS() \ #define BCH_FS_FLAGS() \
x(new_fs) \
x(started) \ x(started) \
x(may_go_rw) \ x(may_go_rw) \
x(rw) \ x(rw) \
...@@ -796,6 +798,7 @@ struct bch_fs { ...@@ -796,6 +798,7 @@ struct bch_fs {
u64 features; u64 features;
u64 compat; u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
u64 btrees_lost_data;
} sb; } sb;
...@@ -1103,6 +1106,8 @@ struct bch_fs { ...@@ -1103,6 +1106,8 @@ struct bch_fs {
struct journal_keys journal_keys; struct journal_keys journal_keys;
struct list_head journal_iters; struct list_head journal_iters;
struct find_btree_nodes found_btree_nodes;
u64 last_bucket_seq_cleanup; u64 last_bucket_seq_cleanup;
u64 counters_on_mount[BCH_COUNTER_NR]; u64 counters_on_mount[BCH_COUNTER_NR];
......
...@@ -818,6 +818,7 @@ struct bch_sb_field_ext { ...@@ -818,6 +818,7 @@ struct bch_sb_field_ext {
struct bch_sb_field field; struct bch_sb_field field;
__le64 recovery_passes_required[2]; __le64 recovery_passes_required[2];
__le64 errors_silent[8]; __le64 errors_silent[8];
__le64 btrees_lost_data;
}; };
struct bch_sb_field_downgrade_entry { struct bch_sb_field_downgrade_entry {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "btree_journal_iter.h" #include "btree_journal_iter.h"
#include "btree_key_cache.h" #include "btree_key_cache.h"
#include "btree_locking.h" #include "btree_locking.h"
#include "btree_node_scan.h"
#include "btree_update_interior.h" #include "btree_update_interior.h"
#include "btree_io.h" #include "btree_io.h"
#include "btree_gc.h" #include "btree_gc.h"
...@@ -41,6 +42,7 @@ ...@@ -41,6 +42,7 @@
#define DROP_THIS_NODE 10 #define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11 #define DROP_PREV_NODE 11
#define DID_FILL_FROM_SCAN 12
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{ {
...@@ -129,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) ...@@ -129,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
struct bkey_i_btree_ptr_v2 *new; struct bkey_i_btree_ptr_v2 *new;
int ret; int ret;
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, " -> ");
bch2_bpos_to_text(&buf, new_min);
bch_info(c, "%s(): %s", __func__, buf.buf);
printbuf_exit(&buf);
}
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new) if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key; return -BCH_ERR_ENOMEM_gc_repair_key;
...@@ -154,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) ...@@ -154,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new; struct bkey_i_btree_ptr_v2 *new;
int ret; int ret;
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, " -> ");
bch2_bpos_to_text(&buf, new_max);
bch_info(c, "%s(): %s", __func__, buf.buf);
printbuf_exit(&buf);
}
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret) if (ret)
return ret; return ret;
...@@ -185,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) ...@@ -185,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0; return 0;
} }
static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
struct btree *prev, struct btree *cur) struct btree *prev, struct btree *cur,
struct bpos *pulled_from_scan)
{ {
struct bpos expected_start = !prev struct bpos expected_start = !prev
? b->data->min_key ? b->data->min_key
: bpos_successor(prev->key.k.p); : bpos_successor(prev->key.k.p);
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret = 0; int ret = 0;
if (!prev) { BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
prt_printf(&buf1, "start of node: "); !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
bch2_bpos_to_text(&buf1, b->data->min_key); b->data->min_key));
} else {
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
}
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); if (bpos_eq(expected_start, cur->data->min_key))
return 0;
if (prev && prt_printf(&buf, " at btree %s level %u:\n parent: ",
bpos_gt(expected_start, cur->data->min_key) && bch2_btree_id_str(b->c.btree_id), b->c.level);
BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
/* cur overwrites prev: */
if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, if (prev) {
cur->data->min_key), c, prt_printf(&buf, "\n prev: ");
btree_node_topology_overwritten_by_next_node, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
"btree node overwritten by next node at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_PREV_NODE;
goto out;
} }
if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, prt_str(&buf, "\n next: ");
bpos_predecessor(cur->data->min_key)), c, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n" if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
" node %s\n" if (b->c.level == 1 &&
" next %s", bpos_lt(*pulled_from_scan, cur->data->min_key)) {
bch2_btree_id_str(b->c.btree_id), b->c.level, ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
buf1.buf, buf2.buf)) expected_start,
bpos_predecessor(cur->data->min_key));
if (ret)
goto err;
*pulled_from_scan = cur->data->min_key;
ret = DID_FILL_FROM_SCAN;
} else {
if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node%s", buf.buf))
ret = DROP_PREV_NODE;
} else {
if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf))
ret = set_node_max(c, prev, ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key)); bpos_predecessor(cur->data->min_key));
}
} else { } else {
/* prev overwrites cur: */ if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
if (mustfix_fsck_err_on(bpos_ge(expected_start, "btree node overwritten by prev node%s", buf.buf))
cur->data->max_key), c,
btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_THIS_NODE; ret = DROP_THIS_NODE;
goto out; } else {
} if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_min(c, cur, expected_start); ret = set_node_min(c, cur, expected_start);
} }
out: }
}
err:
fsck_err: fsck_err:
printbuf_exit(&buf2); printbuf_exit(&buf);
printbuf_exit(&buf1);
return ret; return ret;
} }
static int btree_repair_node_end(struct bch_fs *c, struct btree *b, static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
struct btree *child) struct btree *child, struct bpos *pulled_from_scan)
{ {
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret = 0; int ret = 0;
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); if (bpos_eq(child->key.k.p, b->key.k.p))
bch2_bpos_to_text(&buf2, b->key.k.p); return 0;
if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, prt_printf(&buf, "at btree %s level %u:\n parent: ",
btree_node_topology_bad_max_key, bch2_btree_id_str(b->c.btree_id), b->c.level);
"btree node with incorrect max_key at btree %s level %u:\n" bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
" %s\n"
" expected %s", prt_str(&buf, "\n child: ");
bch2_btree_id_str(b->c.btree_id), b->c.level, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p); if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf)) {
if (b->c.level == 1 &&
bpos_lt(*pulled_from_scan, b->key.k.p)) {
ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
bpos_successor(child->key.k.p), b->key.k.p);
if (ret) if (ret)
goto err; goto err;
*pulled_from_scan = b->key.k.p;
ret = DID_FILL_FROM_SCAN;
} else {
ret = set_node_max(c, child, b->key.k.p);
}
} }
err: err:
fsck_err: fsck_err:
printbuf_exit(&buf2); printbuf_exit(&buf);
printbuf_exit(&buf1);
return ret; return ret;
} }
static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
struct bpos *pulled_from_scan)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter; struct btree_and_journal_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_buf prev_k, cur_k; struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL; struct btree *prev = NULL, *cur = NULL;
bool have_child, dropped_children = false; bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret = 0; int ret = 0;
if (!b->c.level) if (!b->c.level)
return 0; return 0;
again:
prev = NULL;
have_child = dropped_children = false;
bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k); bch2_bkey_buf_init(&cur_k);
again:
cur = prev = NULL;
have_child = new_pass = false;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true; iter.prefetch = true;
...@@ -332,9 +367,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -332,9 +367,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
b->c.level - 1, b->c.level - 1,
buf.buf)) { buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k); bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL; cur = NULL;
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret) if (ret)
break; break;
continue; continue;
...@@ -344,7 +380,23 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -344,7 +380,23 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (ret) if (ret)
break; break;
ret = btree_repair_node_boundaries(c, b, prev, cur); if (bch2_btree_node_is_stale(c, cur)) {
bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
six_unlock_read(&cur->c.lock);
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
if (ret)
break;
continue;
}
ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
}
if (ret == DROP_THIS_NODE) { if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock); six_unlock_read(&cur->c.lock);
...@@ -370,8 +422,6 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -370,8 +422,6 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
break; break;
bch2_btree_and_journal_iter_exit(&iter); bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
goto again; goto again;
} else if (ret) } else if (ret)
break; break;
...@@ -383,7 +433,11 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -383,7 +433,11 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (!ret && !IS_ERR_OR_NULL(prev)) { if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur); BUG_ON(cur);
ret = btree_repair_node_end(c, b, prev); ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
}
} }
if (!IS_ERR_OR_NULL(prev)) if (!IS_ERR_OR_NULL(prev))
...@@ -397,6 +451,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -397,6 +451,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
goto err; goto err;
bch2_btree_and_journal_iter_exit(&iter); bch2_btree_and_journal_iter_exit(&iter);
if (new_pass)
goto again;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true; iter.prefetch = true;
...@@ -413,7 +471,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -413,7 +471,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
if (ret) if (ret)
goto err; goto err;
ret = bch2_btree_repair_topology_recurse(trans, cur); ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock); six_unlock_read(&cur->c.lock);
cur = NULL; cur = NULL;
...@@ -421,7 +479,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -421,7 +479,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
bch2_btree_node_evict(trans, cur_k.k); bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id, ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p); b->c.level, cur_k.k->k.p);
dropped_children = true; new_pass = true;
} }
if (ret) if (ret)
...@@ -448,12 +506,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -448,12 +506,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
six_unlock_read(&cur->c.lock); six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter); bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
if (!ret && dropped_children) if (!ret && new_pass)
goto again; goto again;
BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf); printbuf_exit(&buf);
return ret; return ret;
} }
...@@ -461,32 +521,63 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ...@@ -461,32 +521,63 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
int bch2_check_topology(struct bch_fs *c) int bch2_check_topology(struct bch_fs *c)
{ {
struct btree_trans *trans = bch2_trans_get(c); struct btree_trans *trans = bch2_trans_get(c);
struct btree *b; struct bpos pulled_from_scan = POS_MIN;
unsigned i;
int ret = 0; int ret = 0;
for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i); struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false;
if (!r->alive) if (r->error) {
continue; ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
break;
reconstruct_root:
bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
b = r->b; r->alive = false;
if (btree_node_fake(b)) r->error = 0;
continue;
if (!bch2_btree_has_scanned_nodes(c, i)) {
mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
bch2_btree_root_alloc_fake(c, i, 0);
} else {
bch2_btree_root_alloc_fake(c, i, 1);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
break;
}
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
reconstructed_root = true;
}
struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
ret = bch2_btree_repair_topology_recurse(trans, b); ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
six_unlock_read(&b->c.lock); six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) { if (ret == DROP_THIS_NODE) {
bch_err(c, "empty btree root - repair unimplemented"); bch2_btree_node_hash_remove(&c->btree_cache, b);
ret = -BCH_ERR_fsck_repair_unimplemented; mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
r->b = NULL;
if (!reconstructed_root)
goto reconstruct_root;
bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
bch2_btree_root_alloc_fake(c, i, 0);
r->alive = false;
ret = 0;
} }
} }
fsck_err:
bch2_trans_put(trans); bch2_trans_put(trans);
return ret; return ret;
} }
...@@ -931,9 +1022,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, ...@@ -931,9 +1022,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
b = bch2_btree_id_root(c, btree_id)->b; b = bch2_btree_id_root(c, btree_id)->b;
if (btree_node_fake(b))
return 0;
six_lock_read(&b->c.lock, NULL, NULL); six_lock_read(&b->c.lock, NULL, NULL);
printbuf_reset(&buf); printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key); bch2_bpos_to_text(&buf, b->data->min_key);
......
...@@ -1264,10 +1264,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ...@@ -1264,10 +1264,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
return retry_read; return retry_read;
fsck_err: fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry || if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
ret == -BCH_ERR_btree_node_read_err_must_retry) ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1; retry_read = 1;
else } else {
set_btree_node_read_error(b); set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
}
goto out; goto out;
} }
...@@ -1328,6 +1330,7 @@ static void btree_node_read_work(struct work_struct *work) ...@@ -1328,6 +1330,7 @@ static void btree_node_read_work(struct work_struct *work)
if (!can_retry) { if (!can_retry) {
set_btree_node_read_error(b); set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
break; break;
} }
} }
...@@ -1527,9 +1530,10 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) ...@@ -1527,9 +1530,10 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
ret = -1; ret = -1;
} }
if (ret) if (ret) {
set_btree_node_read_error(b); set_btree_node_read_error(b);
else if (*saw_error) bch2_btree_lost_data(c, b->c.btree_id);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b); bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) { for (i = 0; i < ra->nr; i++) {
...@@ -1665,6 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, ...@@ -1665,6 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_fatal_error(c); bch2_fatal_error(c);
set_btree_node_read_error(b); set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b); clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf); printbuf_exit(&buf);
...@@ -1861,7 +1866,7 @@ static void btree_node_write_work(struct work_struct *work) ...@@ -1861,7 +1866,7 @@ static void btree_node_write_work(struct work_struct *work)
} else { } else {
ret = bch2_trans_do(c, NULL, NULL, 0, ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim| BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw, BCH_TRANS_COMMIT_no_check_rw,
......
...@@ -567,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c) ...@@ -567,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c)
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0; return 0;
} }
void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
unsigned level_min, unsigned level_max,
struct bpos start, struct bpos end)
{
struct journal_keys *keys = &c->journal_keys;
size_t dst = 0;
move_gap(keys, keys->nr);
darray_for_each(*keys, i)
if (!(i->btree_id == btree &&
i->level >= level_min &&
i->level <= level_max &&
bpos_ge(i->k->k.p, start) &&
bpos_le(i->k->k.p, end)))
keys->data[dst++] = *i;
keys->nr = keys->gap = dst;
}
...@@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *); ...@@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *);
void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
unsigned, unsigned,
struct bpos, struct bpos);
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_journal_iter.h"
#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "error.h"
#include "journal_io.h"
#include "recovery_passes.h"
#include <linux/kthread.h>
#include <linux/sort.h>
struct find_btree_nodes_worker {
struct closure *cl;
struct find_btree_nodes *f;
struct bch_dev *ca;
};
static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
{
prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
bch2_bpos_to_text(out, n->min_key);
prt_str(out, "-");
bch2_bpos_to_text(out, n->max_key);
if (n->range_updated)
prt_str(out, " range updated");
if (n->overwritten)
prt_str(out, " overwritten");
for (unsigned i = 0; i < n->nr_ptrs; i++) {
prt_char(out, ' ');
bch2_extent_ptr_to_text(out, c, n->ptrs + i);
}
}
static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
{
printbuf_indent_add(out, 2);
darray_for_each(nodes, i) {
found_btree_node_to_text(out, c, i);
prt_newline(out);
}
printbuf_indent_sub(out, 2);
}
static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
{
struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
bp->k.p = f->max_key;
bp->v.seq = cpu_to_le64(f->cookie);
bp->v.sectors_written = 0;
bp->v.flags = 0;
bp->v.min_key = f->min_key;
SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
}
static bool found_btree_node_is_readable(struct btree_trans *trans,
const struct found_btree_node *f)
{
struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
found_btree_node_to_key(&k.k, f);
struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
if (ret)
six_unlock_read(&b->c.lock);
/*
* We might update this node's range; if that happens, we need the node
* to be re-read so the read path can trim keys that are no longer in
* this node
*/
if (b != btree_node_root(trans->c, b))
bch2_btree_node_evict(trans, &k.k);
return ret;
}
static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
{
const struct found_btree_node *l = _l;
const struct found_btree_node *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
cmp_int(l->level, r->level) ?:
cmp_int(l->cookie, r->cookie);
}
/*
* Given two found btree nodes, if their sequence numbers are equal, take the
* one that's readable:
*/
static int found_btree_node_cmp_time(const struct found_btree_node *l,
const struct found_btree_node *r)
{
return cmp_int(l->seq, r->seq);
}
static int found_btree_node_cmp_pos(const void *_l, const void *_r)
{
const struct found_btree_node *l = _l;
const struct found_btree_node *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->min_key, r->min_key) ?:
-found_btree_node_cmp_time(l, r);
}
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
struct bio *bio, struct btree_node *bn, u64 offset)
{
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, bn, PAGE_SIZE);
submit_bio_wait(bio);
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"IO error in try_read_btree_node() at %llu: %s",
offset, bch2_blk_status_to_str(bio->bi_status)))
return;
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
.level = BTREE_NODE_LEVEL(bn),
.seq = BTREE_NODE_SEQ(bn),
.cookie = le64_to_cpu(bn->keys.seq),
.min_key = bn->min_key,
.max_key = bn->max_key,
.nr_ptrs = 1,
.ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
.ptrs[0].offset = offset,
.ptrs[0].dev = ca->dev_idx,
.ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
};
rcu_read_unlock();
if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
mutex_lock(&f->lock);
if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
bch_err(c, "try_read_btree_node() can't handle endian conversion");
f->ret = -EINVAL;
goto unlock;
}
if (darray_push(&f->nodes, n))
f->ret = -ENOMEM;
unlock:
mutex_unlock(&f->lock);
}
}
static int read_btree_nodes_worker(void *p)
{
struct find_btree_nodes_worker *w = p;
struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
struct bch_dev *ca = w->ca;
void *buf = (void *) __get_free_page(GFP_KERNEL);
struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
unsigned long last_print = jiffies;
if (!buf || !bio) {
bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
w->f->ret = -ENOMEM;
goto err;
}
for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
for (unsigned bucket_offset = 0;
bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
bucket_offset += btree_sectors(c)) {
if (time_after(jiffies, last_print + HZ * 30)) {
u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
bch_info(ca, "%s: %2u%% done", __func__,
(unsigned) div64_u64(cur_sector * 100, end_sector));
last_print = jiffies;
}
try_read_btree_node(w->f, ca, bio, buf,
bucket * ca->mi.bucket_size + bucket_offset);
}
err:
bio_put(bio);
free_page((unsigned long) buf);
percpu_ref_get(&ca->io_ref);
closure_put(w->cl);
kfree(w);
return 0;
}
static int read_btree_nodes(struct find_btree_nodes *f)
{
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
struct closure cl;
int ret = 0;
closure_init_stack(&cl);
for_each_online_member(c, ca) {
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
struct task_struct *t;
if (!w) {
percpu_ref_put(&ca->io_ref);
ret = -ENOMEM;
goto err;
}
percpu_ref_get(&ca->io_ref);
closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = IS_ERR_OR_NULL(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
closure_put(&cl);
f->ret = ret;
bch_err(c, "error starting kthread: %i", ret);
break;
}
}
err:
closure_sync(&cl);
return f->ret ?: ret;
}
static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
{
while (n + 1 < end &&
found_btree_node_cmp_pos(n, n + 1) > 0) {
swap(n[0], n[1]);
n++;
}
}
static int handle_overwrites(struct bch_fs *c,
struct found_btree_node *start,
struct found_btree_node *end)
{
struct found_btree_node *n;
again:
for (n = start + 1;
n < end &&
n->btree_id == start->btree_id &&
n->level == start->level &&
bpos_lt(n->min_key, start->max_key);
n++) {
int cmp = found_btree_node_cmp_time(start, n);
if (cmp > 0) {
if (bpos_cmp(start->max_key, n->max_key) >= 0)
n->overwritten = true;
else {
n->range_updated = true;
n->min_key = bpos_successor(start->max_key);
n->range_updated = true;
bubble_up(n, end);
goto again;
}
} else if (cmp < 0) {
BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
start->max_key = bpos_predecessor(n->min_key);
start->range_updated = true;
} else {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
found_btree_node_to_text(&buf, c, start);
prt_str(&buf, "\n ");
found_btree_node_to_text(&buf, c, n);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
return -1;
}
}
return 0;
}
int bch2_scan_for_btree_nodes(struct bch_fs *c)
{
struct find_btree_nodes *f = &c->found_btree_nodes;
struct printbuf buf = PRINTBUF;
size_t dst;
int ret = 0;
if (f->nodes.nr)
return 0;
mutex_init(&f->lock);
ret = read_btree_nodes(f);
if (ret)
return ret;
if (!f->nodes.nr) {
bch_err(c, "%s: no btree nodes found", __func__);
ret = -EINVAL;
goto err;
}
if (0 && c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
bch2_print_string_as_lines(KERN_INFO, buf.buf);
}
sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
dst = 0;
darray_for_each(f->nodes, i) {
struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
if (prev &&
prev->cookie == i->cookie) {
if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
bch_err(c, "%s: found too many replicas for btree node", __func__);
ret = -EINVAL;
goto err;
}
prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
} else {
f->nodes.data[dst++] = *i;
}
}
f->nodes.nr = dst;
sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
if (0 && c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
bch2_print_string_as_lines(KERN_INFO, buf.buf);
}
dst = 0;
darray_for_each(f->nodes, i) {
if (i->overwritten)
continue;
ret = handle_overwrites(c, i, &darray_top(f->nodes));
if (ret)
goto err;
BUG_ON(i->overwritten);
f->nodes.data[dst++] = *i;
}
f->nodes.nr = dst;
if (c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
bch2_print_string_as_lines(KERN_INFO, buf.buf);
}
eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
err:
printbuf_exit(&buf);
return ret;
}
static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
{
const struct found_btree_node *l = _l;
const struct found_btree_node *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->max_key, r->min_key);
}
#define for_each_found_btree_node_in_range(_f, _search, _idx) \
for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
sizeof((_f)->nodes.data[0]), \
found_btree_node_range_start_cmp, &search); \
_idx < (_f)->nodes.nr && \
(_f)->nodes.data[_idx].btree_id == _search.btree_id && \
(_f)->nodes.data[_idx].level == _search.level && \
bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
_idx = eytzinger0_next(_idx, (_f)->nodes.nr))
bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
{
struct find_btree_nodes *f = &c->found_btree_nodes;
struct found_btree_node search = {
.btree_id = b->c.btree_id,
.level = b->c.level,
.min_key = b->data->min_key,
.max_key = b->key.k.p,
};
for_each_found_btree_node_in_range(f, search, idx)
if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
return true;
return false;
}
bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
{
struct found_btree_node search = {
.btree_id = btree,
.level = 0,
.min_key = POS_MIN,
.max_key = SPOS_MAX,
};
for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
return true;
return false;
}
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
struct find_btree_nodes *f = &c->found_btree_nodes;
int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
return ret;
if (c->opts.verbose) {
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
bch2_bpos_to_text(&buf, node_min);
prt_str(&buf, " - ");
bch2_bpos_to_text(&buf, node_max);
bch_info(c, "%s(): %s", __func__, buf.buf);
printbuf_exit(&buf);
}
struct found_btree_node search = {
.btree_id = btree,
.level = level,
.min_key = node_min,
.max_key = node_max,
};
for_each_found_btree_node_in_range(f, search, idx) {
struct found_btree_node n = f->nodes.data[idx];
n.range_updated |= bpos_lt(n.min_key, node_min);
n.min_key = bpos_max(n.min_key, node_min);
n.range_updated |= bpos_gt(n.max_key, node_max);
n.max_key = bpos_min(n.max_key, node_max);
struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
found_btree_node_to_key(&tmp.k, &n);
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
printbuf_exit(&buf);
BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
if (ret)
return ret;
}
return 0;
}
void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
{
darray_exit(&f->nodes);
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
#define _BCACHEFS_BTREE_NODE_SCAN_H
int bch2_scan_for_btree_nodes(struct bch_fs *);
bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
#include "darray.h"
struct found_btree_node {
bool range_updated:1;
bool overwritten:1;
u8 btree_id;
u8 level;
u32 seq;
u64 cookie;
struct bpos min_key;
struct bpos max_key;
unsigned nr_ptrs;
struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
};
typedef DARRAY(struct found_btree_node) found_btree_nodes;
struct find_btree_nodes {
int ret;
struct mutex lock;
found_btree_nodes nodes;
};
#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
...@@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ...@@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip) int ret, unsigned long trace_ip)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) { switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full: case -BCH_ERR_btree_insert_btree_node_full:
...@@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ...@@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag * flag
*/ */
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock; ret = -BCH_ERR_journal_reclaim_would_deadlock;
break; break;
} }
......
...@@ -26,6 +26,13 @@ ...@@ -26,6 +26,13 @@
#include <linux/random.h> #include <linux/random.h>
const char * const bch2_btree_update_modes[] = {
#define x(t) #t,
BCH_WATERMARKS()
#undef x
NULL
};
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *, struct keylist *); btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
...@@ -303,7 +310,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, ...@@ -303,7 +310,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 }; struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE ? BTREE_NODE_RESERVE
: 0; : 0;
int ret; int ret;
...@@ -687,7 +694,7 @@ static void btree_update_nodes_written(struct btree_update *as) ...@@ -687,7 +694,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well. * which may require allocations as well.
*/ */
ret = commit_do(trans, &as->disk_res, &journal_seq, ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim| BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim, BCH_TRANS_COMMIT_journal_reclaim,
...@@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) ...@@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(!btree_node_dirty(b)); BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level); BUG_ON(!b->c.level);
as->mode = BTREE_INTERIOR_UPDATING_NODE; as->mode = BTREE_UPDATE_node;
as->b = b; as->b = b;
set_btree_node_write_blocked(b); set_btree_node_write_blocked(b);
...@@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as, ...@@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock); lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL; child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS; child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush); bch2_update_reparent_journal_pin_flush);
...@@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) ...@@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key; struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries)); ARRAY_SIZE(as->journal_entries));
...@@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) ...@@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
as->mode = BTREE_INTERIOR_UPDATING_ROOT; as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
} }
...@@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * ...@@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c; struct bch_fs *c = as->c;
u64 start_time = as->start_time; u64 start_time = as->start_time;
BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock) if (as->took_gc_lock)
up_read(&as->c->gc_lock); up_read(&as->c->gc_lock);
...@@ -1121,7 +1128,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1121,7 +1128,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) watermark < BCH_WATERMARK_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK; journal_flags |= JOURNAL_RES_GET_NONBLOCK;
ret = drop_locks_do(trans, ret = drop_locks_do(trans,
...@@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c; as->c = c;
as->start_time = start_time; as->start_time = start_time;
as->ip_started = _RET_IP_; as->ip_started = _RET_IP_;
as->mode = BTREE_INTERIOR_NO_UPDATE; as->mode = BTREE_UPDATE_none;
as->watermark = watermark;
as->took_gc_lock = true; as->took_gc_lock = true;
as->btree_id = path->btree_id; as->btree_id = path->btree_id;
as->update_level = update_level; as->update_level = update_level;
...@@ -1217,7 +1225,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1217,7 +1225,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/ */
if (bch2_err_matches(ret, ENOSPC) && if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) && (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) { watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock; ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err; goto err;
} }
...@@ -2458,7 +2466,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) ...@@ -2458,7 +2466,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b); bch2_btree_set_root_inmem(c, b);
} }
static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct closure cl; struct closure cl;
...@@ -2477,7 +2485,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) ...@@ -2477,7 +2485,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
set_btree_node_fake(b); set_btree_node_fake(b);
set_btree_node_need_rewrite(b); set_btree_node_need_rewrite(b);
b->c.level = 0; b->c.level = level;
b->c.btree_id = id; b->c.btree_id = id;
bkey_btree_ptr_init(&b->key); bkey_btree_ptr_init(&b->key);
...@@ -2504,9 +2512,21 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) ...@@ -2504,9 +2512,21 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
return 0; return 0;
} }
void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{ {
bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
}
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
{
prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
(void *) as->ip_started,
bch2_btree_id_str(as->btree_id),
bch2_watermarks[as->watermark],
bch2_btree_update_modes[as->mode],
as->nodes_written,
closure_nr_remaining(&as->cl),
as->journal.seq);
} }
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
...@@ -2515,12 +2535,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) ...@@ -2515,12 +2535,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list) list_for_each_entry(as, &c->btree_interior_update_list, list)
prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_to_text(out, as);
(void *) as->ip_started,
as->mode,
as->nodes_written,
closure_nr_remaining(&as->cl),
as->journal.seq);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
} }
......
...@@ -12,6 +12,18 @@ ...@@ -12,6 +12,18 @@
int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
#define BTREE_UPDATE_MODES() \
x(none) \
x(node) \
x(root) \
x(update)
enum btree_update_mode {
#define x(n) BTREE_UPDATE_##n,
BTREE_UPDATE_MODES()
#undef x
};
/* /*
* Tracks an in progress split/rewrite of a btree node and the update to the * Tracks an in progress split/rewrite of a btree node and the update to the
* parent node: * parent node:
...@@ -39,14 +51,8 @@ struct btree_update { ...@@ -39,14 +51,8 @@ struct btree_update {
struct list_head list; struct list_head list;
struct list_head unwritten_list; struct list_head unwritten_list;
/* What kind of update are we doing? */ enum btree_update_mode mode;
enum { enum bch_watermark watermark;
BTREE_INTERIOR_NO_UPDATE,
BTREE_INTERIOR_UPDATING_NODE,
BTREE_INTERIOR_UPDATING_ROOT,
BTREE_INTERIOR_UPDATING_AS,
} mode;
unsigned nodes_written:1; unsigned nodes_written:1;
unsigned took_gc_lock:1; unsigned took_gc_lock:1;
...@@ -56,7 +62,7 @@ struct btree_update { ...@@ -56,7 +62,7 @@ struct btree_update {
struct disk_reservation disk_res; struct disk_reservation disk_res;
/* /*
* BTREE_INTERIOR_UPDATING_NODE: * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an * The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b * existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b * until the new nodes we created are finished writing, so we block @b
...@@ -165,7 +171,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, ...@@ -165,7 +171,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool); struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c, static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b) struct btree *b)
......
...@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma ...@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough; fallthrough;
case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim: case BCH_WATERMARK_reclaim:
case BCH_WATERMARK_interior_updates:
break; break;
} }
......
...@@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans, ...@@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans,
move_ctxt_wait_event(ctxt, move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) || PTR_BUCKET_POS(c, &p.ptr), 0)) ||
(!atomic_read(&ctxt->read_sectors) && list_empty(&ctxt->ios));
!atomic_read(&ctxt->write_sectors)));
if (!locked) if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks, bch2_bucket_nocow_lock(&c->nocow_locks,
......
...@@ -978,23 +978,8 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ...@@ -978,23 +978,8 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
return bkey_deleted(k.k); return bkey_deleted(k.k);
} }
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
struct bkey_s_c k)
{ {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bool first = true;
if (c)
prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
bkey_extent_entry_for_each(ptrs, entry) {
if (!first)
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr: {
const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev) ? bch_dev_bkey_exists(c, ptr->dev)
: NULL; : NULL;
...@@ -1016,8 +1001,27 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ...@@ -1016,8 +1001,27 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
if (ca && ptr_stale(ca, ptr)) if (ca && ptr_stale(ca, ptr))
prt_printf(out, " stale"); prt_printf(out, " stale");
} }
}
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bool first = true;
if (c)
prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
bkey_extent_entry_for_each(ptrs, entry) {
if (!first)
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
break; break;
}
case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128: { case BCH_EXTENT_ENTRY_crc128: {
......
...@@ -676,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); ...@@ -676,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c); struct bkey_s_c);
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
......
// SPDX-License-Identifier: GPL-2.0
#include "eytzinger.h"
/**
* is_aligned - is this pointer & size okay for word-wide copying?
* @base: pointer to data
* @size: size of each element
* @align: required alignment (typically 4 or 8)
*
* Returns true if elements can be copied using word loads and stores.
* The size must be a multiple of the alignment, and the base address must
* be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
*
* For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
* to "if ((a | b) & mask)", so we do that by hand.
*/
__attribute_const__ __always_inline
static bool is_aligned(const void *base, size_t size, unsigned char align)
{
unsigned char lsbits = (unsigned char)size;
(void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
lsbits |= (unsigned char)(uintptr_t)base;
#endif
return (lsbits & (align - 1)) == 0;
}
/**
* swap_words_32 - swap two elements in 32-bit chunks
* @a: pointer to the first element to swap
* @b: pointer to the second element to swap
* @n: element size (must be a multiple of 4)
*
* Exchange the two objects in memory. This exploits base+index addressing,
* which basically all CPUs have, to minimize loop overhead computations.
*
* For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
* bottom of the loop, even though the zero flag is still valid from the
* subtract (since the intervening mov instructions don't alter the flags).
* Gcc 8.1.0 doesn't have that problem.
*/
static void swap_words_32(void *a, void *b, size_t n)
{
do {
u32 t = *(u32 *)(a + (n -= 4));
*(u32 *)(a + n) = *(u32 *)(b + n);
*(u32 *)(b + n) = t;
} while (n);
}
/**
* swap_words_64 - swap two elements in 64-bit chunks
* @a: pointer to the first element to swap
* @b: pointer to the second element to swap
* @n: element size (must be a multiple of 8)
*
* Exchange the two objects in memory. This exploits base+index
* addressing, which basically all CPUs have, to minimize loop overhead
* computations.
*
* We'd like to use 64-bit loads if possible. If they're not, emulating
* one requires base+index+4 addressing which x86 has but most other
* processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
* but it's possible to have 64-bit loads without 64-bit pointers (e.g.
* x32 ABI). Are there any cases the kernel needs to worry about?
*/
static void swap_words_64(void *a, void *b, size_t n)
{
do {
#ifdef CONFIG_64BIT
u64 t = *(u64 *)(a + (n -= 8));
*(u64 *)(a + n) = *(u64 *)(b + n);
*(u64 *)(b + n) = t;
#else
/* Use two 32-bit transfers to avoid base+index+4 addressing */
u32 t = *(u32 *)(a + (n -= 4));
*(u32 *)(a + n) = *(u32 *)(b + n);
*(u32 *)(b + n) = t;
t = *(u32 *)(a + (n -= 4));
*(u32 *)(a + n) = *(u32 *)(b + n);
*(u32 *)(b + n) = t;
#endif
} while (n);
}
/**
* swap_bytes - swap two elements a byte at a time
* @a: pointer to the first element to swap
* @b: pointer to the second element to swap
* @n: element size
*
* This is the fallback if alignment doesn't allow using larger chunks.
*/
static void swap_bytes(void *a, void *b, size_t n)
{
do {
char t = ((char *)a)[--n];
((char *)a)[n] = ((char *)b)[n];
((char *)b)[n] = t;
} while (n);
}
/*
* The values are arbitrary as long as they can't be confused with
* a pointer, but small integers make for the smallest compare
* instructions.
*/
#define SWAP_WORDS_64 (swap_r_func_t)0
#define SWAP_WORDS_32 (swap_r_func_t)1
#define SWAP_BYTES (swap_r_func_t)2
#define SWAP_WRAPPER (swap_r_func_t)3
struct wrapper {
cmp_func_t cmp;
swap_func_t swap;
};
/*
* The function pointer is last to make tail calls most efficient if the
* compiler decides not to inline this function.
*/
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
if (swap_func == SWAP_WRAPPER) {
((const struct wrapper *)priv)->swap(a, b, (int)size);
return;
}
if (swap_func == SWAP_WORDS_64)
swap_words_64(a, b, size);
else if (swap_func == SWAP_WORDS_32)
swap_words_32(a, b, size);
else if (swap_func == SWAP_BYTES)
swap_bytes(a, b, size);
else
swap_func(a, b, (int)size, priv);
}
#define _CMP_WRAPPER ((cmp_r_func_t)0L)
static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
{
if (cmp == _CMP_WRAPPER)
return ((const struct wrapper *)priv)->cmp(a, b);
return cmp(a, b, priv);
}
static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func, const void *priv,
size_t l, size_t r)
{
return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
cmp_func, priv);
}
static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
swap_r_func_t swap_func, const void *priv,
size_t l, size_t r)
{
do_swap(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size, swap_func, priv);
}
void eytzinger0_sort_r(void *base, size_t n, size_t size,
cmp_r_func_t cmp_func,
swap_r_func_t swap_func,
const void *priv)
{
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
swap_func = NULL;
if (!swap_func) {
if (is_aligned(base, size, 8))
swap_func = SWAP_WORDS_64;
else if (is_aligned(base, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
for (r = i; r * 2 + 1 < n; r = c) {
c = r * 2 + 1;
if (c + 1 < n &&
eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
c++;
if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
break;
eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
for (r = 0; r * 2 + 1 < i; r = c) {
c = r * 2 + 1;
if (c + 1 < i &&
eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
c++;
if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
break;
eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
}
}
}
void eytzinger0_sort(void *base, size_t n, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func)
{
struct wrapper w = {
.cmp = cmp_func,
.swap = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
...@@ -5,23 +5,33 @@ ...@@ -5,23 +5,33 @@
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/log2.h> #include <linux/log2.h>
#include "util.h" #ifdef EYTZINGER_DEBUG
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
#endif
/* /*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an * Traversal for trees in eytzinger layout - a full binary tree layed out in an
* array * array.
*/
/*
* One based indexing version:
* *
* With one based indexing each level of the tree starts at a power of two - * Consider using an eytzinger tree any time you would otherwise be doing binary
* good for cacheline alignment: * search over an array. Binary search is a worst case scenario for branch
* prediction and prefetching, but in an eytzinger tree every node's children
* are adjacent in memory, thus we can prefetch children before knowing the
* result of the comparison, assuming multiple nodes fit on a cacheline.
*
* Two variants are provided, for one based indexing and zero based indexing.
*
* Zero based indexing is more convenient, but one based indexing has better
* alignment and thus better performance because each new level of the tree
* starts at a power of two, and thus if element 0 was cacheline aligned, each
* new level will be as well.
*/ */
static inline unsigned eytzinger1_child(unsigned i, unsigned child) static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{ {
EBUG_ON(child > 1); EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child; return (i << 1) + child;
} }
...@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) ...@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{ {
EBUG_ON(i > size); EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) { if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i); i = eytzinger1_right_child(i);
...@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) ...@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{ {
EBUG_ON(i > size); EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) { if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1; i = eytzinger1_left_child(i) + 1;
...@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, ...@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b; unsigned shift = __fls(size) - b;
int s; int s;
EBUG_ON(!i || i > size); EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b; i ^= 1U << b;
i <<= 1; i <<= 1;
...@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, ...@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift; unsigned shift;
int s; int s;
EBUG_ON(!i || i > size); EYTZINGER_BUG_ON(!i || i > size);
/* /*
* sign bit trick: * sign bit trick:
...@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) ...@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child) static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{ {
EBUG_ON(child > 1); EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child; return (i << 1) + 1 + child;
} }
...@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) ...@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \ (_i) != -1; \
(_i) = eytzinger0_next((_i), (_size))) (_i) = eytzinger0_next((_i), (_size)))
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
/* return greatest node <= @search, or -1 if not found */ /* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, const void *search) cmp_func_t cmp, const void *search)
{ {
unsigned i, n = 0; unsigned i, n = 0;
...@@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ...@@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do { do {
i = n; i = n;
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr); } while (n < nr);
if (n & 1) { if (n & 1) {
/* @i was greater than @search, return previous node: */ /* @i was greater than @search, return previous node: */
if (i == eytzinger0_first(nr))
return -1;
return eytzinger0_prev(i, nr); return eytzinger0_prev(i, nr);
} else { } else {
return i; return i;
} }
} }
static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
return eytzinger0_next(idx, size);
}
#define eytzinger0_find(base, nr, size, _cmp, search) \ #define eytzinger0_find(base, nr, size, _cmp, search) \
({ \ ({ \
void *_base = (base); \ void *_base = (base); \
...@@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ...@@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \ int _res; \
\ \
while (_i < _nr && \ while (_i < _nr && \
(_res = _cmp(_search, _base + _i * _size, _size))) \ (_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \ _i = eytzinger0_child(_i, _res > 0); \
_i; \ _i; \
}) })
void eytzinger0_sort(void *, size_t, size_t, void eytzinger0_sort_r(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t), cmp_r_func_t, swap_r_func_t, const void *);
void (*swap_func)(void *, void *, size_t)); void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */ #endif /* _EYTZINGER_H */
...@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, ...@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum) u32 *snapshot, u64 *inum)
{ {
struct bch_subvolume s; struct bch_subvolume s;
int ret; int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
*snapshot = le32_to_cpu(s.snapshot); *snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode); *inum = le64_to_cpu(s.inode);
...@@ -170,7 +168,8 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) ...@@ -170,7 +168,8 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
/* Get lost+found, create if it doesn't exist: */ /* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked *lostfound) struct bch_inode_unpacked *lostfound,
u64 reattaching_inum)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found"); struct qstr lostfound_str = QSTR("lost+found");
...@@ -185,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, ...@@ -185,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
return ret; return ret;
subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
u32 subvol_snapshot;
ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), struct bch_subvolume subvol;
&subvol_snapshot, &root_inum.inum); ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
bch_err_msg(c, ret, "looking up root subvol"); false, 0, &subvol);
bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
le32_to_cpu(st.master_subvol), snapshot);
if (ret)
return ret;
if (!subvol.inode) {
struct btree_iter iter;
struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
0, subvolume);
ret = PTR_ERR_OR_ZERO(subvol);
if (ret) if (ret)
return ret; return ret;
subvol->v.inode = cpu_to_le64(reattaching_inum);
bch2_trans_iter_exit(trans, &iter);
}
root_inum.inum = le64_to_cpu(subvol.inode);
struct bch_inode_unpacked root_inode; struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info; struct bch_hash_info root_hash_info;
u32 root_inode_snapshot = snapshot; u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
bch_err_msg(c, ret, "looking up root inode"); bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret) if (ret)
return ret; return ret;
...@@ -293,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans, ...@@ -293,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
} }
ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret) if (ret)
return ret; return ret;
...@@ -364,6 +380,112 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume ...@@ -364,6 +380,112 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret; return ret;
} }
static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
{
struct bch_fs *c = trans->c;
if (!bch2_snapshot_is_leaf(c, snapshotid)) {
bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
return -BCH_ERR_fsck_repair_unimplemented;
}
/*
* If inum isn't set, that means we're being called from check_dirents,
* not check_inodes - the root of this subvolume doesn't exist or we
* would have found it there:
*/
if (!inum) {
struct btree_iter inode_iter = {};
struct bch_inode_unpacked new_inode;
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, &new_inode);
bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
new_inode.bi_subvol = subvolid;
int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
bch2_btree_iter_traverse(&inode_iter) ?:
bch2_inode_write(trans, &inode_iter, &new_inode);
bch2_trans_iter_exit(trans, &inode_iter);
if (ret)
return ret;
inum = new_inode.bi_inum;
}
bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
int ret = PTR_ERR_OR_ZERO(new_subvol);
if (ret)
return ret;
bkey_subvolume_init(&new_subvol->k_i);
new_subvol->k.p.offset = subvolid;
new_subvol->v.snapshot = cpu_to_le32(snapshotid);
new_subvol->v.inode = cpu_to_le64(inum);
ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
if (ret)
return ret;
struct btree_iter iter;
struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_snapshots, POS(0, snapshotid),
0, snapshot);
ret = PTR_ERR_OR_ZERO(s);
bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
if (ret)
return ret;
u32 snapshot_tree = le32_to_cpu(s->v.tree);
s->v.subvol = cpu_to_le32(subvolid);
SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
bch2_trans_iter_exit(trans, &iter);
struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
0, snapshot_tree);
ret = PTR_ERR_OR_ZERO(st);
bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
if (ret)
return ret;
if (!st->v.master_subvol)
st->v.master_subvol = cpu_to_le32(subvolid);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked new_inode;
bch2_inode_init_early(c, &new_inode);
bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
new_inode.bi_size = size;
new_inode.bi_inum = inum;
return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
}
static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
{
struct btree_iter iter = {};
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
bch2_trans_iter_exit(trans, &iter);
int ret = bkey_err(k);
if (ret)
return ret;
return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
}
struct snapshots_seen_entry { struct snapshots_seen_entry {
u32 id; u32 id;
u32 equiv; u32 equiv;
...@@ -1065,6 +1187,11 @@ static int check_inode(struct btree_trans *trans, ...@@ -1065,6 +1187,11 @@ static int check_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT)) if (ret && !bch2_err_matches(ret, ENOENT))
goto err; goto err;
if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
goto do_update;
}
if (fsck_err_on(ret, if (fsck_err_on(ret,
c, inode_bi_subvol_missing, c, inode_bi_subvol_missing,
"inode %llu:%u bi_subvol points to missing subvolume %u", "inode %llu:%u bi_subvol points to missing subvolume %u",
...@@ -1082,7 +1209,7 @@ static int check_inode(struct btree_trans *trans, ...@@ -1082,7 +1209,7 @@ static int check_inode(struct btree_trans *trans,
do_update = true; do_update = true;
} }
} }
do_update:
if (do_update) { if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode"); bch_err_msg(c, ret, "in fsck updating inode");
...@@ -1131,7 +1258,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal ...@@ -1131,7 +1258,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
i->count = count2; i->count = count2;
if (i->count != count2) { if (i->count != count2) {
bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
w->last_pos.inode, i->snapshot, i->count, count2); w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err; return -BCH_ERR_internal_fsck_err;
} }
...@@ -1435,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1435,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err; goto err;
if (k.k->type != KEY_TYPE_whiteout) { if (k.k->type != KEY_TYPE_whiteout) {
if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto err;
inode->last_pos.inode--;
ret = -BCH_ERR_transaction_restart_nested;
goto err;
}
if (fsck_err_on(!i, c, extent_in_missing_inode, if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s", "extent in missing inode:\n %s",
(printbuf_reset(&buf), (printbuf_reset(&buf),
...@@ -1587,8 +1725,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ ...@@ -1587,8 +1725,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
return count2; return count2;
if (i->count != count2) { if (i->count != count2) {
bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
i->count, count2); w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2; i->count = count2;
if (i->inode.bi_nlink == i->count) if (i->inode.bi_nlink == i->count)
continue; continue;
...@@ -1785,6 +1923,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ...@@ -1785,6 +1923,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 parent_snapshot; u32 parent_snapshot;
u32 new_parent_subvol = 0;
u64 parent_inum; u64 parent_inum;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret = 0; int ret = 0;
...@@ -1793,6 +1932,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ...@@ -1793,6 +1932,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret && !bch2_err_matches(ret, ENOENT)) if (ret && !bch2_err_matches(ret, ENOENT))
return ret; return ret;
if (ret ||
(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
if (ret2 && !bch2_err_matches(ret, ENOENT))
return ret2;
}
if (ret &&
!new_parent_subvol &&
(c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
/*
* Couldn't find a subvol for dirent's snapshot - but we lost
* subvols, so we need to reconstruct:
*/
ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
if (ret)
return ret;
parent_snapshot = d.k->p.snapshot;
}
if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
"dirent parent_subvol points to missing subvolume\n%s", "dirent parent_subvol points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
...@@ -1801,10 +1961,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ...@@ -1801,10 +1961,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
parent_snapshot, parent_snapshot,
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
u32 new_parent_subvol; if (!new_parent_subvol) {
ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
if (ret) return -BCH_ERR_fsck_repair_unimplemented;
goto err; }
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
ret = PTR_ERR_OR_ZERO(new_dirent); ret = PTR_ERR_OR_ZERO(new_dirent);
...@@ -1850,9 +2010,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ...@@ -1850,9 +2010,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT)) if (ret && !bch2_err_matches(ret, ENOENT))
return ret; goto err;
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
ret = -BCH_ERR_fsck_repair_unimplemented;
ret = 0;
goto err;
}
if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
c, inode_bi_parent_wrong, c, inode_bi_parent_wrong,
"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
target_inum, target_inum,
...@@ -1860,13 +2027,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ...@@ -1860,13 +2027,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
subvol_root.bi_parent_subvol = parent_subvol; subvol_root.bi_parent_subvol = parent_subvol;
ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
if (ret) if (ret)
return ret; goto err;
} }
ret = check_dirent_target(trans, iter, d, &subvol_root, ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot); target_snapshot);
if (ret) if (ret)
return ret; goto err;
out: out:
err: err:
fsck_err: fsck_err:
...@@ -1883,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1883,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s) struct snapshots_seen *s)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bkey_s_c_dirent d;
struct inode_walker_entry *i; struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct bpos equiv; struct bpos equiv;
...@@ -1922,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1922,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false; dir->first_this_inode = false;
if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto err;
dir->last_pos.inode--;
ret = -BCH_ERR_transaction_restart_nested;
goto err;
}
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s", "dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf), (printbuf_reset(&buf),
...@@ -1956,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1956,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent) if (k.k->type != KEY_TYPE_dirent)
goto out; goto out;
d = bkey_s_c_to_dirent(k); struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) { if (d.v->d_type == DT_SUBVOL) {
ret = check_dirent_to_subvol(trans, iter, d); ret = check_dirent_to_subvol(trans, iter, d);
......
...@@ -95,8 +95,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) ...@@ -95,8 +95,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
return ret ?: bch2_blacklist_table_initialize(c); return ret ?: bch2_blacklist_table_initialize(c);
} }
static int journal_seq_blacklist_table_cmp(const void *_l, static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
const void *_r, size_t size)
{ {
const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r; const struct journal_seq_blacklist_table_entry *r = _r;
......
...@@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test) ...@@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev); d, mean, stddev, weighted_mean, weighted_stddev);
} }
static void mean_and_variance_test_2(struct kunit *test)
{
s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
d, mean, stddev, weighted_mean, weighted_stddev);
}
/* Test behaviour where we switch from one steady state to another: */ /* Test behaviour where we switch from one steady state to another: */
static void mean_and_variance_test_3(struct kunit *test) static void mean_and_variance_test_2(struct kunit *test)
{ {
s64 d[] = { 100, 100, 100, 100, 100 }; s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 22, 32, 40, 46, 50 }; s64 mean[] = { 22, 32, 40, 46, 50 };
...@@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test) ...@@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev); d, mean, stddev, weighted_mean, weighted_stddev);
} }
static void mean_and_variance_test_4(struct kunit *test)
{
s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 10, 11, 12, 13, 14 };
s64 stddev[] = { 9, 13, 15, 17, 19 };
s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
d, mean, stddev, weighted_mean, weighted_stddev);
}
static void mean_and_variance_fast_divpow2(struct kunit *test) static void mean_and_variance_fast_divpow2(struct kunit *test)
{ {
s64 i; s64 i;
...@@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = { ...@@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
KUNIT_CASE(mean_and_variance_weighted_advanced_test), KUNIT_CASE(mean_and_variance_weighted_advanced_test),
KUNIT_CASE(mean_and_variance_test_1), KUNIT_CASE(mean_and_variance_test_1),
KUNIT_CASE(mean_and_variance_test_2), KUNIT_CASE(mean_and_variance_test_2),
KUNIT_CASE(mean_and_variance_test_3),
KUNIT_CASE(mean_and_variance_test_4),
{} {}
}; };
......
...@@ -368,11 +368,11 @@ enum fsck_err_opts { ...@@ -368,11 +368,11 @@ enum fsck_err_opts {
OPT_STR_NOLIMIT(bch2_recovery_passes), \ OPT_STR_NOLIMIT(bch2_recovery_passes), \
BCH2_NO_SB_OPT, 0, \ BCH2_NO_SB_OPT, 0, \
NULL, "Exit recovery after specified pass") \ NULL, "Exit recovery after specified pass") \
x(keep_journal, u8, \ x(retain_recovery_info, u8, \
0, \ 0, \
OPT_BOOL(), \ OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \ BCH2_NO_SB_OPT, false, \
NULL, "Don't free journal entries/keys after startup")\ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
x(read_entire_journal, u8, \ x(read_entire_journal, u8, \
0, \ 0, \
OPT_BOOL(), \ OPT_BOOL(), \
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "alloc_background.h" #include "alloc_background.h"
#include "bkey_buf.h" #include "bkey_buf.h"
#include "btree_journal_iter.h" #include "btree_journal_iter.h"
#include "btree_node_scan.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h" #include "btree_update_interior.h"
#include "btree_io.h" #include "btree_io.h"
...@@ -32,6 +33,20 @@ ...@@ -32,6 +33,20 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n } #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
u64 b = BIT_ULL(btree);
if (!(c->sb.btrees_lost_data & b)) {
bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
mutex_lock(&c->sb_lock);
bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
}
static bool btree_id_is_alloc(enum btree_id id) static bool btree_id_is_alloc(enum btree_id id)
{ {
switch (id) { switch (id) {
...@@ -47,7 +62,7 @@ static bool btree_id_is_alloc(enum btree_id id) ...@@ -47,7 +62,7 @@ static bool btree_id_is_alloc(enum btree_id id)
} }
/* for -o reconstruct_alloc: */ /* for -o reconstruct_alloc: */
static void do_reconstruct_alloc(struct bch_fs *c) static void bch2_reconstruct_alloc(struct bch_fs *c)
{ {
bch2_journal_log_msg(c, "dropping alloc info"); bch2_journal_log_msg(c, "dropping alloc info");
bch_info(c, "dropping and reconstructing all alloc info"); bch_info(c, "dropping and reconstructing all alloc info");
...@@ -82,15 +97,17 @@ static void do_reconstruct_alloc(struct bch_fs *c) ...@@ -82,15 +97,17 @@ static void do_reconstruct_alloc(struct bch_fs *c)
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
struct journal_keys *keys = &c->journal_keys;
size_t src, dst;
move_gap(keys, keys->nr);
for (src = 0, dst = 0; src < keys->nr; src++) bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
if (!btree_id_is_alloc(keys->data[src].btree_id)) 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
keys->data[dst++] = keys->data[src]; bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
keys->nr = keys->gap = dst; 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
} }
/* /*
...@@ -269,7 +286,7 @@ int bch2_journal_replay(struct bch_fs *c) ...@@ -269,7 +286,7 @@ int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans); bch2_trans_put(trans);
trans = NULL; trans = NULL;
if (!c->opts.keep_journal && if (!c->opts.retain_recovery_info &&
c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c); bch2_journal_keys_put_initial(c);
...@@ -433,10 +450,9 @@ static int journal_replay_early(struct bch_fs *c, ...@@ -433,10 +450,9 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c) static int read_btree_roots(struct bch_fs *c)
{ {
unsigned i;
int ret = 0; int ret = 0;
for (i = 0; i < btree_id_nr_alive(c); i++) { for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i); struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive) if (!r->alive)
...@@ -445,36 +461,40 @@ static int read_btree_roots(struct bch_fs *c) ...@@ -445,36 +461,40 @@ static int read_btree_roots(struct bch_fs *c)
if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue; continue;
if (r->error) { if (mustfix_fsck_err_on((ret = r->error),
__fsck_err(c, c, btree_root_bkey_invalid,
btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
btree_root_bkey_invalid,
"invalid btree root %s", "invalid btree root %s",
bch2_btree_id_str(i)); bch2_btree_id_str(i)) ||
if (i == BTREE_ID_alloc) mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
c, btree_root_read_error,
"error reading btree root %s l=%u: %s",
bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
if (btree_id_is_alloc(i)) {
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
r->error = 0;
} else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
bch_info(c, "will run btree node scan");
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
} }
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
fsck_err(c,
btree_root_read_error,
"error reading btree root %s",
bch2_btree_id_str(i));
if (btree_id_is_alloc(i))
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0; ret = 0;
bch2_btree_lost_data(c, i);
} }
} }
for (i = 0; i < BTREE_ID_NR; i++) { for (unsigned i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = bch2_btree_id_root(c, i); struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->b) { if (!r->b && !r->error) {
r->alive = false; r->alive = false;
r->level = 0; r->level = 0;
bch2_btree_root_alloc(c, i); bch2_btree_root_alloc_fake(c, i, 0);
} }
} }
fsck_err: fsck_err:
...@@ -651,7 +671,7 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -651,7 +671,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err; goto err;
} }
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
struct genradix_iter iter; struct genradix_iter iter;
struct journal_replay **i; struct journal_replay **i;
...@@ -731,7 +751,7 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -731,7 +751,7 @@ int bch2_fs_recovery(struct bch_fs *c)
c->journal_replay_seq_end = blacklist_seq - 1; c->journal_replay_seq_end = blacklist_seq - 1;
if (c->opts.reconstruct_alloc) if (c->opts.reconstruct_alloc)
do_reconstruct_alloc(c); bch2_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys); zero_out_btree_mem_ptr(&c->journal_keys);
...@@ -838,13 +858,19 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -838,13 +858,19 @@ int bch2_fs_recovery(struct bch_fs *c)
} }
if (!test_bit(BCH_FS_error, &c->flags) && if (!test_bit(BCH_FS_error, &c->flags) &&
(!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
!bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
write_sb = true; write_sb = true;
} }
if (c->opts.fsck &&
!test_bit(BCH_FS_error, &c->flags) &&
c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
ext->btrees_lost_data) {
ext->btrees_lost_data = 0;
write_sb = true;
}
if (c->opts.fsck && if (c->opts.fsck &&
!test_bit(BCH_FS_error, &c->flags) && !test_bit(BCH_FS_error, &c->flags) &&
!test_bit(BCH_FS_errors_not_fixed, &c->flags)) { !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
...@@ -883,9 +909,10 @@ int bch2_fs_recovery(struct bch_fs *c) ...@@ -883,9 +909,10 @@ int bch2_fs_recovery(struct bch_fs *c)
out: out:
bch2_flush_fsck_errs(c); bch2_flush_fsck_errs(c);
if (!c->opts.keep_journal && if (!c->opts.retain_recovery_info) {
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
bch2_journal_keys_put_initial(c); bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
}
kfree(clean); kfree(clean);
if (!ret && if (!ret &&
...@@ -911,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c) ...@@ -911,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c)
int ret; int ret;
bch_notice(c, "initializing new filesystem"); bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock); mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
...@@ -929,7 +957,7 @@ int bch2_fs_initialize(struct bch_fs *c) ...@@ -929,7 +957,7 @@ int bch2_fs_initialize(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++) for (unsigned i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i); bch2_btree_root_alloc_fake(c, i, 0);
for_each_member_device(c, ca) for_each_member_device(c, ca)
bch2_dev_usage_init(ca); bch2_dev_usage_init(ca);
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_RECOVERY_H #ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H
void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
int bch2_journal_replay(struct bch_fs *); int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *);
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "alloc_background.h" #include "alloc_background.h"
#include "backpointers.h" #include "backpointers.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_node_scan.h"
#include "ec.h" #include "ec.h"
#include "fsck.h" #include "fsck.h"
#include "inode.h" #include "inode.h"
...@@ -59,18 +60,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = { ...@@ -59,18 +60,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = {
#undef x #undef x
}; };
u64 bch2_recovery_passes_to_stable(u64 v) static const u8 passes_to_stable_map[] = {
{
static const u8 map[] = {
#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
BCH_RECOVERY_PASSES() BCH_RECOVERY_PASSES()
#undef x #undef x
}; };
static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
{
return passes_to_stable_map[pass];
}
u64 bch2_recovery_passes_to_stable(u64 v)
{
u64 ret = 0; u64 ret = 0;
for (unsigned i = 0; i < ARRAY_SIZE(map); i++) for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
if (v & BIT_ULL(i)) if (v & BIT_ULL(i))
ret |= BIT_ULL(map[i]); ret |= BIT_ULL(passes_to_stable_map[i]);
return ret; return ret;
} }
...@@ -116,13 +122,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, ...@@ -116,13 +122,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
enum bch_recovery_pass pass) enum bch_recovery_pass pass)
{ {
__le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
mutex_lock(&c->sb_lock); mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (!(ext->recovery_passes_required[0] & s)) { if (!test_bit_le64(s, ext->recovery_passes_required)) {
ext->recovery_passes_required[0] |= s; __set_bit_le64(s, ext->recovery_passes_required);
bch2_write_super(c); bch2_write_super(c);
} }
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
...@@ -130,6 +136,21 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, ...@@ -130,6 +136,21 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
return bch2_run_explicit_recovery_pass(c, pass); return bch2_run_explicit_recovery_pass(c, pass);
} }
static void bch2_clear_recovery_pass_required(struct bch_fs *c,
enum bch_recovery_pass pass)
{
enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (test_bit_le64(s, ext->recovery_passes_required)) {
__clear_bit_le64(s, ext->recovery_passes_required);
bch2_write_super(c);
}
mutex_unlock(&c->sb_lock);
}
u64 bch2_fsck_recovery_passes(void) u64 bch2_fsck_recovery_passes(void)
{ {
u64 ret = 0; u64 ret = 0;
...@@ -218,6 +239,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) ...@@ -218,6 +239,9 @@ int bch2_run_recovery_passes(struct bch_fs *c)
c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
if (!test_bit(BCH_FS_error, &c->flags))
bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
c->curr_recovery_pass++; c->curr_recovery_pass++;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* must never change: * must never change:
*/ */
#define BCH_RECOVERY_PASSES() \ #define BCH_RECOVERY_PASSES() \
x(scan_for_btree_nodes, 37, 0) \
x(check_topology, 4, 0) \ x(check_topology, 4, 0) \
x(alloc_read, 0, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \ x(stripes_read, 1, PASS_ALWAYS) \
...@@ -31,6 +32,7 @@ ...@@ -31,6 +32,7 @@
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \ x(bucket_gens_init, 17, 0) \
x(reconstruct_snapshots, 38, 0) \
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
......
...@@ -6,12 +6,15 @@ ...@@ -6,12 +6,15 @@
#include "replicas.h" #include "replicas.h"
#include "super-io.h" #include "super-io.h"
#include <linux/sort.h>
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *); struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
static int bch2_memcmp(const void *l, const void *r, size_t size) static int bch2_memcmp(const void *l, const void *r, const void *priv)
{ {
size_t size = (size_t) priv;
return memcmp(l, r, size); return memcmp(l, r, size);
} }
...@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) ...@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{ {
eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
} }
static void bch2_replicas_entry_v0_to_text(struct printbuf *out, static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
...@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, ...@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
verify_replicas_entry(search); verify_replicas_entry(search);
#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size, idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search); entry_cmp, search);
#undef entry_cmp #undef entry_cmp
...@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, ...@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{ {
unsigned i; unsigned i;
sort_cmp_size(cpu_r->entries, sort_r(cpu_r->entries,
cpu_r->nr, cpu_r->nr,
cpu_r->entry_size, cpu_r->entry_size,
bch2_memcmp, NULL); bch2_memcmp, NULL,
(void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) { for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e = struct bch_replicas_entry_v1 *e =
......
...@@ -267,7 +267,10 @@ ...@@ -267,7 +267,10 @@
x(subvol_unreachable, 259) \ x(subvol_unreachable, 259) \
x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_bkey_bad_u64s, 260) \
x(btree_node_topology_empty_interior_node, 261) \ x(btree_node_topology_empty_interior_node, 261) \
x(btree_ptr_v2_min_key_bad, 262) x(btree_ptr_v2_min_key_bad, 262) \
x(btree_root_unreadable_and_scan_found_nothing, 263) \
x(snapshot_node_missing, 264) \
x(dup_backpointer_to_bad_csum_extent, 265)
enum bch_sb_error_id { enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n, #define x(t, n) BCH_FSCK_ERR_##t = n,
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "errcode.h" #include "errcode.h"
#include "error.h" #include "error.h"
#include "fs.h" #include "fs.h"
#include "recovery_passes.h"
#include "snapshot.h" #include "snapshot.h"
#include <linux/random.h> #include <linux/random.h>
...@@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans, ...@@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id; u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
bch_err_fn(c, ret);
if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
ret = 0;
goto err;
}
if (ret) if (ret)
goto err; goto err;
...@@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans, ...@@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth; u32 real_depth;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
bool should_have_subvol;
u32 i, id; u32 i, id;
int ret = 0; int ret = 0;
...@@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans, ...@@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans,
} }
} }
should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s); !BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) { if (should_have_subvol) {
...@@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c) ...@@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c)
return ret; return ret;
} }
static int check_snapshot_exists(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
if (bch2_snapshot_equiv(c, id))
return 0;
u32 tree_id;
int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
if (ret)
return ret;
struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
ret = PTR_ERR_OR_ZERO(snapshot);
if (ret)
return ret;
bkey_snapshot_init(&snapshot->k_i);
snapshot->k.p = POS(0, id);
snapshot->v.tree = cpu_to_le32(tree_id);
snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
}
/* Figure out which snapshot nodes belong in the same tree: */
struct snapshot_tree_reconstruct {
enum btree_id btree;
struct bpos cur_pos;
snapshot_id_list cur_ids;
DARRAY(snapshot_id_list) trees;
};
static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
{
darray_for_each(r->trees, i)
darray_exit(i);
darray_exit(&r->trees);
darray_exit(&r->cur_ids);
}
static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
{
return r->btree == BTREE_ID_inodes
? r->cur_pos.offset == pos.offset
: r->cur_pos.inode == pos.inode;
}
static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
{
darray_for_each(*l, i)
if (snapshot_list_has_id(r, *i))
return true;
return false;
}
static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
{
bool first = true;
darray_for_each(*s, i) {
if (!first)
prt_char(out, ' ');
first = false;
prt_printf(out, "%u", *i);
}
}
static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
{
if (r->cur_ids.nr) {
darray_for_each(r->trees, i)
if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
int ret = snapshot_list_merge(c, i, &r->cur_ids);
if (ret)
return ret;
goto out;
}
darray_push(&r->trees, r->cur_ids);
darray_init(&r->cur_ids);
}
out:
r->cur_ids.nr = 0;
return 0;
}
static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
{
if (!same_snapshot(r, pos))
snapshot_tree_reconstruct_next(c, r);
r->cur_pos = pos;
return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
}
int bch2_reconstruct_snapshots(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct printbuf buf = PRINTBUF;
struct snapshot_tree_reconstruct r = {};
int ret = 0;
for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
if (btree_type_has_snapshots(btree)) {
r.btree = btree;
ret = for_each_btree_key(trans, iter, btree, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
get_snapshot_trees(c, &r, k.k->p);
}));
if (ret)
goto err;
snapshot_tree_reconstruct_next(c, &r);
}
}
darray_for_each(r.trees, t) {
printbuf_reset(&buf);
snapshot_id_list_to_text(&buf, t);
darray_for_each(*t, id) {
if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
c, snapshot_node_missing,
"snapshot node %u from tree %s missing", *id, buf.buf)) {
if (t->nr > 1) {
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
ret = -BCH_ERR_fsck_repair_unimplemented;
goto err;
}
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_exists(trans, *id));
if (ret)
goto err;
}
}
}
fsck_err:
err:
bch2_trans_put(trans);
snapshot_tree_reconstruct_exit(&r);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
/* /*
* Mark a snapshot as deleted, for future cleanup: * Mark a snapshot as deleted, for future cleanup:
*/ */
...@@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c) ...@@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c)
POS_MIN, 0, k, POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0))); (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
bch_err_fn(c, ret); bch_err_fn(c, ret);
/*
* It's important that we check if we need to reconstruct snapshots
* before going RW, so we mark that pass as required in the superblock -
* otherwise, we could end up deleting keys with missing snapshot nodes
* instead
*/
BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
test_bit(BCH_FS_may_go_rw, &c->flags));
if (bch2_err_matches(ret, EIO) ||
(c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
return ret; return ret;
} }
......
...@@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list ...@@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{ {
int ret;
BUG_ON(snapshot_list_has_id(s, id)); BUG_ON(snapshot_list_has_id(s, id));
ret = darray_push(s, id); int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
int ret = snapshot_list_has_id(s, id)
? 0
: darray_push(s, id);
if (ret) if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret; return ret;
} }
static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
{
darray_for_each(*src, i) {
int ret = snapshot_list_add_nodup(c, dst, *i);
if (ret)
return ret;
}
return 0;
}
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s); struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32, int bch2_snapshot_get_subvol(struct btree_trans *, u32,
...@@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, ...@@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *);
int bch2_reconstruct_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *); void bch2_delete_dead_snapshots_work(struct work_struct *);
......
...@@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c) ...@@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
if (ext) if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8); sizeof(c->sb.errors_silent) * 8);
c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
}
for_each_member_device(c, ca) { for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
...@@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, ...@@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
kfree(errors_silent); kfree(errors_silent);
} }
prt_printf(out, "Btrees with missing data:");
prt_tab(out);
prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
prt_newline(out);
} }
static const struct bch_sb_field_ops bch_sb_field_ops_ext = { static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_journal_iter.h" #include "btree_journal_iter.h"
#include "btree_key_cache.h" #include "btree_key_cache.h"
#include "btree_node_scan.h"
#include "btree_update_interior.h" #include "btree_update_interior.h"
#include "btree_io.h" #include "btree_io.h"
#include "btree_write_buffer.h" #include "btree_write_buffer.h"
...@@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c) ...@@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++) for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]); bch2_time_stats_exit(&c->times[i]);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c); bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c); bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c); bch2_fs_counters_exit(c);
...@@ -560,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c) ...@@ -560,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]); bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c); bch2_fs_compress_exit(c);
bch2_journal_keys_put_initial(c); bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref)); BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c); bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock); percpu_free_rwsem(&c->mark_lock);
......
...@@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) ...@@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
} }
} }
static int alignment_ok(const void *base, size_t align)
{
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
((unsigned long)base & (align - 1)) == 0;
}
static void u32_swap(void *a, void *b, size_t size)
{
u32 t = *(u32 *)a;
*(u32 *)a = *(u32 *)b;
*(u32 *)b = t;
}
static void u64_swap(void *a, void *b, size_t size)
{
u64 t = *(u64 *)a;
*(u64 *)a = *(u64 *)b;
*(u64 *)b = t;
}
static void generic_swap(void *a, void *b, size_t size)
{
char t;
do {
t = *(char *)a;
*(char *)a++ = *(char *)b;
*(char *)b++ = t;
} while (--size > 0);
}
static inline int do_cmp(void *base, size_t n, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
size_t l, size_t r)
{
return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size);
}
static inline void do_swap(void *base, size_t n, size_t size,
void (*swap_func)(void *, void *, size_t),
size_t l, size_t r)
{
swap_func(base + inorder_to_eytzinger0(l, n) * size,
base + inorder_to_eytzinger0(r, n) * size,
size);
}
void eytzinger0_sort(void *base, size_t n, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t))
{
int i, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for (i = n / 2 - 1; i >= 0; --i) {
for (r = i; r * 2 + 1 < n; r = c) {
c = r * 2 + 1;
if (c + 1 < n &&
do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
c++;
if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
break;
do_swap(base, n, size, swap_func, r, c);
}
}
/* sort */
for (i = n - 1; i > 0; --i) {
do_swap(base, n, size, swap_func, 0, i);
for (r = 0; r * 2 + 1 < i; r = c) {
c = r * 2 + 1;
if (c + 1 < i &&
do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
c++;
if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
break;
do_swap(base, n, size, swap_func, r, c);
}
}
}
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t size))
{
/* pre-scale counters for performance */
int i = (num/2 - 1) * size, n = num * size, c, r;
if (!swap_func) {
if (size == 4 && alignment_ok(base, 4))
swap_func = u32_swap;
else if (size == 8 && alignment_ok(base, 8))
swap_func = u64_swap;
else
swap_func = generic_swap;
}
/* heapify */
for ( ; i >= 0; i -= size) {
for (r = i; r * 2 + size < n; r = c) {
c = r * 2 + size;
if (c < n - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
/* sort */
for (i = n - size; i > 0; i -= size) {
swap_func(base, base + i, size);
for (r = 0; r * 2 + size < i; r = c) {
c = r * 2 + size;
if (c < i - size &&
cmp_func(base + c, base + c + size, size) < 0)
c += size;
if (cmp_func(base + r, base + c, size) >= 0)
break;
swap_func(base + r, base + c, size);
}
}
}
#if 0 #if 0
void eytzinger1_test(void) void eytzinger1_test(void)
{ {
......
...@@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) ...@@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem); memset(s + bytes, c, rem);
} }
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
/* just the memmove, doesn't update @_nr */ /* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \ #define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \ memmove(&(_array)[(_pos) + 1], \
...@@ -797,4 +793,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) ...@@ -797,4 +793,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr)
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
} }
static inline void __clear_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64));
}
static inline bool test_bit_le64(size_t bit, __le64 *addr)
{
return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
}
#endif /* _BCACHEFS_UTIL_H */ #endif /* _BCACHEFS_UTIL_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment