Commit ef1669ff authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Update fsck for snapshots

This updates the fsck algorithms to handle snapshots - meaning there
will be multiple versions of the same key (extents, inodes, dirents,
xattrs) in different snapshots, and we have to carefully consider which
keys are visible in which snapshot.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
parent 6fed42bb
......@@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
struct bpos, struct bpos, u64 *);
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);
......
......@@ -1204,13 +1204,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
unsigned iter_flags,
u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
retry:
while ((bch2_trans_begin(trans),
(k = bch2_btree_iter_peek(&iter)).k) &&
......@@ -1277,5 +1278,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
u64 *journal_seq)
{
return bch2_trans_do(c, NULL, journal_seq, 0,
bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
}
......@@ -18,7 +18,8 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
......@@ -26,7 +27,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
int ret;
for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(inum, 0), 0, k, ret) {
SPOS(inum, 0, snapshot), 0, k, ret) {
if (k.k->p.inode != inum)
break;
......@@ -39,6 +40,33 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
return ret ?: sectors;
}
static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
u64 subdirs = 0;
int ret;
for_each_btree_key(trans, iter, BTREE_ID_dirents,
SPOS(inum, 0, snapshot), 0, k, ret) {
if (k.k->p.inode != inum)
break;
if (k.k->type != KEY_TYPE_dirent)
continue;
d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_DIR)
subdirs++;
}
bch2_trans_iter_exit(trans, &iter);
return ret ?: subdirs;
}
static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
u32 *subvol)
{
......@@ -72,8 +100,8 @@ static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
}
static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol,
u64 *inum)
static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct btree_iter iter;
struct bkey_s_c k;
......@@ -92,6 +120,7 @@ static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol,
goto err;
}
*snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
*inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
err:
bch2_trans_iter_exit(trans, &iter);
......@@ -99,9 +128,10 @@ static int __subvol_lookup_root(struct btree_trans *trans, u32 subvol,
}
static int subvol_lookup_root(struct btree_trans *trans, u32 subvol, u64 *inum)
static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
return lockrestart_do(trans, __subvol_lookup_root(trans, subvol, inum));
return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
}
static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
......@@ -113,14 +143,13 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
POS(0, inode_nr), 0);
SPOS(0, inode_nr, *snapshot), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (snapshot)
*snapshot = iter.pos.snapshot;
*snapshot = iter.pos.snapshot;
ret = k.k->type == KEY_TYPE_inode
? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
: -ENOENT;
......@@ -136,6 +165,36 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
}
static int __lookup_dirent(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
u64 *target, unsigned *type)
{
struct btree_iter iter;
struct bkey_s_c_dirent d;
int ret;
ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
&hash_info, dir, name, 0);
if (ret)
return ret;
d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
*target = le64_to_cpu(d.v->d_inum);
*type = d.v->d_type;
bch2_trans_iter_exit(trans, &iter);
return 0;
}
static int lookup_dirent(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
u64 *target, unsigned *type)
{
return lockrestart_do(trans,
__lookup_dirent(trans, hash_info, dir, name, target, type));
}
static int __write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
......@@ -166,6 +225,71 @@ static int write_inode(struct btree_trans *trans,
return ret;
}
static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
int ret;
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
SPOS(inum, 0, snapshot),
SPOS(inum, U64_MAX, snapshot),
0, NULL) ?:
bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
SPOS(inum, 0, snapshot),
SPOS(inum, U64_MAX, snapshot),
0, NULL) ?:
bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
SPOS(inum, 0, snapshot),
SPOS(inum, U64_MAX, snapshot),
0, NULL);
if (ret)
goto err;
retry:
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_inode) {
bch2_fs_inconsistent(trans->c,
"inode %llu:%u not found when deleting",
inum, snapshot);
ret = -EIO;
goto err;
}
bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
/* Subvolume root? */
if (inode_u.bi_subvol) {
ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
if (ret)
goto err;
}
bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
return ret;
}
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
......@@ -200,32 +324,49 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
}
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans,
u32 subvol,
static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked root;
struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
u64 inum;
subvol_inum root_inum = { .subvol = subvol };
u64 inum = 0;
unsigned d_type = 0;
u32 snapshot;
int ret;
ret = subvol_lookup_root(trans, subvol, &inum);
ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
if (ret)
return ret;
ret = lookup_inode(trans, inum, &root, &snapshot);
if (ret && ret != -ENOENT)
ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
if (ret) {
bch_err(c, "error fetching subvol root: %i", ret);
return ret;
}
root_hash_info = bch2_hash_info_init(c, &root);
inum = bch2_dirent_lookup(c, root.bi_inum, &root_hash_info,
&lostfound_str);
if (!inum) {
ret = lookup_dirent(trans, root_hash_info, root_inum,
&lostfound_str, &inum, &d_type);
if (ret == -ENOENT) {
bch_notice(c, "creating lost+found");
goto create_lostfound;
}
if (ret) {
bch_err(c, "error looking up lost+found: %i", ret);
return ret;
}
if (d_type != DT_DIR) {
bch_err(c, "error looking up lost+found: not a directory");
return ret;
}
ret = lookup_inode(trans, inum, lostfound, &snapshot);
if (ret && ret != -ENOENT) {
/*
......@@ -243,11 +384,9 @@ static int lookup_lostfound(struct btree_trans *trans,
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_create_trans(trans,
BCACHEFS_ROOT_INO, &root,
lostfound,
&lostfound_str,
0, 0, S_IFDIR|0700, 0, NULL, NULL));
bch2_create_trans(trans, root_inum, &root,
lostfound, &lostfound_str,
0, 0, S_IFDIR|0700, 0, NULL, NULL, 0));
if (ret)
bch_err(c, "error creating lost+found: %i", ret);
}
......@@ -257,7 +396,7 @@ static int lookup_lostfound(struct btree_trans *trans,
static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
u32 inode_snapshot)
{
struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
......@@ -267,7 +406,7 @@ static int reattach_inode(struct btree_trans *trans,
u32 subvol;
int ret;
ret = snapshot_lookup_subvol(trans, snapshot, &subvol);
ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
if (ret)
return ret;
......@@ -289,10 +428,15 @@ static int reattach_inode(struct btree_trans *trans,
name = (struct qstr) QSTR(name_buf);
ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
mode_to_type(inode->bi_mode),
&name, inode->bi_inum, &dir_offset,
BCH_HASH_SET_MUST_CREATE));
bch2_dirent_create(trans,
(subvol_inum) {
.subvol = subvol,
.inum = lostfound.bi_inum,
},
&dir_hash,
mode_to_type(inode->bi_mode),
&name, inode->bi_inum, &dir_offset,
BCH_HASH_SET_MUST_CREATE));
if (ret) {
bch_err(trans->c, "error %i reattaching inode %llu",
ret, inode->bi_inum);
......@@ -302,7 +446,7 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
return write_inode(trans, inode, U32_MAX);
return write_inode(trans, inode, inode_snapshot);
}
static int remove_backpointer(struct btree_trans *trans,
......@@ -329,45 +473,287 @@ static int remove_backpointer(struct btree_trans *trans,
return ret;
}
struct snapshots_seen {
struct bpos pos;
size_t nr;
size_t size;
u32 *d;
};
static void snapshots_seen_exit(struct snapshots_seen *s)
{
kfree(s->d);
s->d = NULL;
}
static void snapshots_seen_init(struct snapshots_seen *s)
{
memset(s, 0, sizeof(*s));
}
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
{
pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
if (bkey_cmp(s->pos, pos))
s->nr = 0;
s->pos = pos;
if (s->nr == s->size) {
size_t new_size = max(s->size, 128UL) * 2;
u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
if (!d) {
bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
new_size);
return -ENOMEM;
}
s->size = new_size;
s->d = d;
}
/* Might get called multiple times due to lock restarts */
if (s->nr && s->d[s->nr - 1] == pos.snapshot)
return 0;
s->d[s->nr++] = pos.snapshot;
return 0;
}
/**
* key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
* and @ancestor hasn't been overwritten in @seen
*
* That is, returns whether key in @ancestor snapshot is visible in @id snapshot
*/
static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
u32 id, u32 ancestor)
{
ssize_t i;
BUG_ON(id > ancestor);
id = snapshot_t(c, id)->equiv;
ancestor = snapshot_t(c, ancestor)->equiv;
/* @ancestor should be the snapshot most recently added to @seen */
BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
BUG_ON(seen->pos.snapshot != ancestor);
if (id == ancestor)
return true;
if (!bch2_snapshot_is_ancestor(c, id, ancestor))
return false;
for (i = seen->nr - 2;
i >= 0 && seen->d[i] >= id;
--i)
if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
return false;
return true;
}
/**
* ref_visible - given a key with snapshot id @src that points to a key with
* snapshot id @dst, test whether there is some snapshot in which @dst is
* visible.
*
* This assumes we're visiting @src keys in natural key order.
*
* @s - list of snapshot IDs already seen at @src
* @src - snapshot ID of src key
* @dst - snapshot ID of dst key
*/
static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
u32 src, u32 dst)
{
return dst <= src
? key_visible_in_snapshot(c, s, dst, src)
: bch2_snapshot_is_ancestor(c, src, dst);
}
#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
struct inode_walker {
bool first_this_inode;
bool have_inode;
u64 cur_inum;
u32 snapshot;
struct bch_inode_unpacked inode;
bool first_this_inode;
u64 cur_inum;
size_t nr;
size_t size;
struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
u64 count;
} *d;
};
static void inode_walker_exit(struct inode_walker *w)
{
kfree(w->d);
w->d = NULL;
}
static struct inode_walker inode_walker_init(void)
{
return (struct inode_walker) {
.cur_inum = -1,
.have_inode = false,
return (struct inode_walker) { 0, };
}
static int inode_walker_realloc(struct inode_walker *w)
{
if (w->nr == w->size) {
size_t new_size = max_t(size_t, 8UL, w->size * 2);
void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
GFP_KERNEL);
if (!d)
return -ENOMEM;
w->d = d;
w->size = new_size;
}
return 0;
}
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c_inode inode)
{
struct bch_inode_unpacked u;
int ret;
ret = inode_walker_realloc(w);
if (ret)
return ret;
BUG_ON(bch2_inode_unpack(inode, &u));
w->d[w->nr++] = (struct inode_walker_entry) {
.inode = u,
.snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
};
return 0;
}
static int __walk_inode(struct btree_trans *trans,
struct inode_walker *w, u64 inum)
struct inode_walker *w, struct bpos pos)
{
if (inum != w->cur_inum) {
int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
unsigned i, ancestor_pos;
int ret;
if (ret && ret != -ENOENT)
return ret;
pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
w->have_inode = !ret;
w->cur_inum = inum;
w->first_this_inode = true;
} else {
if (pos.inode == w->cur_inum) {
w->first_this_inode = false;
goto lookup_snapshot;
}
return 0;
w->nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != pos.inode)
break;
if (k.k->type == KEY_TYPE_inode)
add_inode(c, w, bkey_s_c_to_inode(k));
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
return ret;
w->cur_inum = pos.inode;
w->first_this_inode = true;
lookup_snapshot:
for (i = 0; i < w->nr; i++)
if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
goto found;
return INT_MAX;
found:
BUG_ON(pos.snapshot > w->d[i].snapshot);
if (pos.snapshot != w->d[i].snapshot) {
ancestor_pos = i;
while (i && w->d[i - 1].snapshot > pos.snapshot)
--i;
ret = inode_walker_realloc(w);
if (ret)
return ret;
array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
w->d[i].snapshot = pos.snapshot;
w->d[i].count = 0;
}
return i;
}
static int walk_inode(struct btree_trans *trans,
struct inode_walker *w, u64 inum)
struct inode_walker *w, struct bpos pos)
{
return lockrestart_do(trans, __walk_inode(trans, w, inum));
return lockrestart_do(trans, __walk_inode(trans, w, pos));
}
static int __get_visible_inodes(struct btree_trans *trans,
struct inode_walker *w,
struct snapshots_seen *s,
u64 inum)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
w->nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != inum)
break;
if (k.k->type != KEY_TYPE_inode)
continue;
if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
add_inode(c, w, bkey_s_c_to_inode(k));
if (k.k->p.snapshot >= s->pos.snapshot)
break;
}
}
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int check_key_has_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
char buf[200];
int ret = 0;
if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
"key in missing snapshot: %s",
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
return ret ?: -EINTR;
}
fsck_err:
return ret;
}
static int hash_redo_key(struct btree_trans *trans,
......@@ -375,6 +761,9 @@ static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
bch_err(trans->c, "hash_redo_key() not implemented yet");
return -EINVAL;
#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
......@@ -393,6 +782,7 @@ static int hash_redo_key(struct btree_trans *trans,
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
#endif
}
static int fsck_hash_delete_at(struct btree_trans *trans,
......@@ -484,30 +874,29 @@ static int hash_check_key(struct btree_trans *trans,
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_inode inode)
struct bch_inode_unpacked *prev,
struct bch_inode_unpacked u)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
bool do_update = false;
int ret = 0;
ret = bch2_inode_unpack(inode, &u);
if (bch2_fs_inconsistent_on(ret, c,
"error unpacking inode %llu in fsck",
inode.k->p.inode))
return ret;
if (fsck_err_on(prev &&
(prev->bi_hash_seed != u.bi_hash_seed ||
mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
return -EINVAL;
}
if (u.bi_flags & BCH_INODE_UNLINKED &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
bch_verbose(c, "deleting inode %llu", u.bi_inum);
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
ret = bch2_inode_rm(c, u.bi_inum, false);
ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
......@@ -527,9 +916,10 @@ static int check_inode(struct btree_trans *trans,
* just switch units to bytes and that issue goes away
*/
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
NULL);
0, NULL);
if (ret) {
bch_err(c, "error in fsck: error %i truncating inode", ret);
return ret;
......@@ -554,7 +944,7 @@ static int check_inode(struct btree_trans *trans,
bch_verbose(c, "recounting sectors for inode %llu",
u.bi_inum);
sectors = bch2_count_inode_sectors(trans, u.bi_inum);
sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
bch_err(c, "error in fsck: error %i recounting inode sectors",
(int) sectors);
......@@ -574,11 +964,7 @@ static int check_inode(struct btree_trans *trans,
}
if (do_update) {
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_btree_iter_traverse(iter) ?:
bch2_inode_write(trans, iter, &u));
ret = write_inode(trans, &u, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i "
"updating inode", ret);
......@@ -594,26 +980,49 @@ static int check_inodes(struct bch_fs *c, bool full)
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_inode inode;
struct bch_inode_unpacked prev, u;
int ret;
memset(&prev, 0, sizeof(prev));
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH, k, ret) {
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
ret = check_key_has_snapshot(&trans, &iter, k);
if (ret)
break;
/*
* if snapshot id isn't a leaf node, skip it - deletion in
* particular is not atomic, so on the internal snapshot nodes
* we can see inodes marked for deletion after a clean shutdown
*/
if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
continue;
if (k.k->type != KEY_TYPE_inode)
continue;
inode = bkey_s_c_to_inode(k);
if (full ||
(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
BCH_INODE_I_SECTORS_DIRTY|
BCH_INODE_UNLINKED))) {
ret = check_inode(&trans, &iter, inode);
if (ret)
break;
}
if (!full &&
!(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
BCH_INODE_I_SECTORS_DIRTY|
BCH_INODE_UNLINKED)))
continue;
BUG_ON(bch2_inode_unpack(inode, &u));
ret = check_inode(&trans, &iter,
full && prev.bi_inum == u.bi_inum
? &prev : NULL, u);
if (ret)
break;
prev = u;
}
bch2_trans_iter_exit(&trans, &iter);
......@@ -622,6 +1031,29 @@ static int check_inodes(struct bch_fs *c, bool full)
return bch2_trans_exit(&trans) ?: ret;
}
noinline_for_stack
static int check_subvols(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
0, k, ret) {
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
/*
* Checking for overlapping extents needs to be reimplemented
*/
#if 0
static int fix_overlapping_extent(struct btree_trans *trans,
struct bkey_s_c k, struct bpos cut_at)
{
......@@ -638,55 +1070,195 @@ static int fix_overlapping_extent(struct btree_trans *trans,
bch2_cut_front(cut_at, u);
/*
* We don't want to go through the extent_handle_overwrites path:
*
* XXX: this is going to screw up disk accounting, extent triggers
* assume things about extent overwrites - we should be running the
* triggers manually here
*/
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
/*
* We don't want to go through the extent_handle_overwrites path:
*
* XXX: this is going to screw up disk accounting, extent triggers
* assume things about extent overwrites - we should be running the
* triggers manually here
*/
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
#endif
static int inode_backpointer_exists(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto out;
if (k.k->type != KEY_TYPE_dirent)
goto out;
ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
return d.k->p.inode == inode->bi_dir &&
d.k->p.offset == inode->bi_dir_offset;
}
static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
int ret = 0, ret2 = 0;
s64 count2;
for (i = w->d; i < w->d + w->nr; i++) {
if (i->inode.bi_sectors == i->count)
continue;
count2 = lockrestart_do(trans,
bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
if (i->count != count2) {
bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
i->count, count2);
i->count = count2;
if (i->inode.bi_sectors == i->count)
continue;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->cur_inum, i->snapshot,
i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
continue;
i->inode.bi_sectors = i->count;
ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
ret2 = -EINTR;
}
fsck_err:
return ret ?: ret2;
}
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker *inode,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct inode_walker_entry *i;
char buf[200];
int ret = 0;
k = bch2_btree_iter_peek(iter);
if (!k.k)
return 0;
ret = bkey_err(k);
if (ret)
return ret;
ret = check_key_has_snapshot(trans, iter, k);
if (ret)
return ret;
ret = snapshots_seen_update(c, s, k.k->p);
if (ret)
return ret;
if (k.k->type == KEY_TYPE_whiteout)
return 0;
if (inode->cur_inum != k.k->p.inode) {
ret = check_i_sectors(trans, inode);
if (ret)
return ret;
}
#if 0
if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
char buf1[200];
char buf2[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
bch2_bkey_val_to_text(&PBUF(buf2), c, k);
if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
}
#endif
ret = __walk_inode(trans, inode, k.k->p);
if (ret < 0)
return ret;
if (fsck_err_on(ret == INT_MAX, c,
"extent in missing inode:\n %s",
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
if (ret == INT_MAX)
return 0;
BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
i = inode->d + ret;
ret = 0;
static int inode_backpointer_exists(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
!S_ISLNK(i->inode.bi_mode), c,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
k.k->type != KEY_TYPE_reservation &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
"extent type %u offset %llu past end of inode %llu, i_size %llu",
k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
bch2_fs_lazy_rw(c);
return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
k.k->p.snapshot),
POS(k.k->p.inode, U64_MAX),
0, NULL) ?: -EINTR;
}
}
}
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
POS(inode->bi_dir, inode->bi_dir_offset), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto out;
if (k.k->type != KEY_TYPE_dirent)
goto out;
if (bkey_extent_is_allocation(k.k))
for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
i->count += k.k->size;
#if 0
bch2_bkey_buf_reassemble(&prev, c, k);
#endif
ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
out:
bch2_trans_iter_exit(trans, &iter);
fsck_err:
return ret;
}
static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
return d.k->p.inode == inode->bi_dir &&
d.k->p.offset == inode->bi_dir_offset;
}
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
......@@ -695,15 +1267,17 @@ noinline_for_stack
static int check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct snapshots_seen s;
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf prev;
u64 i_sectors = 0;
int ret = 0;
#if 0
struct bkey_buf prev;
bch2_bkey_buf_init(&prev);
prev.k->k = KEY(0, 0, 0);
#endif
snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
......@@ -711,96 +1285,172 @@ static int check_extents(struct bch_fs *c)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH);
retry:
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k))) {
if (w.have_inode &&
w.cur_inum != k.k->p.inode &&
!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
fsck_err_on(w.inode.bi_sectors != i_sectors, c,
"inode %llu has incorrect i_sectors: got %llu, should be %llu",
w.inode.bi_inum,
w.inode.bi_sectors, i_sectors)) {
w.inode.bi_sectors = i_sectors;
ret = write_inode(&trans, &w.inode, w.snapshot);
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
do {
ret = lockrestart_do(&trans,
check_extent(&trans, &iter, &w, &s));
if (ret)
break;
} while (bch2_btree_iter_advance(&iter));
bch2_trans_iter_exit(&trans, &iter);
#if 0
bch2_bkey_buf_exit(&prev, c);
#endif
inode_walker_exit(&w);
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
return ret;
}
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
int ret = 0, ret2 = 0;
s64 count2;
for (i = w->d; i < w->d + w->nr; i++) {
if (i->inode.bi_nlink == i->count)
continue;
count2 = lockrestart_do(trans,
bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
if (i->count != count2) {
bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
}
if (fsck_err_on(i->inode.bi_nlink != i->count, c,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
ret2 = -EINTR;
}
}
fsck_err:
return ret ?: ret2;
}
if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
char buf1[200];
char buf2[200];
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
struct bch_inode_unpacked *target,
u32 target_snapshot)
{
struct bch_fs *c = trans->c;
bool backpointer_exists = true;
char buf[200];
int ret = 0;
if (!target->bi_dir &&
!target->bi_dir_offset) {
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
ret = write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
if (!inode_backpointer_matches(d, target)) {
ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
if (ret < 0)
goto err;
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
bch2_bkey_val_to_text(&PBUF(buf2), c, k);
backpointer_exists = ret;
ret = 0;
if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
if (fsck_err_on(S_ISDIR(target->bi_mode) &&
backpointer_exists, c,
"directory %llu with multiple links",
target->bi_inum)) {
ret = remove_dirent(trans, d.k->p);
if (ret)
goto err;
return 0;
}
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
if (fsck_err_on(backpointer_exists &&
!target->bi_nlink, c,
"inode %llu has multiple links but i_nlink 0",
target->bi_inum)) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_UNLINKED;
if (w.first_this_inode)
i_sectors = 0;
if (fsck_err_on(!w.have_inode, c,
"extent type %u for missing inode %llu",
k.k->type, k.k->p.inode) ||
fsck_err_on(w.have_inode &&
!S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
"extent type %u for non regular file, inode %llu mode %o",
k.k->type, k.k->p.inode, w.inode.bi_mode)) {
bch2_fs_lazy_rw(c);
return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
POS(k.k->p.inode, 0),
POS(k.k->p.inode, U64_MAX),
NULL) ?: -EINTR;
ret = write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
if (fsck_err_on(w.have_inode &&
!(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
k.k->type != KEY_TYPE_reservation &&
k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
"extent type %u offset %llu past end of inode %llu, i_size %llu",
k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
bch2_fs_lazy_rw(c);
return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
POS(k.k->p.inode, U64_MAX),
NULL) ?: -EINTR;
if (fsck_err_on(!backpointer_exists, c,
"inode %llu has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
target->bi_inum,
target->bi_dir,
target->bi_dir_offset,
d.k->p.inode,
d.k->p.offset)) {
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
ret = write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
}
if (bkey_extent_is_allocation(k.k))
i_sectors += k.k->size;
bch2_bkey_buf_reassemble(&prev, c, k);
if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
"incorrect d_type: should be %u:\n%s",
mode_to_type(target->bi_mode),
(bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
struct bkey_i_dirent *n;
bch2_btree_iter_advance(&iter);
n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto err;
}
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = mode_to_type(target->bi_mode);
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_trans_update(trans, iter, &n->k_i, 0));
kfree(n);
if (ret)
goto err;
}
err:
fsck_err:
if (ret == -EINTR)
goto retry;
bch2_trans_iter_exit(&trans, &iter);
bch2_bkey_buf_exit(&prev, c);
return bch2_trans_exit(&trans) ?: ret;
return ret;
}
static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_hash_info *hash_info,
struct inode_walker *w, unsigned *nr_subdirs)
struct inode_walker *dir,
struct inode_walker *target,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
struct bch_inode_unpacked target;
struct inode_walker_entry *i;
u32 target_snapshot;
u32 target_subvol;
bool have_target;
bool backpointer_exists = true;
u64 d_inum;
u64 target_inum;
char buf[200];
int ret;
......@@ -812,38 +1462,49 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
return ret;
if (w->have_inode &&
w->cur_inum != k.k->p.inode &&
fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
"directory %llu with wrong i_nlink: got %u, should be %u",
w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
w->inode.bi_nlink = *nr_subdirs;
ret = write_inode(trans, &w->inode, w->snapshot);
return ret ?: -EINTR;
}
ret = check_key_has_snapshot(trans, iter, k);
if (ret)
return ret;
ret = __walk_inode(trans, w, k.k->p.inode);
ret = snapshots_seen_update(c, s, k.k->p);
if (ret)
return ret;
if (w->first_this_inode)
*nr_subdirs = 0;
if (k.k->type == KEY_TYPE_whiteout)
return 0;
if (dir->cur_inum != k.k->p.inode) {
ret = check_subdir_count(trans, dir);
if (ret)
return ret;
}
ret = __walk_inode(trans, dir, k.k->p);
if (ret < 0)
return ret;
if (fsck_err_on(!w->have_inode, c,
if (fsck_err_on(ret == INT_MAX, c,
"dirent in nonexisting directory:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
if (ret == INT_MAX)
return 0;
i = dir->d + ret;
ret = 0;
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
"dirent in non directory inode type %u:\n%s",
mode_to_type(w->inode.bi_mode),
mode_to_type(i->inode.bi_mode),
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
return __bch2_trans_do(trans, NULL, NULL, 0,
bch2_btree_delete_at(trans, iter, 0));
if (!w->have_inode)
return 0;
if (w->first_this_inode)
*hash_info = bch2_hash_info_init(c, &w->inode);
if (dir->first_this_inode)
*hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
ret = hash_check_key(trans, bch2_dirent_hash_desc,
hash_info, iter, k);
......@@ -856,128 +1517,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
return 0;
d = bkey_s_c_to_dirent(k);
d_inum = le64_to_cpu(d.v->d_inum);
ret = __bch2_dirent_read_target(&trans, d,
ret = __bch2_dirent_read_target(trans, d,
&target_subvol,
&target_snapshot,
&target_inum);
&target_inum,
true);
if (ret && ret != -ENOENT)
return ret;
ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
if (ret && ret != -ENOENT)
return ret;
if (fsck_err_on(ret, c,
"dirent points to missing subvolume %llu",
le64_to_cpu(d.v->d_inum)))
return remove_dirent(trans, d.k->p);
have_target = !ret;
ret = 0;
if (target_subvol) {
struct bch_inode_unpacked subvol_root;
if (fsck_err_on(!have_target, c,
"dirent points to missing inode:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c,
k), buf)))
return remove_dirent(trans, d.k->p);
ret = __lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && ret != -ENOENT)
return ret;
if (!have_target)
return 0;
if (fsck_err_on(ret, c,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
bch_err(c, "repair not implemented yet");
return -EINVAL;
}
if (!target.bi_dir &&
!target.bi_dir_offset) {
target.bi_dir = k.k->p.inode;
target.bi_dir_offset = k.k->p.offset;
if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
target_inum,
subvol_root.bi_subvol, target_subvol)) {
subvol_root.bi_subvol = target_subvol;
ret = write_inode(trans, &subvol_root, target_snapshot);
if (ret)
return ret;
}
ret = __write_inode(trans, &target, target_snapshot) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
return ret;
return -EINTR;
}
if (!inode_backpointer_matches(d, &target)) {
ret = inode_backpointer_exists(trans, &target);
if (ret < 0)
} else {
ret = __get_visible_inodes(trans, target, s, target_inum);
if (ret)
return ret;
backpointer_exists = ret;
ret = 0;
if (fsck_err_on(S_ISDIR(target.bi_mode) &&
backpointer_exists, c,
"directory %llu with multiple links",
target.bi_inum))
return remove_dirent(trans, d.k->p);
if (fsck_err_on(backpointer_exists &&
!target.bi_nlink, c,
"inode %llu has multiple links but i_nlink 0",
d_inum)) {
target.bi_nlink++;
target.bi_flags &= ~BCH_INODE_UNLINKED;
ret = write_inode(trans, &target, target_snapshot);
return ret ?: -EINTR;
if (fsck_err_on(!target->nr, c,
"dirent points to missing inode:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c,
k), buf))) {
ret = remove_dirent(trans, d.k->p);
if (ret)
return ret;
}
if (fsck_err_on(!backpointer_exists, c,
"inode %llu has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
d_inum,
target.bi_dir,
target.bi_dir_offset,
k.k->p.inode,
k.k->p.offset)) {
target.bi_dir = k.k->p.inode;
target.bi_dir_offset = k.k->p.offset;
ret = write_inode(trans, &target, target_snapshot);
return ret ?: -EINTR;
for (i = target->d; i < target->d + target->nr; i++) {
ret = check_dirent_target(trans, iter, d,
&i->inode, i->snapshot);
if (ret)
return ret;
}
}
target_subvol = d.v->d_type == DT_SUBVOL
? le64_to_cpu(d.v->d_inum) : 0;
if (fsck_err_on(target.bi_subvol != target_subvol, c,
"subvol root %llu has wrong subvol field:\n"
"got %u\n"
"should be %u",
target.bi_inum,
target.bi_subvol,
target_subvol)) {
target.bi_subvol = target_subvol;
ret = write_inode(trans, &target, target_snapshot);
return ret ?: -EINTR;
}
if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target.bi_mode), c,
"incorrect d_type: should be %u:\n%s",
mode_to_type(target.bi_mode),
(bch2_bkey_val_to_text(&PBUF(buf), c,
k), buf))) {
struct bkey_i_dirent *n;
n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
if (!n)
return -ENOMEM;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = mode_to_type(target.bi_mode);
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, &n->k_i, 0));
kfree(n);
return ret ?: -EINTR;
}
if (d.v->d_type == DT_DIR)
for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
i->count++;
*nr_subdirs += d.v->d_type == DT_DIR;
return 0;
fsck_err:
return ret;
}
......@@ -989,31 +1598,39 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
noinline_for_stack
static int check_dirents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct inode_walker dir = inode_walker_init();
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
struct btree_trans trans;
struct btree_iter iter;
unsigned nr_subdirs = 0;
int ret = 0;
bch_verbose(c, "checking dirents");
snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH);
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
do {
ret = lockrestart_do(&trans,
check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs));
check_dirent(&trans, &iter, &hash_info,
&dir, &target, &s));
if (ret)
break;
} while (bch2_btree_iter_advance(&iter));
bch2_trans_iter_exit(&trans, &iter);
return bch2_trans_exit(&trans) ?: ret;
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
return ret;
}
/*
......@@ -1036,15 +1653,22 @@ static int check_xattrs(struct bch_fs *c)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH);
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
retry:
bch2_trans_begin(&trans);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k))) {
ret = walk_inode(&trans, &w, k.k->p.inode);
ret = check_key_has_snapshot(&trans, &iter, k);
if (ret)
break;
if (fsck_err_on(!w.have_inode, c,
ret = walk_inode(&trans, &w, k.k->p);
if (ret < 0)
break;
if (fsck_err_on(ret == INT_MAX, c,
"xattr for missing inode %llu",
k.k->p.inode)) {
ret = bch2_btree_delete_at(&trans, &iter, 0);
......@@ -1053,14 +1677,18 @@ static int check_xattrs(struct bch_fs *c)
continue;
}
if (w.first_this_inode && w.have_inode)
hash_info = bch2_hash_info_init(c, &w.inode);
if (ret == INT_MAX)
goto next;
ret = 0;
if (w.first_this_inode)
hash_info = bch2_hash_info_init(c, &w.d[0].inode);
ret = hash_check_key(&trans, bch2_xattr_hash_desc,
&hash_info, &iter, k);
if (ret)
break;
next:
bch2_btree_iter_advance(&iter);
}
fsck_err:
......@@ -1072,40 +1700,63 @@ static int check_xattrs(struct bch_fs *c)
}
/* Get root directory, create if it doesn't exist: */
static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
static int check_root(struct bch_fs *c)
{
struct bkey_inode_buf packed;
struct btree_trans trans;
struct bch_inode_unpacked root_inode;
u32 snapshot;
u64 inum;
int ret;
bch2_trans_init(&trans, c, 0, 0);
bch_verbose(c, "checking root directory");
ret = bch2_trans_do(c, NULL, NULL, 0,
lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && ret != -ENOENT)
return ret;
if (fsck_err_on(ret, c, "root directory missing"))
goto create_root;
if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
struct bkey_i_subvolume root_subvol;
if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
"root inode not a directory"))
goto create_root;
snapshot = U32_MAX;
inum = BCACHEFS_ROOT_INO;
return 0;
fsck_err:
return ret;
create_root:
bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
bkey_subvolume_init(&root_subvol.k_i);
root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
root_subvol.v.flags = 0;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
__bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
if (ret) {
bch_err(c, "error writing root subvol: %i", ret);
goto err;
}
}
ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
if (ret && ret != -ENOENT)
return ret;
bch2_inode_pack(c, &packed, root_inode);
if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
"root inode not a directory")) {
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
root_inode.bi_inum = inum;
return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
ret = write_inode(&trans, &root_inode, snapshot);
if (ret)
bch_err(c, "error writing root inode: %i", ret);
}
err:
fsck_err:
bch2_trans_exit(&trans);
return ret;
}
struct pathbuf {
......@@ -1147,17 +1798,18 @@ static int check_path(struct btree_trans *trans,
size_t i;
int ret = 0;
snapshot = snapshot_t(c, snapshot)->equiv;
p->nr = 0;
while (inode->bi_inum != BCACHEFS_ROOT_INO) {
ret = lockrestart_do(trans,
inode_backpointer_exists(trans, inode));
inode_backpointer_exists(trans, inode, snapshot));
if (ret < 0)
break;
if (!ret) {
if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
inode->bi_inum,
if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
inode->bi_inum, snapshot,
mode_to_type(inode->bi_mode),
inode->bi_nlink,
inode->bi_dir,
......@@ -1226,7 +1878,8 @@ static int check_directory_structure(struct bch_fs *c)
for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH, k, ret) {
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->type != KEY_TYPE_inode)
continue;
......@@ -1237,6 +1890,9 @@ static int check_directory_structure(struct bch_fs *c)
break;
}
if (u.bi_flags & BCH_INODE_UNLINKED)
continue;
ret = check_path(&trans, &path, &u, iter.pos.snapshot);
if (ret)
break;
......@@ -1295,8 +1951,9 @@ static int nlink_cmp(const void *_l, const void *_r)
return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
}
static void inc_link(struct bch_fs *c, struct nlink_table *links,
u64 range_start, u64 range_end, u64 inum)
static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
struct nlink_table *links,
u64 range_start, u64 range_end, u64 inum, u32 snapshot)
{
struct nlink *link, key = {
.inum = inum, .snapshot = U32_MAX,
......@@ -1307,8 +1964,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links,
link = __inline_bsearch(&key, links->d, links->nr,
sizeof(links->d[0]), nlink_cmp);
if (link)
link->count++;
if (!link)
return;
while (link > links->d && link[0].inum == link[-1].inum)
--link;
for (; link < links->d + links->nr && link->inum == inum; link++)
if (ref_visible(c, s, snapshot, link->snapshot)) {
link->count++;
if (link->snapshot >= snapshot)
break;
}
}
noinline_for_stack
......@@ -1328,7 +1995,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, start),
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH, k, ret) {
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->type != KEY_TYPE_inode)
continue;
......@@ -1369,23 +2037,33 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
u64 range_start, u64 range_end)
{
struct btree_trans trans;
struct snapshots_seen s;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
int ret;
snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH, k, ret) {
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
ret = snapshots_seen_update(c, &s, k.k->p);
if (ret)
break;
switch (k.k->type) {
case KEY_TYPE_dirent:
d = bkey_s_c_to_dirent(k);
if (d.v->d_type != DT_DIR)
inc_link(c, links, range_start, range_end,
le64_to_cpu(d.v->d_inum));
if (d.v->d_type != DT_DIR &&
d.v->d_type != DT_SUBVOL)
inc_link(c, &s, links, range_start, range_end,
le64_to_cpu(d.v->d_inum),
d.k->p.snapshot);
break;
}
......@@ -1393,10 +2071,11 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
}
bch2_trans_iter_exit(&trans, &iter);
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
return ret;
}
......@@ -1418,7 +2097,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH, k, ret) {
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset >= range_end)
break;
......@@ -1434,7 +2114,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
if (!u.bi_nlink)
continue;
while (link->inum < k.k->p.offset) {
while ((cmp_int(link->inum, k.k->p.offset) ?:
cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
link++;
BUG_ON(link >= links->d + links->nr);
}
......@@ -1507,14 +2188,13 @@ static int check_nlinks(struct bch_fs *c)
*/
int bch2_fsck_full(struct bch_fs *c)
{
struct bch_inode_unpacked root_inode;
return bch2_fs_snapshots_check(c) ?:
check_inodes(c, true) ?:
check_subvols(c) ?:
check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
check_root(c, &root_inode) ?:
check_root(c) ?:
check_directory_structure(c) ?:
check_nlinks(c);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment