Commit a2de733c authored by Arne Jansen's avatar Arne Jansen

btrfs: scrub

This adds an initial implementation for scrub. It works quite
straightforward. The usermode issues an ioctl for each device in the
fs. For each device, it enumerates the allocated device chunks. For
each chunk, the contained extents are enumerated and the data checksums
fetched. The extents are read sequentially and the checksums verified.
If an error occurs (checksum or EIO), a good copy is searched for. If
one is found, the bad copy will be rewritten.
All enumerations happen from the commit roots. During a transaction
commit, the scrubs get paused and afterwards continue from the new
roots.

This commit is based on the series originally posted to linux-btrfs
with some improvements that resulted from comments from David Sterba,
Ilya Dryomov and Jan Schmidt.
Signed-off-by: default avatarArne Jansen <sensille@gmx.net>
parent 7cf96da3
...@@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ ...@@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o compression.o delayed-ref.o relocation.o scrub.o
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/rwsem.h>
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/wait.h> #include <linux/wait.h>
...@@ -33,6 +34,7 @@ ...@@ -33,6 +34,7 @@
#include "extent_io.h" #include "extent_io.h"
#include "extent_map.h" #include "extent_map.h"
#include "async-thread.h" #include "async-thread.h"
#include "ioctl.h"
struct btrfs_trans_handle; struct btrfs_trans_handle;
struct btrfs_transaction; struct btrfs_transaction;
...@@ -510,6 +512,12 @@ struct btrfs_extent_item_v0 { ...@@ -510,6 +512,12 @@ struct btrfs_extent_item_v0 {
/* use full backrefs for extent pointers in the block */ /* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) #define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
/*
* this flag is only used internally by scrub and may be changed at any time
* it is only declared here to avoid collisions
*/
#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
struct btrfs_tree_block_info { struct btrfs_tree_block_info {
struct btrfs_disk_key key; struct btrfs_disk_key key;
u8 level; u8 level;
...@@ -1077,6 +1085,17 @@ struct btrfs_fs_info { ...@@ -1077,6 +1085,17 @@ struct btrfs_fs_info {
void *bdev_holder; void *bdev_holder;
/* private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
atomic_t scrub_pause_req;
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
struct rw_semaphore scrub_super_lock;
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
/* filesystem state */ /* filesystem state */
u64 fs_state; u64 fs_state;
}; };
...@@ -2472,8 +2491,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, ...@@ -2472,8 +2491,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int btrfs_csum_truncate(struct btrfs_trans_handle *trans, int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path, struct btrfs_root *root, struct btrfs_path *path,
u64 isize); u64 isize);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
u64 end, struct list_head *list); struct list_head *list, int search_commit);
/* inode.c */ /* inode.c */
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
...@@ -2637,4 +2656,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, ...@@ -2637,4 +2656,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
u64 *bytes_to_reserve); u64 *bytes_to_reserve);
void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending); struct btrfs_pending_snapshot *pending);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
struct btrfs_scrub_progress *progress);
int btrfs_scrub_pause(struct btrfs_root *root);
int btrfs_scrub_pause_super(struct btrfs_root *root);
int btrfs_scrub_continue(struct btrfs_root *root);
int btrfs_scrub_continue_super(struct btrfs_root *root);
int btrfs_scrub_cancel(struct btrfs_root *root);
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
#endif #endif
...@@ -1773,6 +1773,17 @@ struct btrfs_root *open_ctree(struct super_block *sb, ...@@ -1773,6 +1773,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->ordered_extents); INIT_LIST_HEAD(&fs_info->ordered_extents);
spin_lock_init(&fs_info->ordered_extent_lock); spin_lock_init(&fs_info->ordered_extent_lock);
mutex_init(&fs_info->scrub_lock);
atomic_set(&fs_info->scrubs_running, 0);
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
btrfs_init_workers(&fs_info->scrub_workers, "scrub",
fs_info->thread_pool_size, &fs_info->generic_worker);
sb->s_blocksize = 4096; sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096); sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi; sb->s_bdi = &fs_info->bdi;
...@@ -2599,6 +2610,7 @@ int close_ctree(struct btrfs_root *root) ...@@ -2599,6 +2610,7 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1; fs_info->closing = 1;
smp_mb(); smp_mb();
btrfs_scrub_cancel(root);
btrfs_put_block_group_cache(fs_info); btrfs_put_block_group_cache(fs_info);
/* /*
......
...@@ -266,7 +266,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, ...@@ -266,7 +266,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
} }
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list) struct list_head *list, int search_commit)
{ {
struct btrfs_key key; struct btrfs_key key;
struct btrfs_path *path; struct btrfs_path *path;
...@@ -283,6 +283,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, ...@@ -283,6 +283,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
path = btrfs_alloc_path(); path = btrfs_alloc_path();
BUG_ON(!path); BUG_ON(!path);
if (search_commit) {
path->skip_locking = 1;
path->reada = 2;
path->search_commit_root = 1;
}
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start; key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY; key.type = BTRFS_EXTENT_CSUM_KEY;
......
...@@ -1007,7 +1007,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root, ...@@ -1007,7 +1007,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
LIST_HEAD(list); LIST_HEAD(list);
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
bytenr + num_bytes - 1, &list); bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list)) if (ret == 0 && list_empty(&list))
return 0; return 0;
......
...@@ -42,6 +42,43 @@ struct btrfs_ioctl_vol_args_v2 { ...@@ -42,6 +42,43 @@ struct btrfs_ioctl_vol_args_v2 {
char name[BTRFS_SUBVOL_NAME_MAX + 1]; char name[BTRFS_SUBVOL_NAME_MAX + 1];
}; };
/*
* structure to report errors and progress to userspace, either as a
* result of a finished scrub, a canceled scrub or a progress inquiry
*/
struct btrfs_scrub_progress {
__u64 data_extents_scrubbed; /* # of data extents scrubbed */
__u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
__u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
__u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
__u64 read_errors; /* # of read errors encountered (EIO) */
__u64 csum_errors; /* # of failed csum checks */
__u64 verify_errors; /* # of occurences, where the metadata
* of a tree block did not match the
* expected values, like generation or
* logical */
__u64 no_csum; /* # of 4k data block for which no csum
* is present, probably the result of
* data written with nodatasum */
__u64 csum_discards; /* # of csum for which no data was found
* in the extent tree. */
__u64 super_errors; /* # of bad super blocks encountered */
__u64 malloc_errors; /* # of internal kmalloc errors. These
* will likely cause an incomplete
* scrub */
__u64 uncorrectable_errors; /* # of errors where either no intact
* copy was found or the writeback
* failed */
__u64 corrected_errors; /* # of errors corrected */
__u64 last_physical; /* last physical address scrubbed. In
* case a scrub was aborted, this can
* be used to restart the scrub */
__u64 unverified_errors; /* # of occurences where a read for a
* full (64k) bio failed, but the re-
* check succeeded for each 4k piece.
* Intermittent error. */
};
#define BTRFS_INO_LOOKUP_PATH_MAX 4080 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args { struct btrfs_ioctl_ino_lookup_args {
__u64 treeid; __u64 treeid;
......
...@@ -4242,7 +4242,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) ...@@ -4242,7 +4242,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list); disk_bytenr + len - 1, &list, 0);
while (!list_empty(&list)) { while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list); sums = list_entry(list.next, struct btrfs_ordered_sum, list);
......
This diff is collapsed.
...@@ -1321,6 +1321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ...@@ -1321,6 +1321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
WARN_ON(cur_trans != trans->transaction); WARN_ON(cur_trans != trans->transaction);
btrfs_scrub_pause(root);
/* btrfs_commit_tree_roots is responsible for getting the /* btrfs_commit_tree_roots is responsible for getting the
* various roots consistent with each other. Every pointer * various roots consistent with each other. Every pointer
* in the tree of tree roots has to point to the most up to date * in the tree of tree roots has to point to the most up to date
...@@ -1405,6 +1406,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ...@@ -1405,6 +1406,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex); mutex_unlock(&root->fs_info->trans_mutex);
btrfs_scrub_continue(root);
if (current->journal_info == trans) if (current->journal_info == trans)
current->journal_info = NULL; current->journal_info = NULL;
......
...@@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ...@@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root, ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1, csum_start, csum_end - 1,
&ordered_sums); &ordered_sums, 0);
BUG_ON(ret); BUG_ON(ret);
while (!list_empty(&ordered_sums)) { while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums; struct btrfs_ordered_sum *sums;
...@@ -2093,7 +2093,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ...@@ -2093,7 +2093,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop * the running transaction open, so a full commit can't hop
* in and cause problems either. * in and cause problems either.
*/ */
btrfs_scrub_pause_super(root);
write_ctree_super(trans, root->fs_info->tree_root, 1); write_ctree_super(trans, root->fs_info->tree_root, 1);
btrfs_scrub_continue_super(root);
ret = 0; ret = 0;
mutex_lock(&root->log_mutex); mutex_lock(&root->log_mutex);
...@@ -2689,7 +2691,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ...@@ -2689,7 +2691,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range( ret = btrfs_lookup_csums_range(
log->fs_info->csum_root, log->fs_info->csum_root,
ds + cs, ds + cs + cl - 1, ds + cs, ds + cs + cl - 1,
&ordered_sums); &ordered_sums, 0);
BUG_ON(ret); BUG_ON(ret);
} }
} }
......
...@@ -38,9 +38,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, ...@@ -38,9 +38,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device); struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
static DEFINE_MUTEX(uuid_mutex); static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids); static LIST_HEAD(fs_uuids);
...@@ -1334,6 +1331,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ...@@ -1334,6 +1331,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto error_undo; goto error_undo;
device->in_fs_metadata = 0; device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);
/* /*
* the device list mutex makes sure that we don't change * the device list mutex makes sure that we don't change
......
...@@ -85,6 +85,9 @@ struct btrfs_device { ...@@ -85,6 +85,9 @@ struct btrfs_device {
/* physical drive uuid (or lvm uuid) */ /* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE]; u8 uuid[BTRFS_UUID_SIZE];
/* per-device scrub information */
struct scrub_dev *scrub_device;
struct btrfs_work work; struct btrfs_work work;
}; };
...@@ -157,6 +160,9 @@ struct map_lookup { ...@@ -157,6 +160,9 @@ struct map_lookup {
struct btrfs_bio_stripe stripes[]; struct btrfs_bio_stripe stripes[];
}; };
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
/* Used to sort the devices by max_avail(descending sort) */ /* Used to sort the devices by max_avail(descending sort) */
int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment