Commit 32a7627c authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: optimised resync using Bitmap based intent logging

With this patch, the intent to write to some block in the array can be logged
to a bitmap file.  Each bit represents some number of sectors and is set
before any update happens, and only cleared when all writes relating to all
sectors are complete.

After an unclean shutdown, information in this bitmap can be used to optimise
resync - only sectors which could be out-of-sync need to be updated.

Also if a drive is removed and then added back into an array, the recovery can
make use of the bitmap to optimise reconstruction.  This is not implemented in
this patch.

Currently the bitmap is stored in a file which must (obviously) be stored on a
separate device.

The patch only provided infrastructure.  It does not update any personalities
to bitmap intent logging.

Md arrays can still be used with no bitmap file.  This patch has minimal
impact on such arrays.
Signed-off-by: default avatarNeil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 57afd89f
...@@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ ...@@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snapshot-objs := dm-snap.o dm-exception-store.o
dm-mirror-objs := dm-log.o dm-raid1.o dm-mirror-objs := dm-log.o dm-raid1.o
md-mod-objs := md.o bitmap.o
raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \ raid6int1.o raid6int2.o raid6int4.o \
raid6int8.o raid6int16.o raid6int32.o \ raid6int8.o raid6int16.o raid6int32.o \
...@@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o ...@@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
obj-$(CONFIG_MD_RAID6) += raid6.o xor.o obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
......
This diff is collapsed.
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
Neil Brown <neilb@cse.unsw.edu.au>. Neil Brown <neilb@cse.unsw.edu.au>.
- persistent bitmap code
Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option) the Free Software Foundation; either version 2, or (at your option)
...@@ -33,6 +36,7 @@ ...@@ -33,6 +36,7 @@
#include <linux/config.h> #include <linux/config.h>
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/raid/md.h> #include <linux/raid/md.h>
#include <linux/raid/bitmap.h>
#include <linux/sysctl.h> #include <linux/sysctl.h>
#include <linux/devfs_fs_kernel.h> #include <linux/devfs_fs_kernel.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/buffer_head.h> /* for invalidate_bdev */
...@@ -40,6 +44,8 @@ ...@@ -40,6 +44,8 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/file.h>
#ifdef CONFIG_KMOD #ifdef CONFIG_KMOD
#include <linux/kmod.h> #include <linux/kmod.h>
#endif #endif
...@@ -1198,8 +1204,11 @@ void md_print_devices(void) ...@@ -1198,8 +1204,11 @@ void md_print_devices(void)
printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
printk("md: **********************************\n"); printk("md: **********************************\n");
ITERATE_MDDEV(mddev,tmp) { ITERATE_MDDEV(mddev,tmp) {
printk("%s: ", mdname(mddev));
if (mddev->bitmap)
bitmap_print_sb(mddev->bitmap);
else
printk("%s: ", mdname(mddev));
ITERATE_RDEV(mddev,rdev,tmp2) ITERATE_RDEV(mddev,rdev,tmp2)
printk("<%s>", bdevname(rdev->bdev,b)); printk("<%s>", bdevname(rdev->bdev,b));
printk("\n"); printk("\n");
...@@ -1287,7 +1296,7 @@ static void md_update_sb(mddev_t * mddev) ...@@ -1287,7 +1296,7 @@ static void md_update_sb(mddev_t * mddev)
"md: updating %s RAID superblock on device (in sync %d)\n", "md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev),mddev->in_sync); mdname(mddev),mddev->in_sync);
err = 0; err = bitmap_update_sb(mddev->bitmap);
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
dprintk(KERN_INFO "md: "); dprintk(KERN_INFO "md: ");
...@@ -1624,12 +1633,19 @@ static int do_md_run(mddev_t * mddev) ...@@ -1624,12 +1633,19 @@ static int do_md_run(mddev_t * mddev)
mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
err = mddev->pers->run(mddev); /* before we start the array running, initialise the bitmap */
err = bitmap_create(mddev);
if (err)
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
else
err = mddev->pers->run(mddev);
if (err) { if (err) {
printk(KERN_ERR "md: pers->run() failed ...\n"); printk(KERN_ERR "md: pers->run() failed ...\n");
module_put(mddev->pers->owner); module_put(mddev->pers->owner);
mddev->pers = NULL; mddev->pers = NULL;
return -EINVAL; bitmap_destroy(mddev);
return err;
} }
atomic_set(&mddev->writes_pending,0); atomic_set(&mddev->writes_pending,0);
mddev->safemode = 0; mddev->safemode = 0;
...@@ -1742,6 +1758,14 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -1742,6 +1758,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
if (ro) if (ro)
set_disk_ro(disk, 1); set_disk_ro(disk, 1);
} }
bitmap_destroy(mddev);
if (mddev->bitmap_file) {
atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
fput(mddev->bitmap_file);
mddev->bitmap_file = NULL;
}
/* /*
* Free resources if final stop * Free resources if final stop
*/ */
...@@ -2000,6 +2024,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg) ...@@ -2000,6 +2024,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
return 0; return 0;
} }
static int get_bitmap_file(mddev_t * mddev, void * arg)
{
mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
char *ptr, *buf = NULL;
int err = -ENOMEM;
file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file)
goto out;
/* bitmap disabled, zero the first byte and copy out */
if (!mddev->bitmap || !mddev->bitmap->file) {
file->pathname[0] = '\0';
goto copy_out;
}
buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
if (!buf)
goto out;
ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
if (!ptr)
goto out;
strcpy(file->pathname, ptr);
copy_out:
err = 0;
if (copy_to_user(arg, file, sizeof(*file)))
err = -EFAULT;
out:
kfree(buf);
kfree(file);
return err;
}
static int get_disk_info(mddev_t * mddev, void __user * arg) static int get_disk_info(mddev_t * mddev, void __user * arg)
{ {
mdu_disk_info_t info; mdu_disk_info_t info;
...@@ -2275,6 +2335,48 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2275,6 +2335,48 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
return err; return err;
} }
/* similar to deny_write_access, but accounts for our holding a reference
* to the file ourselves */
static int deny_bitmap_write_access(struct file * file)
{
struct inode *inode = file->f_mapping->host;
spin_lock(&inode->i_lock);
if (atomic_read(&inode->i_writecount) > 1) {
spin_unlock(&inode->i_lock);
return -ETXTBSY;
}
atomic_set(&inode->i_writecount, -1);
spin_unlock(&inode->i_lock);
return 0;
}
static int set_bitmap_file(mddev_t *mddev, int fd)
{
int err;
if (mddev->pers)
return -EBUSY;
mddev->bitmap_file = fget(fd);
if (mddev->bitmap_file == NULL) {
printk(KERN_ERR "%s: error: failed to get bitmap file\n",
mdname(mddev));
return -EBADF;
}
err = deny_bitmap_write_access(mddev->bitmap_file);
if (err) {
printk(KERN_ERR "%s: error: bitmap file is already in use\n",
mdname(mddev));
fput(mddev->bitmap_file);
mddev->bitmap_file = NULL;
}
return err;
}
/* /*
* set_array_info is used two different ways * set_array_info is used two different ways
* The original usage is when creating a new array. * The original usage is when creating a new array.
...@@ -2586,8 +2688,10 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -2586,8 +2688,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
/* /*
* Commands querying/configuring an existing array: * Commands querying/configuring an existing array:
*/ */
/* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
&& cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
err = -ENODEV; err = -ENODEV;
goto abort_unlock; goto abort_unlock;
} }
...@@ -2601,6 +2705,10 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -2601,6 +2705,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
err = get_array_info(mddev, argp); err = get_array_info(mddev, argp);
goto done_unlock; goto done_unlock;
case GET_BITMAP_FILE:
err = get_bitmap_file(mddev, (void *)arg);
goto done_unlock;
case GET_DISK_INFO: case GET_DISK_INFO:
err = get_disk_info(mddev, argp); err = get_disk_info(mddev, argp);
goto done_unlock; goto done_unlock;
...@@ -2681,6 +2789,10 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -2681,6 +2789,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
err = do_md_run (mddev); err = do_md_run (mddev);
goto done_unlock; goto done_unlock;
case SET_BITMAP_FILE:
err = set_bitmap_file(mddev, (int)arg);
goto done_unlock;
default: default:
if (_IOC_TYPE(cmd) == MD_MAJOR) if (_IOC_TYPE(cmd) == MD_MAJOR)
printk(KERN_WARNING "md: %s(pid %d) used" printk(KERN_WARNING "md: %s(pid %d) used"
...@@ -2792,8 +2904,9 @@ static int md_thread(void * arg) ...@@ -2792,8 +2904,9 @@ static int md_thread(void * arg)
while (thread->run) { while (thread->run) {
void (*run)(mddev_t *); void (*run)(mddev_t *);
wait_event_interruptible(thread->wqueue, wait_event_interruptible_timeout(thread->wqueue,
test_bit(THREAD_WAKEUP, &thread->flags)); test_bit(THREAD_WAKEUP, &thread->flags),
thread->timeout);
if (current->flags & PF_FREEZE) if (current->flags & PF_FREEZE)
refrigerator(PF_FREEZE); refrigerator(PF_FREEZE);
...@@ -2839,6 +2952,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, ...@@ -2839,6 +2952,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
thread->run = run; thread->run = run;
thread->mddev = mddev; thread->mddev = mddev;
thread->name = name; thread->name = name;
thread->timeout = MAX_SCHEDULE_TIMEOUT;
ret = kernel_thread(md_thread, thread, 0); ret = kernel_thread(md_thread, thread, 0);
if (ret < 0) { if (ret < 0) {
kfree(thread); kfree(thread);
...@@ -2877,13 +2991,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -2877,13 +2991,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!rdev || rdev->faulty) if (!rdev || rdev->faulty)
return; return;
/*
dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
mdname(mddev), mdname(mddev),
MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
__builtin_return_address(0),__builtin_return_address(1), __builtin_return_address(0),__builtin_return_address(1),
__builtin_return_address(2),__builtin_return_address(3)); __builtin_return_address(2),__builtin_return_address(3));
*/
if (!mddev->pers->error_handler) if (!mddev->pers->error_handler)
return; return;
mddev->pers->error_handler(mddev,rdev); mddev->pers->error_handler(mddev,rdev);
...@@ -3037,6 +3151,7 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -3037,6 +3151,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
struct list_head *tmp2; struct list_head *tmp2;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int i; int i;
struct bitmap *bitmap;
if (v == (void*)1) { if (v == (void*)1) {
seq_printf(seq, "Personalities : "); seq_printf(seq, "Personalities : ");
...@@ -3089,10 +3204,36 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -3089,10 +3204,36 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (mddev->pers) { if (mddev->pers) {
mddev->pers->status (seq, mddev); mddev->pers->status (seq, mddev);
seq_printf(seq, "\n "); seq_printf(seq, "\n ");
if (mddev->curr_resync > 2) if (mddev->curr_resync > 2) {
status_resync (seq, mddev); status_resync (seq, mddev);
else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) seq_printf(seq, "\n ");
seq_printf(seq, " resync=DELAYED"); } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
seq_printf(seq, " resync=DELAYED\n ");
} else
seq_printf(seq, "\n ");
if ((bitmap = mddev->bitmap)) {
char *buf, *path;
unsigned long chunk_kb;
unsigned long flags;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
spin_lock_irqsave(&bitmap->lock, flags);
chunk_kb = bitmap->chunksize >> 10;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
"%lu%s chunk",
bitmap->pages - bitmap->missing_pages,
bitmap->pages,
(bitmap->pages - bitmap->missing_pages)
<< (PAGE_SHIFT - 10),
chunk_kb ? chunk_kb : bitmap->chunksize,
chunk_kb ? "KB" : "B");
if (bitmap->file && buf) {
path = file_path(bitmap->file, buf, PAGE_SIZE);
seq_printf(seq, ", file: %s", path ? path : "");
}
seq_printf(seq, "\n");
spin_unlock_irqrestore(&bitmap->lock, flags);
kfree(buf);
} }
seq_printf(seq, "\n"); seq_printf(seq, "\n");
...@@ -3328,7 +3469,8 @@ static void md_do_sync(mddev_t *mddev) ...@@ -3328,7 +3469,8 @@ static void md_do_sync(mddev_t *mddev)
sysctl_speed_limit_max); sysctl_speed_limit_max);
is_mddev_idle(mddev); /* this also initializes IO event counters */ is_mddev_idle(mddev); /* this also initializes IO event counters */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) /* we don't use the checkpoint if there's a bitmap */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
j = mddev->recovery_cp; j = mddev->recovery_cp;
else else
j = 0; j = 0;
...@@ -3673,6 +3815,8 @@ static int __init md_init(void) ...@@ -3673,6 +3815,8 @@ static int __init md_init(void)
" MD_SB_DISKS=%d\n", " MD_SB_DISKS=%d\n",
MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_MAJOR_VERSION, MD_MINOR_VERSION,
MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
BITMAP_MINOR);
if (register_blkdev(MAJOR_NR, "md")) if (register_blkdev(MAJOR_NR, "md"))
return -1; return -1;
......
This diff is collapsed.
...@@ -267,6 +267,9 @@ struct mddev_s ...@@ -267,6 +267,9 @@ struct mddev_s
atomic_t writes_pending; atomic_t writes_pending;
request_queue_t *queue; /* for plugging ... */ request_queue_t *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
struct file *bitmap_file; /* the bitmap file */
struct list_head all_mddevs; struct list_head all_mddevs;
}; };
...@@ -341,6 +344,7 @@ typedef struct mdk_thread_s { ...@@ -341,6 +344,7 @@ typedef struct mdk_thread_s {
unsigned long flags; unsigned long flags;
struct completion *event; struct completion *event;
struct task_struct *tsk; struct task_struct *tsk;
unsigned long timeout;
const char *name; const char *name;
} mdk_thread_t; } mdk_thread_t;
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) #define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) #define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) #define RAID_AUTORUN _IO (MD_MAJOR, 0x14)
#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t)
/* configuration */ /* configuration */
#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) #define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
...@@ -36,6 +37,7 @@ ...@@ -36,6 +37,7 @@
#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) #define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) #define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a) #define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a)
#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int)
/* usage */ /* usage */
#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) #define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
...@@ -106,6 +108,11 @@ typedef struct mdu_start_info_s { ...@@ -106,6 +108,11 @@ typedef struct mdu_start_info_s {
} mdu_start_info_t; } mdu_start_info_t;
typedef struct mdu_bitmap_file_s
{
char pathname[4096];
} mdu_bitmap_file_t;
typedef struct mdu_param_s typedef struct mdu_param_s
{ {
int personality; /* 1,2,3,4 */ int personality; /* 1,2,3,4 */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment