Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: md: allow resync_start to be set while an array is active. md/raid10: reformat some loops with less indenting. md/raid10: remove unused variable. md/raid10: make more use of 'slot' in raid10d. md/raid10: some tidying up in fix_read_error md/raid1: improve handling of pages allocated for write-behind. md/raid1: try fix_sync_read_error before process_checks. md/raid1: tidy up new functions: process_checks and fix_sync_read_error. md/raid1: split out two sub-functions from sync_request_write md: make error_handler functions more uniform and correct. md/multipath: discard ->working_disks in favour of ->degraded md/raid1: clean up read_balance. md: simplify raid10 read_balance md/bitmap: fix saving of events_cleared and other state. md: reject a re-add request that cannot be honoured. md: Fix race when creating a new md device.

Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: md: allow resync_start to be set while an array is active. md/raid10: reformat some loops with less indenting. md/raid10: remove unused variable. md/raid10: make more use of 'slot' in raid10d. md/raid10: some tidying up in fix_read_error md/raid1: improve handling of pages allocated for write-behind. md/raid1: try fix_sync_read_error before process_checks. md/raid1: tidy up new functions: process_checks and fix_sync_read_error. md/raid1: split out two sub-functions from sync_request_write md: make error_handler functions more uniform and correct. md/multipath: discard ->working_disks in favour of ->degraded md/raid1: clean up read_balance. md: simplify raid10 read_balance md/bitmap: fix saving of events_cleared and other state. md: reject a re-add request that cannot be honoured. md: Fix race when creating a new md device.
4b382d06 · Linus Torvalds · bdfbe804 · b098636c · 4b382d06 · 4b382d06
Commit 4b382d06 authored May 22, 2011 by Linus Torvalds
8 changed files
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 	sb = kmap_atomic(bitmap->sb_page, KM_USER0);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
-	if (bitmap->mddev->events < bitmap->events_cleared) {
+	if (bitmap->mddev->events < bitmap->events_cleared)
 		/* rocking back to read-only */
 		bitmap->events_cleared = bitmap->mddev->events;
-		sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+	sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
-	}
+	sb->state = cpu_to_le32(bitmap->flags);
 	/* Just in case these have been changed via sysfs: */
 	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
 	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -618,7 +618,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
 		bitmap->flags |= BITMAP_HOSTENDIAN;
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
-	if (sb->state & cpu_to_le32(BITMAP_STALE))
+	if (bitmap->flags & BITMAP_STALE)
 		bitmap->events_cleared = bitmap->mddev->events;
 	err = 0;
 out:
@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 	switch (op) {
 	case MASK_SET:
 		sb->state |= cpu_to_le32(bits);
+		bitmap->flags |= bits;
 		break;
 	case MASK_UNSET:
 		sb->state &= cpu_to_le32(~bits);
+		bitmap->flags &= ~bits;
 		break;
 	default:
 		BUG();

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
 	char *e;
 	unsigned long long n = simple_strtoull(buf, &e, 10);
-	if (mddev->pers)
+	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
 		return -EBUSY;
 	if (cmd_match(buf, "none"))
 		n = MaxSector;
@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name)
 	disk->fops = &md_fops;
 	disk->private_data = mddev;
 	disk->queue = mddev->queue;
+	blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
 	/* Allow extended partitions.  This makes the
 	 * 'mdp' device redundant, but we can't really
 	 * remove it now.
 	 */
 	disk->flags |= GENHD_FL_EXT_DEVT;
-	add_disk(disk);
 	mddev->gendisk = disk;
+	/* As soon as we call add_disk(), another thread could get
+	 * through to md_open, so make sure it doesn't get too far
+	 */
+	mutex_lock(&mddev->open_mutex);
+	add_disk(disk);
 	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
 				     &disk_to_dev(disk)->kobj, "%s", "md");
 	if (error) {
@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name)
 	if (mddev->kobj.sd &&
 	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
 		printk(KERN_DEBUG "pointless warning\n");
+	mutex_unlock(&mddev->open_mutex);
-	blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
 abort:
 	mutex_unlock(&disks_mutex);
 	if (!error && mddev->kobj.sd) {
@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		} else
 			super_types[mddev->major_version].
 				validate_super(mddev, rdev);
+		if ((info->state & (1<<MD_DISK_SYNC)) &&
+		    (!test_bit(In_sync, &rdev->flags) ||
+		     rdev->raid_disk != info->raid_disk)) {
+			/* This was a hot-add request, but events doesn't
+			 * match, so reject it.
+			 */
+			export_rdev(rdev);
+			return -EINVAL;
+		}
 		if (test_bit(In_sync, &rdev->flags))
 			rdev->saved_raid_disk = rdev->raid_disk;
 		else

--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 	int i;
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
-						 conf->working_disks);
+		    conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf (seq, "%s",
 			       conf->multipaths[i].rdev && 
@@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits)
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	multipath_conf_t *conf = mddev->private;
+	char b[BDEVNAME_SIZE];
-	if (conf->working_disks <= 1) {
+	if (conf->raid_disks - mddev->degraded <= 1) {
 		/*
 		 * Uh oh, we can do nothing if this is our last path, but
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
 		printk(KERN_ALERT 
-			"multipath: only one IO path left and IO error.\n");
+		       "multipath: only one IO path left and IO error.\n");
 		/* leave it active... it's all we have */
-	} else {
+		return;
-		/*
+	}
-		 * Mark disk as unusable
+	/*
-		 */
+	 * Mark disk as unusable
-		if (!test_bit(Faulty, &rdev->flags)) {
+	 */
-			char b[BDEVNAME_SIZE];
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-			clear_bit(In_sync, &rdev->flags);
+		unsigned long flags;
-			set_bit(Faulty, &rdev->flags);
+		spin_lock_irqsave(&conf->device_lock, flags);
-			set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		mddev->degraded++;
-			conf->working_disks--;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
-			mddev->degraded++;
-			printk(KERN_ALERT "multipath: IO failure on %s,"
-				" disabling IO path.\n"
-				"multipath: Operation continuing"
-				" on %d IO paths.\n",
-				bdevname (rdev->bdev,b),
-				conf->working_disks);
-		}
 	}
+	set_bit(Faulty, &rdev->flags);
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	printk(KERN_ALERT "multipath: IO failure on %s,"
+	       " disabling IO path.\n"
+	       "multipath: Operation continuing"
+	       " on %d IO paths.\n",
+	       bdevname(rdev->bdev, b),
+	       conf->raid_disks - mddev->degraded);
 }
 static void print_multipath_conf (multipath_conf_t *conf)
@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
 		printk("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->working_disks,
+	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 			 conf->raid_disks);
 	for (i = 0; i < conf->raid_disks; i++) {
@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 							   PAGE_CACHE_SIZE - 1);
 			}
-			conf->working_disks++;
+			spin_lock_irq(&conf->device_lock);
 			mddev->degraded--;
 			rdev->raid_disk = path;
 			set_bit(In_sync, &rdev->flags);
+			spin_unlock_irq(&conf->device_lock);
 			rcu_assign_pointer(p->rdev, rdev);
 			err = 0;
 			md_integrity_add_rdev(rdev, mddev);
@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
 	int disk_idx;
 	struct multipath_info *disk;
 	mdk_rdev_t *rdev;
+	int working_disks;
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
 		goto out_free_conf;
 	}
-	conf->working_disks = 0;
+	working_disks = 0;
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0 ||
@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
 		}
 		if (!test_bit(Faulty, &rdev->flags))
-			conf->working_disks++;
+			working_disks++;
 	}
 	conf->raid_disks = mddev->raid_disks;
@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
-	if (!conf->working_disks) {
+	if (!working_disks) {
 		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
 			mdname(mddev));
 		goto out_free_conf;
 	}
-	mddev->degraded = conf->raid_disks - conf->working_disks;
+	mddev->degraded = conf->raid_disks - working_disks;
 	conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
 						 sizeof(struct multipath_bh));
@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev)
 	printk(KERN_INFO 
 		"multipath: array %s active with %d out of %d IO paths\n",
-		mdname(mddev), conf->working_disks, mddev->raid_disks);
+		mdname(mddev), conf->raid_disks - mddev->degraded,
+	       mddev->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */

--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -9,7 +9,6 @@ struct multipath_private_data {
 	mddev_t			*mddev;
 	struct multipath_info	*multipaths;
 	int			raid_disks;
-	int			working_disks;
 	spinlock_t		device_lock;
 	struct list_head	retry_list;

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -94,7 +94,9 @@ struct r1bio_s {
 	int			read_disk;
 	struct list_head	retry_list;
-	struct bitmap_update	*bitmap_update;
+	/* Next two are only valid when R1BIO_BehindIO is set */
+	struct page		**behind_pages;
+	int			behind_page_count;
 	/*
 	 * if the IO is in WRITE direction, then multiple bios are used.
 	 * We choose the number when they are allocated.

--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1700,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 	raid5_conf_t *conf = mddev->private;
 	pr_debug("raid456: error called\n");
-	if (!test_bit(Faulty, &rdev->flags)) {
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		unsigned long flags;
-		if (test_and_clear_bit(In_sync, &rdev->flags)) {
+		spin_lock_irqsave(&conf->device_lock, flags);
-			unsigned long flags;
+		mddev->degraded++;
-			spin_lock_irqsave(&conf->device_lock, flags);
+		spin_unlock_irqrestore(&conf->device_lock, flags);
-			mddev->degraded++;
+		/*
-			spin_unlock_irqrestore(&conf->device_lock, flags);
+		 * if recovery was running, make sure it aborts.
-			/*
+		 */
-			 * if recovery was running, make sure it aborts.
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			 */
-			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-		}
-		set_bit(Faulty, &rdev->flags);
-		printk(KERN_ALERT
-		       "md/raid:%s: Disk failure on %s, disabling device.\n"
-		       "md/raid:%s: Operation continuing on %d devices.\n",
-		       mdname(mddev),
-		       bdevname(rdev->bdev, b),
-		       mdname(mddev),
-		       conf->raid_disks - mddev->degraded);
 	}
+	set_bit(Faulty, &rdev->flags);
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	printk(KERN_ALERT
+	       "md/raid:%s: Disk failure on %s, disabling device.\n"
+	       "md/raid:%s: Operation continuing on %d devices.\n",
+	       mdname(mddev),
+	       bdevname(rdev->bdev, b),
+	       mdname(mddev),
+	       conf->raid_disks - mddev->degraded);
 }
 /*
@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	revalidate_disk(mddev->gendisk);
-	if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+	if (sectors > mddev->dev_sectors &&
+	    mddev->recovery_cp > mddev->dev_sectors) {
 		mddev->recovery_cp = mddev->dev_sectors;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}