Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (27 commits) md: add 'recovery_start' per-device sysfs attribute md: rcu_read_lock() walk of mddev->disks in md_do_sync() md: integrate spares into array at earliest opportunity. md: move compat_ioctl handling into md.c md: revise Kconfig help for MD_MULTIPATH md: add MODULE_DESCRIPTION for all md related modules. raid: improve MD/raid10 handling of correctable read errors. md/raid10: print more useful messages on device failure. md/bitmap: update dirty flag when bitmap bits are explicitly set. md: Support write-intent bitmaps with externally managed metadata. md/bitmap: move setting of daemon_lastrun out of bitmap_read_sb md: support updating bitmap parameters via sysfs. md: factor out parsing of fixed-point numbers md: support bitmap offset appropriate for external-metadata arrays. md: remove needless setting of thread->timeout in raid10_quiesce md: change daemon_sleep to be in 'jiffies' rather than 'seconds'. md: move offset, daemon_sleep and chunksize out of bitmap structure md: collect bitmap-specific fields into one structure. md/raid1: add takeover support for raid5->raid1 md: add honouring of suspend_{lo,hi} to raid1. ...

Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (27 commits) md: add 'recovery_start' per-device sysfs attribute md: rcu_read_lock() walk of mddev->disks in md_do_sync() md: integrate spares into array at earliest opportunity. md: move compat_ioctl handling into md.c md: revise Kconfig help for MD_MULTIPATH md: add MODULE_DESCRIPTION for all md related modules. raid: improve MD/raid10 handling of correctable read errors. md/raid10: print more useful messages on device failure. md/bitmap: update dirty flag when bitmap bits are explicitly set. md: Support write-intent bitmaps with externally managed metadata. md/bitmap: move setting of daemon_lastrun out of bitmap_read_sb md: support updating bitmap parameters via sysfs. md: factor out parsing of fixed-point numbers md: support bitmap offset appropriate for external-metadata arrays. md: remove needless setting of thread->timeout in raid10_quiesce md: change daemon_sleep to be in 'jiffies' rather than 'seconds'. md: move offset, daemon_sleep and chunksize out of bitmap structure md: collect bitmap-specific fields into one structure. md/raid1: add takeover support for raid5->raid1 md: add honouring of suspend_{lo,hi} to raid1. ...
37222e1c · Linus Torvalds · 76b8f82c · 06e3c817 · 37222e1c · 37222e1c
Commit 37222e1c authored Dec 14, 2009 by Linus Torvalds
17 changed files
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -233,9 +233,9 @@ All md devices contain:

  resync_start
     The point at which resync should start.  If no resync is needed,
-     this will be a very large number.  At array creation it will
-     default to 0, though starting the array as 'clean' will
-     set it much larger.
+     this will be a very large number (or 'none' since 2.6.30-rc1).  At
+     array creation it will default to 0, though starting the array as
+     'clean' will set it much larger.

   new_dev
     This file can be written but not read.  The value written should
@@ -296,6 +296,51 @@ All md devices contain:
     active-idle
         like active, but no writes have been seen for a while (safe_mode_delay).

+  bitmap/location
+     This indicates where the write-intent bitmap for the array is
+     stored.
+     It can be one of "none", "file" or "[+-]N".
+     "file" may later be extended to "file:/file/name"
+     "[+-]N" means that many sectors from the start of the metadata.
+       This is replicated on all devices.  For arrays with externally
+       managed metadata, the offset is from the beginning of the
+       device.
+  bitmap/chunksize
+     The size, in bytes, of the chunk which will be represented by a
+     single bit.  For RAID456, it is a portion of an individual
+     device. For RAID10, it is a portion of the array.  For RAID1, it
+     is both (they come to the same thing).
+  bitmap/time_base
+     The time, in seconds, between looking for bits in the bitmap to
+     be cleared. In the current implementation, a bit will be cleared
+     between 2 and 3 times "time_base" after all the covered blocks
+     are known to be in-sync.
+  bitmap/backlog
+     When write-mostly devices are active in a RAID1, write requests
+     to those devices proceed in the background - the filesystem (or
+     other user of the device) does not have to wait for them.
+     'backlog' sets a limit on the number of concurrent background
+     writes.  If there are more than this, new writes will by
+     synchronous.
+  bitmap/metadata
+     This can be either 'internal' or 'external'.
+     'internal' is the default and means the metadata for the bitmap
+     is stored in the first 256 bytes of the allocated space and is
+     managed by the md module.
+     'external' means that bitmap metadata is managed externally to
+     the kernel (i.e. by some userspace program)
+  bitmap/can_clear
+     This is either 'true' or 'false'.  If 'true', then bits in the
+     bitmap will be cleared when the corresponding blocks are thought
+     to be in-sync.  If 'false', bits will never be cleared.
+     This is automatically set to 'false' if a write happens on a
+     degraded array, or if the array becomes degraded during a write.
+     When metadata is managed externally, it should be set to true
+     once the array becomes non-degraded, and this fact has been
+     recorded in the metadata.
+     
+     
+     

 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
@@ -334,8 +379,9 @@ Each directory contains:
 	Writing "writemostly" sets the writemostly flag.
 	Writing "-writemostly" clears the writemostly flag.
 	Writing "blocked" sets the "blocked" flag.
-	Writing "-blocked" clear the "blocked" flag and allows writes
+	Writing "-blocked" clears the "blocked" flag and allows writes
 		to complete.
+	Writing "in_sync" sets the in_sync flag.

 	This file responds to select/poll. Any change to 'faulty'
 	or 'blocked' causes an event.
@@ -372,6 +418,24 @@ Each directory contains:
        array.  If a value less than the current component_size is
        written, it will be rejected.

+      recovery_start
+
+        When the device is not 'in_sync', this records the number of
+	sectors from the start of the device which are known to be
+	correct.  This is normally zero, but during a recovery
+	operation is will steadily increase, and if the recovery is
+	interrupted, restoring this value can cause recovery to
+	avoid repeating the earlier blocks.  With v1.x metadata, this
+	value is saved and restored automatically.
+
+	This can be set whenever the device is not an active member of
+	the array, either before the array is activated, or before
+	the 'slot' is set.
+
+	Setting this to 'none' is equivalent to setting 'in_sync'.
+	Setting to any other value also clears the 'in_sync' flag.
+	
+

 An active md device will also contain and entry for each active device
 in the array.  These are named

--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -185,11 +185,10 @@ config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
 	help
-	  Multipath-IO is the ability of certain devices to address the same
-	  physical disk over multiple 'IO paths'. The code ensures that such
-	  paths can be defined and handled at runtime, and ensures that a
-	  transparent failover to the backup path(s) happens if a IO errors
-	  arrives on the primary path.
+	  MD_MULTIPATH provides a simple multi-path personality for use
+	  the MD framework.  It is not under active development.  New
+	  projects should consider using DM_MULTIPATH which has more
+	  features and more testing.

 	  If unsure, say N.


--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t;
 #define BITMAP_BLOCK_SHIFT 9

 /* how many blocks per chunk? (this is variable) */
-#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
 #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
 #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)

@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t;
 			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
 #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)

-/*
- * on-disk bitmap:
- *
- * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
- * file a page at a time. There's a superblock at the start of the file.
- */
-
-/* map chunks (bits) to file pages - offset by the size of the superblock */
-#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
-
 #endif

 /*
@@ -209,7 +199,6 @@ struct bitmap {
 	int counter_bits; /* how many bits per block counter */

 	/* bitmap chunksize -- how much data does each bit represent? */
-	unsigned long chunksize;
 	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
 	unsigned long chunks; /* total number of data chunks for the array */

@@ -226,7 +215,6 @@ struct bitmap {
 	/* bitmap spinlock */
 	spinlock_t lock;

-	long offset; /* offset from superblock if file is NULL */
 	struct file *file; /* backing disk file */
 	struct page *sb_page; /* cached copy of the bitmap file superblock */
 	struct page **filemap; /* list of cache pages for the file */
@@ -238,7 +226,6 @@ struct bitmap {

 	int allclean;

-	unsigned long max_write_behind; /* write-behind mode */
 	atomic_t behind_writes;

 	/*
@@ -246,7 +233,6 @@ struct bitmap {
 	 * file, cleaning up bits and flushing out pages to disk as necessary
 	 */
 	unsigned long daemon_lastrun; /* jiffies of last run */
-	unsigned long daemon_sleep; /* how many seconds between updates? */
 	unsigned long last_end_sync; /* when we lasted called end_sync to
 				      * update bitmap with resync progress */

@@ -254,6 +240,7 @@ struct bitmap {
 	wait_queue_head_t write_wait;
 	wait_queue_head_t overflow_wait;

+	struct sysfs_dirent *sysfs_can_clear;
 };

 /* the bitmap API */
@@ -282,7 +269,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);

 void bitmap_unplug(struct bitmap *bitmap);
-void bitmap_daemon_work(struct bitmap *bitmap);
+void bitmap_daemon_work(mddev_t *mddev);
 #endif

 #endif
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -360,6 +360,7 @@ static void raid_exit(void)
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fault injection personality for MD");
 MODULE_ALIAS("md-personality-10"); /* faulty */
 MODULE_ALIAS("md-faulty");
 MODULE_ALIAS("md-level--5");
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -292,7 +292,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	int cpu;

 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}

@@ -383,6 +383,7 @@ static void linear_exit (void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD");
 MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
 MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
 	atomic_t	read_errors;	/* number of consecutive read errors that
 					 * we have tried to ignore.
 					 */
+	struct timespec last_read_error;	/* monotonic time since our
+						 * last read error
+						 */
 	atomic_t	corrected_errors; /* number of corrected read errors,
 					   * for reporting to userspace and storing
 					   * in superblock.
@@ -280,17 +283,38 @@ struct mddev_s
 	unsigned int                    max_write_behind; /* 0 = sync */

 	struct bitmap                   *bitmap; /* the bitmap for the device */
-	struct file			*bitmap_file; /* the bitmap file */
-	long				bitmap_offset; /* offset from superblock of
-							* start of bitmap. May be
-							* negative, but not '0'
-							*/
-	long				default_bitmap_offset; /* this is the offset to use when
-								* hot-adding a bitmap.  It should
-								* eventually be settable by sysfs.
-								*/
-
+	struct {
+		struct file		*file; /* the bitmap file */
+		loff_t			offset; /* offset from superblock of
+						 * start of bitmap. May be
+						 * negative, but not '0'
+						 * For external metadata, offset
+						 * from start of device. 
+						 */
+		loff_t			default_offset; /* this is the offset to use when
+							 * hot-adding a bitmap.  It should
+							 * eventually be settable by sysfs.
+							 */
+		struct mutex		mutex;
+		unsigned long		chunksize;
+		unsigned long		daemon_sleep; /* how many seconds between updates? */
+		unsigned long		max_write_behind; /* write-behind mode */
+		int			external;
+	} bitmap_info;
+
+	atomic_t 			max_corr_read_errors; /* max read retries */
 	struct list_head		all_mddevs;
+
+	/* Generic barrier handling.
+	 * If there is a pending barrier request, all other
+	 * writes are blocked while the devices are flushed.
+	 * The last to finish a flush schedules a worker to
+	 * submit the barrier request (without the barrier flag),
+	 * then submit more flush requests.
+	 */
+	struct bio *barrier;
+	atomic_t flush_pending;
+	struct work_struct barrier_work;
 };


@@ -353,7 +377,7 @@ struct md_sysfs_entry {
 	ssize_t (*show)(mddev_t *, char *);
 	ssize_t (*store)(mddev_t *, const char *, size_t);
 };
-
+extern struct attribute_group md_bitmap_group;

 static inline char * mdname (mddev_t * mddev)
 {
@@ -431,6 +455,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);

 extern int mddev_congested(mddev_t *mddev, int bits);
+extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
@@ -443,6 +468,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(mddev_t *mddev);
 extern int md_integrity_register(mddev_t *mddev);
-void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
+extern void restore_bitmap_write_access(struct file *file);

 #endif /* _MD_MD_H */
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	int cpu;

 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}

@@ -581,6 +581,7 @@ static void __exit multipath_exit (void)
 module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("simple multi-path personality for MD");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
 MODULE_ALIAS("md-multipath");
 MODULE_ALIAS("md-level--4");
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	int cpu;

 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}

@@ -567,6 +567,7 @@ static void raid0_exit (void)
 module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
 MODULE_ALIAS("md-raid0");
 MODULE_ALIAS("md-level-0");
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -677,6 +677,7 @@ static void raise_barrier(conf_t *conf)
 static void lower_barrier(conf_t *conf)
 {
 	unsigned long flags;
+	BUG_ON(conf->barrier <= 0);
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
@@ -801,6 +802,25 @@ static int make_request(struct request_queue *q, struct bio * bio)

 	md_write_start(mddev, bio); /* wait on superblock update early */

+	if (bio_data_dir(bio) == WRITE &&
+	    bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+	    bio->bi_sector < mddev->suspend_hi) {
+		/* As the suspend_* range is controlled by
+		 * userspace, we want an interruptible
+		 * wait.
+		 */
+		DEFINE_WAIT(w);
+		for (;;) {
+			flush_signals(current);
+			prepare_to_wait(&conf->wait_barrier,
+					&w, TASK_INTERRUPTIBLE);
+			if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+			    bio->bi_sector >= mddev->suspend_hi)
+				break;
+			schedule();
+		}
+		finish_wait(&conf->wait_barrier, &w);
+	}
 	if (unlikely(!mddev->barriers_work &&
 		     bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		if (rw == WRITE)
@@ -923,7 +943,8 @@ static int make_request(struct request_queue *q, struct bio * bio)

 	/* do behind I/O ? */
 	if (bitmap &&
-	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+	    (atomic_read(&bitmap->behind_writes)
+	     < mddev->bitmap_info.max_write_behind) &&
 	    (behind_pages = alloc_behind_pages(bio)) != NULL)
 		set_bit(R1BIO_BehindIO, &r1_bio->state);

@@ -1941,74 +1962,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return mddev->dev_sectors;
 }

-static int run(mddev_t *mddev)
+static conf_t *setup_conf(mddev_t *mddev)
 {
 	conf_t *conf;
-	int i, j, disk_idx;
+	int i;
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
+	int err = -ENOMEM;

-	if (mddev->level != 1) {
-		printk("raid1: %s: raid level not set to mirroring (%d)\n",
-		       mdname(mddev), mddev->level);
-		goto out;
-	}
-	if (mddev->reshape_position != MaxSector) {
-		printk("raid1: %s: reshape_position set but not supported\n",
-		       mdname(mddev));
-		goto out;
-	}
-	/*
-	 * copy the already verified devices into our private RAID1
-	 * bookkeeping area. [whatever we allocate in run(),
-	 * should be freed in stop()]
-	 */
 	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
-	mddev->private = conf;
 	if (!conf)
-		goto out_no_mem;
+		goto abort;

 	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				 GFP_KERNEL);
 	if (!conf->mirrors)
-		goto out_no_mem;
+		goto abort;

 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
-		goto out_no_mem;
+		goto abort;

-	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo)
-		goto out_no_mem;
-	conf->poolinfo->mddev = NULL;
+		goto abort;
 	conf->poolinfo->raid_disks = mddev->raid_disks;
 	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
 					  r1bio_pool_free,
 					  conf->poolinfo);
 	if (!conf->r1bio_pool)
-		goto out_no_mem;
+		goto abort;
+
 	conf->poolinfo->mddev = mddev;

 	spin_lock_init(&conf->device_lock);
-	mddev->queue->queue_lock = &conf->device_lock;
-
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		disk_idx = rdev->raid_disk;
+		int disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
 			continue;
 		disk = conf->mirrors + disk_idx;

 		disk->rdev = rdev;
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);

 		disk->head_position = 0;
 	}
@@ -2022,8 +2017,7 @@ static int run(mddev_t *mddev)
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);

-
-	mddev->degraded = 0;
+	conf->last_used = -1;
 	for (i = 0; i < conf->raid_disks; i++) {

 		disk = conf->mirrors + i;
@@ -2031,38 +2025,97 @@ static int run(mddev_t *mddev)
 		if (!disk->rdev ||
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
-			mddev->degraded++;
 			if (disk->rdev)
 				conf->fullsync = 1;
-		}
+		} else if (conf->last_used < 0)
+			/*
+			 * The first working device is used as a
+			 * starting point to read balancing.
+			 */
+			conf->last_used = i;
 	}
-	if (mddev->degraded == conf->raid_disks) {
+
+	err = -EIO;
+	if (conf->last_used < 0) {
 		printk(KERN_ERR "raid1: no operational mirrors for %s\n",
-			mdname(mddev));
-		goto out_free_conf;
+		       mdname(mddev));
+		goto abort;
 	}
-	if (conf->raid_disks - mddev->degraded == 1)
-		mddev->recovery_cp = MaxSector;
+	err = -ENOMEM;
+	conf->thread = md_register_thread(raid1d, mddev, NULL);
+	if (!conf->thread) {
+		printk(KERN_ERR
+		       "raid1: couldn't allocate thread for %s\n",
+		       mdname(mddev));
+		goto abort;
+	}
+
+	return conf;
+
+ abort:
+	if (conf) {
+		if (conf->r1bio_pool)
+			mempool_destroy(conf->r1bio_pool);
+		kfree(conf->mirrors);
+		safe_put_page(conf->tmppage);
+		kfree(conf->poolinfo);
+		kfree(conf);
+	}
+	return ERR_PTR(err);
+}

+static int run(mddev_t *mddev)
+{
+	conf_t *conf;
+	int i;
+	mdk_rdev_t *rdev;
+
+	if (mddev->level != 1) {
+		printk("raid1: %s: raid level not set to mirroring (%d)\n",
+		       mdname(mddev), mddev->level);
+		return -EIO;
+	}
+	if (mddev->reshape_position != MaxSector) {
+		printk("raid1: %s: reshape_position set but not supported\n",
+		       mdname(mddev));
+		return -EIO;
+	}
 	/*
-	 * find the first working one and use it as a starting point
-	 * to read balancing.
+	 * copy the already verified devices into our private RAID1
+	 * bookkeeping area. [whatever we allocate in run(),
+	 * should be freed in stop()]
 	 */
-	for (j = 0; j < conf->raid_disks &&
-		     (!conf->mirrors[j].rdev ||
-		      !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
-		/* nothing */;
-	conf->last_used = j;
+	if (mddev->private == NULL)
+		conf = setup_conf(mddev);
+	else
+		conf = mddev->private;

+	if (IS_ERR(conf))
+		return PTR_ERR(conf);

-	mddev->thread = md_register_thread(raid1d, mddev, NULL);
-	if (!mddev->thread) {
-		printk(KERN_ERR
-		       "raid1: couldn't allocate thread for %s\n",
-		       mdname(mddev));
-		goto out_free_conf;
+	mddev->queue->queue_lock = &conf->device_lock;
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_sector to one PAGE, as
+		 * a one page request is never in violation.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 	}

+	mddev->degraded = 0;
+	for (i=0; i < conf->raid_disks; i++)
+		if (conf->mirrors[i].rdev == NULL ||
+		    !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+		    test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+			mddev->degraded++;
+
+	if (conf->raid_disks - mddev->degraded == 1)
+		mddev->recovery_cp = MaxSector;
+
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "raid1: %s is not clean"
 		       " -- starting background reconstruction\n",
@@ -2071,9 +2124,14 @@ static int run(mddev_t *mddev)
 		"raid1: raid set %s active with %d out of %d mirrors\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded, 
 		mddev->raid_disks);
+
 	/*
 	 * Ok, everything is just fine now
 	 */
+	mddev->thread = conf->thread;
+	conf->thread = NULL;
+	mddev->private = conf;
+
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));

 	mddev->queue->unplug_fn = raid1_unplug;
@@ -2081,23 +2139,6 @@ static int run(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
 	return 0;
-
-out_no_mem:
-	printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
-	       mdname(mddev));
-
-out_free_conf:
-	if (conf) {
-		if (conf->r1bio_pool)
-			mempool_destroy(conf->r1bio_pool);
-		kfree(conf->mirrors);
-		safe_put_page(conf->tmppage);
-		kfree(conf->poolinfo);
-		kfree(conf);
-		mddev->private = NULL;
-	}
-out:
-	return -EIO;
 }

 static int stop(mddev_t *mddev)
@@ -2271,6 +2312,9 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 	conf_t *conf = mddev->private;

 	switch(state) {
+	case 2: /* wake for suspend */
+		wake_up(&conf->wait_barrier);
+		break;
 	case 1:
 		raise_barrier(conf);
 		break;
@@ -2280,6 +2324,23 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 	}
 }

+static void *raid1_takeover(mddev_t *mddev)
+{
+	/* raid1 can take over:
+	 *  raid5 with 2 devices, any layout or chunk size
+	 */
+	if (mddev->level == 5 && mddev->raid_disks == 2) {
+		conf_t *conf;
+		mddev->new_level = 1;
+		mddev->new_layout = 0;
+		mddev->new_chunk_sectors = 0;
+		conf = setup_conf(mddev);
+		if (!IS_ERR(conf))
+			conf->barrier = 1;
+		return conf;
+	}
+	return ERR_PTR(-EINVAL);
+}

 static struct mdk_personality raid1_personality =
 {
@@ -2299,6 +2360,7 @@ static struct mdk_personality raid1_personality =
 	.size		= raid1_size,
 	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
+	.takeover	= raid1_takeover,
 };

 static int __init raid_init(void)
@@ -2314,6 +2376,7 @@ static void raid_exit(void)
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
 MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -59,6 +59,11 @@ struct r1_private_data_s {

 	mempool_t *r1bio_pool;
 	mempool_t *r1buf_pool;
+
+	/* When taking over an array from a different personality, we store
+	 * the new thread here until we fully activate the array.
+	 */
+	struct mdk_thread_s	*thread;
 };

 typedef struct r1_private_data_s conf_t;

--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	mdk_rdev_t *blocked_rdev;

 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}

@@ -1431,6 +1431,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 }


+/*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ *
+ */
+static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct timespec cur_time_mon;
+	unsigned long hours_since_last;
+	unsigned int read_errors = atomic_read(&rdev->read_errors);
+
+	ktime_get_ts(&cur_time_mon);
+
+	if (rdev->last_read_error.tv_sec == 0 &&
+	    rdev->last_read_error.tv_nsec == 0) {
+		/* first time we've seen a read error */
+		rdev->last_read_error = cur_time_mon;
+		return;
+	}
+
+	hours_since_last = (cur_time_mon.tv_sec -
+			    rdev->last_read_error.tv_sec) / 3600;
+
+	rdev->last_read_error = cur_time_mon;
+
+	/*
+	 * if hours_since_last is > the number of bits in read_errors
+	 * just set read errors to 0. We do this to avoid
+	 * overflowing the shift of read_errors by hours_since_last.
+	 */
+	if (hours_since_last >= 8 * sizeof(read_errors))
+		atomic_set(&rdev->read_errors, 0);
+	else
+		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+
 /*
 * This is a kernel thread which:
 *
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
 	mdk_rdev_t*rdev;
+	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+
+	rcu_read_lock();
+	{
+		int d = r10_bio->devs[r10_bio->read_slot].devnum;
+		char b[BDEVNAME_SIZE];
+		int cur_read_error_count = 0;
+
+		rdev = rcu_dereference(conf->mirrors[d].rdev);
+		bdevname(rdev->bdev, b);
+
+		if (test_bit(Faulty, &rdev->flags)) {
+			rcu_read_unlock();
+			/* drive has already been failed, just ignore any
+			   more fix_read_error() attempts */
+			return;
+		}
+
+		check_decay_read_errors(mddev, rdev);
+		atomic_inc(&rdev->read_errors);
+		cur_read_error_count = atomic_read(&rdev->read_errors);
+		if (cur_read_error_count > max_read_errors) {
+			rcu_read_unlock();
+			printk(KERN_NOTICE
+			       "raid10: %s: Raid device exceeded "
+			       "read_error threshold "
+			       "[cur %d:max %d]\n",
+			       b, cur_read_error_count, max_read_errors);
+			printk(KERN_NOTICE
+			       "raid10: %s: Failing raid "
+			       "device\n", b);
+			md_error(mddev, conf->mirrors[d].rdev);
+			return;
+		}
+	}
+	rcu_read_unlock();
+
 	while(sectors) {
 		int s = sectors;
 		int sl = r10_bio->read_slot;
@@ -1488,6 +1562,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 		/* write it back and re-read */
 		rcu_read_lock();
 		while (sl != r10_bio->read_slot) {
+			char b[BDEVNAME_SIZE];
 			int d;
 			if (sl==0)
 				sl = conf->copies;
@@ -1503,9 +1578,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 						 r10_bio->devs[sl].addr +
 						 sect + rdev->data_offset,
 						 s<<9, conf->tmppage, WRITE)
-				    == 0)
+				    == 0) {
 					/* Well, this device is dead */
+					printk(KERN_NOTICE
+					       "raid10:%s: read correction "
+					       "write failed"
+					       " (%d sectors at %llu on %s)\n",
+					       mdname(mddev), s,
+					       (unsigned long long)(sect+
+					       rdev->data_offset),
+					       bdevname(rdev->bdev, b));
+					printk(KERN_NOTICE "raid10:%s: failing "
+					       "drive\n",
+					       bdevname(rdev->bdev, b));
 					md_error(mddev, rdev);
+				}
 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();
 			}
@@ -1526,10 +1613,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 				if (sync_page_io(rdev->bdev,
 						 r10_bio->devs[sl].addr +
 						 sect + rdev->data_offset,
-						 s<<9, conf->tmppage, READ) == 0)
+						 s<<9, conf->tmppage,
+						 READ) == 0) {
 					/* Well, this device is dead */
+					printk(KERN_NOTICE
+					       "raid10:%s: unable to read back "
+					       "corrected sectors"
+					       " (%d sectors at %llu on %s)\n",
+					       mdname(mddev), s,
+					       (unsigned long long)(sect+
+						    rdev->data_offset),
+					       bdevname(rdev->bdev, b));
+					printk(KERN_NOTICE "raid10:%s: failing drive\n",
+					       bdevname(rdev->bdev, b));
+
 					md_error(mddev, rdev);
-				else
+				} else {
 					printk(KERN_INFO
 					       "raid10:%s: read error corrected"
 					       " (%d sectors at %llu on %s)\n",
@@ -1537,6 +1636,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 					       (unsigned long long)(sect+
 					            rdev->data_offset),
 					       bdevname(rdev->bdev, b));
+				}

 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();
@@ -2275,13 +2375,6 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 		lower_barrier(conf);
 		break;
 	}
-	if (mddev->thread) {
-		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
-		else
-			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
-		md_wakeup_thread(mddev->thread);
-	}
 }

 static struct mdk_personality raid10_personality =
@@ -2315,6 +2408,7 @@ static void raid_exit(void)
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2947,6 +2947,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct r5dev *dev;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
+	int dec_preread_active = 0;

 	memset(&s, 0, sizeof(s));
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3096,12 +3097,8 @@ static void handle_stripe5(struct stripe_head *sh)
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-				IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			dec_preread_active = 1;
 	}

 	/* Now to consider new write requests and what else, if anything
@@ -3208,6 +3205,16 @@ static void handle_stripe5(struct stripe_head *sh)

 	ops_run_io(sh, &s);

+	if (dec_preread_active) {
+		/* We delay this until after ops_run_io so that if make_request
+		 * is waiting on a barrier, it won't continue until the writes
+		 * have actually been submitted.
+		 */
+		atomic_dec(&conf->preread_active_stripes);
+		if (atomic_read(&conf->preread_active_stripes) <
+		    IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+	}
 	return_io(return_bi);
 }

@@ -3221,6 +3228,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
 	mdk_rdev_t *blocked_rdev = NULL;
+	int dec_preread_active = 0;

 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
 		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3358,7 +3366,6 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * completed
 	 */
 	if (sh->reconstruct_state == reconstruct_state_drain_result) {
-		int qd_idx = sh->qd_idx;

 		sh->reconstruct_state = reconstruct_state_idle;
 		/* All the 'written' buffers and the parity blocks are ready to
@@ -3380,12 +3387,8 @@ static void handle_stripe6(struct stripe_head *sh)
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-				IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			dec_preread_active = 1;
 	}

 	/* Now to consider new write requests and what else, if anything
@@ -3494,6 +3497,18 @@ static void handle_stripe6(struct stripe_head *sh)

 	ops_run_io(sh, &s);

+
+	if (dec_preread_active) {
+		/* We delay this until after ops_run_io so that if make_request
+		 * is waiting on a barrier, it won't continue until the writes
+		 * have actually been submitted.
+		 */
+		atomic_dec(&conf->preread_active_stripes);
+		if (atomic_read(&conf->preread_active_stripes) <
+		    IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+	}
+
 	return_io(return_bi);
 }

@@ -3741,7 +3756,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev->private;
-	unsigned int dd_idx;
+	int dd_idx;
 	struct bio* align_bi;
 	mdk_rdev_t *rdev;

@@ -3866,7 +3881,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	int cpu, remaining;

 	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
-		bio_endio(bi, -EOPNOTSUPP);
+		/* Drain all pending writes.  We only really need
+		 * to ensure they have been submitted, but this is
+		 * easier.
+		 */
+		mddev->pers->quiesce(mddev, 1);
+		mddev->pers->quiesce(mddev, 0);
+		md_barrier_request(mddev, bi);
 		return 0;
 	}

@@ -3990,6 +4011,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
 			finish_wait(&conf->wait_for_overlap, &w);
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
+			if (mddev->barrier && 
+			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				atomic_inc(&conf->preread_active_stripes);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -4009,6 +4033,14 @@ static int make_request(struct request_queue *q, struct bio * bi)

 		bio_endio(bi, 0);
 	}
+
+	if (mddev->barrier) {
+		/* We need to wait for the stripes to all be handled.
+		 * So: wait for preread_active_stripes to drop to 0.
+		 */
+		wait_event(mddev->thread->wqueue,
+			   atomic_read(&conf->preread_active_stripes) == 0);
+	}
 	return 0;
 }

@@ -5860,6 +5892,7 @@ static void raid5_exit(void)
 module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
 MODULE_ALIAS("md-raid5");
 MODULE_ALIAS("md-raid4");

--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -31,25 +31,6 @@ EXPORT_SYMBOL(raid6_empty_zero_page);
 struct raid6_calls raid6_call;
 EXPORT_SYMBOL_GPL(raid6_call);

-/* Various routine sets */
-extern const struct raid6_calls raid6_intx1;
-extern const struct raid6_calls raid6_intx2;
-extern const struct raid6_calls raid6_intx4;
-extern const struct raid6_calls raid6_intx8;
-extern const struct raid6_calls raid6_intx16;
-extern const struct raid6_calls raid6_intx32;
-extern const struct raid6_calls raid6_mmxx1;
-extern const struct raid6_calls raid6_mmxx2;
-extern const struct raid6_calls raid6_sse1x1;
-extern const struct raid6_calls raid6_sse1x2;
-extern const struct raid6_calls raid6_sse2x1;
-extern const struct raid6_calls raid6_sse2x2;
-extern const struct raid6_calls raid6_sse2x4;
-extern const struct raid6_calls raid6_altivec1;
-extern const struct raid6_calls raid6_altivec2;
-extern const struct raid6_calls raid6_altivec4;
-extern const struct raid6_calls raid6_altivec8;
-
 const struct raid6_calls * const raid6_algos[] = {
 	&raid6_intx1,
 	&raid6_intx2,
@@ -169,3 +150,4 @@ static void raid6_exit(void)
 subsys_initcall(raid6_select_algo);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID6 Q-syndrome calculations");
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -979,24 +979,6 @@ COMPATIBLE_IOCTL(FIGETBSZ)
 /* 'X' - originally XFS but some now in the VFS */
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
-/* RAID */
-COMPATIBLE_IOCTL(RAID_VERSION)
-COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-COMPATIBLE_IOCTL(GET_DISK_INFO)
-COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
-COMPATIBLE_IOCTL(RAID_AUTORUN)
-COMPATIBLE_IOCTL(CLEAR_ARRAY)
-COMPATIBLE_IOCTL(ADD_NEW_DISK)
-COMPATIBLE_IOCTL(SET_ARRAY_INFO)
-COMPATIBLE_IOCTL(SET_DISK_INFO)
-COMPATIBLE_IOCTL(WRITE_RAID_INFO)
-COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
-COMPATIBLE_IOCTL(PROTECT_ARRAY)
-COMPATIBLE_IOCTL(RUN_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY_RO)
-COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
-COMPATIBLE_IOCTL(GET_BITMAP_FILE)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
 COMPATIBLE_IOCTL(KDGKBTYPE)

--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -78,6 +78,25 @@ struct raid6_calls {
 /* Selected algorithm */
 extern struct raid6_calls raid6_call;

+/* Various routine sets */
+extern const struct raid6_calls raid6_intx1;
+extern const struct raid6_calls raid6_intx2;
+extern const struct raid6_calls raid6_intx4;
+extern const struct raid6_calls raid6_intx8;
+extern const struct raid6_calls raid6_intx16;
+extern const struct raid6_calls raid6_intx32;
+extern const struct raid6_calls raid6_mmxx1;
+extern const struct raid6_calls raid6_mmxx2;
+extern const struct raid6_calls raid6_sse1x1;
+extern const struct raid6_calls raid6_sse1x2;
+extern const struct raid6_calls raid6_sse2x1;
+extern const struct raid6_calls raid6_sse2x2;
+extern const struct raid6_calls raid6_sse2x4;
+extern const struct raid6_calls raid6_altivec1;
+extern const struct raid6_calls raid6_altivec2;
+extern const struct raid6_calls raid6_altivec4;
+extern const struct raid6_calls raid6_altivec8;
+
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
 int raid6_select_algo(void);