Commit 17999be4 authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: improve raid1 "IO Barrier" concept

raid1 needs to put up a barrier to new requests while it does resync or other
background recovery.  The code for this is currently open-coded, slighty
obscure by its use of two waitqueues, and not documented.

This patch gathers all the related code into 4 functions, and includes a
comment which (hopefully) explains what is happening.
Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent ac81b2ee
...@@ -51,6 +51,8 @@ static mdk_personality_t raid1_personality; ...@@ -51,6 +51,8 @@ static mdk_personality_t raid1_personality;
static void unplug_slaves(mddev_t *mddev); static void unplug_slaves(mddev_t *mddev);
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{ {
...@@ -160,20 +162,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) ...@@ -160,20 +162,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
static inline void free_r1bio(r1bio_t *r1_bio) static inline void free_r1bio(r1bio_t *r1_bio)
{ {
unsigned long flags;
conf_t *conf = mddev_to_conf(r1_bio->mddev); conf_t *conf = mddev_to_conf(r1_bio->mddev);
/* /*
* Wake up any possible resync thread that waits for the device * Wake up any possible resync thread that waits for the device
* to go idle. * to go idle.
*/ */
spin_lock_irqsave(&conf->resync_lock, flags); allow_barrier(conf);
if (!--conf->nr_pending) {
wake_up(&conf->wait_idle);
wake_up(&conf->wait_resume);
}
spin_unlock_irqrestore(&conf->resync_lock, flags);
put_all_bios(conf, r1_bio); put_all_bios(conf, r1_bio);
mempool_free(r1_bio, conf->r1bio_pool); mempool_free(r1_bio, conf->r1bio_pool);
...@@ -182,22 +177,10 @@ static inline void free_r1bio(r1bio_t *r1_bio) ...@@ -182,22 +177,10 @@ static inline void free_r1bio(r1bio_t *r1_bio)
static inline void put_buf(r1bio_t *r1_bio) static inline void put_buf(r1bio_t *r1_bio)
{ {
conf_t *conf = mddev_to_conf(r1_bio->mddev); conf_t *conf = mddev_to_conf(r1_bio->mddev);
unsigned long flags;
mempool_free(r1_bio, conf->r1buf_pool); mempool_free(r1_bio, conf->r1buf_pool);
spin_lock_irqsave(&conf->resync_lock, flags); lower_barrier(conf);
if (!conf->barrier)
BUG();
--conf->barrier;
wake_up(&conf->wait_resume);
wake_up(&conf->wait_idle);
if (!--conf->nr_pending) {
wake_up(&conf->wait_idle);
wake_up(&conf->wait_resume);
}
spin_unlock_irqrestore(&conf->resync_lock, flags);
} }
static void reschedule_retry(r1bio_t *r1_bio) static void reschedule_retry(r1bio_t *r1_bio)
...@@ -210,6 +193,7 @@ static void reschedule_retry(r1bio_t *r1_bio) ...@@ -210,6 +193,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
list_add(&r1_bio->retry_list, &conf->retry_list); list_add(&r1_bio->retry_list, &conf->retry_list);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
} }
...@@ -593,30 +577,83 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, ...@@ -593,30 +577,83 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret; return ret;
} }
/* /* Barriers....
* Throttle resync depth, so that we can both get proper overlapping of * Sometimes we need to suspend IO while we do something else,
* requests, but are still able to handle normal requests quickly. * either some resync/recovery, or reconfigure the array.
* To do this we raise a 'barrier'.
* The 'barrier' is a counter that can be raised multiple times
* to count how many activities are happening which preclude
* normal IO.
* We can only raise the barrier if there is no pending IO.
* i.e. if nr_pending == 0.
* We choose only to raise the barrier if no-one is waiting for the
* barrier to go down. This means that as soon as an IO request
* is ready, no other operations which require a barrier will start
* until the IO request has had a chance.
*
* So: regular IO calls 'wait_barrier'. When that returns there
* is no backgroup IO happening, It must arrange to call
* allow_barrier when it has finished its IO.
* backgroup IO calls must call raise_barrier. Once that returns
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
*/ */
#define RESYNC_DEPTH 32 #define RESYNC_DEPTH 32
static void device_barrier(conf_t *conf, sector_t sect) static void raise_barrier(conf_t *conf)
{ {
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
conf->resync_lock, raid1_unplug(conf->mddev->queue)); /* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
if (!conf->barrier++) { conf->resync_lock,
wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, raid1_unplug(conf->mddev->queue));
conf->resync_lock, raid1_unplug(conf->mddev->queue));
if (conf->nr_pending) /* block any new IO from starting */
BUG(); conf->barrier++;
/* No wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
conf->resync_lock,
raid1_unplug(conf->mddev->queue));
spin_unlock_irq(&conf->resync_lock);
}
static void lower_barrier(conf_t *conf)
{
unsigned long flags;
spin_lock_irqsave(&conf->resync_lock, flags);
conf->barrier--;
spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
static void wait_barrier(conf_t *conf)
{
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
raid1_unplug(conf->mddev->queue));
conf->nr_waiting--;
} }
wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, conf->nr_pending++;
conf->resync_lock, raid1_unplug(conf->mddev->queue));
conf->next_resync = sect;
spin_unlock_irq(&conf->resync_lock); spin_unlock_irq(&conf->resync_lock);
} }
static void allow_barrier(conf_t *conf)
{
unsigned long flags;
spin_lock_irqsave(&conf->resync_lock, flags);
conf->nr_pending--;
spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
/* duplicate the data pages for behind I/O */ /* duplicate the data pages for behind I/O */
static struct page **alloc_behind_pages(struct bio *bio) static struct page **alloc_behind_pages(struct bio *bio)
{ {
...@@ -678,10 +715,7 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -678,10 +715,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
*/ */
md_write_start(mddev, bio); /* wait on superblock update early */ md_write_start(mddev, bio); /* wait on superblock update early */
spin_lock_irq(&conf->resync_lock); wait_barrier(conf);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock);
disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_inc(mddev->gendisk, ios[rw]);
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
...@@ -909,13 +943,8 @@ static void print_conf(conf_t *conf) ...@@ -909,13 +943,8 @@ static void print_conf(conf_t *conf)
static void close_sync(conf_t *conf) static void close_sync(conf_t *conf)
{ {
spin_lock_irq(&conf->resync_lock); wait_barrier(conf);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, allow_barrier(conf);
conf->resync_lock, raid1_unplug(conf->mddev->queue));
spin_unlock_irq(&conf->resync_lock);
if (conf->barrier) BUG();
if (waitqueue_active(&conf->wait_idle)) BUG();
mempool_destroy(conf->r1buf_pool); mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL; conf->r1buf_pool = NULL;
...@@ -1317,12 +1346,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1317,12 +1346,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return sync_blocks; return sync_blocks;
} }
/* /*
* If there is non-resync activity waiting for us then * If there is non-resync activity waiting for a turn,
* put in a delay to throttle resync. * and resync is going fast enough,
* then let it though before starting on this new sync request.
*/ */
if (!go_faster && waitqueue_active(&conf->wait_resume)) if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000); msleep_interruptible(1000);
device_barrier(conf, sector_nr + RESYNC_SECTORS);
raise_barrier(conf);
conf->next_resync = sector_nr;
/* /*
* If reconstructing, and >1 working disc, * If reconstructing, and >1 working disc,
...@@ -1355,10 +1388,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1355,10 +1388,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
spin_lock_irq(&conf->resync_lock);
conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock);
r1_bio->mddev = mddev; r1_bio->mddev = mddev;
r1_bio->sector = sector_nr; r1_bio->sector = sector_nr;
r1_bio->state = 0; r1_bio->state = 0;
...@@ -1542,8 +1571,7 @@ static int run(mddev_t *mddev) ...@@ -1542,8 +1571,7 @@ static int run(mddev_t *mddev)
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
spin_lock_init(&conf->resync_lock); spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_idle); init_waitqueue_head(&conf->wait_barrier);
init_waitqueue_head(&conf->wait_resume);
bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->pending_bio_list);
bio_list_init(&conf->flushing_bio_list); bio_list_init(&conf->flushing_bio_list);
...@@ -1714,11 +1742,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) ...@@ -1714,11 +1742,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
} }
memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
spin_lock_irq(&conf->resync_lock); raise_barrier(conf);
conf->barrier++;
wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
conf->resync_lock, raid1_unplug(mddev->queue));
spin_unlock_irq(&conf->resync_lock);
/* ok, everything is stopped */ /* ok, everything is stopped */
oldpool = conf->r1bio_pool; oldpool = conf->r1bio_pool;
...@@ -1738,12 +1762,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) ...@@ -1738,12 +1762,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
conf->raid_disks = mddev->raid_disks = raid_disks; conf->raid_disks = mddev->raid_disks = raid_disks;
conf->last_used = 0; /* just make sure it is in-range */ conf->last_used = 0; /* just make sure it is in-range */
spin_lock_irq(&conf->resync_lock); lower_barrier(conf);
conf->barrier--;
spin_unlock_irq(&conf->resync_lock);
wake_up(&conf->wait_resume);
wake_up(&conf->wait_idle);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
...@@ -1758,18 +1777,10 @@ static void raid1_quiesce(mddev_t *mddev, int state) ...@@ -1758,18 +1777,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
switch(state) { switch(state) {
case 1: case 1:
spin_lock_irq(&conf->resync_lock); raise_barrier(conf);
conf->barrier++;
wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
conf->resync_lock, raid1_unplug(mddev->queue));
spin_unlock_irq(&conf->resync_lock);
break; break;
case 0: case 0:
spin_lock_irq(&conf->resync_lock); lower_barrier(conf);
conf->barrier--;
spin_unlock_irq(&conf->resync_lock);
wake_up(&conf->wait_resume);
wake_up(&conf->wait_idle);
break; break;
} }
if (mddev->thread) { if (mddev->thread) {
......
...@@ -45,6 +45,7 @@ struct r1_private_data_s { ...@@ -45,6 +45,7 @@ struct r1_private_data_s {
spinlock_t resync_lock; spinlock_t resync_lock;
int nr_pending; int nr_pending;
int nr_waiting;
int barrier; int barrier;
sector_t next_resync; sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed, int fullsync; /* set to 1 if a full sync is needed,
...@@ -52,8 +53,7 @@ struct r1_private_data_s { ...@@ -52,8 +53,7 @@ struct r1_private_data_s {
* Cleared when a sync completes. * Cleared when a sync completes.
*/ */
wait_queue_head_t wait_idle; wait_queue_head_t wait_barrier;
wait_queue_head_t wait_resume;
struct pool_info *poolinfo; struct pool_info *poolinfo;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment