[PATCH] block batching fairness

From: Nick Piggin <piggin@cyberone.com.au> This patch fixes the request batching fairness/starvation issue. Its not clear what is going on with 2.4, but it seems that its a problem around this area. Anyway, previously: * request queue fills up * process 1 calls get_request, sleeps * a couple of requests are freed * process 2 calls get_request, proceeds * a couple of requests are freed * process 2 calls get_request... Now as unlikely as it seems, it could be a problem. Its a fairness problem that process 2 can skip ahead of process 1 anyway. With the patch: * request queue fills up * any process calling get_request will sleep * once the queue gets below the batch watermark, processes start being worken, and may allocate. This patch includes Chris Mason's fix to only clear queue_full when all tasks have been woken. Previously I think starvation and unfairness could still occur. With this change to the blk-fair-batches patch, Chris is showing some much improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches for a dbench 90 run. He didn't indicate how much difference his patch alone made, but it is an important fix I think.

[PATCH] block batching fairness
From: Nick Piggin <piggin@cyberone.com.au> This patch fixes the request batching fairness/starvation issue. Its not clear what is going on with 2.4, but it seems that its a problem around this area. Anyway, previously: * request queue fills up * process 1 calls get_request, sleeps * a couple of requests are freed * process 2 calls get_request, proceeds * a couple of requests are freed * process 2 calls get_request... Now as unlikely as it seems, it could be a problem. Its a fairness problem that process 2 can skip ahead of process 1 anyway. With the patch: * request queue fills up * any process calling get_request will sleep * once the queue gets below the batch watermark, processes start being worken, and may allocate. This patch includes Chris Mason's fix to only clear queue_full when all tasks have been woken. Previously I think starvation and unfairness could still occur. With this change to the blk-fair-batches patch, Chris is showing some much improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches for a dbench 90 run. He didn't indicate how much difference his patch alone made, but it is an important fix I think.
80af89ca · Andrew Morton · Linus Torvalds · f67198fb · 80af89ca · 80af89ca
Commit 80af89ca authored Jul 04, 2003 by Andrew Morton Committed by Linus Torvalds Jul 04, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 75 additions and 26 deletions

drivers/block/ll_rw_blk.c drivers/block/ll_rw_blk.c +49 -26

include/linux/blkdev.h include/linux/blkdev.h +26 -0

No files found.
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -53,7 +53,7 @@ unsigned long blk_max_low_pfn, blk_max_pfn;

 static inline int batch_requests(struct request_queue *q)
 {
-	return q->nr_requests - min(q->nr_requests / 8, 8UL);
+	return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1;
 }

 /*
@@ -1309,13 +1309,16 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
 /*
 * Get a free request, queue_lock must not be held
 */
-static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
+static struct request *
+get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;

 	spin_lock_irq(q->queue_lock);
-	if (rl->count[rw] >= q->nr_requests && !elv_may_queue(q, rw)) {
+	if (rl->count[rw] == q->nr_requests)
+		blk_set_queue_full(q, rw);
+	if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
@@ -1330,6 +1333,14 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		rl->count[rw]--;
 		if (rl->count[rw] < queue_congestion_off_threshold(q))
                        clear_queue_congested(q, rw);
+
+		if (rl->count[rw] <= batch_requests(q)) {
+			if (waitqueue_active(&rl->wait[rw]))
+				wake_up(&rl->wait[rw]);
+			else
+				blk_clear_queue_full(q, rw);
+		}
+
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
@@ -1366,26 +1377,22 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
+	int waited = 0;

 	generic_unplug_device(q);
 	do {
-		rq = get_request(q, rw, GFP_NOIO);
+		struct request_list *rl = &q->rq;

-		if (!rq) {
-			struct request_list *rl = &q->rq;
+		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+				TASK_UNINTERRUPTIBLE);

-			prepare_to_wait_exclusive(&rl->wait[rw], &wait,
-						TASK_UNINTERRUPTIBLE);
-			/*
-			 * If _all_ the requests were suddenly returned then
-			 * no wakeup will be delivered.  So now we're on the
-			 * waitqueue, go check for that.
-			 */
-			rq = get_request(q, rw, GFP_NOIO);
-			if (!rq)
-				io_schedule();
-			finish_wait(&rl->wait[rw], &wait);
+		rq = get_request(q, rw, GFP_NOIO, waited);
+
+		if (!rq) {
+			io_schedule();
+			waited = 1;
 		}
+		finish_wait(&rl->wait[rw], &wait);
 	} while (!rq);

 	return rq;
@@ -1397,10 +1404,10 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)

 	BUG_ON(rw != READ && rw != WRITE);

-	rq = get_request(q, rw, gfp_mask);
-
-	if (!rq && (gfp_mask & __GFP_WAIT))
+	if (gfp_mask & __GFP_WAIT)
 		rq = get_request_wait(q, rw);
+	else
+		rq = get_request(q, rw, gfp_mask, 0);

 	return rq;
 }
@@ -1551,9 +1558,13 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		rl->count[rw]--;
 		if (rl->count[rw] < queue_congestion_off_threshold(q))
 			clear_queue_congested(q, rw);
-		if (rl->count[rw] < batch_requests(q) &&
-				waitqueue_active(&rl->wait[rw]))
-			wake_up(&rl->wait[rw]);
+
+		if (rl->count[rw] <= batch_requests(q)) {
+			if (waitqueue_active(&rl->wait[rw]))
+				wake_up(&rl->wait[rw]);
+			else
+				blk_clear_queue_full(q, rw);
+		}
 	}
 }

@@ -1796,7 +1807,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
+		if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@@ -1904,8 +1915,7 @@ static inline void blk_partition_remap(struct bio *bio)
 * bio happens to be merged with someone else, and may change bi_dev and
 * bi_sector for remaps as it sees fit.  So the values of these fields
 * should NOT be depended on after the call to generic_make_request.
- *
- * */
+ */
 void generic_make_request(struct bio *bio)
 {
 	request_queue_t *q;
@@ -2415,6 +2425,19 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
 		clear_queue_congested(q, WRITE);

+	if (rl->count[READ] >= q->nr_requests) {
+		blk_set_queue_full(q, READ);
+	} else if (rl->count[READ] <= batch_requests(q)) {
+		blk_clear_queue_full(q, READ);
+		wake_up_all(&rl->wait[READ]);
+	}
+
+	if (rl->count[WRITE] >= q->nr_requests) {
+		blk_set_queue_full(q, WRITE);
+	} else if (rl->count[WRITE] <= batch_requests(q)) {
+		blk_clear_queue_full(q, WRITE);
+		wake_up_all(&rl->wait[WRITE]);
+	}
 	return ret;
 }


--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,8 @@ struct request_queue
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
+#define	QUEUE_FLAG_READFULL	3	/* write queue has been filled */
+#define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */

 #define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -322,6 +324,30 @@ struct request_queue

 #define rq_data_dir(rq)		((rq)->flags & 1)

+static inline int blk_queue_full(struct request_queue *q, int rw)
+{
+	if (rw == READ)
+		return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
+	return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+}
+
+static inline void blk_set_queue_full(struct request_queue *q, int rw)
+{
+	if (rw == READ)
+		set_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
+	else
+		set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+}
+
+static inline void blk_clear_queue_full(struct request_queue *q, int rw)
+{
+	if (rw == READ)
+		clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
+	else
+		clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+}
+
+
 /*
 * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
 * it already be started by driver.