[PATCH] Fix busy-wait with writeback to large queues

blk_congestion_wait() is a utility function which various callers use to throttle themselves to the rate at which the IO system can retire writes. The current implementation refuses to wait if no queues are "congested" (>75% of requests are in flight). That doesn't work if the queue is so huge that it can hold more than 40% (dirty_ratio) of memory. The queue simply cannot enter congestion because the VM refuses to allow more than 40% of memory to be dirtied. (This spin could happen with a lot of normal-sized queues too) So this patch simply changes blk_congestion_wait() to throttle even if there are no congested queues. It will cause the caller to sleep until someone puts back a write request against any queue. (Nobody uses blk_congestion_wait for read congestion). The patch adds new state to backing_dev_info->state: a couple of flags which indicate whether there are _any_ reads or writes in flight against that queue. This was added to prevent blk_congestion_wait() from taking a nap when there are no writes at all in flight. But the "are there any reads" info could be used to defer background writeout from pdflush, to reduce read-vs-write competition. We'll see. Because the large request queues have made a fundamental change: blocking in get_request_wait() has been the main form of VM throttling for years. But with large queues it doesn't work any more - all throttling happens in blk_congestion_wait(). Also, change io_schedule_timeout() to propagate the schedule_timeout() return value. I was using that in some debug code, but it should have been like that from day one.

[PATCH] Fix busy-wait with writeback to large queues
blk_congestion_wait() is a utility function which various callers use to throttle themselves to the rate at which the IO system can retire writes. The current implementation refuses to wait if no queues are "congested" (>75% of requests are in flight). That doesn't work if the queue is so huge that it can hold more than 40% (dirty_ratio) of memory. The queue simply cannot enter congestion because the VM refuses to allow more than 40% of memory to be dirtied. (This spin could happen with a lot of normal-sized queues too) So this patch simply changes blk_congestion_wait() to throttle even if there are no congested queues. It will cause the caller to sleep until someone puts back a write request against any queue. (Nobody uses blk_congestion_wait for read congestion). The patch adds new state to backing_dev_info->state: a couple of flags which indicate whether there are _any_ reads or writes in flight against that queue. This was added to prevent blk_congestion_wait() from taking a nap when there are no writes at all in flight. But the "are there any reads" info could be used to defer background writeout from pdflush, to reduce read-vs-write competition. We'll see. Because the large request queues have made a fundamental change: blocking in get_request_wait() has been the main form of VM throttling for years. But with large queues it doesn't work any more - all throttling happens in blk_congestion_wait(). Also, change io_schedule_timeout() to propagate the schedule_timeout() return value. I was using that in some debug code, but it should have been like that from day one.
5fa9d488 · Andrew Morton · Linus Torvalds · 40a7fe2f · 5fa9d488 · 5fa9d488
Commit 5fa9d488 authored Nov 21, 2002 by Andrew Morton Committed by Linus Torvalds Nov 21, 2002
Showing with 65 additions and 7 deletions

drivers/block/ll_rw_blk.c drivers/block/ll_rw_blk.c +48 -4

include/linux/backing-dev.h include/linux/backing-dev.h +12 -0

include/linux/sched.h include/linux/sched.h +1 -1

kernel/sched.c kernel/sched.c +4 -2

No files found.
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -56,6 +56,7 @@ int blk_nohighio = 0;
 static struct congestion_state {
 	wait_queue_head_t wqh;
 	atomic_t nr_congested_queues;
+	atomic_t nr_active_queues;
 } congestion_states[2];

 /*
@@ -86,6 +87,11 @@ static inline int queue_congestion_off_threshold(void)
 	return ret;
 }

+/*
+ * A queue has just exitted congestion.  Note this in the global counter of
+ * congested queues, and wake up anyone who was waiting for requests to be
+ * put back.
+ */
 static void clear_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
@@ -99,6 +105,10 @@ static void clear_queue_congested(request_queue_t *q, int rw)
 		wake_up(&cs->wqh);
 }

+/*
+ * A queue has just entered congestion.  Flag that in the queue's VM-visible
+ * state flags and increment the global gounter of congested queues.
+ */
 static void set_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
@@ -109,6 +119,34 @@ static void set_queue_congested(request_queue_t *q, int rw)
 		atomic_inc(&congestion_states[rw].nr_congested_queues);
 }

+/*
+ * A queue has just put back its last read or write request and has fallen
+ * idle.
+ */
+static void clear_queue_active(request_queue_t *q, int rw)
+{
+	enum bdi_state bit;
+
+	bit = (rw == WRITE) ? BDI_write_active : BDI_read_active;
+
+	if (test_and_clear_bit(bit, &q->backing_dev_info.state))
+		atomic_dec(&congestion_states[rw].nr_active_queues);
+}
+
+/*
+ * A queue has just taken its first read or write request and has become
+ * active.
+ */
+static void set_queue_active(request_queue_t *q, int rw)
+{
+	enum bdi_state bit;
+
+	bit = (rw == WRITE) ? BDI_write_active : BDI_read_active;
+
+	if (!test_and_set_bit(bit, &q->backing_dev_info.state))
+		atomic_inc(&congestion_states[rw].nr_active_queues);
+}
+
 /**
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 * @dev:	device
@@ -1252,6 +1290,8 @@ static struct request *get_request(request_queue_t *q, int rw)
 		rq = blkdev_free_rq(&rl->free);
 		list_del_init(&rq->queuelist);
 		rq->ref_count = 1;
+		if (rl->count == queue_nr_requests)
+			set_queue_active(q, rw);
 		rl->count--;
 		if (rl->count < queue_congestion_on_threshold())
 			set_queue_congested(q, rw);
@@ -1484,6 +1524,8 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		rl->count++;
 		if (rl->count >= queue_congestion_off_threshold())
 			clear_queue_congested(q, rw);
+		if (rl->count == queue_nr_requests)
+			clear_queue_active(q, rw);
 		if (rl->count >= batch_requests && waitqueue_active(&rl->wait))
 			wake_up(&rl->wait);
 	}
@@ -1512,19 +1554,20 @@ void blk_put_request(struct request *req)
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
- * If no queues are congested then just return, in the hope that the caller
- * will submit some more IO.
+ * If no queues are congested then just wait for the next request to be
+ * returned.
 */
 void blk_congestion_wait(int rw, long timeout)
 {
 	DEFINE_WAIT(wait);
 	struct congestion_state *cs = &congestion_states[rw];

-	if (atomic_read(&cs->nr_congested_queues) == 0)
+	if (!atomic_read(&cs->nr_active_queues))
 		return;
+
 	blk_run_queues();
 	prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE);
-	if (atomic_read(&cs->nr_congested_queues) != 0)
+	if (atomic_read(&cs->nr_active_queues))
 		io_schedule_timeout(timeout);
 	finish_wait(&cs->wqh, &wait);
 }
@@ -2157,6 +2200,7 @@ int __init blk_dev_init(void)
 	for (i = 0; i < ARRAY_SIZE(congestion_states); i++) {
 		init_waitqueue_head(&congestion_states[i].wqh);
 		atomic_set(&congestion_states[i].nr_congested_queues, 0);
+		atomic_set(&congestion_states[i].nr_active_queues, 0);
 	}
 	return 0;
 };

--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,6 +17,8 @@ enum bdi_state {
 	BDI_pdflush,		/* A pdflush thread is working this device */
 	BDI_write_congested,	/* The write queue is getting full */
 	BDI_read_congested,	/* The read queue is getting full */
+	BDI_write_active,	/* There are one or more queued writes */
+	BDI_read_active,	/* There are one or more queued reads */
 	BDI_unused,		/* Available bits start here */
 };

@@ -42,4 +44,14 @@ static inline int bdi_write_congested(struct backing_dev_info *bdi)
 	return test_bit(BDI_write_congested, &bdi->state);
 }

+static inline int bdi_read_active(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_read_active, &bdi->state);
+}
+
+static inline int bdi_write_active(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_write_active, &bdi->state);
+}
+
 #endif		/* _LINUX_BACKING_DEV_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -150,7 +150,7 @@ extern void show_stack(unsigned long *stack);
 extern void show_regs(struct pt_regs *);

 void io_schedule(void);
-void io_schedule_timeout(long timeout);
+long io_schedule_timeout(long timeout);

 extern void cpu_init (void);
 extern void trap_init(void);

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1745,13 +1745,15 @@ void io_schedule(void)
 	atomic_dec(&rq->nr_iowait);
 }

-void io_schedule_timeout(long timeout)
+long io_schedule_timeout(long timeout)
 {
 	struct runqueue *rq = this_rq();
+	long ret;

 	atomic_inc(&rq->nr_iowait);
-	schedule_timeout(timeout);
+	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
+	return ret;
 }

 /**