Commit 5fa9d488 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Fix busy-wait with writeback to large queues

blk_congestion_wait() is a utility function which various callers use
to throttle themselves to the rate at which the IO system can retire
writes.

The current implementation refuses to wait if no queues are "congested"
(>75% of requests are in flight).

That doesn't work if the queue is so huge that it can hold more than
40% (dirty_ratio) of memory.  The queue simply cannot enter congestion
because the VM refuses to allow more than 40% of memory to be dirtied.
(This spin could happen with a lot of normal-sized queues too)

So this patch simply changes blk_congestion_wait() to throttle even if
there are no congested queues.  It will cause the caller to sleep until
someone puts back a write request against any queue.  (Nobody uses
blk_congestion_wait for read congestion).

The patch adds new state to backing_dev_info->state: a couple of flags
which indicate whether there are _any_ reads or writes in flight
against that queue.  This was added to prevent blk_congestion_wait()
from taking a nap when there are no writes at all in flight.

But the "are there any reads" info could be used to defer background
writeout from pdflush, to reduce read-vs-write competition.  We'll see.

Because the large request queues have made a fundamental change:
blocking in get_request_wait() has been the main form of VM throttling
for years.  But with large queues it doesn't work any more - all
throttling happens in blk_congestion_wait().

Also, change io_schedule_timeout() to propagate the schedule_timeout()
return value.  I was using that in some debug code, but it should have
been like that from day one.
parent 40a7fe2f
......@@ -56,6 +56,7 @@ int blk_nohighio = 0;
static struct congestion_state {
wait_queue_head_t wqh;
atomic_t nr_congested_queues;
atomic_t nr_active_queues;
} congestion_states[2];
/*
......@@ -86,6 +87,11 @@ static inline int queue_congestion_off_threshold(void)
return ret;
}
/*
* A queue has just exitted congestion. Note this in the global counter of
* congested queues, and wake up anyone who was waiting for requests to be
* put back.
*/
static void clear_queue_congested(request_queue_t *q, int rw)
{
enum bdi_state bit;
......@@ -99,6 +105,10 @@ static void clear_queue_congested(request_queue_t *q, int rw)
wake_up(&cs->wqh);
}
/*
* A queue has just entered congestion. Flag that in the queue's VM-visible
* state flags and increment the global gounter of congested queues.
*/
static void set_queue_congested(request_queue_t *q, int rw)
{
enum bdi_state bit;
......@@ -109,6 +119,34 @@ static void set_queue_congested(request_queue_t *q, int rw)
atomic_inc(&congestion_states[rw].nr_congested_queues);
}
/*
* A queue has just put back its last read or write request and has fallen
* idle.
*/
static void clear_queue_active(request_queue_t *q, int rw)
{
enum bdi_state bit;
bit = (rw == WRITE) ? BDI_write_active : BDI_read_active;
if (test_and_clear_bit(bit, &q->backing_dev_info.state))
atomic_dec(&congestion_states[rw].nr_active_queues);
}
/*
* A queue has just taken its first read or write request and has become
* active.
*/
static void set_queue_active(request_queue_t *q, int rw)
{
enum bdi_state bit;
bit = (rw == WRITE) ? BDI_write_active : BDI_read_active;
if (!test_and_set_bit(bit, &q->backing_dev_info.state))
atomic_inc(&congestion_states[rw].nr_active_queues);
}
/**
* blk_get_backing_dev_info - get the address of a queue's backing_dev_info
* @dev: device
......@@ -1252,6 +1290,8 @@ static struct request *get_request(request_queue_t *q, int rw)
rq = blkdev_free_rq(&rl->free);
list_del_init(&rq->queuelist);
rq->ref_count = 1;
if (rl->count == queue_nr_requests)
set_queue_active(q, rw);
rl->count--;
if (rl->count < queue_congestion_on_threshold())
set_queue_congested(q, rw);
......@@ -1484,6 +1524,8 @@ void __blk_put_request(request_queue_t *q, struct request *req)
rl->count++;
if (rl->count >= queue_congestion_off_threshold())
clear_queue_congested(q, rw);
if (rl->count == queue_nr_requests)
clear_queue_active(q, rw);
if (rl->count >= batch_requests && waitqueue_active(&rl->wait))
wake_up(&rl->wait);
}
......@@ -1512,19 +1554,20 @@ void blk_put_request(struct request *req)
* @timeout: timeout in jiffies
*
* Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
* If no queues are congested then just return, in the hope that the caller
* will submit some more IO.
* If no queues are congested then just wait for the next request to be
* returned.
*/
void blk_congestion_wait(int rw, long timeout)
{
DEFINE_WAIT(wait);
struct congestion_state *cs = &congestion_states[rw];
if (atomic_read(&cs->nr_congested_queues) == 0)
if (!atomic_read(&cs->nr_active_queues))
return;
blk_run_queues();
prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(&cs->nr_congested_queues) != 0)
if (atomic_read(&cs->nr_active_queues))
io_schedule_timeout(timeout);
finish_wait(&cs->wqh, &wait);
}
......@@ -2157,6 +2200,7 @@ int __init blk_dev_init(void)
for (i = 0; i < ARRAY_SIZE(congestion_states); i++) {
init_waitqueue_head(&congestion_states[i].wqh);
atomic_set(&congestion_states[i].nr_congested_queues, 0);
atomic_set(&congestion_states[i].nr_active_queues, 0);
}
return 0;
};
......
......@@ -17,6 +17,8 @@ enum bdi_state {
BDI_pdflush, /* A pdflush thread is working this device */
BDI_write_congested, /* The write queue is getting full */
BDI_read_congested, /* The read queue is getting full */
BDI_write_active, /* There are one or more queued writes */
BDI_read_active, /* There are one or more queued reads */
BDI_unused, /* Available bits start here */
};
......@@ -42,4 +44,14 @@ static inline int bdi_write_congested(struct backing_dev_info *bdi)
return test_bit(BDI_write_congested, &bdi->state);
}
static inline int bdi_read_active(struct backing_dev_info *bdi)
{
return test_bit(BDI_read_active, &bdi->state);
}
static inline int bdi_write_active(struct backing_dev_info *bdi)
{
return test_bit(BDI_write_active, &bdi->state);
}
#endif /* _LINUX_BACKING_DEV_H */
......@@ -150,7 +150,7 @@ extern void show_stack(unsigned long *stack);
extern void show_regs(struct pt_regs *);
void io_schedule(void);
void io_schedule_timeout(long timeout);
long io_schedule_timeout(long timeout);
extern void cpu_init (void);
extern void trap_init(void);
......
......@@ -1745,13 +1745,15 @@ void io_schedule(void)
atomic_dec(&rq->nr_iowait);
}
void io_schedule_timeout(long timeout)
long io_schedule_timeout(long timeout)
{
struct runqueue *rq = this_rq();
long ret;
atomic_inc(&rq->nr_iowait);
schedule_timeout(timeout);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
return ret;
}
/**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment