Commit 6d27f67b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] per-backing dev unplugging

From: Jens Axboe <axboe@suse.de>,
      Chris Mason,
      me, others.

The global unplug list causes horrid spinlock contention on many-disk
many-CPU setups - throughput is worse than halved.

The other problem with the global unplugging is of course that it will cause
the unplugging of queues which are unrelated to the I/O upon which the caller
is about to wait.

So what we do to solve these problems is to remove the global unplug and set
up the infrastructure under which the VFS can tell the block layer to unplug
only those queues which are relevant to the page or buffer_head whcih is
about to be waited upon.

We do this via the very appropriate address_space->backing_dev_info structure.

Most of the complexity is in devicemapper, MD and swapper_space, because for
these backing devices, multiple queues may need to be unplugged to complete a
page/buffer I/O.  In each case we ensure that data structures are in place to
permit us to identify all the lower-level queues which contribute to the
higher-level backing_dev_info.  Each contributing queue is told to unplug in
response to a higher-level unplug.

To simplify things in various places we also introduce the concept of a
"synchronous BIO": it is tagged with BIO_RW_SYNC.  The block layer will
perform an immediate unplug when it sees one of these go past.
parent 3749bf2c
......@@ -42,12 +42,6 @@ static void blk_unplug_timeout(unsigned long data);
*/
static kmem_cache_t *request_cachep;
/*
* plug management
*/
static LIST_HEAD(blk_plug_list);
static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
......@@ -251,8 +245,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
*/
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
INIT_LIST_HEAD(&q->plug_list);
blk_queue_activity_fn(q, NULL, NULL);
}
......@@ -1104,13 +1096,11 @@ void blk_plug_device(request_queue_t *q)
* don't plug a stopped queue, it must be paired with blk_start_queue()
* which will restart the queueing
*/
if (!blk_queue_plugged(q)
&& !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
spin_lock(&blk_plug_lock);
list_add_tail(&q->plug_list, &blk_plug_list);
if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
return;
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
spin_unlock(&blk_plug_lock);
}
}
EXPORT_SYMBOL(blk_plug_device);
......@@ -1122,15 +1112,12 @@ EXPORT_SYMBOL(blk_plug_device);
int blk_remove_plug(request_queue_t *q)
{
WARN_ON(!irqs_disabled());
if (blk_queue_plugged(q)) {
spin_lock(&blk_plug_lock);
list_del_init(&q->plug_list);
del_timer(&q->unplug_timer);
spin_unlock(&blk_plug_lock);
return 1;
}
if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
return 0;
del_timer(&q->unplug_timer);
return 1;
}
EXPORT_SYMBOL(blk_remove_plug);
......@@ -1161,24 +1148,32 @@ static inline void __generic_unplug_device(request_queue_t *q)
* Linux uses plugging to build bigger requests queues before letting
* the device have at them. If a queue is plugged, the I/O scheduler
* is still adding and merging requests on the queue. Once the queue
* gets unplugged (either by manually calling this function, or by
* calling blk_run_queues()), the request_fn defined for the
* queue is invoked and transfers started.
* gets unplugged, the request_fn defined for the queue is invoked and
* transfers started.
**/
void generic_unplug_device(void *data)
void generic_unplug_device(request_queue_t *q)
{
request_queue_t *q = data;
spin_lock_irq(q->queue_lock);
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
}
EXPORT_SYMBOL(generic_unplug_device);
static void blk_backing_dev_unplug(struct backing_dev_info *bdi)
{
request_queue_t *q = bdi->unplug_io_data;
/*
* devices don't necessarily have an ->unplug_fn defined
*/
if (q->unplug_fn)
q->unplug_fn(q);
}
static void blk_unplug_work(void *data)
{
request_queue_t *q = data;
q->unplug_fn(q);
}
......@@ -1255,42 +1250,6 @@ void blk_run_queue(struct request_queue *q)
EXPORT_SYMBOL(blk_run_queue);
/**
* blk_run_queues - fire all plugged queues
*
* Description:
* Start I/O on all plugged queues known to the block layer. Queues that
* are currently stopped are ignored. This is equivalent to the older
* tq_disk task queue run.
**/
#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
void blk_run_queues(void)
{
LIST_HEAD(local_plug_list);
spin_lock_irq(&blk_plug_lock);
/*
* this will happen fairly often
*/
if (list_empty(&blk_plug_list))
goto out;
list_splice_init(&blk_plug_list, &local_plug_list);
while (!list_empty(&local_plug_list)) {
request_queue_t *q = blk_plug_entry(local_plug_list.next);
spin_unlock_irq(&blk_plug_lock);
q->unplug_fn(q);
spin_lock_irq(&blk_plug_lock);
}
out:
spin_unlock_irq(&blk_plug_lock);
}
EXPORT_SYMBOL(blk_run_queues);
/**
* blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
* @q: the request queue to be released
......@@ -1390,6 +1349,10 @@ request_queue_t *blk_alloc_queue(int gfp_mask)
memset(q, 0, sizeof(*q));
init_timer(&q->unplug_timer);
atomic_set(&q->refcnt, 1);
q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
q->backing_dev_info.unplug_io_data = q;
return q;
}
......@@ -2050,7 +2013,6 @@ long blk_congestion_wait(int rw, long timeout)
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[rw];
blk_run_queues();
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
......@@ -2315,7 +2277,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
if (blk_queue_plugged(q)) {
int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
if (nr_queued == q->unplug_thresh)
if (nr_queued == q->unplug_thresh || bio_sync(bio))
__generic_unplug_device(q);
}
spin_unlock_irq(q->queue_lock);
......
......@@ -434,6 +434,17 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
goto out;
}
/*
* kick off io on the underlying address space
*/
static void loop_unplug(request_queue_t *q)
{
struct loop_device *lo = q->queuedata;
clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
blk_run_address_space(lo->lo_backing_file->f_mapping);
}
struct switch_request {
struct file *file;
struct completion wait;
......@@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
{
struct file *file;
struct inode *inode;
struct block_device *lo_device = NULL;
struct address_space *mapping;
unsigned lo_blocksize;
int lo_flags = 0;
......@@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
lo->lo_blocksize = lo_blocksize;
lo->lo_device = lo_device;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
lo->lo_backing_file = file;
lo->transfer = NULL;
......@@ -688,6 +698,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
*/
blk_queue_make_request(lo->lo_queue, loop_make_request);
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;
set_capacity(disks[lo->lo_number], size);
bd_set_size(bdev, size << 9);
......
......@@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode, struct file *file,
static struct backing_dev_info rd_backing_dev_info = {
.ra_pages = 0, /* No readahead */
.memory_backed = 1, /* Does not contribute to dirty memory */
.unplug_io_fn = default_unplug_io_fn,
};
static int rd_open(struct inode *inode, struct file *filp)
......
......@@ -368,9 +368,8 @@ static inline void reset_page(struct mm_page *page)
page->biotail = & page->bio;
}
static void mm_unplug_device(void *data)
static void mm_unplug_device(request_queue_t *q)
{
request_queue_t *q = data;
struct cardinfo *card = q->queuedata;
unsigned long flags;
......
......@@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
/* out of memory -> run queues */
if (remaining)
blk_run_queues();
blk_congestion_wait(bio_data_dir(clone), HZ/100);
}
/* drop reference, clones could have returned before we reach this */
......
......@@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
return r;
}
void dm_table_unplug_all(struct dm_table *t)
{
struct list_head *d, *devices = dm_table_get_devices(t);
for (d = devices->next; d != devices; d = d->next) {
struct dm_dev *dd = list_entry(d, struct dm_dev, list);
request_queue_t *q = bdev_get_queue(dd->bdev);
if (q->unplug_fn)
q->unplug_fn(q);
}
}
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
EXPORT_SYMBOL(dm_table_event);
EXPORT_SYMBOL(dm_table_get_mode);
EXPORT_SYMBOL(dm_table_put);
EXPORT_SYMBOL(dm_table_get);
EXPORT_SYMBOL(dm_table_unplug_all);
......@@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q, struct bio *bio)
return 0;
}
static void dm_unplug_all(request_queue_t *q)
{
struct mapped_device *md = q->queuedata;
struct dm_table *map = dm_get_table(md);
if (map) {
dm_table_unplug_all(map);
dm_table_put(map);
}
}
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r;
......@@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
md->queue->unplug_fn = dm_unplug_all;
md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
mempool_free_slab, _io_cache);
......@@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
add_wait_queue(&md->wait, &wait);
up_write(&md->lock);
/* unplug */
map = dm_get_table(md);
if (map) {
dm_table_unplug_all(map);
dm_table_put(map);
}
/*
* Then we wait for the already mapped ios to
* complete.
*/
blk_run_queues();
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
......@@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md)
def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->lock);
dm_table_unplug_all(map);
dm_table_put(map);
blk_run_queues();
return 0;
}
......
......@@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t);
void dm_table_suspend_targets(struct dm_table *t);
void dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
void dm_table_unplug_all(struct dm_table *t);
/*-----------------------------------------------------------------
* A registry of target types.
......
......@@ -160,6 +160,30 @@ static int md_fail_request (request_queue_t *q, struct bio *bio)
return 0;
}
void md_unplug_mddev(mddev_t *mddev)
{
struct list_head *tmp;
mdk_rdev_t *rdev;
/*
* this list iteration is done without any locking in md?!
*/
ITERATE_RDEV(mddev, rdev, tmp) {
request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
if (r_queue->unplug_fn)
r_queue->unplug_fn(r_queue);
}
}
EXPORT_SYMBOL(md_unplug_mddev);
static void md_unplug_all(request_queue_t *q)
{
mddev_t *mddev = q->queuedata;
md_unplug_mddev(mddev);
}
static inline mddev_t *mddev_get(mddev_t *mddev)
{
atomic_inc(&mddev->active);
......@@ -335,6 +359,8 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct bio_vec vec;
struct completion event;
rw |= (1 << BIO_RW_SYNC);
bio_init(&bio);
bio.bi_io_vec = &vec;
vec.bv_page = page;
......@@ -349,7 +375,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
bio.bi_private = &event;
bio.bi_end_io = bi_complete;
submit_bio(rw, &bio);
blk_run_queues();
wait_for_completion(&event);
return test_bit(BIO_UPTODATE, &bio.bi_flags);
......@@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev)
*/
mddev->queue->queuedata = mddev;
mddev->queue->make_request_fn = mddev->pers->make_request;
mddev->queue->unplug_fn = md_unplug_all;
mddev->changed = 1;
return 0;
......@@ -2718,7 +2744,7 @@ int md_thread(void * arg)
run = thread->run;
if (run) {
run(thread->mddev);
blk_run_queues();
md_unplug_mddev(thread->mddev);
}
if (signal_pending(current))
flush_signals(current);
......@@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev)
test_bit(MD_RECOVERY_ERR, &mddev->recovery))
break;
blk_run_queues();
md_unplug_mddev(mddev);
repeat:
if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
......
......@@ -451,6 +451,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
static void device_barrier(conf_t *conf, sector_t sect)
{
md_unplug_mddev(conf->mddev);
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
......@@ -478,6 +479,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
md_unplug_mddev(conf->mddev);
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
conf->nr_pending++;
......@@ -644,6 +646,7 @@ static void print_conf(conf_t *conf)
static void close_sync(conf_t *conf)
{
md_unplug_mddev(conf->mddev);
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
......
......@@ -249,6 +249,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
break;
if (!sh) {
conf->inactive_blocked = 1;
md_unplug_mddev(conf->mddev);
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list) &&
(atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
......@@ -1292,9 +1293,8 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
}
}
}
static void raid5_unplug_device(void *data)
static void raid5_unplug_device(request_queue_t *q)
{
request_queue_t *q = data;
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long flags;
......
......@@ -1454,9 +1454,8 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
}
}
}
static void raid6_unplug_device(void *data)
static void raid6_unplug_device(request_queue_t *q)
{
request_queue_t *q = data;
mddev_t *mddev = q->queuedata;
raid6_conf_t *conf = mddev_to_conf(mddev);
unsigned long flags;
......
......@@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd_dev *dev, struct page *page)
bio->bi_private = &event;
bio->bi_end_io = bi_read_complete;
if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
submit_bio(READ, bio);
blk_run_queues();
submit_bio(READ_SYNC, bio);
wait_for_completion(&event);
err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
bio_put(bio);
......@@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio *bio)
init_completion(&event);
bio->bi_private = &event;
bio->bi_end_io = bi_write_complete;
submit_bio(WRITE, bio);
blk_run_queues();
submit_bio(WRITE_SYNC, bio);
wait_for_completion(&event);
DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
......
......@@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head * bh)
do {
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
if (buffer_locked(bh)) {
blk_run_queues();
struct block_device *bd;
smp_mb();
bd = bh->b_bdev;
if (bd)
blk_run_address_space(bd->bd_inode->i_mapping);
io_schedule();
}
} while (buffer_locked(bh));
......@@ -492,7 +496,6 @@ static void free_more_memory(void)
pg_data_t *pgdat;
wakeup_bdflush(1024);
blk_run_queues();
yield();
for_each_pgdat(pgdat) {
......@@ -2927,7 +2930,10 @@ EXPORT_SYMBOL(try_to_free_buffers);
int block_sync_page(struct page *page)
{
blk_run_queues();
struct address_space *mapping;
smp_mb();
mapping = page->mapping;
blk_run_address_space(mapping);
return 0;
}
......
......@@ -364,7 +364,7 @@ static struct bio *dio_await_one(struct dio *dio)
if (dio->bio_list == NULL) {
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
blk_run_queues();
blk_run_address_space(dio->inode->i_mapping);
io_schedule();
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
......@@ -1035,7 +1035,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
if (ret == 0)
ret = dio->result;
finished_one_bio(dio); /* This can free the dio */
blk_run_queues();
blk_run_address_space(inode->i_mapping);
if (should_wait) {
unsigned long flags;
/*
......
......@@ -1975,8 +1975,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
submit_bio(READ, bio);
blk_run_queues();
submit_bio(READ_SYNC, bio);
wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
......@@ -2120,9 +2119,8 @@ static void lbmStartIO(struct lbuf * bp)
/* check if journaling to disk has been disabled */
if (!log->no_integrity) {
submit_bio(WRITE, bio);
submit_bio(WRITE_SYNC, bio);
INCREMENT(lmStat.submitted);
blk_run_queues();
}
else {
bio->bi_size = 0;
......
......@@ -23,6 +23,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include "ntfs.h"
......@@ -668,7 +669,7 @@ int ntfs_read_compressed_block(struct page *page)
"uptodate! Unplugging the disk queue "
"and rescheduling.");
get_bh(tbh);
blk_run_queues();
blk_run_address_space(mapping);
schedule();
put_bh(tbh);
if (unlikely(!buffer_uptodate(tbh)))
......
......@@ -38,6 +38,7 @@
#include <linux/string.h>
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/sched.h>
#include "swab.h"
......@@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode)
break;
if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
ufs_sync_inode (inode);
blk_run_queues();
blk_run_address_space(inode->i_mapping);
yield();
}
offset = inode->i_size & uspi->s_fshift;
......
......@@ -1013,7 +1013,7 @@ pagebuf_lock(
{
PB_TRACE(pb, "lock", 0);
if (atomic_read(&pb->pb_io_remaining))
blk_run_queues();
blk_run_address_space(pb->pb_target->pbr_mapping);
down(&pb->pb_sema);
PB_SET_OWNER(pb);
PB_TRACE(pb, "locked", 0);
......@@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin(
if (atomic_read(&pb->pb_pin_count) == 0)
break;
if (atomic_read(&pb->pb_io_remaining))
blk_run_queues();
blk_run_address_space(pb->pb_target->pbr_mapping);
schedule();
}
remove_wait_queue(&pb->pb_waiters, &wait);
......@@ -1407,7 +1407,7 @@ _pagebuf_ioapply(
if (pb->pb_flags & PBF_RUN_QUEUES) {
pb->pb_flags &= ~PBF_RUN_QUEUES;
if (atomic_read(&pb->pb_io_remaining) > 1)
blk_run_queues();
blk_run_address_space(pb->pb_target->pbr_mapping);
}
}
......@@ -1471,7 +1471,7 @@ pagebuf_iowait(
{
PB_TRACE(pb, "iowait", 0);
if (atomic_read(&pb->pb_io_remaining))
blk_run_queues();
blk_run_address_space(pb->pb_target->pbr_mapping);
down(&pb->pb_iodonesema);
PB_TRACE(pb, "iowaited", (long)pb->pb_error);
return pb->pb_error;
......@@ -1617,7 +1617,6 @@ STATIC int
pagebuf_daemon(
void *data)
{
int count;
page_buf_t *pb;
struct list_head *curr, *next, tmp;
......@@ -1640,7 +1639,6 @@ pagebuf_daemon(
spin_lock(&pbd_delwrite_lock);
count = 0;
list_for_each_safe(curr, next, &pbd_delwrite_queue) {
pb = list_entry(curr, page_buf_t, pb_list);
......@@ -1657,7 +1655,6 @@ pagebuf_daemon(
pb->pb_flags &= ~PBF_DELWRI;
pb->pb_flags |= PBF_WRITE;
list_move(&pb->pb_list, &tmp);
count++;
}
}
......@@ -1667,12 +1664,11 @@ pagebuf_daemon(
list_del_init(&pb->pb_list);
pagebuf_iostrategy(pb);
blk_run_address_space(pb->pb_target->pbr_mapping);
}
if (as_list_len > 0)
purge_addresses();
if (count)
blk_run_queues();
force_flush = 0;
} while (pagebuf_daemon_active);
......@@ -1689,7 +1685,6 @@ pagebuf_delwri_flush(
page_buf_t *pb;
struct list_head *curr, *next, tmp;
int pincount = 0;
int flush_cnt = 0;
pagebuf_runall_queues(pagebuf_dataio_workqueue);
pagebuf_runall_queues(pagebuf_logio_workqueue);
......@@ -1733,13 +1728,7 @@ pagebuf_delwri_flush(
pagebuf_lock(pb);
pagebuf_iostrategy(pb);
if (++flush_cnt > 32) {
blk_run_queues();
flush_cnt = 0;
}
}
blk_run_queues();
while (!list_empty(&tmp)) {
pb = list_entry(tmp.next, page_buf_t, pb_list);
......@@ -1751,6 +1740,9 @@ pagebuf_delwri_flush(
pagebuf_rele(pb);
}
if (flags & PBDF_WAIT)
blk_run_address_space(target->pbr_mapping);
if (pinptr)
*pinptr = pincount;
}
......
......@@ -28,9 +28,12 @@ struct backing_dev_info {
int memory_backed; /* Cannot clean pages with writepage */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
void (*unplug_io_fn)(struct backing_dev_info *);
void *unplug_io_data;
};
extern struct backing_dev_info default_backing_dev_info;
void default_unplug_io_fn(struct backing_dev_info *bdi);
int writeback_acquire(struct backing_dev_info *bdi);
int writeback_in_progress(struct backing_dev_info *bdi);
......
......@@ -119,11 +119,13 @@ struct bio {
* bit 1 -- rw-ahead when set
* bit 2 -- barrier
* bit 3 -- fail fast, don't want low level driver retries
* bit 4 -- synchronous I/O hint: the block layer will unplug immediately
*/
#define BIO_RW 0
#define BIO_RW_AHEAD 1
#define BIO_RW_BARRIER 2
#define BIO_RW_FAILFAST 3
#define BIO_RW_SYNC 4
/*
* various member access, note that bio_data should of course not be used
......@@ -138,6 +140,7 @@ struct bio {
#define bio_cur_sectors(bio) (bio_iovec(bio)->bv_len >> 9)
#define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio)))
#define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER))
#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC))
/*
* will die
......
......@@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request_queue_t *, struct request *,
typedef void (request_fn_proc) (request_queue_t *q);
typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
typedef int (prep_rq_fn) (request_queue_t *, struct request *);
typedef void (unplug_fn) (void *q);
typedef void (unplug_fn) (request_queue_t *);
struct bio_vec;
typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
......@@ -315,8 +315,6 @@ struct request_queue
unsigned long bounce_pfn;
int bounce_gfp;
struct list_head plug_list;
/*
* various queue flags, see QUEUE_* below
*/
......@@ -370,8 +368,9 @@ struct request_queue
#define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */
#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */
#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */
#define blk_queue_plugged(q) !list_empty(&(q)->plug_list)
#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
......@@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk *, unsigned int, unsigned long);
extern void blk_start_queue(request_queue_t *q);
extern void blk_stop_queue(request_queue_t *q);
extern void __blk_stop_queue(request_queue_t *q);
extern void blk_run_queue(request_queue_t *q);
extern void blk_run_queue(request_queue_t *);
extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int);
......@@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
return bdev->bd_disk->queue;
}
static inline void blk_run_backing_dev(struct backing_dev_info *bdi)
{
if (bdi && bdi->unplug_io_fn)
bdi->unplug_io_fn(bdi);
}
static inline void blk_run_address_space(struct address_space *mapping)
{
if (mapping)
blk_run_backing_dev(mapping->backing_dev_info);
}
/*
* end_request() and friends. Must be called with the request queue spinlock
* acquired. All functions called within end_request() _must_be_ atomic.
......@@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
extern void generic_unplug_device(void *);
extern void generic_unplug_device(request_queue_t *);
extern long nr_blockdev_pages(void);
int blk_get_queue(request_queue_t *);
......
......@@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
#define WRITE 1
#define READA 2 /* read-ahead - don't block if no resources */
#define SPECIAL 4 /* For non-blockdevice requests in request queue */
#define READ_SYNC (READ | (1 << BIO_RW_SYNC))
#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
#define SEL_IN 1
#define SEL_OUT 2
......
......@@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
extern void md_unplug_mddev(mddev_t *mddev);
extern void md_print_devices (void);
......
......@@ -326,7 +326,6 @@ do { \
if (condition) \
break; \
spin_unlock_irq(&lock); \
blk_run_queues(); \
schedule(); \
spin_lock_irq(&lock); \
} \
......@@ -341,30 +340,5 @@ do { \
__wait_event_lock_irq(wq, condition, lock); \
} while (0)
#define __wait_disk_event(wq, condition) \
do { \
wait_queue_t __wait; \
init_waitqueue_entry(&__wait, current); \
\
add_wait_queue(&wq, &__wait); \
for (;;) { \
set_current_state(TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
blk_run_queues(); \
schedule(); \
} \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while (0)
#define wait_disk_event(wq, condition) \
do { \
if (condition) \
break; \
__wait_disk_event(wq, condition); \
} while (0)
#endif
......@@ -197,6 +197,8 @@ extern int shmem_unuse(swp_entry_t entry, struct page *page);
#define SWAP_AGAIN 1
#define SWAP_FAIL 2
extern void swap_unplug_io_fn(struct backing_dev_info *);
#ifdef CONFIG_SWAP
/* linux/mm/page_io.c */
extern int swap_readpage(struct file *, struct page *);
......@@ -232,6 +234,7 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
extern struct swap_info_struct *get_swap_info_struct(unsigned);
extern int can_share_swap_page(struct page *);
extern int remove_exclusive_swap_page(struct page *);
struct backing_dev_info;
extern struct swap_list_t swap_list;
extern spinlock_t swaplock;
......
......@@ -84,7 +84,6 @@ static void free_some_memory(void)
while (shrink_all_memory(10000))
printk(".");
printk("|\n");
blk_run_queues();
}
......
......@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err)
static void wait_io(void)
{
blk_run_queues();
while(atomic_read(&io_done))
io_schedule();
}
......@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page)
if (rw == WRITE)
bio_set_pages_dirty(bio);
start_io();
submit_bio(rw,bio);
submit_bio(rw | (1 << BIO_RW_SYNC), bio);
wait_io();
Done:
bio_put(bio);
......
......@@ -707,11 +707,6 @@ int software_suspend(void)
free_some_memory();
/* No need to invalidate any vfsmnt list --
* they will be valid after resume, anyway.
*/
blk_run_queues();
/* Save state of all device drivers, and stop them. */
if ((res = device_suspend(4))==0)
/* If stopping device drivers worked, we proceed basically into
......
......@@ -119,8 +119,10 @@ void remove_from_page_cache(struct page *page)
static inline int sync_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct address_space *mapping;
smp_mb();
mapping = page->mapping;
if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
return mapping->a_ops->sync_page(page);
return 0;
......
......@@ -234,8 +234,6 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
if (!(gfp_mask & __GFP_WAIT))
return NULL;
blk_run_queues();
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
mb();
if (!pool->curr_nr)
......
......@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
......@@ -571,3 +572,7 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr,
void pte_chain_init(void)
{
}
void swap_unplug_io_fn(struct backing_dev_info *)
{
}
......@@ -15,11 +15,16 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
void default_unplug_io_fn(struct backing_dev_info *bdi)
{
}
EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
.state = 0,
.unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
/*
......@@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
ra->ra_pages = mapping->backing_dev_info->ra_pages;
ra->average = ra->ra_pages / 2;
}
EXPORT_SYMBOL(file_ra_state_init);
/*
......
......@@ -169,6 +169,7 @@ static struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info = {
.ra_pages = 0, /* No readahead */
.memory_backed = 1, /* Does not contribute to dirty memory */
.unplug_io_fn = default_unplug_io_fn,
};
LIST_HEAD(shmem_inodes);
......
......@@ -19,6 +19,7 @@
static struct backing_dev_info swap_backing_dev_info = {
.ra_pages = 0, /* No readahead */
.memory_backed = 1, /* Does not contribute to dirty memory */
.unplug_io_fn = swap_unplug_io_fn,
};
extern struct address_space_operations swap_aops;
......
......@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/rmap-locking.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
......@@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1};
struct swap_info_struct swap_info[MAX_SWAPFILES];
/*
* Array of backing blockdevs, for swap_unplug_fn. We need this because the
* bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
* the unplug_fn. And swap_list_lock cannot be turned into a semaphore.
*/
static DECLARE_MUTEX(swap_bdevs_sem);
static struct block_device *swap_bdevs[MAX_SWAPFILES];
#define SWAPFILE_CLUSTER 256
/*
* Caller holds swap_bdevs_sem
*/
static void install_swap_bdev(struct block_device *bdev)
{
int i;
for (i = 0; i < MAX_SWAPFILES; i++) {
if (swap_bdevs[i] == NULL) {
swap_bdevs[i] = bdev;
return;
}
}
BUG();
}
static void remove_swap_bdev(struct block_device *bdev)
{
int i;
for (i = 0; i < MAX_SWAPFILES; i++) {
if (swap_bdevs[i] == bdev) {
memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
(MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
swap_bdevs[MAX_SWAPFILES - 1] = NULL;
return;
}
}
BUG();
}
void swap_unplug_io_fn(struct backing_dev_info *unused_bdi)
{
int i;
down(&swap_bdevs_sem);
for (i = 0; i < MAX_SWAPFILES; i++) {
struct block_device *bdev = swap_bdevs[i];
struct backing_dev_info *bdi;
if (bdev == NULL)
break;
bdi = bdev->bd_inode->i_mapping->backing_dev_info;
(*bdi->unplug_io_fn)(bdi);
}
up(&swap_bdevs_sem);
}
static inline int scan_swap_map(struct swap_info_struct *si)
{
unsigned long offset;
......@@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
swap_list_unlock();
goto out_dput;
}
down(&swap_bdevs_sem);
swap_list_lock();
swap_device_lock(p);
swap_file = p->swap_file;
......@@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
destroy_swap_extents(p);
swap_device_unlock(p);
swap_list_unlock();
remove_swap_bdev(p->bdev);
up(&swap_bdevs_sem);
vfree(swap_map);
if (S_ISBLK(mapping->host->i_mode)) {
struct block_device *bdev = I_BDEV(mapping->host);
......@@ -1440,6 +1500,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
if (error)
goto bad_swap;
down(&swap_bdevs_sem);
swap_list_lock();
swap_device_lock(p);
p->flags = SWP_ACTIVE;
......@@ -1465,6 +1526,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
}
swap_device_unlock(p);
swap_list_unlock();
install_swap_bdev(p->bdev);
up(&swap_bdevs_sem);
error = 0;
goto out;
bad_swap:
......@@ -1484,7 +1547,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
destroy_swap_extents(p);
if (swap_map)
vfree(swap_map);
if (swap_file && !IS_ERR(swap_file))
if (swap_file)
filp_close(swap_file, NULL);
out:
if (page && !IS_ERR(page)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment