ll_rw_blk.c 72.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 *  linux/drivers/block/ll_rw_blk.c
 *
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
Linus Torvalds's avatar
Linus Torvalds committed
9
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
Linus Torvalds's avatar
Linus Torvalds committed
10 11 12 13 14 15
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/config.h>
16 17 18
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
19
#include <linux/bio.h>
20
#include <linux/blkdev.h>
21
#include <linux/highmem.h>
Linus Torvalds's avatar
Linus Torvalds committed
22
#include <linux/mm.h>
23 24
#include <linux/kernel_stat.h>
#include <linux/string.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/init.h>
26
#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
Linus Torvalds's avatar
Linus Torvalds committed
27
#include <linux/completion.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
#include <linux/slab.h>
29
#include <linux/swap.h>
Linus Torvalds's avatar
Linus Torvalds committed
30

31
static void blk_unplug_work(void *data);
32
static void blk_unplug_timeout(unsigned long data);
33

Linus Torvalds's avatar
Linus Torvalds committed
34 35 36 37 38
/*
 * For the allocated request tables
 */
static kmem_cache_t *request_cachep;

Jens Axboe's avatar
Jens Axboe committed
39 40 41
/*
 * plug management
 */
Andrew Morton's avatar
Andrew Morton committed
42
static LIST_HEAD(blk_plug_list);
43
static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
Linus Torvalds's avatar
Linus Torvalds committed
44

45
static wait_queue_head_t congestion_wqh[2];
46

47 48 49 50 51 52 53
/*
 * Controlling structure to kblockd
 */
static struct workqueue_struct *kblockd_workqueue; 

unsigned long blk_max_low_pfn, blk_max_pfn;

54 55 56 57 58
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME	(HZ/50UL)

/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ	32
59

60
/*
61
 * Return the threshold (number of used requests) at which the queue is
62 63 64
 * considered to be congested.  It include a little hysteresis to keep the
 * context switch rate down.
 */
65
static inline int queue_congestion_on_threshold(struct request_queue *q)
66 67 68
{
	int ret;

69 70 71 72 73
	ret = q->nr_requests - (q->nr_requests / 8) + 1;

	if (ret > q->nr_requests)
		ret = q->nr_requests;

74 75 76 77 78 79
	return ret;
}

/*
 * The threshold at which a queue is considered to be uncongested
 */
80
static inline int queue_congestion_off_threshold(struct request_queue *q)
81 82 83
{
	int ret;

84 85 86 87 88
	ret = q->nr_requests - (q->nr_requests / 8) - 1;

	if (ret < 1)
		ret = 1;

89 90 91
	return ret;
}

92 93 94 95 96
/*
 * A queue has just exitted congestion.  Note this in the global counter of
 * congested queues, and wake up anyone who was waiting for requests to be
 * put back.
 */
97 98 99
static void clear_queue_congested(request_queue_t *q, int rw)
{
	enum bdi_state bit;
100
	wait_queue_head_t *wqh = &congestion_wqh[rw];
101 102

	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
103 104 105
	clear_bit(bit, &q->backing_dev_info.state);
	if (waitqueue_active(wqh))
		wake_up(wqh);
106 107
}

108 109 110 111
/*
 * A queue has just entered congestion.  Flag that in the queue's VM-visible
 * state flags and increment the global gounter of congested queues.
 */
112 113 114 115 116
static void set_queue_congested(request_queue_t *q, int rw)
{
	enum bdi_state bit;

	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
117
	set_bit(bit, &q->backing_dev_info.state);
118 119
}

Andrew Morton's avatar
Andrew Morton committed
120
/**
121
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
Andrew Morton's avatar
Andrew Morton committed
122 123
 * @dev:	device
 *
Andrew Morton's avatar
Andrew Morton committed
124
 * Locates the passed device's request queue and returns the address of its
125
 * backing_dev_info
Andrew Morton's avatar
Andrew Morton committed
126
 *
Andrew Morton's avatar
Andrew Morton committed
127
 * Will return NULL if the request queue cannot be located.
Andrew Morton's avatar
Andrew Morton committed
128
 */
129
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
Andrew Morton's avatar
Andrew Morton committed
130
{
131
	struct backing_dev_info *ret = NULL;
132
	request_queue_t *q = bdev_get_queue(bdev);
Andrew Morton's avatar
Andrew Morton committed
133

Andrew Morton's avatar
Andrew Morton committed
134
	if (q)
135
		ret = &q->backing_dev_info;
Andrew Morton's avatar
Andrew Morton committed
136 137 138
	return ret;
}

Jens Axboe's avatar
Jens Axboe committed
139 140 141 142 143 144
void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
{
	q->activity_fn = fn;
	q->activity_data = data;
}

145 146 147 148 149 150 151 152 153 154 155
/**
 * blk_queue_prep_rq - set a prepare_request function for queue
 * @q:		queue
 * @pfn:	prepare_request function
 *
 * It's possible for a queue to register a prepare_request callback which
 * is invoked before the request is handed to the request_fn. The goal of
 * the function is to prepare a request for I/O, it can be used to build a
 * cdb from the request data for instance.
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
156 157 158 159 160
void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
{
	q->prep_rq_fn = pfn;
}

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
/**
 * blk_queue_merge_bvec - set a merge_bvec function for queue
 * @q:		queue
 * @mbfn:	merge_bvec_fn
 *
 * Usually queues have static limitations on the max sectors or segments that
 * we can put in a request. Stacking drivers may have some settings that
 * are dynamic, and thus we have to query the queue whether it is ok to
 * add a new bio_vec to a bio at a given offset or not. If the block device
 * has such limitations, it needs to register a merge_bvec_fn to control
 * the size of bio's sent to it. Per default now merge_bvec_fn is defined for
 * a queue, and only the fixed limits are honored.
 *
 */
void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
{
	q->merge_bvec_fn = mbfn;
}

Linus Torvalds's avatar
Linus Torvalds committed
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
/**
 * blk_queue_make_request - define an alternate make_request function for a device
 * @q:  the request queue for the device to be affected
 * @mfn: the alternate make_request function
 *
 * Description:
 *    The normal way for &struct bios to be passed to a device
 *    driver is for them to be collected into requests on a request
 *    queue, and then to allow the device driver to select requests
 *    off that queue when it is ready.  This works well for many block
 *    devices. However some block devices (typically virtual devices
 *    such as md or lvm) do not benefit from the processing on the
 *    request queue, and are served best by having the requests passed
 *    directly to them.  This can be achieved by providing a function
 *    to blk_queue_make_request().
 *
 * Caveat:
 *    The driver that does this *must* be able to deal appropriately
 *    with buffers in "highmemory". This can be accomplished by either calling
Andrew Morton's avatar
Andrew Morton committed
199
 *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
Linus Torvalds's avatar
Linus Torvalds committed
200 201 202
 *    blk_queue_bounce() to create a buffer in normal memory.
 **/
void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
Linus Torvalds's avatar
Linus Torvalds committed
203
{
Linus Torvalds's avatar
Linus Torvalds committed
204 205 206
	/*
	 * set defaults
	 */
207
	q->nr_requests = BLKDEV_MAX_RQ;
Linus Torvalds's avatar
Linus Torvalds committed
208 209
	q->max_phys_segments = MAX_PHYS_SEGMENTS;
	q->max_hw_segments = MAX_HW_SEGMENTS;
Linus Torvalds's avatar
Linus Torvalds committed
210
	q->make_request_fn = mfn;
211 212
	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	q->backing_dev_info.state = 0;
213
	q->backing_dev_info.memory_backed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
214 215
	blk_queue_max_sectors(q, MAX_SECTORS);
	blk_queue_hardsect_size(q, 512);
Jens Axboe's avatar
Jens Axboe committed
216
	blk_queue_dma_alignment(q, 511);
Linus Torvalds's avatar
Linus Torvalds committed
217

218 219 220 221 222 223 224
	q->unplug_thresh = 4;		/* hmm */
	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
	if (q->unplug_delay == 0)
		q->unplug_delay = 1;

	INIT_WORK(&q->unplug_work, blk_unplug_work, q);

225 226 227
	q->unplug_timer.function = blk_unplug_timeout;
	q->unplug_timer.data = (unsigned long)q;

Linus Torvalds's avatar
Linus Torvalds committed
228 229 230 231 232
	/*
	 * by default assume old behaviour and bounce for any highmem page
	 */
	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

Neil Brown's avatar
Neil Brown committed
233
	INIT_LIST_HEAD(&q->plug_list);
Jens Axboe's avatar
Jens Axboe committed
234 235

	blk_queue_activity_fn(q, NULL, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
236 237 238
}

/**
Linus Torvalds's avatar
Linus Torvalds committed
239 240 241
 * blk_queue_bounce_limit - set bounce buffer limit for queue
 * @q:  the request queue for the device
 * @dma_addr:   bus address limit
Linus Torvalds's avatar
Linus Torvalds committed
242 243
 *
 * Description:
Linus Torvalds's avatar
Linus Torvalds committed
244 245 246 247 248
 *    Different hardware can have different requirements as to what pages
 *    it can do I/O directly to. A low level driver can call
 *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
 *    buffers for doing I/O to pages residing above @page. By default
 *    the block layer sets this to the highest numbered "low" memory page.
Linus Torvalds's avatar
Linus Torvalds committed
249
 **/
Linus Torvalds's avatar
Linus Torvalds committed
250
void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
Linus Torvalds's avatar
Linus Torvalds committed
251
{
Linus Torvalds's avatar
Linus Torvalds committed
252 253 254
	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
	unsigned long mb = dma_addr >> 20;
	static request_queue_t *last_q;
Linus Torvalds's avatar
Linus Torvalds committed
255

Linus Torvalds's avatar
Linus Torvalds committed
256 257 258 259 260
	/*
	 * set appropriate bounce gfp mask -- unfortunately we don't have a
	 * full 4GB zone, so we have to resort to low memory for any bounces.
	 * ISA has its own < 16MB zone.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
261 262
	if (bounce_pfn < blk_max_low_pfn) {
		BUG_ON(dma_addr < BLK_BOUNCE_ISA);
Linus Torvalds's avatar
Linus Torvalds committed
263 264 265
		init_emergency_isa_pool();
		q->bounce_gfp = GFP_NOIO | GFP_DMA;
	} else
Andrew Morton's avatar
Andrew Morton committed
266
		q->bounce_gfp = GFP_NOIO;
Linus Torvalds's avatar
Linus Torvalds committed
267

Linus Torvalds's avatar
Linus Torvalds committed
268 269 270 271 272 273 274 275
	/*
	 * keep this for debugging for now...
	 */
	if (dma_addr != BLK_BOUNCE_HIGH && q != last_q) {
		printk("blk: queue %p, ", q);
		if (dma_addr == BLK_BOUNCE_ANY)
			printk("no I/O memory limit\n");
		else
Linus Torvalds's avatar
Linus Torvalds committed
276
			printk("I/O limit %luMb (mask 0x%Lx)\n", mb, (long long) dma_addr);
Linus Torvalds's avatar
Linus Torvalds committed
277
	}
Linus Torvalds's avatar
Linus Torvalds committed
278

Linus Torvalds's avatar
Linus Torvalds committed
279 280
	q->bounce_pfn = bounce_pfn;
	last_q = q;
Linus Torvalds's avatar
Linus Torvalds committed
281 282
}

Linus Torvalds's avatar
Linus Torvalds committed
283

Linus Torvalds's avatar
Linus Torvalds committed
284
/**
Linus Torvalds's avatar
Linus Torvalds committed
285 286 287
 * blk_queue_max_sectors - set max sectors for a request for this queue
 * @q:  the request queue for the device
 * @max_sectors:  max sectors in the usual 512b unit
Linus Torvalds's avatar
Linus Torvalds committed
288 289
 *
 * Description:
Linus Torvalds's avatar
Linus Torvalds committed
290 291 292 293 294
 *    Enables a low level driver to set an upper limit on the size of
 *    received requests.
 **/
void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
{
295 296 297 298 299
	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
	}

Linus Torvalds's avatar
Linus Torvalds committed
300 301 302 303
	q->max_sectors = max_sectors;
}

/**
Linus Torvalds's avatar
Linus Torvalds committed
304
 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
Linus Torvalds's avatar
Linus Torvalds committed
305 306
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
Linus Torvalds's avatar
Linus Torvalds committed
307
 *
Linus Torvalds's avatar
Linus Torvalds committed
308 309
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
Linus Torvalds's avatar
Linus Torvalds committed
310 311
 *    physical data segments in a request.  This would be the largest sized
 *    scatter list the driver could handle.
Linus Torvalds's avatar
Linus Torvalds committed
312
 **/
Linus Torvalds's avatar
Linus Torvalds committed
313
void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
Linus Torvalds's avatar
Linus Torvalds committed
314
{
315 316 317 318 319
	if (!max_segments) {
		max_segments = 1;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
	}

Linus Torvalds's avatar
Linus Torvalds committed
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
	q->max_phys_segments = max_segments;
}

/**
 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
 *    hw data segments in a request.  This would be the largest number of
 *    address/length pairs the host adapter can actually give as once
 *    to the device.
 **/
void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
{
336 337 338 339 340
	if (!max_segments) {
		max_segments = 1;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
	}

Linus Torvalds's avatar
Linus Torvalds committed
341
	q->max_hw_segments = max_segments;
Linus Torvalds's avatar
Linus Torvalds committed
342 343 344
}

/**
Linus Torvalds's avatar
Linus Torvalds committed
345 346 347
 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
 * @q:  the request queue for the device
 * @max_size:  max size of segment in bytes
Linus Torvalds's avatar
Linus Torvalds committed
348 349
 *
 * Description:
Linus Torvalds's avatar
Linus Torvalds committed
350 351 352 353 354
 *    Enables a low level driver to set an upper limit on the size of a
 *    coalesced segment
 **/
void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
{
355 356 357 358 359
	if (max_size < PAGE_CACHE_SIZE) {
		max_size = PAGE_CACHE_SIZE;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
	}

Linus Torvalds's avatar
Linus Torvalds committed
360 361 362 363 364 365 366
	q->max_segment_size = max_size;
}

/**
 * blk_queue_hardsect_size - set hardware sector size for the queue
 * @q:  the request queue for the device
 * @size:  the hardware sector size, in bytes
Linus Torvalds's avatar
Linus Torvalds committed
367
 *
Linus Torvalds's avatar
Linus Torvalds committed
368 369 370 371 372
 * Description:
 *   This should typically be set to the lowest possible sector size
 *   that the hardware can operate on (possible without reverting to
 *   even internal read-modify-write operations). Usually the default
 *   of 512 covers most hardware.
Linus Torvalds's avatar
Linus Torvalds committed
373
 **/
Linus Torvalds's avatar
Linus Torvalds committed
374 375 376 377
void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
{
	q->hardsect_size = size;
}
Linus Torvalds's avatar
Linus Torvalds committed
378

379 380 381 382 383
/*
 * Returns the minimum that is _not_ zero, unless both are zero.
 */
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))

384 385 386 387 388 389 390
/**
 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
 * @t:	the stacking driver (top)
 * @b:  the underlying device (bottom)
 **/
void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
{
391 392 393
	/* zero is "infinity" */
	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);

394 395 396 397 398 399
	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
}

Linus Torvalds's avatar
Linus Torvalds committed
400 401 402 403 404 405 406
/**
 * blk_queue_segment_boundary - set boundary rules for segment merging
 * @q:  the request queue for the device
 * @mask:  the memory boundary mask
 **/
void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
{
407 408 409 410 411
	if (mask < PAGE_CACHE_SIZE - 1) {
		mask = PAGE_CACHE_SIZE - 1;
		printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
	}

Linus Torvalds's avatar
Linus Torvalds committed
412 413 414
	q->seg_boundary_mask = mask;
}

Jens Axboe's avatar
Jens Axboe committed
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
/**
 * blk_queue_dma_alignment - set dma length and memory alignment
 * @q:  the request queue for the device
 * @dma_mask:  alignment mask
 *
 * description:
 *    set required memory and length aligment for direct dma transactions.
 *    this is used when buiding direct io requests for the queue.
 *
 **/
void blk_queue_dma_alignment(request_queue_t *q, int mask)
{
	q->dma_alignment = mask;
}

Jens Axboe's avatar
Jens Axboe committed
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
/**
 * blk_queue_find_tag - find a request by its tag and queue
 *
 * @q:	 The request queue for the device
 * @tag: The tag of the request
 *
 * Notes:
 *    Should be used when a device returns a tag and you want to match
 *    it with a request.
 *
 *    no locks need be held.
 **/
struct request *blk_queue_find_tag(request_queue_t *q, int tag)
{
	struct blk_queue_tag *bqt = q->queue_tags;

Jens Axboe's avatar
Jens Axboe committed
446
	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
Jens Axboe's avatar
Jens Axboe committed
447 448 449 450
		return NULL;

	return bqt->tag_index[tag];
}
Jens Axboe's avatar
Jens Axboe committed
451

452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
/**
 * blk_queue_free_tags - release tag maintenance info
 * @q:  the request queue for the device
 *
 *  Notes:
 *    blk_cleanup_queue() will take care of calling this function, if tagging
 *    has been used. So there's usually no need to call this directly, unless
 *    tagging is just being disabled but the queue remains in function.
 **/
void blk_queue_free_tags(request_queue_t *q)
{
	struct blk_queue_tag *bqt = q->queue_tags;

	if (!bqt)
		return;

468 469 470
	if (atomic_dec_and_test(&bqt->refcnt)) {
		BUG_ON(bqt->busy);
		BUG_ON(!list_empty(&bqt->busy_list));
471

472 473
		kfree(bqt->tag_index);
		bqt->tag_index = NULL;
474

475 476 477 478 479
		kfree(bqt->tag_map);
		bqt->tag_map = NULL;

		kfree(bqt);
	}
480 481 482 483 484

	q->queue_tags = NULL;
	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
}

485 486
static int
init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
487 488 489
{
	int bits, i;

490 491 492 493
	if (depth > q->nr_requests * 2) {
		depth = q->nr_requests * 2;
		printk(KERN_ERR "%s: adjusted depth to %d\n",
				__FUNCTION__, depth);
494
	}
495 496 497

	tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
	if (!tags->tag_index)
Jens Axboe's avatar
Jens Axboe committed
498
		goto fail;
499 500 501 502

	bits = (depth / BLK_TAGS_PER_LONG) + 1;
	tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
	if (!tags->tag_map)
Jens Axboe's avatar
Jens Axboe committed
503
		goto fail;
504

Jens Axboe's avatar
Jens Axboe committed
505 506
	memset(tags->tag_index, 0, depth * sizeof(struct request *));
	memset(tags->tag_map, 0, bits * sizeof(unsigned long));
507
	tags->max_depth = depth;
Jens Axboe's avatar
Jens Axboe committed
508
	tags->real_max_depth = bits * BITS_PER_LONG;
509 510 511 512 513

	/*
	 * set the upper bits if the depth isn't a multiple of the word size
	 */
	for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
514
		__set_bit(i, tags->tag_map);
515

516 517 518
	INIT_LIST_HEAD(&tags->busy_list);
	tags->busy = 0;
	atomic_set(&tags->refcnt, 1);
Jens Axboe's avatar
Jens Axboe committed
519 520 521 522 523 524 525 526 527 528 529
	return 0;
fail:
	kfree(tags->tag_index);
	return -ENOMEM;
}

/**
 * blk_queue_init_tags - initialize the queue tag info
 * @q:  the request queue for the device
 * @depth:  the maximum queue depth supported
 **/
530 531
int blk_queue_init_tags(request_queue_t *q, int depth,
			struct blk_queue_tag *tags)
Jens Axboe's avatar
Jens Axboe committed
532
{
533 534 535 536
	if (!tags) {
		tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
		if (!tags)
			goto fail;
Jens Axboe's avatar
Jens Axboe committed
537

538 539 540 541
		if (init_tag_map(q, tags, depth))
			goto fail;
	} else
		atomic_inc(&tags->refcnt);
Jens Axboe's avatar
Jens Axboe committed
542

543 544 545 546 547 548 549
	/*
	 * assign it, all done
	 */
	q->queue_tags = tags;
	q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
	return 0;
fail:
Jens Axboe's avatar
Jens Axboe committed
550
	kfree(tags);
551 552 553
	return -ENOMEM;
}

Jens Axboe's avatar
Jens Axboe committed
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
/**
 * blk_queue_resize_tags - change the queueing depth
 * @q:  the request queue for the device
 * @new_depth: the new max command queueing depth
 *
 *  Notes:
 *    Must be called with the queue lock held.
 **/
int blk_queue_resize_tags(request_queue_t *q, int new_depth)
{
	struct blk_queue_tag *bqt = q->queue_tags;
	struct request **tag_index;
	unsigned long *tag_map;
	int bits, max_depth;

	if (!bqt)
		return -ENXIO;

	/*
	 * don't bother sizing down
	 */
	if (new_depth <= bqt->real_max_depth) {
		bqt->max_depth = new_depth;
		return 0;
	}

	/*
	 * save the old state info, so we can copy it back
	 */
	tag_index = bqt->tag_index;
	tag_map = bqt->tag_map;
	max_depth = bqt->real_max_depth;

587
	if (init_tag_map(q, bqt, new_depth))
Jens Axboe's avatar
Jens Axboe committed
588 589 590 591
		return -ENOMEM;

	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
	bits = max_depth / BLK_TAGS_PER_LONG;
Jens Axboe's avatar
Jens Axboe committed
592
	memcpy(bqt->tag_map, tag_map, bits * sizeof(unsigned long));
Jens Axboe's avatar
Jens Axboe committed
593 594 595 596 597 598

	kfree(tag_index);
	kfree(tag_map);
	return 0;
}

599 600 601
/**
 * blk_queue_end_tag - end tag operations for a request
 * @q:  the request queue for the device
602
 * @rq: the request that has completed
603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
 *
 *  Description:
 *    Typically called when end_that_request_first() returns 0, meaning
 *    all transfers have been done for a request. It's important to call
 *    this function before end_that_request_last(), as that will put the
 *    request back on the free list thus corrupting the internal tag list.
 *
 *  Notes:
 *   queue lock must be held.
 **/
void blk_queue_end_tag(request_queue_t *q, struct request *rq)
{
	struct blk_queue_tag *bqt = q->queue_tags;
	int tag = rq->tag;

	BUG_ON(tag == -1);

Jens Axboe's avatar
Jens Axboe committed
620
	if (unlikely(tag >= bqt->real_max_depth))
621 622 623 624 625 626 627
		return;

	if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
		printk("attempt to clear non-busy tag (%d)\n", tag);
		return;
	}

628
	list_del_init(&rq->queuelist);
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
	rq->flags &= ~REQ_QUEUED;
	rq->tag = -1;

	if (unlikely(bqt->tag_index[tag] == NULL))
		printk("tag %d is missing\n", tag);

	bqt->tag_index[tag] = NULL;
	bqt->busy--;
}

/**
 * blk_queue_start_tag - find a free tag and assign it
 * @q:  the request queue for the device
 * @rq:  the block request that needs tagging
 *
 *  Description:
 *    This can either be used as a stand-alone helper, or possibly be
 *    assigned as the queue &prep_rq_fn (in which case &struct request
Jens Axboe's avatar
Jens Axboe committed
647 648 649 650 651 652
 *    automagically gets a tag assigned). Note that this function
 *    assumes that any type of request can be queued! if this is not
 *    true for your device, you must check the request type before
 *    calling this function.  The request will also be removed from
 *    the request queue, so it's the drivers responsibility to readd
 *    it if it should need to be restarted for some reason.
653 654 655 656 657 658 659 660 661 662
 *
 *  Notes:
 *   queue lock must be held.
 **/
int blk_queue_start_tag(request_queue_t *q, struct request *rq)
{
	struct blk_queue_tag *bqt = q->queue_tags;
	unsigned long *map = bqt->tag_map;
	int tag = 0;

Jens Axboe's avatar
Jens Axboe committed
663 664
	if (unlikely((rq->flags & REQ_QUEUED))) {
		printk(KERN_ERR 
665 666
		       "request %p for device [%s] already tagged %d",
		       rq, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
Jens Axboe's avatar
Jens Axboe committed
667 668
		BUG();
	}
669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703

	for (map = bqt->tag_map; *map == -1UL; map++) {
		tag += BLK_TAGS_PER_LONG;

		if (tag >= bqt->max_depth)
			return 1;
	}

	tag += ffz(*map);
	__set_bit(tag, bqt->tag_map);

	rq->flags |= REQ_QUEUED;
	rq->tag = tag;
	bqt->tag_index[tag] = rq;
	blkdev_dequeue_request(rq);
	list_add(&rq->queuelist, &bqt->busy_list);
	bqt->busy++;
	return 0;
}

/**
 * blk_queue_invalidate_tags - invalidate all pending tags
 * @q:  the request queue for the device
 *
 *  Description:
 *   Hardware conditions may dictate a need to stop all pending requests.
 *   In this case, we will safely clear the block side of the tag queue and
 *   readd all requests to the request queue in the right order.
 *
 *  Notes:
 *   queue lock must be held.
 **/
void blk_queue_invalidate_tags(request_queue_t *q)
{
	struct blk_queue_tag *bqt = q->queue_tags;
704
	struct list_head *tmp, *n;
705 706
	struct request *rq;

707
	list_for_each_safe(tmp, n, &bqt->busy_list) {
708 709
		rq = list_entry_rq(tmp);

710 711
		if (rq->tag == -1) {
			printk("bad tag found on list\n");
712
			list_del_init(&rq->queuelist);
Jens Axboe's avatar
Jens Axboe committed
713
			rq->flags &= ~REQ_QUEUED;
714 715 716
		} else
			blk_queue_end_tag(q, rq);

717
		rq->flags &= ~REQ_STARTED;
Jens Axboe's avatar
Jens Axboe committed
718
		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
719 720 721
	}
}

Martin Dalecki's avatar
Martin Dalecki committed
722 723
static char *rq_flags[] = {
	"REQ_RW",
Jens Axboe's avatar
Jens Axboe committed
724
	"REQ_FAILFAST",
Jens Axboe's avatar
Jens Axboe committed
725 726
	"REQ_SOFTBARRIER",
	"REQ_HARDBARRIER",
Martin Dalecki's avatar
Martin Dalecki committed
727 728 729 730 731 732 733 734
	"REQ_CMD",
	"REQ_NOMERGE",
	"REQ_STARTED",
	"REQ_DONTPREP",
	"REQ_QUEUED",
	"REQ_PC",
	"REQ_BLOCK_PC",
	"REQ_SENSE",
735 736
	"REQ_FAILED",
	"REQ_QUIET",
737
	"REQ_SPECIAL",
738 739 740
	"REQ_DRIVE_CMD",
	"REQ_DRIVE_TASK",
	"REQ_DRIVE_TASKFILE",
Jens Axboe's avatar
Jens Axboe committed
741 742 743 744
	"REQ_PREEMPT",
	"REQ_PM_SUSPEND",
	"REQ_PM_RESUME",
	"REQ_PM_SHUTDOWN",
Martin Dalecki's avatar
Martin Dalecki committed
745
};
Linus Torvalds's avatar
Linus Torvalds committed
746 747 748 749 750

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	int bit;

751 752
	printk("%s: dev %s: flags = ", msg,
		rq->rq_disk ? rq->rq_disk->disk_name : "?");
Linus Torvalds's avatar
Linus Torvalds committed
753 754 755 756 757 758 759
	bit = 0;
	do {
		if (rq->flags & (1 << bit))
			printk("%s ", rq_flags[bit]);
		bit++;
	} while (bit < __REQ_NR_BITS);

760
	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
Linus Torvalds's avatar
Linus Torvalds committed
761 762
						       rq->nr_sectors,
						       rq->current_nr_sectors);
763 764 765 766 767 768 769 770
	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);

	if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
		printk("cdb: ");
		for (bit = 0; bit < sizeof(rq->cmd); bit++)
			printk("%02x ", rq->cmd[bit]);
		printk("\n");
	}
Linus Torvalds's avatar
Linus Torvalds committed
771 772
}

Linus Torvalds's avatar
Linus Torvalds committed
773 774 775
void blk_recount_segments(request_queue_t *q, struct bio *bio)
{
	struct bio_vec *bv, *bvprv = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
776
	int i, nr_phys_segs, nr_hw_segs, seg_size, cluster;
Linus Torvalds's avatar
Linus Torvalds committed
777 778 779 780 781

	if (unlikely(!bio->bi_io_vec))
		return;

	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
Linus Torvalds's avatar
Linus Torvalds committed
782
	seg_size = nr_phys_segs = nr_hw_segs = 0;
Linus Torvalds's avatar
Linus Torvalds committed
783 784
	bio_for_each_segment(bv, bio, i) {
		if (bvprv && cluster) {
785
			if (seg_size + bv->bv_len > q->max_segment_size)
Linus Torvalds's avatar
Linus Torvalds committed
786
				goto new_segment;
787
			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
Linus Torvalds's avatar
Linus Torvalds committed
788
				goto new_segment;
789
			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
Linus Torvalds's avatar
Linus Torvalds committed
790 791 792 793 794 795 796
				goto new_segment;

			seg_size += bv->bv_len;
			bvprv = bv;
			continue;
		}
new_segment:
797 798 799 800
		if (!bvprv || !BIOVEC_VIRT_MERGEABLE(bvprv, bv))
			nr_hw_segs++;

		nr_phys_segs++;
Linus Torvalds's avatar
Linus Torvalds committed
801
		bvprv = bv;
Linus Torvalds's avatar
Linus Torvalds committed
802
		seg_size = bv->bv_len;
Linus Torvalds's avatar
Linus Torvalds committed
803 804
	}

Linus Torvalds's avatar
Linus Torvalds committed
805 806
	bio->bi_phys_segments = nr_phys_segs;
	bio->bi_hw_segments = nr_hw_segs;
Linus Torvalds's avatar
Linus Torvalds committed
807 808 809 810
	bio->bi_flags |= (1 << BIO_SEG_VALID);
}


811
int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
Linus Torvalds's avatar
Linus Torvalds committed
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
				   struct bio *nxt)
{
	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
		return 0;

	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
		return 0;
	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
		return 0;

	/*
	 * bio and nxt are contigous in memory, check if the queue allows
	 * these two to be merged into one
	 */
	if (BIO_SEG_BOUNDARY(q, bio, nxt))
		return 1;

	return 0;
}

832
int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
Linus Torvalds's avatar
Linus Torvalds committed
833
				 struct bio *nxt)
Linus Torvalds's avatar
Linus Torvalds committed
834
{
Linus Torvalds's avatar
Linus Torvalds committed
835 836 837
	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
838
	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
Linus Torvalds's avatar
Linus Torvalds committed
839
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
840 841
	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
842 843

	/*
Linus Torvalds's avatar
Linus Torvalds committed
844 845
	 * bio and nxt are contigous in memory, check if the queue allows
	 * these two to be merged into one
Linus Torvalds's avatar
Linus Torvalds committed
846
	 */
Linus Torvalds's avatar
Linus Torvalds committed
847
	if (BIO_SEG_BOUNDARY(q, bio, nxt))
Linus Torvalds's avatar
Linus Torvalds committed
848 849 850
		return 1;

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
851 852
}

Linus Torvalds's avatar
Linus Torvalds committed
853 854
/*
 * map a request to scatterlist, return number of sg entries setup. Caller
Linus Torvalds's avatar
Linus Torvalds committed
855
 * must make sure sg can hold rq->nr_phys_segments entries
Linus Torvalds's avatar
Linus Torvalds committed
856 857
 */
int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
Linus Torvalds's avatar
Linus Torvalds committed
858
{
Linus Torvalds's avatar
Linus Torvalds committed
859
	struct bio_vec *bvec, *bvprv;
Linus Torvalds's avatar
Linus Torvalds committed
860
	struct bio *bio;
Linus Torvalds's avatar
Linus Torvalds committed
861
	int nsegs, i, cluster;
Linus Torvalds's avatar
Linus Torvalds committed
862 863

	nsegs = 0;
Linus Torvalds's avatar
Linus Torvalds committed
864
	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
Linus Torvalds's avatar
Linus Torvalds committed
865 866 867 868

	/*
	 * for each bio in rq
	 */
Linus Torvalds's avatar
Linus Torvalds committed
869
	bvprv = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
870 871 872 873 874 875 876
	rq_for_each_bio(bio, rq) {
		/*
		 * for each segment in bio
		 */
		bio_for_each_segment(bvec, bio, i) {
			int nbytes = bvec->bv_len;

Linus Torvalds's avatar
Linus Torvalds committed
877
			if (bvprv && cluster) {
Linus Torvalds's avatar
Linus Torvalds committed
878
				if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
Linus Torvalds's avatar
Linus Torvalds committed
879 880
					goto new_segment;

Linus Torvalds's avatar
Linus Torvalds committed
881
				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
Linus Torvalds's avatar
Linus Torvalds committed
882 883 884
					goto new_segment;
				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
					goto new_segment;
Linus Torvalds's avatar
Linus Torvalds committed
885 886 887 888

				sg[nsegs - 1].length += nbytes;
			} else {
new_segment:
Linus Torvalds's avatar
Linus Torvalds committed
889
				memset(&sg[nsegs],0,sizeof(struct scatterlist));
Linus Torvalds's avatar
Linus Torvalds committed
890 891 892 893 894 895
				sg[nsegs].page = bvec->bv_page;
				sg[nsegs].length = nbytes;
				sg[nsegs].offset = bvec->bv_offset;

				nsegs++;
			}
Linus Torvalds's avatar
Linus Torvalds committed
896
			bvprv = bvec;
Linus Torvalds's avatar
Linus Torvalds committed
897 898 899 900 901 902 903 904 905 906
		} /* segments in bio */
	} /* bios in rq */

	return nsegs;
}

/*
 * the standard queue merge functions, can be overridden with device
 * specific ones if so desired
 */
Linus Torvalds's avatar
Linus Torvalds committed
907 908 909 910

static inline int ll_new_mergeable(request_queue_t *q,
				   struct request *req,
				   struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
911
{
Linus Torvalds's avatar
Linus Torvalds committed
912
	int nr_phys_segs = bio_phys_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
913

Linus Torvalds's avatar
Linus Torvalds committed
914 915
	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
916
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
917
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
918
	}
Linus Torvalds's avatar
Linus Torvalds committed
919

Linus Torvalds's avatar
Linus Torvalds committed
920 921 922 923 924 925 926 927 928 929 930 931 932
	/*
	 * A hw segment is just getting larger, bump just the phys
	 * counter.
	 */
	req->nr_phys_segments += nr_phys_segs;
	return 1;
}

static inline int ll_new_hw_segment(request_queue_t *q,
				    struct request *req,
				    struct bio *bio)
{
	int nr_hw_segs = bio_hw_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
933
	int nr_phys_segs = bio_phys_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
934

Linus Torvalds's avatar
Linus Torvalds committed
935 936
	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
Linus Torvalds's avatar
Linus Torvalds committed
937
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
938
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
939 940 941 942 943 944 945 946
		return 0;
	}

	/*
	 * This will form the start of a new hw segment.  Bump both
	 * counters.
	 */
	req->nr_hw_segments += nr_hw_segs;
Linus Torvalds's avatar
Linus Torvalds committed
947
	req->nr_phys_segments += nr_phys_segs;
Linus Torvalds's avatar
Linus Torvalds committed
948
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
949 950 951
}

static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
Linus Torvalds's avatar
Linus Torvalds committed
952
			    struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
953
{
Linus Torvalds's avatar
Linus Torvalds committed
954 955
	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
956
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
957
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
958 959
	}

Linus Torvalds's avatar
Linus Torvalds committed
960
	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)))
Linus Torvalds's avatar
Linus Torvalds committed
961 962 963
		return ll_new_mergeable(q, req, bio);

	return ll_new_hw_segment(q, req, bio);
Linus Torvalds's avatar
Linus Torvalds committed
964 965 966
}

static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
Linus Torvalds's avatar
Linus Torvalds committed
967
			     struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
968
{
Linus Torvalds's avatar
Linus Torvalds committed
969 970
	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
		req->flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
971
		q->last_merge = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
972
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
973 974
	}

Linus Torvalds's avatar
Linus Torvalds committed
975
	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)))
Linus Torvalds's avatar
Linus Torvalds committed
976 977 978
		return ll_new_mergeable(q, req, bio);

	return ll_new_hw_segment(q, req, bio);
Linus Torvalds's avatar
Linus Torvalds committed
979 980 981
}

static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
Linus Torvalds's avatar
Linus Torvalds committed
982
				struct request *next)
Linus Torvalds's avatar
Linus Torvalds committed
983
{
Linus Torvalds's avatar
Linus Torvalds committed
984
	int total_phys_segments = req->nr_phys_segments +next->nr_phys_segments;
Linus Torvalds's avatar
Linus Torvalds committed
985
	int total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
Linus Torvalds's avatar
Linus Torvalds committed
986

Linus Torvalds's avatar
Linus Torvalds committed
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
	/*
	 * First check if the either of the requests are re-queued
	 * requests.  Can't merge them if they are.
	 */
	if (req->special || next->special)
		return 0;

	/*
	 * Will it become to large?
	 */
	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
		return 0;

	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
	if (blk_phys_contig_segment(q, req->biotail, next->bio))
		total_phys_segments--;

	if (total_phys_segments > q->max_phys_segments)
		return 0;

	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
	if (blk_hw_contig_segment(q, req->biotail, next->bio))
		total_hw_segments--;
1010

Linus Torvalds's avatar
Linus Torvalds committed
1011
	if (total_hw_segments > q->max_hw_segments)
Linus Torvalds's avatar
Linus Torvalds committed
1012 1013
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
1014 1015 1016
	/* Merge is OK... */
	req->nr_phys_segments = total_phys_segments;
	req->nr_hw_segments = total_hw_segments;
Linus Torvalds's avatar
Linus Torvalds committed
1017 1018 1019 1020 1021 1022 1023 1024
	return 1;
}

/*
 * "plug" the device if there are no outstanding requests: this will
 * force the transfer to start only after we have put all the requests
 * on the list.
 *
Jens Axboe's avatar
Jens Axboe committed
1025 1026
 * This is called with interrupts off and no requests on the queue and
 * with the queue lock held.
Linus Torvalds's avatar
Linus Torvalds committed
1027
 */
Linus Torvalds's avatar
Linus Torvalds committed
1028
void blk_plug_device(request_queue_t *q)
Linus Torvalds's avatar
Linus Torvalds committed
1029
{
1030
	WARN_ON(!irqs_disabled());
Jens Axboe's avatar
Jens Axboe committed
1031 1032 1033 1034 1035 1036 1037

	/*
	 * don't plug a stopped queue, it must be paired with blk_start_queue()
	 * which will restart the queueing
	 */
	if (!blk_queue_plugged(q)
	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
1038 1039
		spin_lock(&blk_plug_lock);
		list_add_tail(&q->plug_list, &blk_plug_list);
1040
		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1041 1042
		spin_unlock(&blk_plug_lock);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1043 1044
}

Jens Axboe's avatar
Jens Axboe committed
1045 1046 1047 1048
/*
 * remove the queue from the plugged list, if present. called with
 * queue lock held and interrupts disabled.
 */
1049
int blk_remove_plug(request_queue_t *q)
Jens Axboe's avatar
Jens Axboe committed
1050
{
1051
	WARN_ON(!irqs_disabled());
Jens Axboe's avatar
Jens Axboe committed
1052 1053 1054
	if (blk_queue_plugged(q)) {
		spin_lock(&blk_plug_lock);
		list_del_init(&q->plug_list);
1055
		del_timer(&q->unplug_timer);
Jens Axboe's avatar
Jens Axboe committed
1056 1057 1058 1059 1060 1061 1062
		spin_unlock(&blk_plug_lock);
		return 1;
	}

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
1063 1064 1065 1066 1067
/*
 * remove the plug and let it rip..
 */
static inline void __generic_unplug_device(request_queue_t *q)
{
1068
	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
Jens Axboe's avatar
Jens Axboe committed
1069 1070
		return;

1071
	if (!blk_remove_plug(q))
Linus Torvalds's avatar
Linus Torvalds committed
1072 1073
		return;

1074 1075
	del_timer(&q->unplug_timer);

Linus Torvalds's avatar
Linus Torvalds committed
1076 1077 1078
	/*
	 * was plugged, fire request_fn if queue has stuff to do
	 */
1079
	if (elv_next_request(q))
Linus Torvalds's avatar
Linus Torvalds committed
1080
		q->request_fn(q);
Linus Torvalds's avatar
Linus Torvalds committed
1081 1082
}

Linus Torvalds's avatar
Linus Torvalds committed
1083 1084
/**
 * generic_unplug_device - fire a request queue
Jens Axboe's avatar
Jens Axboe committed
1085
 * @data:    The &request_queue_t in question
Linus Torvalds's avatar
Linus Torvalds committed
1086 1087 1088 1089 1090 1091
 *
 * Description:
 *   Linux uses plugging to build bigger requests queues before letting
 *   the device have at them. If a queue is plugged, the I/O scheduler
 *   is still adding and merging requests on the queue. Once the queue
 *   gets unplugged (either by manually calling this function, or by
1092
 *   calling blk_run_queues()), the request_fn defined for the
Linus Torvalds's avatar
Linus Torvalds committed
1093 1094
 *   queue is invoked and transfers started.
 **/
Linus Torvalds's avatar
Linus Torvalds committed
1095
void generic_unplug_device(void *data)
Jens Axboe's avatar
Jens Axboe committed
1096 1097
{
	request_queue_t *q = data;
Linus Torvalds's avatar
Linus Torvalds committed
1098

Jens Axboe's avatar
Jens Axboe committed
1099
	spin_lock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1100
	__generic_unplug_device(q);
Jens Axboe's avatar
Jens Axboe committed
1101
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1102 1103
}

1104 1105
static void blk_unplug_work(void *data)
{
Andrew Morton's avatar
Andrew Morton committed
1106 1107
	request_queue_t *q = data;
	q->unplug_fn(q);
1108 1109 1110 1111 1112 1113
}

static void blk_unplug_timeout(unsigned long data)
{
	request_queue_t *q = (request_queue_t *)data;

1114
	kblockd_schedule_work(&q->unplug_work);
1115 1116
}

1117 1118 1119 1120 1121 1122 1123
/**
 * blk_start_queue - restart a previously stopped queue
 * @q:    The &request_queue_t in question
 *
 * Description:
 *   blk_start_queue() will clear the stop flag on the queue, and call
 *   the request_fn for the queue if it was in a stopped state when
1124
 *   entered. Also see blk_stop_queue(). Must not be called from driver
Jens Axboe's avatar
Jens Axboe committed
1125
 *   request function due to recursion issues. Queue lock must be held.
1126
 **/
Jens Axboe's avatar
Jens Axboe committed
1127 1128
void blk_start_queue(request_queue_t *q)
{
Andrew Morton's avatar
Andrew Morton committed
1129 1130
	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
	schedule_work(&q->unplug_work);
1131 1132
}

1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
/**
 * blk_stop_queue - stop a queue
 * @q:    The &request_queue_t in question
 *
 * Description:
 *   The Linux block layer assumes that a block driver will consume all
 *   entries on the request queue when the request_fn strategy is called.
 *   Often this will not happen, because of hardware limitations (queue
 *   depth settings). If a device driver gets a 'queue full' response,
 *   or if it simply chooses not to queue more I/O at one point, it can
 *   call this function to prevent the request_fn from being called until
 *   the driver has signalled it's ready to go again. This happens by calling
Jens Axboe's avatar
Jens Axboe committed
1145
 *   blk_start_queue() to restart queue operations. Queue lock must be held.
1146
 **/
Jens Axboe's avatar
Jens Axboe committed
1147 1148
void blk_stop_queue(request_queue_t *q)
{
Jens Axboe's avatar
Jens Axboe committed
1149 1150
	blk_remove_plug(q);
	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
Jens Axboe's avatar
Jens Axboe committed
1151 1152
}

1153 1154 1155 1156
/**
 * blk_run_queue - run a single device queue
 * @q	The queue to run
 */
1157
void blk_run_queue(struct request_queue *q)
1158
{
1159 1160 1161
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
1162 1163
	blk_remove_plug(q);
	q->request_fn(q);
1164
	spin_unlock_irqrestore(q->queue_lock, flags);
1165 1166
}

1167 1168 1169 1170 1171 1172 1173 1174
/**
 * blk_run_queues - fire all plugged queues
 *
 * Description:
 *   Start I/O on all plugged queues known to the block layer. Queues that
 *   are currently stopped are ignored. This is equivalent to the older
 *   tq_disk task queue run.
 **/
Jens Axboe's avatar
Jens Axboe committed
1175
#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
Jens Axboe's avatar
Jens Axboe committed
1176 1177
void blk_run_queues(void)
{
1178
	LIST_HEAD(local_plug_list);
1179

Jens Axboe's avatar
Jens Axboe committed
1180 1181
	spin_lock_irq(&blk_plug_lock);

Jens Axboe's avatar
Jens Axboe committed
1182
	/*
1183
	 * this will happen fairly often
Jens Axboe's avatar
Jens Axboe committed
1184
	 */
1185 1186
	if (list_empty(&blk_plug_list))
		goto out;
1187

1188
	list_splice_init(&blk_plug_list, &local_plug_list);
Jens Axboe's avatar
Jens Axboe committed
1189 1190 1191
	
	while (!list_empty(&local_plug_list)) {
		request_queue_t *q = blk_plug_entry(local_plug_list.next);
Jens Axboe's avatar
Jens Axboe committed
1192

1193
		spin_unlock_irq(&blk_plug_lock);
Jens Axboe's avatar
Jens Axboe committed
1194
		q->unplug_fn(q);
1195
		spin_lock_irq(&blk_plug_lock);
1196
	}
1197 1198
out:
	spin_unlock_irq(&blk_plug_lock);
Jens Axboe's avatar
Jens Axboe committed
1199 1200
}

Linus Torvalds's avatar
Linus Torvalds committed
1201 1202 1203 1204 1205
/**
 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
 * @q:    the request queue to be released
 *
 * Description:
1206 1207 1208 1209 1210 1211
 *     blk_cleanup_queue is the pair to blk_init_queue() or
 *     blk_queue_make_request().  It should be called when a request queue is
 *     being released; typically when a block device is being de-registered.
 *     Currently, its primary task it to free all the &struct request
 *     structures that were allocated to the queue and the queue itself.
 *
1212
 * Caveat:
Linus Torvalds's avatar
Linus Torvalds committed
1213 1214 1215 1216 1217
 *     Hopefully the low level driver will have finished any
 *     outstanding requests first...
 **/
void blk_cleanup_queue(request_queue_t * q)
{
1218
	struct request_list *rl = &q->rq;
Linus Torvalds's avatar
Linus Torvalds committed
1219

1220 1221 1222
	if (!atomic_dec_and_test(&q->refcnt))
		return;

1223 1224
	elevator_exit(q);

1225
	del_timer_sync(&q->unplug_timer);
1226
	kblockd_flush();
1227

1228 1229
	if (rl->rq_pool)
		mempool_destroy(rl->rq_pool);
Linus Torvalds's avatar
Linus Torvalds committed
1230

1231 1232 1233
	if (blk_queue_tagged(q))
		blk_queue_free_tags(q);

1234
	kfree(q);
Linus Torvalds's avatar
Linus Torvalds committed
1235 1236 1237
}

static int blk_init_free_list(request_queue_t *q)
Linus Torvalds's avatar
Linus Torvalds committed
1238
{
1239
	struct request_list *rl = &q->rq;
Linus Torvalds's avatar
Linus Torvalds committed
1240

1241
	rl->count[READ] = rl->count[WRITE] = 0;
1242 1243
	init_waitqueue_head(&rl->wait[READ]);
	init_waitqueue_head(&rl->wait[WRITE]);
Linus Torvalds's avatar
Linus Torvalds committed
1244

1245
	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
Linus Torvalds's avatar
Linus Torvalds committed
1246

1247 1248
	if (!rl->rq_pool)
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1249

Linus Torvalds's avatar
Linus Torvalds committed
1250
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1251 1252
}

Linus Torvalds's avatar
Linus Torvalds committed
1253
static int __make_request(request_queue_t *, struct bio *);
Linus Torvalds's avatar
Linus Torvalds committed
1254

1255 1256 1257 1258 1259
static elevator_t *chosen_elevator =
#if defined(CONFIG_IOSCHED_AS)
	&iosched_as;
#elif defined(CONFIG_IOSCHED_DEADLINE)
	&iosched_deadline;
1260
#elif defined(CONFIG_IOSCHED_NOOP)
1261
	&elevator_noop;
1262 1263 1264
#else
	NULL;
#error "You must have at least 1 I/O scheduler selected"
1265 1266
#endif

1267
#if defined(CONFIG_IOSCHED_AS) || defined(CONFIG_IOSCHED_DEADLINE) || defined (CONFIG_IOSCHED_NOOP)
1268 1269
static int __init elevator_setup(char *str)
{
1270
#ifdef CONFIG_IOSCHED_DEADLINE
1271 1272
	if (!strcmp(str, "deadline"))
		chosen_elevator = &iosched_deadline;
1273 1274
#endif
#ifdef CONFIG_IOSCHED_AS
1275 1276
	if (!strcmp(str, "as"))
		chosen_elevator = &iosched_as;
1277 1278 1279 1280
#endif
#ifdef CONFIG_IOSCHED_NOOP
	if (!strcmp(str, "noop"))
		chosen_elevator = &elevator_noop;
1281
#endif
1282 1283
	return 1;
}
1284

1285
__setup("elevator=", elevator_setup);
1286
#endif /* CONFIG_IOSCHED_AS || CONFIG_IOSCHED_DEADLINE || CONFIG_IOSCHED_NOOP */
1287

1288 1289 1290 1291 1292 1293 1294 1295
request_queue_t *blk_alloc_queue(int gfp_mask)
{
	request_queue_t *q = kmalloc(sizeof(*q), gfp_mask);

	if (!q)
		return NULL;

	memset(q, 0, sizeof(*q));
1296
	init_timer(&q->unplug_timer);
1297 1298 1299 1300
	atomic_set(&q->refcnt, 1);
	return q;
}

Linus Torvalds's avatar
Linus Torvalds committed
1301 1302 1303 1304
/**
 * blk_init_queue  - prepare a request queue for use with a block device
 * @rfn:  The function to be called to process requests that have been
 *        placed on the queue.
1305
 * @lock: Request queue spin lock
Linus Torvalds's avatar
Linus Torvalds committed
1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
 *
 * Description:
 *    If a block device wishes to use the standard request handling procedures,
 *    which sorts requests and coalesces adjacent requests, then it must
 *    call blk_init_queue().  The function @rfn will be called when there
 *    are requests on the queue that need to be processed.  If the device
 *    supports plugging, then @rfn may not be called immediately when requests
 *    are available on the queue, but may be called at some time later instead.
 *    Plugged queues are generally unplugged when a buffer belonging to one
 *    of the requests on the queue is needed, or due to memory pressure.
 *
 *    @rfn is not required, or even expected, to remove all requests off the
 *    queue, but only as many as it can handle at a time.  If it does leave
 *    requests on the queue, it is responsible for arranging that the requests
 *    get dealt with eventually.
 *
Linus Torvalds's avatar
Linus Torvalds committed
1322 1323
 *    The queue spin lock must be held while manipulating the requests on the
 *    request queue.
Linus Torvalds's avatar
Linus Torvalds committed
1324
 *
1325 1326 1327
 *    Function returns a pointer to the initialized request queue, or NULL if
 *    it didn't succeed.
 *
Linus Torvalds's avatar
Linus Torvalds committed
1328
 * Note:
Linus Torvalds's avatar
Linus Torvalds committed
1329
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
Linus Torvalds's avatar
Linus Torvalds committed
1330 1331
 *    when the block device is deactivated (such as at module unload).
 **/
1332
request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
Linus Torvalds's avatar
Linus Torvalds committed
1333
{
1334
	request_queue_t *q;
1335
	static int printed;
Linus Torvalds's avatar
Linus Torvalds committed
1336

1337 1338 1339 1340
	q = blk_alloc_queue(GFP_KERNEL);
	if (!q)
		return NULL;

Linus Torvalds's avatar
Linus Torvalds committed
1341
	if (blk_init_free_list(q))
1342
		goto out_init;
Linus Torvalds's avatar
Linus Torvalds committed
1343

1344 1345
	if (!printed) {
		printed = 1;
1346
		printk("Using %s io scheduler\n", chosen_elevator->elevator_name);
1347 1348
	}

1349 1350
	if (elevator_init(q, chosen_elevator))
		goto out_elv;
Linus Torvalds's avatar
Linus Torvalds committed
1351

Linus Torvalds's avatar
Linus Torvalds committed
1352
	q->request_fn		= rfn;
Linus Torvalds's avatar
Linus Torvalds committed
1353 1354 1355
	q->back_merge_fn       	= ll_back_merge_fn;
	q->front_merge_fn      	= ll_front_merge_fn;
	q->merge_requests_fn	= ll_merge_requests_fn;
Linus Torvalds's avatar
Linus Torvalds committed
1356
	q->prep_rq_fn		= NULL;
Jens Axboe's avatar
Jens Axboe committed
1357
	q->unplug_fn		= generic_unplug_device;
Linus Torvalds's avatar
Linus Torvalds committed
1358
	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
Linus Torvalds's avatar
Linus Torvalds committed
1359
	q->queue_lock		= lock;
Andrew Morton's avatar
Andrew Morton committed
1360

Linus Torvalds's avatar
Linus Torvalds committed
1361 1362
	blk_queue_segment_boundary(q, 0xffffffff);

Linus Torvalds's avatar
Linus Torvalds committed
1363 1364
	blk_queue_make_request(q, __make_request);
	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
Linus Torvalds's avatar
Linus Torvalds committed
1365 1366 1367

	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
Jens Axboe's avatar
Jens Axboe committed
1368

1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
	return q;
out_elv:
	blk_cleanup_queue(q);
out_init:
	kfree(q);
	return NULL;
	
}

int blk_get_queue(request_queue_t *q)
{
	if (!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
		atomic_inc(&q->refcnt);
		return 0;
	}

	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
1386 1387
}

1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407
static inline void blk_free_request(request_queue_t *q, struct request *rq)
{
	elv_put_request(q, rq);
	mempool_free(rq, q->rq.rq_pool);
}

static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
{
	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

	if (!rq)
		return NULL;

	if (!elv_set_request(q, rq, gfp_mask))
		return rq;

	mempool_free(rq, q->rq.rq_pool);
	return NULL;
}

1408 1409 1410 1411 1412 1413 1414 1415 1416
/*
 * ioc_batching returns true if the ioc is a valid batching request and
 * should be given priority access to a request.
 */
static inline int ioc_batching(struct io_context *ioc)
{
	if (!ioc)
		return 0;

1417 1418 1419 1420 1421
	/*
	 * Make sure the process is able to allocate at least 1 request
	 * even if the batch times out, otherwise we could theoretically
	 * lose wakeups.
	 */
1422 1423 1424 1425 1426 1427
	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
		(ioc->nr_batch_requests > 0
		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
}

/*
1428 1429 1430 1431
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 * will cause the process to be a "batcher" on all queues in the system. This
 * is the behaviour we want though - once it gets a wakeup it should be given
 * a nice run.
1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
 */
void ioc_set_batching(struct io_context *ioc)
{
	if (!ioc || ioc_batching(ioc))
		return;

	ioc->nr_batch_requests = BLK_BATCH_REQ;
	ioc->last_waited = jiffies;
}

/*
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
static void freed_request(request_queue_t *q, int rw)
{
	struct request_list *rl = &q->rq;

	rl->count[rw]--;
	if (rl->count[rw] < queue_congestion_off_threshold(q))
		clear_queue_congested(q, rw);
	if (rl->count[rw]+1 <= q->nr_requests) {
		smp_mb();
		if (waitqueue_active(&rl->wait[rw]))
			wake_up(&rl->wait[rw]);
		if (!waitqueue_active(&rl->wait[rw]))
			blk_clear_queue_full(q, rw);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
1462
#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
Linus Torvalds's avatar
Linus Torvalds committed
1463
/*
1464
 * Get a free request, queue_lock must not be held
Linus Torvalds's avatar
Linus Torvalds committed
1465
 */
1466
static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
1467
{
Linus Torvalds's avatar
Linus Torvalds committed
1468
	struct request *rq = NULL;
1469
	struct request_list *rl = &q->rq;
Andrew Morton's avatar
Andrew Morton committed
1470
	struct io_context *ioc = get_io_context(gfp_mask);
1471 1472

	spin_lock_irq(q->queue_lock);
1473
	if (rl->count[rw]+1 >= q->nr_requests) {
1474 1475 1476 1477 1478 1479
		/*
		 * The queue will fill after this allocation, so set it as
		 * full, and mark this process as "batching". This process
		 * will be allowed to complete a batch of requests, others
		 * will be blocked.
		 */
1480 1481 1482 1483 1484
		if (!blk_queue_full(q, rw)) {
			ioc_set_batching(ioc);
			blk_set_queue_full(q, rw);
		}
	}
Andrew Morton's avatar
Andrew Morton committed
1485

1486 1487
	if (blk_queue_full(q, rw)
			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
1488 1489 1490 1491
		/*
		 * The queue is full and the allocating process is not a
		 * "batcher", and not exempted by the IO scheduler
		 */
1492 1493
		spin_unlock_irq(q->queue_lock);
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1494
	}
1495

1496
	rl->count[rw]++;
1497
	if (rl->count[rw] >= queue_congestion_on_threshold(q))
1498 1499
		set_queue_congested(q, rw);
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1500

1501 1502
	rq = blk_alloc_request(q, gfp_mask);
	if (!rq) {
1503 1504 1505 1506 1507 1508 1509
		/*
		 * Allocation failed presumably due to memory. Undo anything
		 * we might have messed up.
		 *
		 * Allocating task should really be put onto the front of the
		 * wait queue, but this is pretty rare.
		 */
1510
		spin_lock_irq(q->queue_lock);
1511
		freed_request(q, rw);
1512 1513 1514
		spin_unlock_irq(q->queue_lock);
		goto out;
	}
1515 1516 1517

	if (ioc_batching(ioc))
		ioc->nr_batch_requests--;
1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539
	
	INIT_LIST_HEAD(&rq->queuelist);

	/*
	 * first three bits are identical in rq->flags and bio->bi_rw,
	 * see bio.h and blkdev.h
	 */
	rq->flags = rw;

	rq->errors = 0;
	rq->rq_status = RQ_ACTIVE;
	rq->bio = rq->biotail = NULL;
	rq->buffer = NULL;
	rq->ref_count = 1;
	rq->q = q;
	rq->rl = rl;
	rq->waiting = NULL;
	rq->special = NULL;
	rq->data = NULL;
	rq->sense = NULL;

out:
1540
	put_io_context(ioc);
Linus Torvalds's avatar
Linus Torvalds committed
1541 1542 1543 1544
	return rq;
}

/*
1545 1546
 * No available requests for this queue, unplug the device and wait for some
 * requests to become available.
Linus Torvalds's avatar
Linus Torvalds committed
1547
 */
Linus Torvalds's avatar
Linus Torvalds committed
1548
static struct request *get_request_wait(request_queue_t *q, int rw)
Linus Torvalds's avatar
Linus Torvalds committed
1549
{
1550
	DEFINE_WAIT(wait);
Linus Torvalds's avatar
Linus Torvalds committed
1551 1552
	struct request *rq;

Linus Torvalds's avatar
Linus Torvalds committed
1553 1554
	generic_unplug_device(q);
	do {
1555
		struct request_list *rl = &q->rq;
1556

1557 1558
		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
				TASK_UNINTERRUPTIBLE);
1559

1560
		rq = get_request(q, rw, GFP_NOIO);
1561 1562

		if (!rq) {
1563 1564
			struct io_context *ioc;

1565
			io_schedule();
1566 1567 1568 1569 1570 1571 1572

			/*
			 * After sleeping, we become a "batching" process and
			 * will be able to allocate at least one request, and
			 * up to a big batch of them for a small period time.
			 * See ioc_batching, ioc_set_batching
			 */
Andrew Morton's avatar
Andrew Morton committed
1573
			ioc = get_io_context(GFP_NOIO);
1574 1575
			ioc_set_batching(ioc);
			put_io_context(ioc);
1576
		}
1577
		finish_wait(&rl->wait[rw], &wait);
1578
	} while (!rq);
1579

Linus Torvalds's avatar
Linus Torvalds committed
1580 1581 1582
	return rq;
}

Linus Torvalds's avatar
Linus Torvalds committed
1583 1584 1585 1586 1587 1588
struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
{
	struct request *rq;

	BUG_ON(rw != READ && rw != WRITE);

1589
	if (gfp_mask & __GFP_WAIT)
Linus Torvalds's avatar
Linus Torvalds committed
1590
		rq = get_request_wait(q, rw);
1591
	else
1592
		rq = get_request(q, rw, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
1593

Martin Dalecki's avatar
Martin Dalecki committed
1594 1595
	return rq;
}
1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612
/**
 * blk_requeue_request - put a request back on queue
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 *
 * Description:
 *    Drivers often keep queueing requests until the hardware cannot accept
 *    more, when that condition happens we need to put the request back
 *    on the queue. Must be called with queue lock held.
 */
void blk_requeue_request(request_queue_t *q, struct request *rq)
{
	if (blk_rq_tagged(rq))
		blk_queue_end_tag(q, rq);

	elv_requeue_request(q, rq);
}
Martin Dalecki's avatar
Martin Dalecki committed
1613

1614 1615 1616 1617 1618 1619
/**
 * blk_insert_request - insert a special request in to a request queue
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 * @at_head:	insert request at head or tail of queue
 * @data:	private data
1620
 * @reinsert:	true if request it a reinsertion of previously processed one
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634
 *
 * Description:
 *    Many block devices need to execute commands asynchronously, so they don't
 *    block the whole kernel from preemption during request execution.  This is
 *    accomplished normally by inserting aritficial requests tagged as
 *    REQ_SPECIAL in to the corresponding request queue, and letting them be
 *    scheduled for actual execution by the request queue.
 *
 *    We have the option of inserting the head or the tail of the queue.
 *    Typically we use the tail for new ioctls and so forth.  We use the head
 *    of the queue for things like a QUEUE_FULL message from a device, or a
 *    host that is unable to accept a particular command.
 */
void blk_insert_request(request_queue_t *q, struct request *rq,
1635
			int at_head, void *data, int reinsert)
Linus Torvalds's avatar
Linus Torvalds committed
1636
{
1637 1638 1639 1640 1641 1642 1643
	unsigned long flags;

	/*
	 * tell I/O scheduler that this isn't a regular read/write (ie it
	 * must not attempt merges on this) and that it acts as a soft
	 * barrier
	 */
Jens Axboe's avatar
Jens Axboe committed
1644
	rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
1645 1646 1647 1648

	rq->special = data;

	spin_lock_irqsave(q->queue_lock, flags);
1649 1650 1651 1652

	/*
	 * If command is tagged, release the tag
	 */
1653 1654 1655
	if(reinsert) {
		blk_requeue_request(q, rq);
	} else {
Jens Axboe's avatar
Jens Axboe committed
1656 1657 1658 1659 1660
		int where = ELEVATOR_INSERT_BACK;

		if (at_head)
			where = ELEVATOR_INSERT_FRONT;

1661 1662
		if (blk_rq_tagged(rq))
			blk_queue_end_tag(q, rq);
1663

1664
		drive_stat_acct(rq, rq->nr_sectors, 1);
Jens Axboe's avatar
Jens Axboe committed
1665
		__elv_add_request(q, rq, where, 0);
1666
	}
1667 1668
	q->request_fn(q);
	spin_unlock_irqrestore(q->queue_lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1669 1670 1671
}

void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
Linus Torvalds's avatar
Linus Torvalds committed
1672
{
Linus Torvalds's avatar
Linus Torvalds committed
1673
	int rw = rq_data_dir(rq);
Linus Torvalds's avatar
Linus Torvalds committed
1674

Jens Axboe's avatar
Jens Axboe committed
1675
	if (!blk_fs_request(rq) || !rq->rq_disk)
1676 1677 1678
		return;

	if (rw == READ) {
1679
		disk_stat_add(rq->rq_disk, read_sectors, nr_sectors);
1680
		if (!new_io)
1681
			disk_stat_inc(rq->rq_disk, read_merges);
1682
	} else if (rw == WRITE) {
1683
		disk_stat_add(rq->rq_disk, write_sectors, nr_sectors);
1684
		if (!new_io)
1685
			disk_stat_inc(rq->rq_disk, write_merges);
1686 1687 1688
	}
	if (new_io) {
		disk_round_stats(rq->rq_disk);
1689
		rq->rq_disk->in_flight++;
1690
	}
Linus Torvalds's avatar
Linus Torvalds committed
1691 1692 1693 1694
}

/*
 * add-request adds a request to the linked list.
Linus Torvalds's avatar
Linus Torvalds committed
1695
 * queue lock is held and interrupts disabled, as we muck with the
Linus Torvalds's avatar
Linus Torvalds committed
1696
 * request queue list.
Linus Torvalds's avatar
Linus Torvalds committed
1697
 */
Jens Axboe's avatar
Jens Axboe committed
1698
static inline void add_request(request_queue_t * q, struct request * req)
Linus Torvalds's avatar
Linus Torvalds committed
1699
{
Linus Torvalds's avatar
Linus Torvalds committed
1700
	drive_stat_acct(req, req->nr_sectors, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1701

Jens Axboe's avatar
Jens Axboe committed
1702 1703 1704
	if (q->activity_fn)
		q->activity_fn(q->activity_data, rq_data_dir(req));

Linus Torvalds's avatar
Linus Torvalds committed
1705
	/*
Linus Torvalds's avatar
Linus Torvalds committed
1706 1707
	 * elevator indicated where it wants this request to be
	 * inserted at elevator_merge time
Linus Torvalds's avatar
Linus Torvalds committed
1708
	 */
Jens Axboe's avatar
Jens Axboe committed
1709
	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1710
}
1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
 
/*
 * disk_round_stats()	- Round off the performance stats on a struct
 * disk_stats.
 *
 * The average IO queue length and utilisation statistics are maintained
 * by observing the current state of the queue length and the amount of
 * time it has been in this state for.
 *
 * Normally, that accounting is done on IO completion, but that can result
 * in more than a second's worth of IO being accounted for within any one
 * second, leading to >100% utilisation.  To deal with that, we call this
 * function to do a round-off before returning the results when reading
 * /proc/diskstats.  This accounts immediately for all queue usage up to
 * the current jiffies and restarts the counters again.
 */
void disk_round_stats(struct gendisk *disk)
{
	unsigned long now = jiffies;

1731
	disk_stat_add(disk, time_in_queue, 
1732
			disk->in_flight * (now - disk->stamp));
1733 1734
	disk->stamp = now;

1735
	if (disk->in_flight)
1736
		disk_stat_add(disk, io_ticks, (now - disk->stamp_idle));
1737 1738
	disk->stamp_idle = now;
}
Linus Torvalds's avatar
Linus Torvalds committed
1739

1740 1741 1742
/*
 * queue lock must be held
 */
1743
void __blk_put_request(request_queue_t *q, struct request *req)
Linus Torvalds's avatar
Linus Torvalds committed
1744
{
Linus Torvalds's avatar
Linus Torvalds committed
1745
	struct request_list *rl = req->rl;
1746 1747 1748

	if (unlikely(!q))
		return;
1749 1750
	if (unlikely(--req->ref_count))
		return;
Linus Torvalds's avatar
Linus Torvalds committed
1751

1752 1753
	elv_completed_request(req->q, req);

Linus Torvalds's avatar
Linus Torvalds committed
1754
	req->rq_status = RQ_INACTIVE;
Linus Torvalds's avatar
Linus Torvalds committed
1755
	req->q = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1756
	req->rl = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1757 1758

	/*
Linus Torvalds's avatar
Linus Torvalds committed
1759
	 * Request may not have originated from ll_rw_blk. if not,
Linus Torvalds's avatar
Linus Torvalds committed
1760
	 * it didn't come out of our reserved rq pools
Linus Torvalds's avatar
Linus Torvalds committed
1761
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1762
	if (rl) {
1763
		int rw = rq_data_dir(req);
1764

1765 1766
		BUG_ON(!list_empty(&req->queuelist));

1767
		blk_free_request(q, req);
1768
		freed_request(q, rw);
Linus Torvalds's avatar
Linus Torvalds committed
1769 1770 1771
	}
}

1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788
void blk_put_request(struct request *req)
{
	request_queue_t *q = req->q;
	
	/*
	 * if req->q isn't set, this request didnt originate from the
	 * block layer, so it's safe to just disregard it
	 */
	if (q) {
		unsigned long flags;

		spin_lock_irqsave(q->queue_lock, flags);
		__blk_put_request(q, req);
		spin_unlock_irqrestore(q->queue_lock, flags);
	}
}

1789 1790 1791 1792 1793 1794
/**
 * blk_congestion_wait - wait for a queue to become uncongested
 * @rw: READ or WRITE
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
1795 1796
 * If no queues are congested then just wait for the next request to be
 * returned.
1797 1798 1799
 */
void blk_congestion_wait(int rw, long timeout)
{
1800
	DEFINE_WAIT(wait);
1801
	wait_queue_head_t *wqh = &congestion_wqh[rw];
1802 1803

	blk_run_queues();
1804
	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1805
	io_schedule_timeout(timeout);
1806
	finish_wait(wqh, &wait);
1807 1808
}

Linus Torvalds's avatar
Linus Torvalds committed
1809 1810 1811
/*
 * Has to be called with the request spinlock acquired
 */
1812
static int attempt_merge(request_queue_t *q, struct request *req,
Linus Torvalds's avatar
Linus Torvalds committed
1813
			  struct request *next)
Linus Torvalds's avatar
Linus Torvalds committed
1814
{
Linus Torvalds's avatar
Linus Torvalds committed
1815
	if (!rq_mergeable(req) || !rq_mergeable(next))
1816
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1817 1818 1819 1820

	/*
	 * not contigious
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1821
	if (req->sector + req->nr_sectors != next->sector)
1822
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1823

Linus Torvalds's avatar
Linus Torvalds committed
1824
	if (rq_data_dir(req) != rq_data_dir(next)
1825
	    || req->rq_disk != next->rq_disk
Linus Torvalds's avatar
Linus Torvalds committed
1826
	    || next->waiting || next->special)
1827
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1828

Linus Torvalds's avatar
Linus Torvalds committed
1829
	/*
Linus Torvalds's avatar
Linus Torvalds committed
1830 1831 1832 1833
	 * If we are allowed to merge, then append bio list
	 * from next to rq and release next. merge_requests_fn
	 * will have updated segment counts, update sector
	 * counts here.
Linus Torvalds's avatar
Linus Torvalds committed
1834
	 */
1835 1836
	if (!q->merge_requests_fn(q, req, next))
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1837

1838 1839
	req->biotail->bi_next = next->bio;
	req->biotail = next->biotail;
Linus Torvalds's avatar
Linus Torvalds committed
1840

1841
	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
1842

1843
	elv_merge_requests(q, req, next);
1844

1845 1846
	if (req->rq_disk) {
		disk_round_stats(req->rq_disk);
1847
		req->rq_disk->in_flight--;
Linus Torvalds's avatar
Linus Torvalds committed
1848
	}
1849

1850 1851
	__blk_put_request(q, next);
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
1852 1853
}

1854
static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
1855
{
1856
	struct request *next = elv_latter_request(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
1857

1858 1859 1860 1861
	if (next)
		return attempt_merge(q, rq, next);

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1862 1863
}

1864
static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
1865
{
1866
	struct request *prev = elv_former_request(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
1867

1868 1869 1870 1871
	if (prev)
		return attempt_merge(q, prev, rq);

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1872
}
Linus Torvalds's avatar
Linus Torvalds committed
1873

Linus Torvalds's avatar
Linus Torvalds committed
1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889
/**
 * blk_attempt_remerge  - attempt to remerge active head with next request
 * @q:    The &request_queue_t belonging to the device
 * @rq:   The head request (usually)
 *
 * Description:
 *    For head-active devices, the queue can easily be unplugged so quickly
 *    that proper merging is not done on the front request. This may hurt
 *    performance greatly for some devices. The block layer cannot safely
 *    do merging on that first request for these queues, but the driver can
 *    call this function and make it happen any way. Only the driver knows
 *    when it is safe to do so.
 **/
void blk_attempt_remerge(request_queue_t *q, struct request *rq)
{
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed
1890

Linus Torvalds's avatar
Linus Torvalds committed
1891
	spin_lock_irqsave(q->queue_lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1892
	attempt_back_merge(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
1893
	spin_unlock_irqrestore(q->queue_lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1894
}
Linus Torvalds's avatar
Linus Torvalds committed
1895

Martin Dalecki's avatar
Martin Dalecki committed
1896 1897 1898 1899 1900 1901 1902 1903
/*
 * Non-locking blk_attempt_remerge variant.
 */
void __blk_attempt_remerge(request_queue_t *q, struct request *rq)
{
	attempt_back_merge(q, rq);
}

Linus Torvalds's avatar
Linus Torvalds committed
1904 1905 1906
static int __make_request(request_queue_t *q, struct bio *bio)
{
	struct request *req, *freereq = NULL;
Jens Axboe's avatar
Jens Axboe committed
1907
	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
Linus Torvalds's avatar
Linus Torvalds committed
1908
	sector_t sector;
Linus Torvalds's avatar
Linus Torvalds committed
1909

Linus Torvalds's avatar
Linus Torvalds committed
1910 1911
	sector = bio->bi_sector;
	nr_sectors = bio_sectors(bio);
Jens Axboe's avatar
Jens Axboe committed
1912 1913
	cur_nr_sectors = bio_cur_sectors(bio);

Linus Torvalds's avatar
Linus Torvalds committed
1914
	rw = bio_data_dir(bio);
Linus Torvalds's avatar
Linus Torvalds committed
1915 1916

	/*
Linus Torvalds's avatar
Linus Torvalds committed
1917 1918 1919
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
Linus Torvalds's avatar
Linus Torvalds committed
1920
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1921 1922
	blk_queue_bounce(q, &bio);

Linus Torvalds's avatar
Linus Torvalds committed
1923
	spin_lock_prefetch(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1924

Linus Torvalds's avatar
Linus Torvalds committed
1925
	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
Linus Torvalds's avatar
Linus Torvalds committed
1926

Jens Axboe's avatar
Jens Axboe committed
1927
	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
Jens Axboe's avatar
Jens Axboe committed
1928

Linus Torvalds's avatar
Linus Torvalds committed
1929
again:
1930
	spin_lock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1931

Jens Axboe's avatar
Jens Axboe committed
1932
	if (elv_queue_empty(q)) {
Linus Torvalds's avatar
Linus Torvalds committed
1933
		blk_plug_device(q);
Linus Torvalds's avatar
Linus Torvalds committed
1934
		goto get_rq;
Linus Torvalds's avatar
Linus Torvalds committed
1935
	}
Jens Axboe's avatar
Jens Axboe committed
1936 1937
	if (barrier)
		goto get_rq;
Linus Torvalds's avatar
Linus Torvalds committed
1938

Jens Axboe's avatar
Jens Axboe committed
1939
	el_ret = elv_merge(q, &req, bio);
Linus Torvalds's avatar
Linus Torvalds committed
1940
	switch (el_ret) {
Linus Torvalds's avatar
Linus Torvalds committed
1941
		case ELEVATOR_BACK_MERGE:
Linus Torvalds's avatar
Linus Torvalds committed
1942
			BUG_ON(!rq_mergeable(req));
Jens Axboe's avatar
Jens Axboe committed
1943

Jens Axboe's avatar
Jens Axboe committed
1944
			if (!q->back_merge_fn(q, req, bio))
Linus Torvalds's avatar
Linus Torvalds committed
1945
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1946

Linus Torvalds's avatar
Linus Torvalds committed
1947 1948 1949
			req->biotail->bi_next = bio;
			req->biotail = bio;
			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
1950
			drive_stat_acct(req, nr_sectors, 0);
1951 1952
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req);
Linus Torvalds's avatar
Linus Torvalds committed
1953 1954 1955
			goto out;

		case ELEVATOR_FRONT_MERGE:
Linus Torvalds's avatar
Linus Torvalds committed
1956
			BUG_ON(!rq_mergeable(req));
Jens Axboe's avatar
Jens Axboe committed
1957

Jens Axboe's avatar
Jens Axboe committed
1958
			if (!q->front_merge_fn(q, req, bio))
Linus Torvalds's avatar
Linus Torvalds committed
1959
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1960

Linus Torvalds's avatar
Linus Torvalds committed
1961
			bio->bi_next = req->bio;
Jens Axboe's avatar
Jens Axboe committed
1962 1963 1964 1965
			req->cbio = req->bio = bio;
			req->nr_cbio_segments = bio_segments(bio);
			req->nr_cbio_sectors = bio_sectors(bio);

Linus Torvalds's avatar
Linus Torvalds committed
1966 1967 1968 1969 1970 1971 1972 1973
			/*
			 * may not be valid. if the low level driver said
			 * it didn't need a bounce buffer then it better
			 * not touch req->buffer either...
			 */
			req->buffer = bio_data(bio);
			req->current_nr_sectors = cur_nr_sectors;
			req->hard_cur_sectors = cur_nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
1974
			req->sector = req->hard_sector = sector;
Linus Torvalds's avatar
Linus Torvalds committed
1975
			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
1976
			drive_stat_acct(req, nr_sectors, 0);
1977 1978
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req);
Linus Torvalds's avatar
Linus Torvalds committed
1979
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1980

Linus Torvalds's avatar
Linus Torvalds committed
1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
		/*
		 * elevator says don't/can't merge. get new request
		 */
		case ELEVATOR_NO_MERGE:
			break;

		default:
			printk("elevator returned crap (%d)\n", el_ret);
			BUG();
	}
Linus Torvalds's avatar
Linus Torvalds committed
1991

Linus Torvalds's avatar
Linus Torvalds committed
1992
	/*
Linus Torvalds's avatar
Linus Torvalds committed
1993 1994 1995
	 * Grab a free request from the freelist - if that is empty, check
	 * if we are doing read ahead and abort instead of blocking for
	 * a free slot.
Linus Torvalds's avatar
Linus Torvalds committed
1996 1997 1998 1999 2000
	 */
get_rq:
	if (freereq) {
		req = freereq;
		freereq = NULL;
2001
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
2002
		spin_unlock_irq(q->queue_lock);
2003
		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
2004 2005 2006
			/*
			 * READA bit set
			 */
Jens Axboe's avatar
Jens Axboe committed
2007
			if (ra)
2008 2009 2010 2011
				goto end_io;
	
			freereq = get_request_wait(q, rw);
		}
Linus Torvalds's avatar
Linus Torvalds committed
2012 2013 2014
		goto again;
	}

Linus Torvalds's avatar
Linus Torvalds committed
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
	/*
	 * first three bits are identical in rq->flags and bio->bi_rw,
	 * see bio.h and blkdev.h
	 */
	req->flags = (bio->bi_rw & 7) | REQ_CMD;

	/*
	 * REQ_BARRIER implies no merging, but lets make it explicit
	 */
	if (barrier)
Jens Axboe's avatar
Jens Axboe committed
2025
		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
Linus Torvalds's avatar
Linus Torvalds committed
2026

Jens Axboe's avatar
Jens Axboe committed
2027 2028 2029 2030 2031 2032
	/*
	 * don't stack up retries for read ahead
	 */
	if (ra)
		req->flags |= REQ_FAILFAST;

Linus Torvalds's avatar
Linus Torvalds committed
2033 2034
	req->errors = 0;
	req->hard_sector = req->sector = sector;
Linus Torvalds's avatar
Linus Torvalds committed
2035 2036
	req->hard_nr_sectors = req->nr_sectors = nr_sectors;
	req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2037
	req->nr_phys_segments = bio_phys_segments(q, bio);
Linus Torvalds's avatar
Linus Torvalds committed
2038
	req->nr_hw_segments = bio_hw_segments(q, bio);
Jens Axboe's avatar
Jens Axboe committed
2039 2040
	req->nr_cbio_segments = bio_segments(bio);
	req->nr_cbio_sectors = bio_sectors(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2041
	req->buffer = bio_data(bio);	/* see ->buffer comment above */
Linus Torvalds's avatar
Linus Torvalds committed
2042
	req->waiting = NULL;
Jens Axboe's avatar
Jens Axboe committed
2043
	req->cbio = req->bio = req->biotail = bio;
2044
	req->rq_disk = bio->bi_bdev->bd_disk;
2045
	req->start_time = jiffies;
2046

Jens Axboe's avatar
Jens Axboe committed
2047
	add_request(q, req);
Linus Torvalds's avatar
Linus Torvalds committed
2048
out:
Linus Torvalds's avatar
Linus Torvalds committed
2049
	if (freereq)
2050
		__blk_put_request(q, freereq);
2051 2052

	if (blk_queue_plugged(q)) {
2053
		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
2054

2055 2056 2057
		if (nr_queued == q->unplug_thresh)
			__generic_unplug_device(q);
	}
Linus Torvalds's avatar
Linus Torvalds committed
2058
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
2059
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
2060

Linus Torvalds's avatar
Linus Torvalds committed
2061
end_io:
2062
	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
Linus Torvalds's avatar
Linus Torvalds committed
2063 2064 2065
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
2066 2067 2068 2069 2070
/*
 * If bio->bi_dev is a partition, remap the location
 */
static inline void blk_partition_remap(struct bio *bio)
{
2071 2072
	struct block_device *bdev = bio->bi_bdev;

2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087
	if (bdev != bdev->bd_contains) {
		struct hd_struct *p = bdev->bd_part;

		switch (bio->bi_rw) {
		case READ:
			p->read_sectors += bio_sectors(bio);
			p->reads++;
			break;
		case WRITE:
			p->write_sectors += bio_sectors(bio);
			p->writes++;
			break;
		}
		bio->bi_sector += p->start_sect;
		bio->bi_bdev = bdev->bd_contains;
2088
	}
Linus Torvalds's avatar
Linus Torvalds committed
2089 2090
}

Linus Torvalds's avatar
Linus Torvalds committed
2091
/**
2092
 * generic_make_request: hand a buffer to its device driver for I/O
Linus Torvalds's avatar
Linus Torvalds committed
2093
 * @bio:  The bio describing the location in memory and on the device.
Linus Torvalds's avatar
Linus Torvalds committed
2094 2095
 *
 * generic_make_request() is used to make I/O requests of block
Linus Torvalds's avatar
Linus Torvalds committed
2096 2097
 * devices. It is passed a &struct bio, which describes the I/O that needs
 * to be done.
Linus Torvalds's avatar
Linus Torvalds committed
2098 2099 2100
 *
 * generic_make_request() does not return any status.  The
 * success/failure status of the request, along with notification of
Linus Torvalds's avatar
Linus Torvalds committed
2101
 * completion, is delivered asynchronously through the bio->bi_end_io
Linus Torvalds's avatar
Linus Torvalds committed
2102 2103
 * function described (one day) else where.
 *
Linus Torvalds's avatar
Linus Torvalds committed
2104 2105
 * The caller of generic_make_request must make sure that bi_io_vec
 * are set to describe the memory buffer, and that bi_dev and bi_sector are
Linus Torvalds's avatar
Linus Torvalds committed
2106
 * set to describe the device address, and the
Linus Torvalds's avatar
Linus Torvalds committed
2107 2108
 * bi_end_io and optionally bi_private are set to describe how
 * completion notification should be signaled.
Linus Torvalds's avatar
Linus Torvalds committed
2109
 *
Linus Torvalds's avatar
Linus Torvalds committed
2110 2111
 * generic_make_request and the drivers it calls may use bi_next if this
 * bio happens to be merged with someone else, and may change bi_dev and
Linus Torvalds's avatar
Linus Torvalds committed
2112
 * bi_sector for remaps as it sees fit.  So the values of these fields
Linus Torvalds's avatar
Linus Torvalds committed
2113
 * should NOT be depended on after the call to generic_make_request.
2114
 */
Linus Torvalds's avatar
Linus Torvalds committed
2115
void generic_make_request(struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
2116 2117
{
	request_queue_t *q;
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2118
	sector_t maxsector;
Linus Torvalds's avatar
Linus Torvalds committed
2119
	int ret, nr_sectors = bio_sectors(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2120

Linus Torvalds's avatar
Linus Torvalds committed
2121
	/* Test device or partition size, when known. */
2122
	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2123 2124 2125 2126 2127
	if (maxsector) {
		sector_t sector = bio->bi_sector;

		if (maxsector < nr_sectors ||
		    maxsector - nr_sectors < sector) {
2128
			char b[BDEVNAME_SIZE];
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2129 2130 2131 2132 2133
			/* This may well happen - the kernel calls
			 * bread() without checking the size of the
			 * device, e.g., when mounting a device. */
			printk(KERN_INFO
			       "attempt to access beyond end of device\n");
2134
			printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
2135
			       bdevname(bio->bi_bdev, b),
2136
			       bio->bi_rw,
2137
			       (unsigned long long) sector + nr_sectors,
Andries E. Brouwer's avatar
Andries E. Brouwer committed
2138 2139
			       (long long) maxsector);

Linus Torvalds's avatar
Linus Torvalds committed
2140 2141
			set_bit(BIO_EOF, &bio->bi_flags);
			goto end_io;
Linus Torvalds's avatar
Linus Torvalds committed
2142 2143 2144 2145 2146 2147 2148
		}
	}

	/*
	 * Resolve the mapping until finished. (drivers are
	 * still free to implement/resolve their own stacking
	 * by explicitly returning 0)
Linus Torvalds's avatar
Linus Torvalds committed
2149 2150
	 *
	 * NOTE: we don't repeat the blk_size check for each new device.
Linus Torvalds's avatar
Linus Torvalds committed
2151 2152 2153
	 * Stacking drivers are expected to know what they are doing.
	 */
	do {
2154 2155
		char b[BDEVNAME_SIZE];

2156
		q = bdev_get_queue(bio->bi_bdev);
Linus Torvalds's avatar
Linus Torvalds committed
2157 2158
		if (!q) {
			printk(KERN_ERR
2159 2160 2161 2162
			       "generic_make_request: Trying to access "
				"nonexistent block-device %s (%Lu)\n",
				bdevname(bio->bi_bdev, b),
				(long long) bio->bi_sector);
Linus Torvalds's avatar
Linus Torvalds committed
2163
end_io:
2164
			bio_endio(bio, bio->bi_size, -EIO);
Linus Torvalds's avatar
Linus Torvalds committed
2165 2166
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2167

2168
		if (unlikely(bio_sectors(bio) > q->max_sectors)) {
2169
			printk("bio too big device %s (%u > %u)\n", 
2170 2171 2172
				bdevname(bio->bi_bdev, b),
				bio_sectors(bio),
				q->max_sectors);
2173 2174
			goto end_io;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2175

2176 2177 2178
		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))
			goto end_io;

Linus Torvalds's avatar
Linus Torvalds committed
2179 2180 2181 2182 2183 2184
		/*
		 * If this device has partitions, remap block n
		 * of partition p to block n+start(p) of the disk.
		 */
		blk_partition_remap(bio);

Linus Torvalds's avatar
Linus Torvalds committed
2185 2186
		ret = q->make_request_fn(q, bio);
	} while (ret);
Linus Torvalds's avatar
Linus Torvalds committed
2187 2188 2189
}

/**
Linus Torvalds's avatar
Linus Torvalds committed
2190
 * submit_bio: submit a bio to the block device layer for I/O
Linus Torvalds's avatar
Linus Torvalds committed
2191
 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
Linus Torvalds's avatar
Linus Torvalds committed
2192
 * @bio: The &struct bio which describes the I/O
Linus Torvalds's avatar
Linus Torvalds committed
2193
 *
Linus Torvalds's avatar
Linus Torvalds committed
2194 2195 2196
 * submit_bio() is very similar in purpose to generic_make_request(), and
 * uses that function to do most of the work. Both are fairly rough
 * interfaces, @bio must be presetup and ready for I/O.
Linus Torvalds's avatar
Linus Torvalds committed
2197 2198
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
2199
int submit_bio(int rw, struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
2200
{
2201
	int count = bio_sectors(bio);
Linus Torvalds's avatar
Linus Torvalds committed
2202

Linus Torvalds's avatar
Linus Torvalds committed
2203
	BIO_BUG_ON(!bio->bi_size);
Linus Torvalds's avatar
Linus Torvalds committed
2204 2205 2206
	BIO_BUG_ON(!bio->bi_io_vec);
	bio->bi_rw = rw;
	if (rw & WRITE)
2207
		mod_page_state(pgpgout, count);
Linus Torvalds's avatar
Linus Torvalds committed
2208
	else
2209
		mod_page_state(pgpgin, count);
Linus Torvalds's avatar
Linus Torvalds committed
2210 2211 2212 2213
	generic_make_request(bio);
	return 1;
}

Jens Axboe's avatar
Jens Axboe committed
2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
/**
 * blk_rq_next_segment
 * @rq:		the request being processed
 *
 * Description:
 *	Points to the next segment in the request if the current segment
 *	is complete. Leaves things unchanged if this segment is not over
 *	or if no more segments are left in this request.
 *
 *	Meant to be used for bio traversal during I/O submission
 *	Does not affect any I/O completions or update completion state
 *	in the request, and does not modify any bio fields.
 *
 *	Decrementing rq->nr_sectors, rq->current_nr_sectors and
 *	rq->nr_cbio_sectors as data is transferred is the caller's
 *	responsibility and should be done before calling this routine.
 **/
void blk_rq_next_segment(struct request *rq)
{
	if (rq->current_nr_sectors > 0)
		return;

	if (rq->nr_cbio_sectors > 0) {
		--rq->nr_cbio_segments;
		rq->current_nr_sectors = blk_rq_vec(rq)->bv_len >> 9;
	} else {
		if ((rq->cbio = rq->cbio->bi_next)) {
			rq->nr_cbio_segments = bio_segments(rq->cbio);
			rq->nr_cbio_sectors = bio_sectors(rq->cbio);
 			rq->current_nr_sectors = bio_cur_sectors(rq->cbio);
		}
 	}

	/* remember the size of this segment before we start I/O */
	rq->hard_cur_sectors = rq->current_nr_sectors;
}

/**
 * process_that_request_first	-	process partial request submission
 * @req:	the request being processed
 * @nr_sectors:	number of sectors I/O has been submitted on
 *
 * Description:
 *	May be used for processing bio's while submitting I/O without
 *	signalling completion. Fails if more data is requested than is
 *	available in the request in which case it doesn't advance any
 *	pointers.
 *
 *	Assumes a request is correctly set up. No sanity checks.
 *
 * Return:
 *	0 - no more data left to submit (not processed)
 *	1 - data available to submit for this request (processed)
 **/
int process_that_request_first(struct request *req, unsigned int nr_sectors)
{
	unsigned int nsect;

	if (req->nr_sectors < nr_sectors)
		return 0;

	req->nr_sectors -= nr_sectors;
	req->sector += nr_sectors;
	while (nr_sectors) {
		nsect = min_t(unsigned, req->current_nr_sectors, nr_sectors);
		req->current_nr_sectors -= nsect;
		nr_sectors -= nsect;
		if (req->cbio) {
			req->nr_cbio_sectors -= nsect;
			blk_rq_next_segment(req);
		}
	}
	return 1;
}

2289
void blk_recalc_rq_segments(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
2290 2291 2292 2293
{
	struct bio *bio;
	int nr_phys_segs, nr_hw_segs;

2294 2295 2296
	if (!rq->bio)
		return;

Linus Torvalds's avatar
Linus Torvalds committed
2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309
	nr_phys_segs = nr_hw_segs = 0;
	rq_for_each_bio(bio, rq) {
		/* Force bio hw/phys segs to be recalculated. */
		bio->bi_flags &= ~(1 << BIO_SEG_VALID);

		nr_phys_segs += bio_phys_segments(rq->q, bio);
		nr_hw_segs += bio_hw_segments(rq->q, bio);
	}

	rq->nr_phys_segments = nr_phys_segs;
	rq->nr_hw_segments = nr_hw_segs;
}

2310
void blk_recalc_rq_sectors(struct request *rq, int nsect)
Linus Torvalds's avatar
Linus Torvalds committed
2311
{
2312
	if (blk_fs_request(rq)) {
Linus Torvalds's avatar
Linus Torvalds committed
2313
		rq->hard_sector += nsect;
Jens Axboe's avatar
Jens Axboe committed
2314
		rq->hard_nr_sectors -= nsect;
Linus Torvalds's avatar
Linus Torvalds committed
2315

Jens Axboe's avatar
Jens Axboe committed
2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331
		/*
		 * Move the I/O submission pointers ahead if required,
		 * i.e. for drivers not aware of rq->cbio.
		 */
		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
		    (rq->sector <= rq->hard_sector)) {
			rq->sector = rq->hard_sector;
			rq->nr_sectors = rq->hard_nr_sectors;
			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
			rq->current_nr_sectors = rq->hard_cur_sectors;
			rq->nr_cbio_segments = bio_segments(rq->bio);
			rq->nr_cbio_sectors = bio_sectors(rq->bio);
			rq->buffer = bio_data(rq->bio);

			rq->cbio = rq->bio;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2332

Linus Torvalds's avatar
Linus Torvalds committed
2333 2334 2335 2336 2337 2338 2339 2340
		/*
		 * if total number of sectors is less than the first segment
		 * size, something has gone terribly wrong
		 */
		if (rq->nr_sectors < rq->current_nr_sectors) {
			printk("blk: request botched\n");
			rq->nr_sectors = rq->current_nr_sectors;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2341 2342 2343
	}
}

2344 2345
static int __end_that_request_first(struct request *req, int uptodate,
				    int nr_bytes)
Linus Torvalds's avatar
Linus Torvalds committed
2346
{
2347
	int total_bytes, bio_nbytes, error = 0, next_idx = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2348
	struct bio *bio;
Linus Torvalds's avatar
Linus Torvalds committed
2349

2350 2351 2352 2353 2354 2355 2356
	/*
	 * for a REQ_BLOCK_PC request, we want to carry any eventual
	 * sense key with us all the way through
	 */
	if (!blk_pc_request(req))
		req->errors = 0;

Jens Axboe's avatar
Jens Axboe committed
2357 2358
	if (!uptodate) {
		error = -EIO;
Jens Axboe's avatar
Jens Axboe committed
2359 2360
		if (!(req->flags & REQ_QUIET))
			printk("end_request: I/O error, dev %s, sector %llu\n",
2361
				req->rq_disk ? req->rq_disk->disk_name : "?",
Jens Axboe's avatar
Jens Axboe committed
2362
				(unsigned long long)req->sector);
Jens Axboe's avatar
Jens Axboe committed
2363
	}
Linus Torvalds's avatar
Linus Torvalds committed
2364

2365
	total_bytes = bio_nbytes = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2366
	while ((bio = req->bio)) {
2367
		int nbytes;
Jens Axboe's avatar
Jens Axboe committed
2368

2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
		if (nr_bytes >= bio->bi_size) {
			req->bio = bio->bi_next;
			nbytes = bio->bi_size;
			bio_endio(bio, nbytes, error);
			next_idx = 0;
			bio_nbytes = 0;
		} else {
			int idx = bio->bi_idx + next_idx;

			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
				blk_dump_rq_flags(req, "__end_that");
				printk("%s: bio idx %d >= vcnt %d\n",
						__FUNCTION__,
Jens Axboe's avatar
Jens Axboe committed
2382
						bio->bi_idx, bio->bi_vcnt);
2383 2384
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2385

2386 2387
			nbytes = bio_iovec_idx(bio, idx)->bv_len;
			BIO_BUG_ON(nbytes > bio->bi_size);
Linus Torvalds's avatar
Linus Torvalds committed
2388

2389 2390 2391 2392
			/*
			 * not a complete bvec done
			 */
			if (unlikely(nbytes > nr_bytes)) {
2393 2394
				bio_iovec_idx(bio, idx)->bv_offset += nr_bytes;
				bio_iovec_idx(bio, idx)->bv_len -= nr_bytes;
2395 2396 2397 2398
				bio_nbytes += nr_bytes;
				total_bytes += nr_bytes;
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2399

2400 2401 2402 2403 2404
			/*
			 * advance to the next vector
			 */
			next_idx++;
			bio_nbytes += nbytes;
2405
		}
Linus Torvalds's avatar
Linus Torvalds committed
2406

2407 2408
		total_bytes += nbytes;
		nr_bytes -= nbytes;
Jens Axboe's avatar
Jens Axboe committed
2409

Linus Torvalds's avatar
Linus Torvalds committed
2410
		if ((bio = req->bio)) {
Linus Torvalds's avatar
Linus Torvalds committed
2411 2412 2413
			/*
			 * end more in this run, or just return 'not-done'
			 */
2414
			if (unlikely(nr_bytes <= 0))
Jens Axboe's avatar
Jens Axboe committed
2415
				break;
Linus Torvalds's avatar
Linus Torvalds committed
2416 2417
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
2418

Jens Axboe's avatar
Jens Axboe committed
2419 2420 2421 2422 2423 2424 2425 2426 2427
	/*
	 * completely done
	 */
	if (!req->bio)
		return 0;

	/*
	 * if the request wasn't completed, update state
	 */
2428 2429 2430 2431 2432 2433
	if (bio_nbytes) {
		bio_endio(bio, bio_nbytes, error);
		req->bio->bi_idx += next_idx;
	}

	blk_recalc_rq_sectors(req, total_bytes >> 9);
Jens Axboe's avatar
Jens Axboe committed
2434 2435
	blk_recalc_rq_segments(req);
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
2436 2437
}

2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479
/**
 * end_that_request_first - end I/O on a request
 * @req:      the request being processed
 * @uptodate: 0 for I/O error
 * @nr_sectors: number of sectors to end I/O on
 *
 * Description:
 *     Ends I/O on a number of sectors attached to @req, and sets it up
 *     for the next range of segments (if any) in the cluster.
 *
 * Return:
 *     0 - we are done with this request, call end_that_request_last()
 *     1 - still buffers pending for this request
 **/
int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
{
	return __end_that_request_first(req, uptodate, nr_sectors << 9);
}

/**
 * end_that_request_chunk - end I/O on a request
 * @req:      the request being processed
 * @uptodate: 0 for I/O error
 * @nr_bytes: number of bytes to complete
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, and sets it up
 *     for the next range of segments (if any). Like end_that_request_first(),
 *     but deals with bytes instead of sectors.
 *
 * Return:
 *     0 - we are done with this request, call end_that_request_last()
 *     1 - still buffers pending for this request
 **/
int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
{
	return __end_that_request_first(req, uptodate, nr_bytes);
}

/*
 * queue lock must be held
 */
Linus Torvalds's avatar
Linus Torvalds committed
2480 2481
void end_that_request_last(struct request *req)
{
2482
	struct gendisk *disk = req->rq_disk;
2483
	struct completion *waiting = req->waiting;
Linus Torvalds's avatar
Linus Torvalds committed
2484

2485
	if (disk && blk_fs_request(req)) {
2486 2487 2488
		unsigned long duration = jiffies - req->start_time;
		switch (rq_data_dir(req)) {
		    case WRITE:
2489 2490
			disk_stat_inc(disk, writes);
			disk_stat_add(disk, write_ticks, duration);
2491 2492
			break;
		    case READ:
2493 2494
			disk_stat_inc(disk, reads);
			disk_stat_add(disk, read_ticks, duration);
2495 2496 2497
			break;
		}
		disk_round_stats(disk);
2498
		disk->in_flight--;
2499
	}
2500
	__blk_put_request(req->q, req);
2501 2502 2503
	/* Do this LAST! The structure may be freed immediately afterwards */
	if (waiting)
		complete(waiting);
Linus Torvalds's avatar
Linus Torvalds committed
2504 2505
}

2506 2507 2508 2509 2510 2511 2512 2513 2514
void end_request(struct request *req, int uptodate)
{
	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
		add_disk_randomness(req->rq_disk);
		blkdev_dequeue_request(req);
		end_that_request_last(req);
	}
}

2515 2516 2517 2518 2519 2520 2521 2522 2523 2524
void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
{
	/* first three bits are identical in rq->flags and bio->bi_rw */
	rq->flags |= (bio->bi_rw & 7);

	rq->nr_phys_segments = bio_phys_segments(q, bio);
	rq->nr_hw_segments = bio_hw_segments(q, bio);
	rq->current_nr_sectors = bio_cur_sectors(bio);
	rq->hard_cur_sectors = rq->current_nr_sectors;
	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
Jens Axboe's avatar
Jens Axboe committed
2525 2526
	rq->nr_cbio_segments = bio_segments(bio);
	rq->nr_cbio_sectors = bio_sectors(bio);
2527 2528
	rq->buffer = bio_data(bio);

Jens Axboe's avatar
Jens Axboe committed
2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
	rq->cbio = rq->bio = rq->biotail = bio;
}

void blk_rq_prep_restart(struct request *rq)
{
	struct bio *bio;

	bio = rq->cbio = rq->bio;
	if (bio) {
		rq->nr_cbio_segments = bio_segments(bio);
		rq->nr_cbio_sectors = bio_sectors(bio);
		rq->hard_cur_sectors = bio_cur_sectors(bio);
		rq->buffer = bio_data(bio);
	}
	rq->sector = rq->hard_sector;
	rq->nr_sectors = rq->hard_nr_sectors;
	rq->current_nr_sectors = rq->hard_cur_sectors;
2546 2547
}

2548 2549 2550 2551 2552 2553 2554 2555 2556 2557
int kblockd_schedule_work(struct work_struct *work)
{
	return queue_work(kblockd_workqueue, work);
}

void kblockd_flush(void)
{
	flush_workqueue(kblockd_workqueue);
}

Linus Torvalds's avatar
Linus Torvalds committed
2558 2559
int __init blk_dev_init(void)
{
2560
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
2561

2562 2563 2564 2565
	kblockd_workqueue = create_workqueue("kblockd");
	if (!kblockd_workqueue)
		panic("Failed to create kblockd\n");

Linus Torvalds's avatar
Linus Torvalds committed
2566
	request_cachep = kmem_cache_create("blkdev_requests",
2567
			sizeof(struct request), 0, 0, NULL, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
2568 2569 2570
	if (!request_cachep)
		panic("Can't create request pool slab cache\n");

Linus Torvalds's avatar
Linus Torvalds committed
2571 2572 2573
	blk_max_low_pfn = max_low_pfn;
	blk_max_pfn = max_pfn;

2574 2575
	for (i = 0; i < ARRAY_SIZE(congestion_wqh); i++)
		init_waitqueue_head(&congestion_wqh[i]);
Linus Torvalds's avatar
Linus Torvalds committed
2576
	return 0;
2577
}
Linus Torvalds's avatar
Linus Torvalds committed
2578

2579
static atomic_t nr_io_contexts = ATOMIC_INIT(0);
Andrew Morton's avatar
Andrew Morton committed
2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594

/*
 * IO Context helper functions
 */
void put_io_context(struct io_context *ioc)
{
	if (ioc == NULL)
		return;

	BUG_ON(atomic_read(&ioc->refcount) == 0);

	if (atomic_dec_and_test(&ioc->refcount)) {
		if (ioc->aic && ioc->aic->dtor)
			ioc->aic->dtor(ioc->aic);
		kfree(ioc);
2595
		atomic_dec(&nr_io_contexts);
Andrew Morton's avatar
Andrew Morton committed
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611
	}
}

/* Called by the exitting task */
void exit_io_context(void)
{
	unsigned long flags;
	struct io_context *ioc;

	local_irq_save(flags);
	ioc = current->io_context;
	if (ioc) {
		if (ioc->aic && ioc->aic->exit)
			ioc->aic->exit(ioc->aic);
		put_io_context(ioc);
		current->io_context = NULL;
2612 2613
	} else
		WARN_ON(1);
Andrew Morton's avatar
Andrew Morton committed
2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624
	local_irq_restore(flags);
}

/*
 * If the current task has no IO context then create one and initialise it.
 * If it does have a context, take a ref on it.
 *
 * This is always called in the context of the task which submitted the I/O.
 * But weird things happen, so we disable local interrupts to ensure exclusive
 * access to *current.
 */
Andrew Morton's avatar
Andrew Morton committed
2625
struct io_context *get_io_context(int gfp_flags)
Andrew Morton's avatar
Andrew Morton committed
2626 2627 2628 2629 2630 2631 2632 2633 2634 2635
{
	struct task_struct *tsk = current;
	unsigned long flags;
	struct io_context *ret;

	local_irq_save(flags);
	ret = tsk->io_context;
	if (ret == NULL) {
		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
		if (ret) {
2636
			atomic_inc(&nr_io_contexts);
Andrew Morton's avatar
Andrew Morton committed
2637 2638
			atomic_set(&ret->refcount, 1);
			ret->pid = tsk->pid;
2639 2640
			ret->last_waited = jiffies; /* doesn't matter... */
			ret->nr_batch_requests = 0; /* because this is 0 */
Andrew Morton's avatar
Andrew Morton committed
2641 2642 2643 2644
			ret->aic = NULL;
			tsk->io_context = ret;
		}
	}
Andrew Morton's avatar
Andrew Morton committed
2645 2646
	if (ret)
		atomic_inc(&ret->refcount);
Andrew Morton's avatar
Andrew Morton committed
2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672
	local_irq_restore(flags);
	return ret;
}

void copy_io_context(struct io_context **pdst, struct io_context **psrc)
{
	struct io_context *src = *psrc;
	struct io_context *dst = *pdst;

	if (src) {
		BUG_ON(atomic_read(&src->refcount) == 0);
		atomic_inc(&src->refcount);
		put_io_context(dst);
		*pdst = src;
	}
}

void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
{
	struct io_context *temp;
	temp = *ioc1;
	*ioc1 = *ioc2;
	*ioc2 = temp;
}


2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
/*
 * sysfs parts below
 */
struct queue_sysfs_entry {
	struct attribute attr;
	ssize_t (*show)(struct request_queue *, char *);
	ssize_t (*store)(struct request_queue *, const char *, size_t);
};

static ssize_t
queue_var_show(unsigned int var, char *page)
{
	return sprintf(page, "%d\n", var);
}

static ssize_t
queue_var_store(unsigned long *var, const char *page, size_t count)
{
	char *p = (char *) page;

	*var = simple_strtoul(p, &p, 10);
	return count;
}

static ssize_t queue_requests_show(struct request_queue *q, char *page)
{
	return queue_var_show(q->nr_requests, (page));
}

static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
	struct request_list *rl = &q->rq;

	int ret = queue_var_store(&q->nr_requests, page, count);
	if (q->nr_requests < BLKDEV_MIN_RQ)
		q->nr_requests = BLKDEV_MIN_RQ;

2711
	if (rl->count[READ] >= queue_congestion_on_threshold(q))
2712
		set_queue_congested(q, READ);
2713
	else if (rl->count[READ] < queue_congestion_off_threshold(q))
2714 2715
		clear_queue_congested(q, READ);

2716 2717 2718 2719
	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
		set_queue_congested(q, WRITE);
	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
		clear_queue_congested(q, WRITE);
2720

2721 2722
	if (rl->count[READ] >= q->nr_requests) {
		blk_set_queue_full(q, READ);
2723
	} else if (rl->count[READ]+1 <= q->nr_requests) {
2724
		blk_clear_queue_full(q, READ);
2725
		wake_up(&rl->wait[READ]);
2726 2727 2728 2729
	}

	if (rl->count[WRITE] >= q->nr_requests) {
		blk_set_queue_full(q, WRITE);
2730
	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
2731
		blk_clear_queue_full(q, WRITE);
2732
		wake_up(&rl->wait[WRITE]);
2733
	}
2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828
	return ret;
}

static struct queue_sysfs_entry queue_requests_entry = {
	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
	.show = queue_requests_show,
	.store = queue_requests_store,
};

static struct attribute *default_attrs[] = {
	&queue_requests_entry.attr,
	NULL,
};

#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)

static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
	struct queue_sysfs_entry *entry = to_queue(attr);
	struct request_queue *q;

	q = container_of(kobj, struct request_queue, kobj);
	if (!entry->show)
		return 0;

	return entry->show(q, page);
}

static ssize_t
queue_attr_store(struct kobject *kobj, struct attribute *attr,
		    const char *page, size_t length)
{
	struct queue_sysfs_entry *entry = to_queue(attr);
	struct request_queue *q;

	q = container_of(kobj, struct request_queue, kobj);
	if (!entry->store)
		return -EINVAL;

	return entry->store(q, page, length);
}

static struct sysfs_ops queue_sysfs_ops = {
	.show	= queue_attr_show,
	.store	= queue_attr_store,
};

struct kobj_type queue_ktype = {
	.sysfs_ops	= &queue_sysfs_ops,
	.default_attrs	= default_attrs,
};

int blk_register_queue(struct gendisk *disk)
{
	int ret;

	request_queue_t *q = disk->queue;

	if (!q)
		return -ENXIO;

	q->kobj.parent = kobject_get(&disk->kobj);
	if (!q->kobj.parent)
		return -EBUSY;

	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
	q->kobj.ktype = &queue_ktype;

	ret = kobject_register(&q->kobj);
	if (ret < 0)
		return ret;

	ret = elv_register_queue(q);
	if (ret) {
		kobject_unregister(&q->kobj);
		return ret;
	}

	return 0;
}

void blk_unregister_queue(struct gendisk *disk)
{
	request_queue_t *q = disk->queue;

	if (q) {
		elv_unregister_queue(q);

		kobject_unregister(&q->kobj);
		kobject_put(&disk->kobj);
	}
}


Jens Axboe's avatar
Jens Axboe committed
2829
EXPORT_SYMBOL(process_that_request_first);
Linus Torvalds's avatar
Linus Torvalds committed
2830
EXPORT_SYMBOL(end_that_request_first);
2831
EXPORT_SYMBOL(end_that_request_chunk);
Linus Torvalds's avatar
Linus Torvalds committed
2832
EXPORT_SYMBOL(end_that_request_last);
2833
EXPORT_SYMBOL(end_request);
Linus Torvalds's avatar
Linus Torvalds committed
2834 2835
EXPORT_SYMBOL(blk_init_queue);
EXPORT_SYMBOL(blk_cleanup_queue);
2836 2837
EXPORT_SYMBOL(blk_get_queue);
EXPORT_SYMBOL(blk_alloc_queue);
Linus Torvalds's avatar
Linus Torvalds committed
2838
EXPORT_SYMBOL(blk_queue_make_request);
Linus Torvalds's avatar
Linus Torvalds committed
2839
EXPORT_SYMBOL(blk_queue_bounce_limit);
Linus Torvalds's avatar
Linus Torvalds committed
2840
EXPORT_SYMBOL(generic_make_request);
Linus Torvalds's avatar
Linus Torvalds committed
2841
EXPORT_SYMBOL(generic_unplug_device);
Jens Axboe's avatar
Jens Axboe committed
2842 2843
EXPORT_SYMBOL(blk_plug_device);
EXPORT_SYMBOL(blk_remove_plug);
Linus Torvalds's avatar
Linus Torvalds committed
2844
EXPORT_SYMBOL(blk_attempt_remerge);
Martin Dalecki's avatar
Martin Dalecki committed
2845
EXPORT_SYMBOL(__blk_attempt_remerge);
Linus Torvalds's avatar
Linus Torvalds committed
2846
EXPORT_SYMBOL(blk_max_low_pfn);
2847
EXPORT_SYMBOL(blk_max_pfn);
Linus Torvalds's avatar
Linus Torvalds committed
2848
EXPORT_SYMBOL(blk_queue_max_sectors);
Linus Torvalds's avatar
Linus Torvalds committed
2849 2850
EXPORT_SYMBOL(blk_queue_max_phys_segments);
EXPORT_SYMBOL(blk_queue_max_hw_segments);
Linus Torvalds's avatar
Linus Torvalds committed
2851 2852
EXPORT_SYMBOL(blk_queue_max_segment_size);
EXPORT_SYMBOL(blk_queue_hardsect_size);
2853
EXPORT_SYMBOL(blk_queue_stack_limits);
Linus Torvalds's avatar
Linus Torvalds committed
2854
EXPORT_SYMBOL(blk_queue_segment_boundary);
Jens Axboe's avatar
Jens Axboe committed
2855
EXPORT_SYMBOL(blk_queue_dma_alignment);
Linus Torvalds's avatar
Linus Torvalds committed
2856
EXPORT_SYMBOL(blk_rq_map_sg);
Linus Torvalds's avatar
Linus Torvalds committed
2857
EXPORT_SYMBOL(blk_dump_rq_flags);
Linus Torvalds's avatar
Linus Torvalds committed
2858
EXPORT_SYMBOL(submit_bio);
Linus Torvalds's avatar
Linus Torvalds committed
2859 2860
EXPORT_SYMBOL(blk_phys_contig_segment);
EXPORT_SYMBOL(blk_hw_contig_segment);
2861 2862
EXPORT_SYMBOL(blk_get_request);
EXPORT_SYMBOL(blk_put_request);
2863
EXPORT_SYMBOL(blk_insert_request);
2864
EXPORT_SYMBOL(blk_requeue_request);
Linus Torvalds's avatar
Linus Torvalds committed
2865 2866

EXPORT_SYMBOL(blk_queue_prep_rq);
2867
EXPORT_SYMBOL(blk_queue_merge_bvec);
2868

Jens Axboe's avatar
Jens Axboe committed
2869
EXPORT_SYMBOL(blk_queue_find_tag);
2870 2871 2872 2873 2874
EXPORT_SYMBOL(blk_queue_init_tags);
EXPORT_SYMBOL(blk_queue_free_tags);
EXPORT_SYMBOL(blk_queue_start_tag);
EXPORT_SYMBOL(blk_queue_end_tag);
EXPORT_SYMBOL(blk_queue_invalidate_tags);
Jens Axboe's avatar
Jens Axboe committed
2875 2876 2877

EXPORT_SYMBOL(blk_start_queue);
EXPORT_SYMBOL(blk_stop_queue);
2878
EXPORT_SYMBOL(blk_run_queue);
Jens Axboe's avatar
Jens Axboe committed
2879
EXPORT_SYMBOL(blk_run_queues);
2880 2881

EXPORT_SYMBOL(blk_rq_bio_prep);
Andrew Morton's avatar
Andrew Morton committed
2882
EXPORT_SYMBOL(blk_rq_prep_restart);