Commit 68fc0a78 authored by Jens Axboe's avatar Jens Axboe Committed by Linus Torvalds

[PATCH] rbtree core for io scheduler

This patch has a bunch of io scheduler goodies that are, by now, well
tested in -mm and by self and Nick Piggin. In order of interest:

- Use rbtree data structure for sorting of requests. Even with the
  default queue lengths that are fairly short, this cuts a lot of run
  time for io scheduler intensive work loads. If we go to longer queue
  lengths, it very quickly becomes a necessity.

- Add sysfs interface for the tunables. At the same time, finally kill
  the BLKELVGET/BLKELVSET completely. I made these return -ENOTTY in
  2.5.1, but there are left-overs around the kernel. This old interface
  was never any good, it was centered around just one io scheduler.

The io scheduler core itself has received count less hours of tuning by
myself and Nick, should be in pretty good shape. Please apply.

Andrew, I made some sysfs changes to the version from 2.5.56-mm1. It
didn't even compile without warnings (or work, for that matter), as the
sysfs store/show procedures needed updating. Hmm?
parent 54779e07
......@@ -755,8 +755,6 @@ static struct ioctl32_list ioctl32_handler_table[] = {
IOCTL32_HANDLER(BLKSECTGET, w_long),
IOCTL32_DEFAULT(BLKSSZGET),
IOCTL32_HANDLER(BLKPG, blkpg_ioctl_trans),
IOCTL32_DEFAULT(BLKELVGET),
IOCTL32_DEFAULT(BLKELVSET),
IOCTL32_DEFAULT(BLKBSZGET),
IOCTL32_DEFAULT(BLKBSZSET),
......
......@@ -3464,9 +3464,6 @@ COMPATIBLE_IOCTL(DRM_IOCTL_LOCK)
COMPATIBLE_IOCTL(DRM_IOCTL_UNLOCK)
COMPATIBLE_IOCTL(DRM_IOCTL_FINISH)
#endif /* DRM */
/* elevator */
COMPATIBLE_IOCTL(BLKELVGET)
COMPATIBLE_IOCTL(BLKELVSET)
/* Big R */
COMPATIBLE_IOCTL(RNDGETENTCNT)
COMPATIBLE_IOCTL(RNDADDTOENTCNT)
......
......@@ -3613,22 +3613,10 @@ mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
}
/* Fix sizeof(sizeof()) breakage */
#define BLKELVGET_32 _IOR(0x12,106,int)
#define BLKELVSET_32 _IOW(0x12,107,int)
#define BLKBSZGET_32 _IOR(0x12,112,int)
#define BLKBSZSET_32 _IOW(0x12,113,int)
#define BLKGETSIZE64_32 _IOR(0x12,114,int)
static int do_blkelvget(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKELVGET, arg);
}
static int do_blkelvset(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKELVSET, arg);
}
static int do_blkbszget(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKBSZGET, arg);
......@@ -4459,9 +4447,6 @@ HANDLE_IOCTL(USBDEVFS_REAPURB32, do_usbdevfs_reapurb),
HANDLE_IOCTL(USBDEVFS_REAPURBNDELAY32, do_usbdevfs_reapurb),
HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal),
/* take care of sizeof(sizeof()) breakage */
/* elevator */
HANDLE_IOCTL(BLKELVGET_32, do_blkelvget),
HANDLE_IOCTL(BLKELVSET_32, do_blkelvset),
/* block stuff */
HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget),
HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset),
......
......@@ -798,9 +798,6 @@ static struct ioctl32_list ioctl32_handler_table[] = {
IOCTL32_DEFAULT(BLKBSZGET),
IOCTL32_DEFAULT(BLKGETSIZE64),
IOCTL32_DEFAULT(BLKELVGET),
IOCTL32_DEFAULT(BLKELVSET),
IOCTL32_HANDLER(HDIO_GETGEO, hd_geometry_ioctl),
IOCTL32_DEFAULT(TCGETA),
......
......@@ -4244,22 +4244,10 @@ static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
}
/* Fix sizeof(sizeof()) breakage */
#define BLKELVGET_32 _IOR(0x12,106,int)
#define BLKELVSET_32 _IOW(0x12,107,int)
#define BLKBSZGET_32 _IOR(0x12,112,int)
#define BLKBSZSET_32 _IOW(0x12,113,int)
#define BLKGETSIZE64_32 _IOR(0x12,114,int)
static int do_blkelvget(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKELVGET, arg);
}
static int do_blkelvset(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKELVSET, arg);
}
static int do_blkbszget(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKBSZGET, arg);
......@@ -5203,9 +5191,6 @@ HANDLE_IOCTL(USBDEVFS_REAPURB32, do_usbdevfs_reapurb)
HANDLE_IOCTL(USBDEVFS_REAPURBNDELAY32, do_usbdevfs_reapurb)
HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
/* take care of sizeof(sizeof()) breakage */
/* elevator */
HANDLE_IOCTL(BLKELVGET_32, do_blkelvget)
HANDLE_IOCTL(BLKELVSET_32, do_blkelvset)
/* block stuff */
HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
......
......@@ -3025,22 +3025,10 @@ static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
}
/* Fix sizeof(sizeof()) breakage */
#define BLKELVGET_32 _IOR(0x12,106,int)
#define BLKELVSET_32 _IOW(0x12,107,int)
#define BLKBSZGET_32 _IOR(0x12,112,int)
#define BLKBSZSET_32 _IOW(0x12,113,int)
#define BLKGETSIZE64_32 _IOR(0x12,114,int)
static int do_blkelvget(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKELVGET, arg);
}
static int do_blkelvset(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKELVSET, arg);
}
static int do_blkbszget(unsigned int fd, unsigned int cmd, unsigned long arg)
{
return sys_ioctl(fd, BLKBSZGET, arg);
......@@ -4427,9 +4415,6 @@ HANDLE_IOCTL(USBDEVFS_REAPURB32, do_usbdevfs_reapurb)
HANDLE_IOCTL(USBDEVFS_REAPURBNDELAY32, do_usbdevfs_reapurb)
HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
/* take care of sizeof(sizeof()) breakage */
/* elevator */
HANDLE_IOCTL(BLKELVGET_32, do_blkelvget)
HANDLE_IOCTL(BLKELVSET_32, do_blkelvset)
/* block stuff */
HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
......
This diff is collapsed.
......@@ -194,6 +194,12 @@ int elevator_noop_merge(request_queue_t *q, struct list_head **insert,
return ELEVATOR_NO_MERGE;
}
void elevator_noop_merge_requests(request_queue_t *q, struct request *req,
struct request *next)
{
list_del_init(&next->queuelist);
}
void elevator_noop_add_request(request_queue_t *q, struct request *rq,
struct list_head *insert_here)
{
......@@ -370,19 +376,70 @@ int elv_queue_empty(request_queue_t *q)
return list_empty(&q->queue_head);
}
inline struct list_head *elv_get_sort_head(request_queue_t *q,
struct request *rq)
struct request *elv_latter_request(request_queue_t *q, struct request *rq)
{
struct list_head *next;
elevator_t *e = &q->elevator;
if (e->elevator_latter_req_fn)
return e->elevator_latter_req_fn(q, rq);
next = rq->queuelist.next;
if (next != &q->queue_head && next != &rq->queuelist)
return list_entry_rq(next);
return NULL;
}
struct request *elv_former_request(request_queue_t *q, struct request *rq)
{
struct list_head *prev;
elevator_t *e = &q->elevator;
if (e->elevator_get_sort_head_fn)
return e->elevator_get_sort_head_fn(q, rq);
if (e->elevator_former_req_fn)
return e->elevator_latter_req_fn(q, rq);
prev = rq->queuelist.prev;
if (prev != &q->queue_head && prev != &rq->queuelist)
return list_entry_rq(prev);
return NULL;
}
int elv_register_queue(struct gendisk *disk)
{
request_queue_t *q = disk->queue;
elevator_t *e;
if (!q)
return -ENXIO;
e = &q->elevator;
e->kobj.parent = kobject_get(&disk->kobj);
if (!e->kobj.parent)
return -EBUSY;
snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
e->kobj.ktype = e->elevator_ktype;
return kobject_register(&e->kobj);
}
void elv_unregister_queue(struct gendisk *disk)
{
request_queue_t *q = disk->queue;
elevator_t *e = &q->elevator;
return &q->queue_head;
kobject_unregister(&e->kobj);
kobject_put(&disk->kobj);
}
elevator_t elevator_noop = {
.elevator_merge_fn = elevator_noop_merge,
.elevator_merge_req_fn = elevator_noop_merge_requests,
.elevator_next_req_fn = elevator_noop_next_request,
.elevator_add_req_fn = elevator_noop_add_request,
};
......
......@@ -112,6 +112,7 @@ void add_disk(struct gendisk *disk)
blk_register_region(MKDEV(disk->major, disk->first_minor), disk->minors,
NULL, exact_match, exact_lock, disk);
register_disk(disk);
elv_register_queue(disk);
}
EXPORT_SYMBOL(add_disk);
......@@ -119,6 +120,7 @@ EXPORT_SYMBOL(del_gendisk);
void unlink_gendisk(struct gendisk *disk)
{
elv_unregister_queue(disk);
blk_unregister_region(MKDEV(disk->major, disk->first_minor),
disk->minors);
}
......
......@@ -128,10 +128,6 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
int ret, n;
switch (cmd) {
case BLKELVGET:
case BLKELVSET:
/* deprecated, use the /proc/iosched interface instead */
return -ENOTTY;
case BLKRAGET:
case BLKFRAGET:
if (!arg)
......
......@@ -68,7 +68,7 @@ static inline int queue_congestion_on_threshold(void)
{
int ret;
ret = queue_nr_requests / 4 - 1;
ret = queue_nr_requests / 8 - 1;
if (ret < 0)
ret = 1;
return ret;
......@@ -81,7 +81,7 @@ static inline int queue_congestion_off_threshold(void)
{
int ret;
ret = queue_nr_requests / 4 + 1;
ret = queue_nr_requests / 8 + 1;
if (ret > queue_nr_requests)
ret = queue_nr_requests;
return ret;
......@@ -1159,6 +1159,8 @@ void blk_cleanup_queue(request_queue_t * q)
{
int count = (queue_nr_requests*2);
elevator_exit(q);
count -= __blk_cleanup_queue(&q->rq[READ]);
count -= __blk_cleanup_queue(&q->rq[WRITE]);
......@@ -1168,8 +1170,6 @@ void blk_cleanup_queue(request_queue_t * q)
if (blk_queue_tagged(q))
blk_queue_free_tags(q);
elevator_exit(q);
memset(q, 0, sizeof(*q));
}
......@@ -1576,22 +1576,22 @@ void blk_congestion_wait(int rw, long timeout)
/*
* Has to be called with the request spinlock acquired
*/
static void attempt_merge(request_queue_t *q, struct request *req,
static int attempt_merge(request_queue_t *q, struct request *req,
struct request *next)
{
if (!rq_mergeable(req) || !rq_mergeable(next))
return;
return 0;
/*
* not contigious
*/
if (req->sector + req->nr_sectors != next->sector)
return;
return 0;
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk
|| next->waiting || next->special)
return;
return 0;
/*
* If we are allowed to merge, then append bio list
......@@ -1612,27 +1612,31 @@ static void attempt_merge(request_queue_t *q, struct request *req,
req->rq_disk->in_flight--;
}
blkdev_dequeue_request(next);
__blk_put_request(q, next);
return 1;
}
return 0;
}
static inline void attempt_back_merge(request_queue_t *q, struct request *rq)
static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
{
struct list_head *next = rq->queuelist.next;
struct list_head *sort_head = elv_get_sort_head(q, rq);
struct request *next = elv_latter_request(q, rq);
if (next != sort_head)
attempt_merge(q, rq, list_entry_rq(next));
if (next)
return attempt_merge(q, rq, next);
return 0;
}
static inline void attempt_front_merge(request_queue_t *q, struct request *rq)
static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
{
struct list_head *prev = rq->queuelist.prev;
struct list_head *sort_head = elv_get_sort_head(q, rq);
struct request *prev = elv_former_request(q, rq);
if (prev != sort_head)
attempt_merge(q, list_entry_rq(prev), rq);
if (prev)
return attempt_merge(q, prev, rq);
return 0;
}
/**
......@@ -1715,8 +1719,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
drive_stat_acct(req, nr_sectors, 0);
elv_merged_request(q, req);
attempt_back_merge(q, req);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req);
goto out;
case ELEVATOR_FRONT_MERGE:
......@@ -1742,8 +1746,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
drive_stat_acct(req, nr_sectors, 0);
elv_merged_request(q, req);
attempt_front_merge(q, req);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req);
goto out;
/*
......@@ -2169,8 +2173,7 @@ int __init blk_dev_init(void)
int i;
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
sizeof(struct request), 0, 0, NULL, NULL);
if (!request_cachep)
panic("Can't create request pool slab cache\n");
......
......@@ -13,6 +13,7 @@ typedef struct request *(elevator_next_req_fn) (request_queue_t *);
typedef void (elevator_add_req_fn) (request_queue_t *, struct request *, struct list_head *);
typedef int (elevator_queue_empty_fn) (request_queue_t *);
typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
typedef struct list_head *(elevator_get_sort_head_fn) (request_queue_t *, struct request *);
typedef int (elevator_init_fn) (request_queue_t *, elevator_t *);
......@@ -29,12 +30,17 @@ struct elevator_s
elevator_remove_req_fn *elevator_remove_req_fn;
elevator_queue_empty_fn *elevator_queue_empty_fn;
elevator_get_sort_head_fn *elevator_get_sort_head_fn;
elevator_request_list_fn *elevator_former_req_fn;
elevator_request_list_fn *elevator_latter_req_fn;
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
void *elevator_data;
struct kobject kobj;
struct kobj_type *elevator_ktype;
};
/*
......@@ -48,7 +54,10 @@ extern void elv_merge_requests(request_queue_t *, struct request *,
extern void elv_merged_request(request_queue_t *, struct request *);
extern void elv_remove_request(request_queue_t *, struct request *);
extern int elv_queue_empty(request_queue_t *);
extern inline struct list_head *elv_get_sort_head(request_queue_t *, struct request *);
extern struct request *elv_former_request(request_queue_t *, struct request *);
extern struct request *elv_latter_request(request_queue_t *, struct request *);
extern int elv_register_queue(struct gendisk *);
extern void elv_unregister_queue(struct gendisk *);
#define __elv_add_request_pos(q, rq, pos) \
(q)->elevator.elevator_add_req_fn((q), (rq), (pos))
......@@ -64,18 +73,6 @@ extern elevator_t elevator_noop;
*/
extern elevator_t iosched_deadline;
/*
* use the /proc/iosched interface, all the below is history ->
*/
typedef struct blkelv_ioctl_arg_s {
int queue_ID;
int read_latency;
int write_latency;
int max_bomb_segments;
} blkelv_ioctl_arg_t;
#define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))
#define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))
extern int elevator_init(request_queue_t *, elevator_t *);
extern void elevator_exit(request_queue_t *);
extern inline int bio_rq_in_between(struct bio *, struct request *, struct list_head *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment