Commit 00d6555e authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] readahead fix

Changes the way in which the readahead code locates the readahead
setting for the underlying device.

- struct block_device and struct address_space gain a *pointer* to the
  current readahead tunable.

- The tunable lives in the request queue and is altered with the
  traditional ioctl.

- The value gets *copied* into the struct file at open() time.  So a
  fcntl() mode to modify it per-fd is simple.

- Filesystems which are not request_queue-backed get the address of the
  global `default_ra_pages'.  If we want, this can become a tunable.

- Filesystems are at liberty to alter address_space.ra_pages to point
  at some other fs-private default at new_inode/read_inode/alloc_inode
  time.

- The ra_pages pointer can become a structure pointer if, at some time
  in the future, high-level code needs more detailed information about
  device characteristics.

  In fact, it'll need to become a struct pointer for use by
  writeback: my current writeback code has the problem that multiple
  pdflush threads can get stuck on the same request queue.  That's a
  waste of resources.  I currently have a silly flag in the superblock
  to try to avoid this.

  The proper way to get this exclusion is for the high-level
  writeback code to be able to do a test-and-set against a
  per-request_queue flag.  That flag can live in a structure alongside
  ra_pages, conveniently accessible at the pagemap level.

One thing still to-be-done is going into all callers of blk_init_queue
and blk_queue_make_request and making sure that they're setting up a
sensible default.  ATA wants 248 sectors, and floppy drives don't want
128kbytes, I suspect.  Later.
parent d878155c
...@@ -219,6 +219,7 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) ...@@ -219,6 +219,7 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg)
unsigned short usval; unsigned short usval;
kdev_t dev = to_kdev_t(bdev->bd_dev); kdev_t dev = to_kdev_t(bdev->bd_dev);
int holder; int holder;
unsigned long *ra_pages;
intval = block_ioctl(bdev, cmd, arg); intval = block_ioctl(bdev, cmd, arg);
if (intval != -ENOTTY) if (intval != -ENOTTY)
...@@ -240,13 +241,21 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) ...@@ -240,13 +241,21 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg)
case BLKFRASET: case BLKFRASET:
if(!capable(CAP_SYS_ADMIN)) if(!capable(CAP_SYS_ADMIN))
return -EACCES; return -EACCES;
return blk_set_readahead(bdev, arg); ra_pages = blk_get_ra_pages(dev);
if (ra_pages == NULL)
return -ENOTTY;
*ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
return 0;
case BLKRAGET: case BLKRAGET:
case BLKFRAGET: case BLKFRAGET:
if (!arg) if (!arg)
return -EINVAL; return -EINVAL;
return put_user(blk_get_readahead(bdev), (long *)arg); ra_pages = blk_get_ra_pages(dev);
if (ra_pages == NULL)
return -ENOTTY;
return put_user((*ra_pages * PAGE_CACHE_SIZE) / 512,
(long *)arg);
case BLKSECTGET: case BLKSECTGET:
if ((q = blk_get_queue(dev)) == NULL) if ((q = blk_get_queue(dev)) == NULL)
......
...@@ -109,46 +109,21 @@ inline request_queue_t *blk_get_queue(kdev_t dev) ...@@ -109,46 +109,21 @@ inline request_queue_t *blk_get_queue(kdev_t dev)
} }
/** /**
* blk_set_readahead - set a queue's readahead tunable * blk_get_ra_pages - get the address of a queue's readahead tunable
* @dev: device * @dev: device
* @sectors: readahead, in 512 byte sectors
* *
* Returns zero on success, else negative errno * Locates the passed device's request queue and returns the address of its
*/
int blk_set_readahead(struct block_device *bdev, unsigned sectors)
{
int ret = -EINVAL;
request_queue_t *q = blk_get_queue(to_kdev_t(bdev->bd_dev));
if (q) {
q->ra_sectors = sectors;
blk_put_queue(q);
ret = 0;
}
return ret;
}
/**
* blk_get_readahead - query a queue's readahead tunable
* @dev: device
*
* Locates the passed device's request queue and returns its
* readahead setting. * readahead setting.
* *
* The returned value is in units of 512 byte sectors. * Will return NULL if the request queue cannot be located.
*
* Will return zero if the queue has never had its readahead
* setting altered.
*/ */
unsigned blk_get_readahead(struct block_device *bdev) unsigned long *blk_get_ra_pages(kdev_t dev)
{ {
unsigned ret = 0; unsigned long *ret = NULL;
request_queue_t *q = blk_get_queue(to_kdev_t(bdev->bd_dev)); request_queue_t *q = blk_get_queue(dev);
if (q) { if (q)
ret = q->ra_sectors; ret = &q->ra_pages;
blk_put_queue(q);
}
return ret; return ret;
} }
...@@ -187,7 +162,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) ...@@ -187,7 +162,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
q->max_phys_segments = MAX_PHYS_SEGMENTS; q->max_phys_segments = MAX_PHYS_SEGMENTS;
q->max_hw_segments = MAX_HW_SEGMENTS; q->max_hw_segments = MAX_HW_SEGMENTS;
q->make_request_fn = mfn; q->make_request_fn = mfn;
q->ra_sectors = VM_MAX_READAHEAD << (10 - 9); /* kbytes->sectors */ q->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
blk_queue_max_sectors(q, MAX_SECTORS); blk_queue_max_sectors(q, MAX_SECTORS);
blk_queue_hardsect_size(q, 512); blk_queue_hardsect_size(q, 512);
......
...@@ -1521,6 +1521,7 @@ static int device_size_calculation(mddev_t * mddev) ...@@ -1521,6 +1521,7 @@ static int device_size_calculation(mddev_t * mddev)
mdp_super_t *sb = mddev->sb; mdp_super_t *sb = mddev->sb;
struct list_head *tmp; struct list_head *tmp;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
unsigned long *ra_pages;
/* /*
* Do device size calculation. Bail out if too small. * Do device size calculation. Bail out if too small.
...@@ -1577,7 +1578,10 @@ static int device_size_calculation(mddev_t * mddev) ...@@ -1577,7 +1578,10 @@ static int device_size_calculation(mddev_t * mddev)
if (!md_size[mdidx(mddev)]) if (!md_size[mdidx(mddev)])
md_size[mdidx(mddev)] = sb->size * data_disks; md_size[mdidx(mddev)] = sb->size * data_disks;
readahead = (blk_get_readahead(rdev->bdev) * 512) / PAGE_SIZE; readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
ra_pages = blk_get_ra_pages(rdev->dev);
if (ra_pages)
readahead = (*ra_pages * PAGE_CACHE_SIZE) / PAGE_SIZE;
if (!sb->level || (sb->level == 4) || (sb->level == 5)) { if (!sb->level || (sb->level == 4) || (sb->level == 5)) {
readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
......
...@@ -349,6 +349,8 @@ struct block_device *bdget(dev_t dev) ...@@ -349,6 +349,8 @@ struct block_device *bdget(dev_t dev)
struct inode *inode = new_inode(bd_mnt->mnt_sb); struct inode *inode = new_inode(bd_mnt->mnt_sb);
if (inode) { if (inode) {
kdev_t kdev = to_kdev_t(dev); kdev_t kdev = to_kdev_t(dev);
unsigned long *ra_pages;
atomic_set(&new_bdev->bd_count,1); atomic_set(&new_bdev->bd_count,1);
new_bdev->bd_dev = dev; new_bdev->bd_dev = dev;
new_bdev->bd_op = NULL; new_bdev->bd_op = NULL;
...@@ -360,6 +362,10 @@ struct block_device *bdget(dev_t dev) ...@@ -360,6 +362,10 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = new_bdev; inode->i_bdev = new_bdev;
inode->i_data.a_ops = &def_blk_aops; inode->i_data.a_ops = &def_blk_aops;
inode->i_data.gfp_mask = GFP_USER; inode->i_data.gfp_mask = GFP_USER;
ra_pages = blk_get_ra_pages(kdev);
if (ra_pages == NULL)
ra_pages = &default_ra_pages;
inode->i_data.ra_pages = ra_pages;
spin_lock(&bdev_lock); spin_lock(&bdev_lock);
bdev = bdfind(dev, head); bdev = bdfind(dev, head);
if (!bdev) { if (!bdev) {
......
...@@ -109,6 +109,9 @@ static struct inode *alloc_inode(struct super_block *sb) ...@@ -109,6 +109,9 @@ static struct inode *alloc_inode(struct super_block *sb)
inode->i_data.host = inode; inode->i_data.host = inode;
inode->i_data.gfp_mask = GFP_HIGHUSER; inode->i_data.gfp_mask = GFP_HIGHUSER;
inode->i_mapping = &inode->i_data; inode->i_mapping = &inode->i_data;
inode->i_data.ra_pages = &default_ra_pages;
if (sb->s_bdev)
inode->i_data.ra_pages = sb->s_bdev->bd_inode->i_mapping->ra_pages;
memset(&inode->u, 0, sizeof(inode->u)); memset(&inode->u, 0, sizeof(inode->u));
} }
return inode; return inode;
......
...@@ -632,6 +632,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ...@@ -632,6 +632,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
goto cleanup_file; goto cleanup_file;
} }
f->f_ra.ra_pages = *inode->i_mapping->ra_pages;
f->f_dentry = dentry; f->f_dentry = dentry;
f->f_vfsmnt = mnt; f->f_vfsmnt = mnt;
f->f_pos = 0; f->f_pos = 0;
......
...@@ -150,9 +150,9 @@ struct request_queue ...@@ -150,9 +150,9 @@ struct request_queue
/* /*
* The VM-level readahead tunable for this device. In * The VM-level readahead tunable for this device. In
* units of 512-byte sectors. * units of PAGE_CACHE_SIZE pages.
*/ */
unsigned ra_sectors; unsigned long ra_pages;
/* /*
* The queue owner gets to use this for whatever they like. * The queue owner gets to use this for whatever they like.
...@@ -310,8 +310,7 @@ extern void blk_queue_hardsect_size(request_queue_t *q, unsigned short); ...@@ -310,8 +310,7 @@ extern void blk_queue_hardsect_size(request_queue_t *q, unsigned short);
extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long); extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long);
extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *); extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *);
extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn); extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn);
extern int blk_set_readahead(struct block_device *bdev, unsigned sectors); extern unsigned long *blk_get_ra_pages(kdev_t kdev);
extern unsigned blk_get_readahead(struct block_device *bdev);
extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *); extern void blk_dump_rq_flags(struct request *, char *);
......
...@@ -398,6 +398,7 @@ struct address_space { ...@@ -398,6 +398,7 @@ struct address_space {
list_t i_mmap_shared; /* list of private mappings */ list_t i_mmap_shared; /* list of private mappings */
spinlock_t i_shared_lock; /* and spinlock protecting it */ spinlock_t i_shared_lock; /* and spinlock protecting it */
int gfp_mask; /* how to allocate the pages */ int gfp_mask; /* how to allocate the pages */
unsigned long *ra_pages; /* device readahead */
}; };
struct char_device { struct char_device {
...@@ -513,6 +514,7 @@ struct file_ra_state { ...@@ -513,6 +514,7 @@ struct file_ra_state {
unsigned long prev_page; /* Cache last read() position */ unsigned long prev_page; /* Cache last read() position */
unsigned long ahead_start; /* Ahead window */ unsigned long ahead_start; /* Ahead window */
unsigned long ahead_size; unsigned long ahead_size;
unsigned long ra_pages; /* Maximum readahead window */
}; };
struct file { struct file {
......
...@@ -504,6 +504,7 @@ void do_page_cache_readahead(struct file *file, ...@@ -504,6 +504,7 @@ void do_page_cache_readahead(struct file *file,
void page_cache_readahead(struct file *file, unsigned long offset); void page_cache_readahead(struct file *file, unsigned long offset);
void page_cache_readaround(struct file *file, unsigned long offset); void page_cache_readaround(struct file *file, unsigned long offset);
void handle_ra_thrashing(struct file *file); void handle_ra_thrashing(struct file *file);
extern unsigned long default_ra_pages;
/* vma is the first one with address < vma->vm_end, /* vma is the first one with address < vma->vm_end,
* and even address < vma->vm_start. Have to extend vma. */ * and even address < vma->vm_start. Have to extend vma. */
......
...@@ -12,39 +12,19 @@ ...@@ -12,39 +12,19 @@
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
/* unsigned long default_ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
* The readahead logic manages two readahead windows. The "current"
* and the "ahead" windows.
*
* VM_MAX_READAHEAD specifies, in kilobytes, the maximum size of
* each of the two windows. So the amount of readahead which is
* in front of the file pointer varies between VM_MAX_READAHEAD and
* VM_MAX_READAHEAD * 2.
*
* VM_MAX_READAHEAD only applies if the underlying request queue
* has a zero value of ra_sectors.
*/
/* /*
* Return max readahead size for this inode in number-of-pages. * Return max readahead size for this inode in number-of-pages.
*/ */
static int get_max_readahead(struct inode *inode) static inline unsigned long get_max_readahead(struct file *file)
{ {
unsigned blk_ra_kbytes = 0; return file->f_ra.ra_pages;
if (inode->i_sb->s_bdev) {
blk_ra_kbytes = blk_get_readahead(inode->i_sb->s_bdev) / 2;
}
return blk_ra_kbytes >> (PAGE_CACHE_SHIFT - 10);
} }
static int get_min_readahead(struct inode *inode) static inline unsigned long get_min_readahead(struct file *file)
{ {
int ret = VM_MIN_READAHEAD / PAGE_CACHE_SIZE; return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
if (ret < 2)
ret = 2;
return ret;
} }
/* /*
...@@ -189,7 +169,6 @@ void do_page_cache_readahead(struct file *file, ...@@ -189,7 +169,6 @@ void do_page_cache_readahead(struct file *file,
*/ */
void page_cache_readahead(struct file *file, unsigned long offset) void page_cache_readahead(struct file *file, unsigned long offset)
{ {
struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
struct file_ra_state *ra = &file->f_ra; struct file_ra_state *ra = &file->f_ra;
unsigned long max; unsigned long max;
unsigned long min; unsigned long min;
...@@ -206,10 +185,10 @@ void page_cache_readahead(struct file *file, unsigned long offset) ...@@ -206,10 +185,10 @@ void page_cache_readahead(struct file *file, unsigned long offset)
goto out; goto out;
} }
max = get_max_readahead(inode); max = get_max_readahead(file);
if (max == 0) if (max == 0)
goto out; /* No readahead */ goto out; /* No readahead */
min = get_min_readahead(inode); min = get_min_readahead(file);
if (ra->next_size == 0 && offset == 0) { if (ra->next_size == 0 && offset == 0) {
/* /*
...@@ -309,9 +288,9 @@ void page_cache_readahead(struct file *file, unsigned long offset) ...@@ -309,9 +288,9 @@ void page_cache_readahead(struct file *file, unsigned long offset)
*/ */
void page_cache_readaround(struct file *file, unsigned long offset) void page_cache_readaround(struct file *file, unsigned long offset)
{ {
const unsigned long min = get_min_readahead(file) * 2;
unsigned long target; unsigned long target;
unsigned long backward; unsigned long backward;
const int min = get_min_readahead(file->f_dentry->d_inode->i_mapping->host) * 2;
if (file->f_ra.next_size < min) if (file->f_ra.next_size < min)
file->f_ra.next_size = min; file->f_ra.next_size = min;
...@@ -338,8 +317,7 @@ void page_cache_readaround(struct file *file, unsigned long offset) ...@@ -338,8 +317,7 @@ void page_cache_readaround(struct file *file, unsigned long offset)
*/ */
void handle_ra_thrashing(struct file *file) void handle_ra_thrashing(struct file *file)
{ {
struct address_space * mapping = file->f_dentry->d_inode->i_mapping; const unsigned long min = get_min_readahead(file);
const unsigned long min = get_min_readahead(mapping->host);
file->f_ra.next_size -= 3; file->f_ra.next_size -= 3;
if (file->f_ra.next_size < min) if (file->f_ra.next_size < min)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment