Commit 00d6555e authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] readahead fix

Changes the way in which the readahead code locates the readahead
setting for the underlying device.

- struct block_device and struct address_space gain a *pointer* to the
  current readahead tunable.

- The tunable lives in the request queue and is altered with the
  traditional ioctl.

- The value gets *copied* into the struct file at open() time.  So a
  fcntl() mode to modify it per-fd is simple.

- Filesystems which are not request_queue-backed get the address of the
  global `default_ra_pages'.  If we want, this can become a tunable.

- Filesystems are at liberty to alter address_space.ra_pages to point
  at some other fs-private default at new_inode/read_inode/alloc_inode
  time.

- The ra_pages pointer can become a structure pointer if, at some time
  in the future, high-level code needs more detailed information about
  device characteristics.

  In fact, it'll need to become a struct pointer for use by
  writeback: my current writeback code has the problem that multiple
  pdflush threads can get stuck on the same request queue.  That's a
  waste of resources.  I currently have a silly flag in the superblock
  to try to avoid this.

  The proper way to get this exclusion is for the high-level
  writeback code to be able to do a test-and-set against a
  per-request_queue flag.  That flag can live in a structure alongside
  ra_pages, conveniently accessible at the pagemap level.

One thing still to-be-done is going into all callers of blk_init_queue
and blk_queue_make_request and making sure that they're setting up a
sensible default.  ATA wants 248 sectors, and floppy drives don't want
128kbytes, I suspect.  Later.
parent d878155c
......@@ -219,6 +219,7 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg)
unsigned short usval;
kdev_t dev = to_kdev_t(bdev->bd_dev);
int holder;
unsigned long *ra_pages;
intval = block_ioctl(bdev, cmd, arg);
if (intval != -ENOTTY)
......@@ -240,13 +241,21 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg)
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
return blk_set_readahead(bdev, arg);
ra_pages = blk_get_ra_pages(dev);
if (ra_pages == NULL)
return -ENOTTY;
*ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
return 0;
case BLKRAGET:
case BLKFRAGET:
if (!arg)
return -EINVAL;
return put_user(blk_get_readahead(bdev), (long *)arg);
ra_pages = blk_get_ra_pages(dev);
if (ra_pages == NULL)
return -ENOTTY;
return put_user((*ra_pages * PAGE_CACHE_SIZE) / 512,
(long *)arg);
case BLKSECTGET:
if ((q = blk_get_queue(dev)) == NULL)
......
......@@ -109,46 +109,21 @@ inline request_queue_t *blk_get_queue(kdev_t dev)
}
/**
* blk_set_readahead - set a queue's readahead tunable
* blk_get_ra_pages - get the address of a queue's readahead tunable
* @dev: device
* @sectors: readahead, in 512 byte sectors
*
* Returns zero on success, else negative errno
*/
int blk_set_readahead(struct block_device *bdev, unsigned sectors)
{
int ret = -EINVAL;
request_queue_t *q = blk_get_queue(to_kdev_t(bdev->bd_dev));
if (q) {
q->ra_sectors = sectors;
blk_put_queue(q);
ret = 0;
}
return ret;
}
/**
* blk_get_readahead - query a queue's readahead tunable
* @dev: device
*
* Locates the passed device's request queue and returns its
* Locates the passed device's request queue and returns the address of its
* readahead setting.
*
* The returned value is in units of 512 byte sectors.
*
* Will return zero if the queue has never had its readahead
* setting altered.
* Will return NULL if the request queue cannot be located.
*/
unsigned blk_get_readahead(struct block_device *bdev)
unsigned long *blk_get_ra_pages(kdev_t dev)
{
unsigned ret = 0;
request_queue_t *q = blk_get_queue(to_kdev_t(bdev->bd_dev));
unsigned long *ret = NULL;
request_queue_t *q = blk_get_queue(dev);
if (q) {
ret = q->ra_sectors;
blk_put_queue(q);
}
if (q)
ret = &q->ra_pages;
return ret;
}
......@@ -187,7 +162,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
q->max_phys_segments = MAX_PHYS_SEGMENTS;
q->max_hw_segments = MAX_HW_SEGMENTS;
q->make_request_fn = mfn;
q->ra_sectors = VM_MAX_READAHEAD << (10 - 9); /* kbytes->sectors */
q->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
blk_queue_max_sectors(q, MAX_SECTORS);
blk_queue_hardsect_size(q, 512);
......
......@@ -1521,6 +1521,7 @@ static int device_size_calculation(mddev_t * mddev)
mdp_super_t *sb = mddev->sb;
struct list_head *tmp;
mdk_rdev_t *rdev;
unsigned long *ra_pages;
/*
* Do device size calculation. Bail out if too small.
......@@ -1577,7 +1578,10 @@ static int device_size_calculation(mddev_t * mddev)
if (!md_size[mdidx(mddev)])
md_size[mdidx(mddev)] = sb->size * data_disks;
readahead = (blk_get_readahead(rdev->bdev) * 512) / PAGE_SIZE;
readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
ra_pages = blk_get_ra_pages(rdev->dev);
if (ra_pages)
readahead = (*ra_pages * PAGE_CACHE_SIZE) / PAGE_SIZE;
if (!sb->level || (sb->level == 4) || (sb->level == 5)) {
readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
......
......@@ -349,6 +349,8 @@ struct block_device *bdget(dev_t dev)
struct inode *inode = new_inode(bd_mnt->mnt_sb);
if (inode) {
kdev_t kdev = to_kdev_t(dev);
unsigned long *ra_pages;
atomic_set(&new_bdev->bd_count,1);
new_bdev->bd_dev = dev;
new_bdev->bd_op = NULL;
......@@ -360,6 +362,10 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = new_bdev;
inode->i_data.a_ops = &def_blk_aops;
inode->i_data.gfp_mask = GFP_USER;
ra_pages = blk_get_ra_pages(kdev);
if (ra_pages == NULL)
ra_pages = &default_ra_pages;
inode->i_data.ra_pages = ra_pages;
spin_lock(&bdev_lock);
bdev = bdfind(dev, head);
if (!bdev) {
......
......@@ -109,6 +109,9 @@ static struct inode *alloc_inode(struct super_block *sb)
inode->i_data.host = inode;
inode->i_data.gfp_mask = GFP_HIGHUSER;
inode->i_mapping = &inode->i_data;
inode->i_data.ra_pages = &default_ra_pages;
if (sb->s_bdev)
inode->i_data.ra_pages = sb->s_bdev->bd_inode->i_mapping->ra_pages;
memset(&inode->u, 0, sizeof(inode->u));
}
return inode;
......
......@@ -632,6 +632,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
goto cleanup_file;
}
f->f_ra.ra_pages = *inode->i_mapping->ra_pages;
f->f_dentry = dentry;
f->f_vfsmnt = mnt;
f->f_pos = 0;
......
......@@ -150,9 +150,9 @@ struct request_queue
/*
* The VM-level readahead tunable for this device. In
* units of 512-byte sectors.
* units of PAGE_CACHE_SIZE pages.
*/
unsigned ra_sectors;
unsigned long ra_pages;
/*
* The queue owner gets to use this for whatever they like.
......@@ -310,8 +310,7 @@ extern void blk_queue_hardsect_size(request_queue_t *q, unsigned short);
extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long);
extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *);
extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn);
extern int blk_set_readahead(struct block_device *bdev, unsigned sectors);
extern unsigned blk_get_readahead(struct block_device *bdev);
extern unsigned long *blk_get_ra_pages(kdev_t kdev);
extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
......
......@@ -398,6 +398,7 @@ struct address_space {
list_t i_mmap_shared; /* list of private mappings */
spinlock_t i_shared_lock; /* and spinlock protecting it */
int gfp_mask; /* how to allocate the pages */
unsigned long *ra_pages; /* device readahead */
};
struct char_device {
......@@ -513,6 +514,7 @@ struct file_ra_state {
unsigned long prev_page; /* Cache last read() position */
unsigned long ahead_start; /* Ahead window */
unsigned long ahead_size;
unsigned long ra_pages; /* Maximum readahead window */
};
struct file {
......
......@@ -504,6 +504,7 @@ void do_page_cache_readahead(struct file *file,
void page_cache_readahead(struct file *file, unsigned long offset);
void page_cache_readaround(struct file *file, unsigned long offset);
void handle_ra_thrashing(struct file *file);
extern unsigned long default_ra_pages;
/* vma is the first one with address < vma->vm_end,
* and even address < vma->vm_start. Have to extend vma. */
......
......@@ -12,39 +12,19 @@
#include <linux/mm.h>
#include <linux/blkdev.h>
/*
* The readahead logic manages two readahead windows. The "current"
* and the "ahead" windows.
*
* VM_MAX_READAHEAD specifies, in kilobytes, the maximum size of
* each of the two windows. So the amount of readahead which is
* in front of the file pointer varies between VM_MAX_READAHEAD and
* VM_MAX_READAHEAD * 2.
*
* VM_MAX_READAHEAD only applies if the underlying request queue
* has a zero value of ra_sectors.
*/
unsigned long default_ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
/*
* Return max readahead size for this inode in number-of-pages.
*/
static int get_max_readahead(struct inode *inode)
static inline unsigned long get_max_readahead(struct file *file)
{
unsigned blk_ra_kbytes = 0;
if (inode->i_sb->s_bdev) {
blk_ra_kbytes = blk_get_readahead(inode->i_sb->s_bdev) / 2;
}
return blk_ra_kbytes >> (PAGE_CACHE_SHIFT - 10);
return file->f_ra.ra_pages;
}
static int get_min_readahead(struct inode *inode)
static inline unsigned long get_min_readahead(struct file *file)
{
int ret = VM_MIN_READAHEAD / PAGE_CACHE_SIZE;
if (ret < 2)
ret = 2;
return ret;
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
}
/*
......@@ -189,7 +169,6 @@ void do_page_cache_readahead(struct file *file,
*/
void page_cache_readahead(struct file *file, unsigned long offset)
{
struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
struct file_ra_state *ra = &file->f_ra;
unsigned long max;
unsigned long min;
......@@ -206,10 +185,10 @@ void page_cache_readahead(struct file *file, unsigned long offset)
goto out;
}
max = get_max_readahead(inode);
max = get_max_readahead(file);
if (max == 0)
goto out; /* No readahead */
min = get_min_readahead(inode);
min = get_min_readahead(file);
if (ra->next_size == 0 && offset == 0) {
/*
......@@ -309,9 +288,9 @@ void page_cache_readahead(struct file *file, unsigned long offset)
*/
void page_cache_readaround(struct file *file, unsigned long offset)
{
const unsigned long min = get_min_readahead(file) * 2;
unsigned long target;
unsigned long backward;
const int min = get_min_readahead(file->f_dentry->d_inode->i_mapping->host) * 2;
if (file->f_ra.next_size < min)
file->f_ra.next_size = min;
......@@ -338,8 +317,7 @@ void page_cache_readaround(struct file *file, unsigned long offset)
*/
void handle_ra_thrashing(struct file *file)
{
struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
const unsigned long min = get_min_readahead(mapping->host);
const unsigned long min = get_min_readahead(file);
file->f_ra.next_size -= 3;
if (file->f_ra.next_size < min)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment