Commit a83638a4 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] readv/writev speedup

This is Janet Morgan's patch which converts the readv/writev code
to submit all segments for IO before waiting on them, rather than
submitting each segment separately.

This is a critical performance fix for O_DIRECT reads and writes.
Prior to this change, O_DIRECT vectored IO was forced to wait for
completion against each segment of the iovec rather than submitting all
segments and waiting on the lot.  ie: for ten segments, this code will
be ten times faster.

There will also be moderate improvements for buffered IO - smaller code
paths, plus writev() only takes i_sem once.

The patch ended up quite large unfortunately - turned out that the only
sane way to implement this without duplicating significant amounts of
code (the generic_file_write() bounds checking, all the O_DIRECT
handling, etc) was to redo generic_file_read() and generic_file_write()
to take an iovec/nr_segs pair rather than `buf, count'.

New exported functions generic_file_readv() and generic_file_writev()
have been added:

ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
                          unsigned long nr_segs, loff_t *ppos);
ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
                          unsigned long nr_segs, loff_t * ppos);

If a driver does not use these in their file_operations then they will
continue to use the old readv/writev code, which sits in a loop calling
calls fops->read() or fops->write().

ext2, ext3, JFS and the blockdev driver are currently using this
capability.

Some coding cleanups were made in fs/read_write.c.  Mainly:

- pass "READ" or "WRITE" around to indicate the diretion of the
  operation, rather than the (confusing, inverted)
  VERIFY_READ/VERIFY_WRITE.

- Use the identifier `nr_segs' everywhere to indicate the iovec
  length rather than `count', which is often used to indicate the
  number of bytes in the syscall.  It was confusing the heck out of me.

- Some cleanups to the raw driver.

- Some additional generality in fs/direct_io.c: the core `struct dio'
  used to be a "populate-and-go" thing.  Janet has broken that up so
  you can initialise a struct dio once, then loop around feeding it
  more file segments, then wait on completion against everything.

- In a couple of places we needed to handle the situation where we
  knew, a-priori, that the user was going to get a short read or write.
  File size limit exceeded, read past i_size, etc.  We handled that by
  shortening the iovec in-place with iov_shorten().  Which is not
  particularly pretty, but neither were the alternatives.
parent d8fcce3f
...@@ -201,25 +201,29 @@ raw_ctl_ioctl(struct inode *inode, struct file *filp, ...@@ -201,25 +201,29 @@ raw_ctl_ioctl(struct inode *inode, struct file *filp,
} }
static ssize_t static ssize_t
rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp) rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
{ {
const int minor = minor(filp->f_dentry->d_inode->i_rdev); const int minor = minor(filp->f_dentry->d_inode->i_rdev);
struct block_device *bdev = raw_devices[minor].binding; struct block_device *bdev = raw_devices[minor].binding;
struct inode *inode = bdev->bd_inode; struct inode *inode = bdev->bd_inode;
size_t count = iov_length(iov, nr_segs);
ssize_t ret = 0; ssize_t ret = 0;
if (size == 0) if (count == 0)
goto out; goto out;
ret = -EINVAL;
if (size < 0) if ((ssize_t)count < 0)
goto out; return -EINVAL;
ret = -ENXIO;
if (*offp >= inode->i_size) if (*offp >= inode->i_size)
goto out; return -ENXIO;
if (count + *offp > inode->i_size) {
count = inode->i_size - *offp;
nr_segs = iov_shorten((struct iovec *)iov, nr_segs, count);
}
ret = generic_file_direct_IO(rw, inode, iov, *offp, nr_segs);
if (size + *offp > inode->i_size)
size = inode->i_size - *offp;
ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
if (ret > 0) if (ret > 0)
*offp += ret; *offp += ret;
out: out:
...@@ -227,15 +231,31 @@ rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp) ...@@ -227,15 +231,31 @@ rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
} }
static ssize_t static ssize_t
raw_read(struct file *filp, char * buf, size_t size, loff_t *offp) raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
{ {
return rw_raw_dev(READ, filp, buf, size, offp); struct iovec local_iov = { .iov_base = buf, .iov_len = size};
return rw_raw_dev(READ, filp, &local_iov, 1, offp);
} }
static ssize_t static ssize_t
raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp) raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
{ {
return rw_raw_dev(WRITE, filp, (char *)buf, size, offp); struct iovec local_iov = { .iov_base = buf, .iov_len = size};
return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
}
static ssize_t
raw_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
{
return rw_raw_dev(READ, filp, iov, nr_segs, offp);
}
static ssize_t
raw_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
{
return rw_raw_dev(WRITE, filp, iov, nr_segs, offp);
} }
static struct file_operations raw_fops = { static struct file_operations raw_fops = {
...@@ -244,6 +264,8 @@ static struct file_operations raw_fops = { ...@@ -244,6 +264,8 @@ static struct file_operations raw_fops = {
.open = raw_open, .open = raw_open,
.release= raw_release, .release= raw_release,
.ioctl = raw_ioctl, .ioctl = raw_ioctl,
.readv = raw_readv,
.writev = raw_writev,
.owner = THIS_MODULE, .owner = THIS_MODULE,
}; };
......
...@@ -116,11 +116,11 @@ blkdev_get_blocks(struct inode *inode, sector_t iblock, ...@@ -116,11 +116,11 @@ blkdev_get_blocks(struct inode *inode, sector_t iblock,
} }
static int static int
blkdev_direct_IO(int rw, struct inode *inode, char *buf, blkdev_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
loff_t offset, size_t count) loff_t offset, unsigned long nr_segs)
{ {
return generic_direct_IO(rw, inode, buf, offset, return generic_direct_IO(rw, inode, iov, offset,
count, blkdev_get_blocks); nr_segs, blkdev_get_blocks);
} }
static int blkdev_writepage(struct page * page) static int blkdev_writepage(struct page * page)
...@@ -787,6 +787,14 @@ static int blkdev_reread_part(struct block_device *bdev) ...@@ -787,6 +787,14 @@ static int blkdev_reread_part(struct block_device *bdev)
return res; return res;
} }
static ssize_t blkdev_file_write(struct file *file, const char *buf,
size_t count, loff_t *ppos)
{
struct iovec local_iov = { .iov_base = (void *)buf, .iov_len = count };
return generic_file_write_nolock(file, &local_iov, 1, ppos);
}
static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
unsigned long arg) unsigned long arg)
{ {
...@@ -832,26 +840,28 @@ static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, ...@@ -832,26 +840,28 @@ static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
} }
struct address_space_operations def_blk_aops = { struct address_space_operations def_blk_aops = {
readpage: blkdev_readpage, .readpage = blkdev_readpage,
writepage: blkdev_writepage, .writepage = blkdev_writepage,
sync_page: block_sync_page, .sync_page = block_sync_page,
prepare_write: blkdev_prepare_write, .prepare_write = blkdev_prepare_write,
commit_write: blkdev_commit_write, .commit_write = blkdev_commit_write,
writepages: generic_writepages, .writepages = generic_writepages,
vm_writeback: generic_vm_writeback, .vm_writeback = generic_vm_writeback,
direct_IO: blkdev_direct_IO, .direct_IO = blkdev_direct_IO,
}; };
struct file_operations def_blk_fops = { struct file_operations def_blk_fops = {
open: blkdev_open, .open = blkdev_open,
release: blkdev_close, .release = blkdev_close,
llseek: block_llseek, .llseek = block_llseek,
read: generic_file_read, .read = generic_file_read,
write: generic_file_write_nolock, .write = blkdev_file_write,
mmap: generic_file_mmap, .mmap = generic_file_mmap,
fsync: block_fsync, .fsync = block_fsync,
ioctl: blkdev_ioctl, .ioctl = blkdev_ioctl,
sendfile: generic_file_sendfile, .readv = generic_file_readv,
.writev = generic_file_writev,
.sendfile = generic_file_sendfile,
}; };
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
......
...@@ -75,7 +75,7 @@ struct dio { ...@@ -75,7 +75,7 @@ struct dio {
*/ */
static inline unsigned dio_pages_present(struct dio *dio) static inline unsigned dio_pages_present(struct dio *dio)
{ {
return dio->head - dio->tail; return dio->tail - dio->head;
} }
/* /*
...@@ -265,6 +265,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) ...@@ -265,6 +265,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
static int dio_await_completion(struct dio *dio) static int dio_await_completion(struct dio *dio)
{ {
int ret = 0; int ret = 0;
if (dio->bio)
dio_bio_submit(dio);
while (atomic_read(&dio->bio_count)) { while (atomic_read(&dio->bio_count)) {
struct bio *bio = dio_await_one(dio); struct bio *bio = dio_await_one(dio);
int ret2; int ret2;
...@@ -523,29 +527,16 @@ int do_direct_IO(struct dio *dio) ...@@ -523,29 +527,16 @@ int do_direct_IO(struct dio *dio)
return ret; return ret;
} }
/*
* The main direct-IO function. This is a library function for use by
* filesystem drivers.
*/
int int
generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset, direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
size_t count, get_blocks_t get_blocks) loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
{ {
const unsigned blkbits = inode->i_blkbits; const unsigned blkbits = inode->i_blkbits;
const unsigned blocksize_mask = (1 << blkbits) - 1; unsigned long user_addr;
const unsigned long user_addr = (unsigned long)buf; int seg, ret2, ret = 0;
int ret;
int ret2;
struct dio dio; struct dio dio;
size_t bytes; size_t bytes, tot_bytes = 0;
/* Check the memory alignment. Blocks cannot straddle pages */
if ((user_addr & blocksize_mask) || (count & blocksize_mask)) {
ret = -EINVAL;
goto out;
}
/* BIO submission state */
dio.bio = NULL; dio.bio = NULL;
dio.bvec = NULL; dio.bvec = NULL;
dio.inode = inode; dio.inode = inode;
...@@ -553,68 +544,87 @@ generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset, ...@@ -553,68 +544,87 @@ generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
dio.blkbits = blkbits; dio.blkbits = blkbits;
dio.block_in_file = offset >> blkbits; dio.block_in_file = offset >> blkbits;
dio.blocks_available = 0; dio.blocks_available = 0;
dio.final_block_in_request = (offset + count) >> blkbits;
/* Index into the first page of the first block */
dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
dio.boundary = 0; dio.boundary = 0;
dio.reap_counter = 0; dio.reap_counter = 0;
dio.get_blocks = get_blocks; dio.get_blocks = get_blocks;
dio.last_block_in_bio = -1; dio.last_block_in_bio = -1;
dio.next_block_in_bio = -1; dio.next_block_in_bio = -1;
dio.page_errors = 0;
/* BIO completion state */
atomic_set(&dio.bio_count, 0);
spin_lock_init(&dio.bio_list_lock);
dio.bio_list = NULL;
dio.waiter = NULL;
for (seg = 0; seg < nr_segs; seg++) {
user_addr = (unsigned long)iov[seg].iov_base;
bytes = iov[seg].iov_len;
/* Index into the first page of the first block */
dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
/* Page fetching state */ /* Page fetching state */
dio.head = 0;
dio.tail = 0;
dio.curr_page = 0; dio.curr_page = 0;
bytes = count;
dio.total_pages = 0; dio.total_pages = 0;
if (user_addr & (PAGE_SIZE - 1)) { if (user_addr & (PAGE_SIZE-1)) {
dio.total_pages++; dio.total_pages++;
bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
} }
dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
dio.curr_user_address = user_addr; dio.curr_user_address = user_addr;
/* Page queue */
dio.head = 0;
dio.tail = 0;
dio.page_errors = 0;
/* BIO completion state */
atomic_set(&dio.bio_count, 0);
spin_lock_init(&dio.bio_list_lock);
dio.bio_list = NULL;
dio.waiter = NULL;
ret = do_direct_IO(&dio); ret = do_direct_IO(&dio);
if (dio.bio) if (ret) {
dio_bio_submit(&dio);
if (ret)
dio_cleanup(&dio); dio_cleanup(&dio);
break;
}
tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
dio.block_in_file) << blkbits);
} /* end iovec loop */
ret2 = dio_await_completion(&dio); ret2 = dio_await_completion(&dio);
if (ret == 0) if (ret == 0)
ret = ret2; ret = ret2;
if (ret == 0) if (ret == 0)
ret = dio.page_errors; ret = dio.page_errors;
if (ret == 0) if (ret == 0)
ret = count - ((dio.final_block_in_request - ret = tot_bytes;
dio.block_in_file) << blkbits);
out:
return ret; return ret;
} }
ssize_t /*
generic_file_direct_IO(int rw, struct inode *inode, char *buf, * This is a library function for use by filesystem drivers.
loff_t offset, size_t count) */
int
generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
{ {
int seg;
size_t size;
unsigned long addr;
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
unsigned blocksize_mask; unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
ssize_t retval; ssize_t retval = -EINVAL;
blocksize_mask = (1 << inode->i_blkbits) - 1; if (offset & blocksize_mask) {
if ((offset & blocksize_mask) || (count & blocksize_mask)) { goto out;
retval = -EINVAL; }
/* Check the memory alignment. Blocks cannot straddle pages */
for (seg = 0; seg < nr_segs; seg++) {
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out; goto out;
} }
...@@ -625,9 +635,21 @@ generic_file_direct_IO(int rw, struct inode *inode, char *buf, ...@@ -625,9 +635,21 @@ generic_file_direct_IO(int rw, struct inode *inode, char *buf,
if (retval) if (retval)
goto out; goto out;
} }
retval = mapping->a_ops->direct_IO(rw, inode, buf, offset, count);
retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
out:
return retval;
}
ssize_t
generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct address_space *mapping = inode->i_mapping;
ssize_t retval;
retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
if (inode->i_mapping->nrpages) if (inode->i_mapping->nrpages)
invalidate_inode_pages2(inode->i_mapping); invalidate_inode_pages2(inode->i_mapping);
out:
return retval; return retval;
} }
...@@ -46,6 +46,8 @@ struct file_operations ext2_file_operations = { ...@@ -46,6 +46,8 @@ struct file_operations ext2_file_operations = {
.open = generic_file_open, .open = generic_file_open,
.release = ext2_release_file, .release = ext2_release_file,
.fsync = ext2_sync_file, .fsync = ext2_sync_file,
.readv = generic_file_readv,
.writev = generic_file_writev,
.sendfile = generic_file_sendfile, .sendfile = generic_file_sendfile,
}; };
......
...@@ -619,11 +619,11 @@ ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, ...@@ -619,11 +619,11 @@ ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
} }
static int static int
ext2_direct_IO(int rw, struct inode *inode, char *buf, ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
loff_t offset, size_t count) loff_t offset, unsigned long nr_segs)
{ {
return generic_direct_IO(rw, inode, buf, return generic_direct_IO(rw, inode, iov,
offset, count, ext2_get_blocks); offset, nr_segs, ext2_get_blocks);
} }
static int static int
......
...@@ -76,19 +76,21 @@ ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) ...@@ -76,19 +76,21 @@ ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
} }
struct file_operations ext3_file_operations = { struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek, /* BKL held */ .llseek = generic_file_llseek,
.read = generic_file_read, /* BKL not held. Don't need */ .read = generic_file_read,
.write = ext3_file_write, /* BKL not held. Don't need */ .write = ext3_file_write,
.ioctl = ext3_ioctl, /* BKL held */ .readv = generic_file_readv,
.writev = generic_file_writev,
.ioctl = ext3_ioctl,
.mmap = generic_file_mmap, .mmap = generic_file_mmap,
.open = ext3_open_file, /* BKL not held. Don't need */ .open = ext3_open_file,
.release = ext3_release_file, /* BKL not held. Don't need */ .release = ext3_release_file,
.fsync = ext3_sync_file, /* BKL held */ .fsync = ext3_sync_file,
.sendfile = generic_file_sendfile, /* BKL not held. Don't need */ .sendfile = generic_file_sendfile,
}; };
struct inode_operations ext3_file_inode_operations = { struct inode_operations ext3_file_inode_operations = {
.truncate = ext3_truncate, /* BKL held */ .truncate = ext3_truncate,
.setattr = ext3_setattr, /* BKL held */ .setattr = ext3_setattr,
}; };
...@@ -1399,13 +1399,15 @@ static int ext3_releasepage(struct page *page, int wait) ...@@ -1399,13 +1399,15 @@ static int ext3_releasepage(struct page *page, int wait)
* If the O_DIRECT write is intantiating holes inside i_size and the machine * If the O_DIRECT write is intantiating holes inside i_size and the machine
* crashes then stale disk data _may_ be exposed inside the file. * crashes then stale disk data _may_ be exposed inside the file.
*/ */
static int ext3_direct_IO(int rw, struct inode *inode, char *buf, static int ext3_direct_IO(int rw, struct inode *inode,
loff_t offset, size_t count) const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{ {
struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_inode_info *ei = EXT3_I(inode);
handle_t *handle = NULL; handle_t *handle = NULL;
int ret; int ret;
int orphan = 0; int orphan = 0;
size_t count = iov_length(iov, nr_segs);
if (rw == WRITE) { if (rw == WRITE) {
loff_t final_size = offset + count; loff_t final_size = offset + count;
...@@ -1428,8 +1430,8 @@ static int ext3_direct_IO(int rw, struct inode *inode, char *buf, ...@@ -1428,8 +1430,8 @@ static int ext3_direct_IO(int rw, struct inode *inode, char *buf,
} }
} }
ret = generic_direct_IO(rw, inode, buf, offset, ret = generic_direct_IO(rw, inode, iov, offset,
count, ext3_direct_io_get_blocks); nr_segs, ext3_direct_io_get_blocks);
out_stop: out_stop:
if (handle) { if (handle) {
......
...@@ -108,6 +108,8 @@ struct file_operations jfs_file_operations = { ...@@ -108,6 +108,8 @@ struct file_operations jfs_file_operations = {
.write = generic_file_write, .write = generic_file_write,
.read = generic_file_read, .read = generic_file_read,
.mmap = generic_file_mmap, .mmap = generic_file_mmap,
.readv = generic_file_readv,
.writev = generic_file_writev,
.sendfile = generic_file_sendfile, .sendfile = generic_file_sendfile,
.fsync = jfs_fsync, .fsync = jfs_fsync,
}; };
...@@ -309,11 +309,11 @@ static int jfs_bmap(struct address_space *mapping, long block) ...@@ -309,11 +309,11 @@ static int jfs_bmap(struct address_space *mapping, long block)
return generic_block_bmap(mapping, block, jfs_get_block); return generic_block_bmap(mapping, block, jfs_get_block);
} }
static int jfs_direct_IO(int rw, struct inode *inode, char *buf, static int jfs_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
loff_t offset, size_t count) loff_t offset, unsigned long nr_segs)
{ {
return generic_direct_IO(rw, inode, buf, return generic_direct_IO(rw, inode, iov,
offset, count, jfs_get_blocks); offset, nr_segs, jfs_get_blocks);
} }
struct address_space_operations jfs_aops = { struct address_space_operations jfs_aops = {
......
...@@ -286,9 +286,29 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char *buf, ...@@ -286,9 +286,29 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char *buf,
return ret; return ret;
} }
/*
* Reduce an iovec's length in-place. Return the resulting number of segments
*/
unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
{
unsigned long seg = 0;
size_t len = 0;
while (seg < nr_segs) {
seg++;
if (len + iov->iov_len >= to) {
iov->iov_len = to - len;
break;
}
len += iov->iov_len;
iov++;
}
return seg;
}
static ssize_t do_readv_writev(int type, struct file *file, static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec * vector, const struct iovec * vector,
unsigned long count) unsigned long nr_segs)
{ {
typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *); typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *);
typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *);
...@@ -296,73 +316,86 @@ static ssize_t do_readv_writev(int type, struct file *file, ...@@ -296,73 +316,86 @@ static ssize_t do_readv_writev(int type, struct file *file,
size_t tot_len; size_t tot_len;
struct iovec iovstack[UIO_FASTIOV]; struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov=iovstack; struct iovec *iov=iovstack;
ssize_t ret, i; ssize_t ret = -EINVAL;
int seg;
io_fn_t fn; io_fn_t fn;
iov_fn_t fnv; iov_fn_t fnv;
struct inode *inode; struct inode *inode;
/*
* SuS says "The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
* traditionally returned -EINVAL for zero segments, so...
*/
if (nr_segs == 0)
goto out;
/* /*
* First get the "struct iovec" from user memory and * First get the "struct iovec" from user memory and
* verify all the pointers * verify all the pointers
*/ */
ret = 0; if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
if (!count) goto out;
goto out_nofree;
ret = -EINVAL;
if (count > UIO_MAXIOV)
goto out_nofree;
if (!file->f_op) if (!file->f_op)
goto out_nofree; goto out;
if (count > UIO_FASTIOV) { if (nr_segs > UIO_FASTIOV) {
ret = -ENOMEM; ret = -ENOMEM;
iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL); iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
if (!iov) if (!iov)
goto out_nofree; goto out;
} }
ret = -EFAULT; ret = -EFAULT;
if (copy_from_user(iov, vector, count*sizeof(*vector))) if (copy_from_user(iov, vector, nr_segs*sizeof(*vector)))
goto out; goto out;
/* /*
* Single unix specification: * Single unix specification:
* We should -EINVAL if an element length is not >= 0 and fitting an ssize_t * We should -EINVAL if an element length is not >= 0 and fitting an
* The total length is fitting an ssize_t * ssize_t. The total length is fitting an ssize_t
* *
* Be careful here because iov_len is a size_t not an ssize_t * Be careful here because iov_len is a size_t not an ssize_t
*/ */
tot_len = 0; tot_len = 0;
ret = -EINVAL; ret = -EINVAL;
for (i = 0 ; i < count ; i++) { for (seg = 0 ; seg < nr_segs; seg++) {
ssize_t tmp = tot_len; ssize_t tmp = tot_len;
ssize_t len = (ssize_t)iov[i].iov_len; ssize_t len = (ssize_t)iov[seg].iov_len;
if (len < 0) /* size_t not fitting an ssize_t .. */ if (len < 0) /* size_t not fitting an ssize_t .. */
goto out; goto out;
tot_len += len; tot_len += len;
if (tot_len < tmp) /* maths overflow on the ssize_t */ if (tot_len < tmp) /* maths overflow on the ssize_t */
goto out; goto out;
} }
if (tot_len == 0) {
ret = 0;
goto out;
}
inode = file->f_dentry->d_inode; inode = file->f_dentry->d_inode;
/* VERIFY_WRITE actually means a read, as we write to user space */ /* VERIFY_WRITE actually means a read, as we write to user space */
ret = locks_verify_area((type == VERIFY_WRITE ret = locks_verify_area((type == READ
? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE), ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE),
inode, file, file->f_pos, tot_len); inode, file, file->f_pos, tot_len);
if (ret) goto out; if (ret)
goto out;
fnv = (type == VERIFY_WRITE ? file->f_op->readv : file->f_op->writev); fnv = NULL;
if (type == READ) {
fn = file->f_op->read;
fnv = file->f_op->readv;
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->writev;
}
if (fnv) { if (fnv) {
ret = fnv(file, iov, count, &file->f_pos); ret = fnv(file, iov, nr_segs, &file->f_pos);
goto out; goto out;
} }
/* VERIFY_WRITE actually means a read, as we write to user space */ /* Do it by hand, with file-ops */
fn = (type == VERIFY_WRITE ? file->f_op->read :
(io_fn_t) file->f_op->write);
ret = 0; ret = 0;
vector = iov; vector = iov;
while (count > 0) { while (nr_segs > 0) {
void * base; void * base;
size_t len; size_t len;
ssize_t nr; ssize_t nr;
...@@ -370,7 +403,7 @@ static ssize_t do_readv_writev(int type, struct file *file, ...@@ -370,7 +403,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
base = vector->iov_base; base = vector->iov_base;
len = vector->iov_len; len = vector->iov_len;
vector++; vector++;
count--; nr_segs--;
nr = fn(file, base, len, &file->f_pos); nr = fn(file, base, len, &file->f_pos);
...@@ -382,20 +415,18 @@ static ssize_t do_readv_writev(int type, struct file *file, ...@@ -382,20 +415,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
if (nr != len) if (nr != len)
break; break;
} }
out: out:
if (iov != iovstack) if (iov != iovstack)
kfree(iov); kfree(iov);
out_nofree: if ((ret + (type == READ)) > 0)
/* VERIFY_WRITE actually means a read, as we write to user space */
if ((ret + (type == VERIFY_WRITE)) > 0)
dnotify_parent(file->f_dentry, dnotify_parent(file->f_dentry,
(type == VERIFY_WRITE) ? DN_MODIFY : DN_ACCESS); (type == READ) ? DN_MODIFY : DN_ACCESS);
return ret; return ret;
} }
asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector,
unsigned long count) asmlinkage ssize_t
sys_readv(unsigned long fd, const struct iovec *vector, unsigned long nr_segs)
{ {
struct file * file; struct file * file;
ssize_t ret; ssize_t ret;
...@@ -409,7 +440,7 @@ asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector, ...@@ -409,7 +440,7 @@ asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector,
(file->f_op->readv || file->f_op->read)) { (file->f_op->readv || file->f_op->read)) {
ret = security_ops->file_permission (file, MAY_READ); ret = security_ops->file_permission (file, MAY_READ);
if (!ret) if (!ret)
ret = do_readv_writev(VERIFY_WRITE, file, vector, count); ret = do_readv_writev(READ, file, vector, nr_segs);
} }
fput(file); fput(file);
...@@ -417,8 +448,8 @@ asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector, ...@@ -417,8 +448,8 @@ asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector,
return ret; return ret;
} }
asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector, asmlinkage ssize_t
unsigned long count) sys_writev(unsigned long fd, const struct iovec * vector, unsigned long nr_segs)
{ {
struct file * file; struct file * file;
ssize_t ret; ssize_t ret;
...@@ -432,7 +463,7 @@ asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector, ...@@ -432,7 +463,7 @@ asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector,
(file->f_op->writev || file->f_op->write)) { (file->f_op->writev || file->f_op->write)) {
ret = security_ops->file_permission (file, MAY_WRITE); ret = security_ops->file_permission (file, MAY_WRITE);
if (!ret) if (!ret)
ret = do_readv_writev(VERIFY_READ, file, vector, count); ret = do_readv_writev(WRITE, file, vector, nr_segs);
} }
fput(file); fput(file);
......
...@@ -307,8 +307,7 @@ struct address_space_operations { ...@@ -307,8 +307,7 @@ struct address_space_operations {
int (*bmap)(struct address_space *, long); int (*bmap)(struct address_space *, long);
int (*invalidatepage) (struct page *, unsigned long); int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int); int (*releasepage) (struct page *, int);
int (*direct_IO)(int, struct inode *, char *buf, int (*direct_IO)(int, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
loff_t offset, size_t count);
}; };
struct backing_dev_info; struct backing_dev_info;
...@@ -1245,14 +1244,18 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *); ...@@ -1245,14 +1244,18 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
extern ssize_t generic_file_write_nolock(struct file *, const char *, size_t, loff_t *); ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos);
extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t); extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t);
extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf, extern ssize_t generic_file_direct_IO(int rw, struct inode *inode,
loff_t offset, size_t count); const struct iovec *iov, loff_t offset, unsigned long nr_segs);
int generic_direct_IO(int rw, struct inode *inode, char *buf, extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec
loff_t offset, size_t count, get_blocks_t *get_blocks); *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos);
ssize_t generic_file_writev(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos);
extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
......
...@@ -34,4 +34,19 @@ struct iovec ...@@ -34,4 +34,19 @@ struct iovec
/* Beg pardon: BSD has 1024 --ANK */ /* Beg pardon: BSD has 1024 --ANK */
#endif #endif
/*
* Total number of bytes covered by an iovec
*/
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
unsigned long seg;
size_t ret = 0;
for (seg = 0; seg < nr_segs; seg++)
ret += iov[seg].iov_len;
return ret;
}
unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
#endif #endif
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include <linux/highuid.h> #include <linux/highuid.h>
#include <linux/brlock.h> #include <linux/brlock.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/uio.h>
#include <linux/tty.h> #include <linux/tty.h>
#include <linux/in6.h> #include <linux/in6.h>
#include <linux/completion.h> #include <linux/completion.h>
...@@ -343,6 +344,9 @@ EXPORT_SYMBOL(register_disk); ...@@ -343,6 +344,9 @@ EXPORT_SYMBOL(register_disk);
EXPORT_SYMBOL(read_dev_sector); EXPORT_SYMBOL(read_dev_sector);
EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(init_buffer);
EXPORT_SYMBOL_GPL(generic_file_direct_IO); EXPORT_SYMBOL_GPL(generic_file_direct_IO);
EXPORT_SYMBOL(generic_file_readv);
EXPORT_SYMBOL(generic_file_writev);
EXPORT_SYMBOL(iov_shorten);
/* tty routines */ /* tty routines */
EXPORT_SYMBOL(tty_hangup); EXPORT_SYMBOL(tty_hangup);
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/uio.h>
#include <linux/iobuf.h> #include <linux/iobuf.h>
#include <linux/hash.h> #include <linux/hash.h>
#include <linux/writeback.h> #include <linux/writeback.h>
...@@ -1121,14 +1122,18 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, ...@@ -1121,14 +1122,18 @@ int file_read_actor(read_descriptor_t *desc, struct page *page,
* This is the "read()" routine for all filesystems * This is the "read()" routine for all filesystems
* that can use the page cache directly. * that can use the page cache directly.
*/ */
ssize_t static ssize_t
generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) __generic_file_read(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{ {
ssize_t retval; ssize_t retval;
unsigned long seg;
size_t count = iov_length(iov, nr_segs);
if ((ssize_t) count < 0) if ((ssize_t) count < 0)
return -EINVAL; return -EINVAL;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) { if (filp->f_flags & O_DIRECT) {
loff_t pos = *ppos, size; loff_t pos = *ppos, size;
struct address_space *mapping; struct address_space *mapping;
...@@ -1141,10 +1146,13 @@ generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) ...@@ -1141,10 +1146,13 @@ generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
goto out; /* skip atime */ goto out; /* skip atime */
size = inode->i_size; size = inode->i_size;
if (pos < size) { if (pos < size) {
if (pos + count > size) if (pos + count > size) {
count = size - pos; count = size - pos;
nr_segs = iov_shorten((struct iovec *)iov,
nr_segs, count);
}
retval = generic_file_direct_IO(READ, inode, retval = generic_file_direct_IO(READ, inode,
buf, pos, count); iov, pos, nr_segs);
if (retval > 0) if (retval > 0)
*ppos = pos + retval; *ppos = pos + retval;
} }
...@@ -1152,27 +1160,42 @@ generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) ...@@ -1152,27 +1160,42 @@ generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
goto out; goto out;
} }
retval = -EFAULT; for (seg = 0; seg < nr_segs; seg++) {
if (access_ok(VERIFY_WRITE, buf, count)) { if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len))
retval = 0; return -EFAULT;
}
retval = 0;
if (count) { if (count) {
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc; read_descriptor_t desc;
desc.written = 0; desc.written = 0;
desc.count = count; desc.buf = iov[seg].iov_base;
desc.buf = buf; desc.count = iov[seg].iov_len;
if (desc.count == 0)
continue;
desc.error = 0; desc.error = 0;
do_generic_file_read(filp,ppos,&desc,file_read_actor); do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval = desc.written; retval += desc.written;
if (!retval) if (!retval) {
retval = desc.error; retval = desc.error;
break;
}
} }
} }
out: out:
return retval; return retval;
} }
ssize_t
generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
struct iovec local_iov = { .iov_base = buf, .iov_len = count };
return __generic_file_read(filp, &local_iov, 1, ppos);
}
static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{ {
ssize_t written; ssize_t written;
...@@ -1926,11 +1949,14 @@ filemap_copy_from_user(struct page *page, unsigned long offset, ...@@ -1926,11 +1949,14 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
* it for writing by marking it dirty. * it for writing by marking it dirty.
* okir@monad.swb.de * okir@monad.swb.de
*/ */
ssize_t generic_file_write_nolock(struct file *file, const char *buf, ssize_t
size_t count, loff_t *ppos) generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{ {
struct address_space * mapping = file->f_dentry->d_inode->i_mapping; struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
struct address_space_operations *a_ops = mapping->a_ops; struct address_space_operations *a_ops = mapping->a_ops;
const size_t ocount = iov_length(iov, nr_segs);
size_t count = ocount;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
long status = 0; long status = 0;
...@@ -1942,12 +1968,19 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, ...@@ -1942,12 +1968,19 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
unsigned bytes; unsigned bytes;
time_t time_now; time_t time_now;
struct pagevec lru_pvec; struct pagevec lru_pvec;
struct iovec *cur_iov;
unsigned iov_bytes; /* Cumulative count to the end of the
current iovec */
unsigned long seg;
char *buf;
if (unlikely((ssize_t)count < 0)) if (unlikely((ssize_t)count < 0))
return -EINVAL; return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count))) for (seg = 0; seg < nr_segs; seg++) {
if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len))
return -EFAULT; return -EFAULT;
}
pos = *ppos; pos = *ppos;
if (unlikely(pos < 0)) if (unlikely(pos < 0))
...@@ -2045,9 +2078,13 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, ...@@ -2045,9 +2078,13 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
mark_inode_dirty_sync(inode); mark_inode_dirty_sync(inode);
} }
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) { if (unlikely(file->f_flags & O_DIRECT)) {
if (count != ocount)
nr_segs = iov_shorten((struct iovec *)iov,
nr_segs, count);
written = generic_file_direct_IO(WRITE, inode, written = generic_file_direct_IO(WRITE, inode,
(char *)buf, pos, count); iov, pos, nr_segs);
if (written > 0) { if (written > 0) {
loff_t end = pos + written; loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
...@@ -2065,6 +2102,9 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, ...@@ -2065,6 +2102,9 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
goto out_status; goto out_status;
} }
cur_iov = (struct iovec *)iov;
iov_bytes = cur_iov->iov_len;
buf = cur_iov->iov_base;
do { do {
unsigned long index; unsigned long index;
unsigned long offset; unsigned long offset;
...@@ -2075,6 +2115,8 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, ...@@ -2075,6 +2115,8 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
bytes = PAGE_CACHE_SIZE - offset; bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count) if (bytes > count)
bytes = count; bytes = count;
if (bytes + written > iov_bytes)
bytes = iov_bytes - written;
/* /*
* Bring in the user page that we will copy from _first_. * Bring in the user page that we will copy from _first_.
...@@ -2084,7 +2126,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, ...@@ -2084,7 +2126,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
*/ */
fault_in_pages_readable(buf, bytes); fault_in_pages_readable(buf, bytes);
page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) { if (!page) {
status = -ENOMEM; status = -ENOMEM;
break; break;
...@@ -2115,6 +2157,11 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, ...@@ -2115,6 +2157,11 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
count -= status; count -= status;
pos += status; pos += status;
buf += status; buf += status;
if (written == iov_bytes && count) {
cur_iov++;
iov_bytes += cur_iov->iov_len;
buf = cur_iov->iov_base;
}
} }
} }
if (!PageReferenced(page)) if (!PageReferenced(page))
...@@ -2151,10 +2198,29 @@ ssize_t generic_file_write(struct file *file, const char *buf, ...@@ -2151,10 +2198,29 @@ ssize_t generic_file_write(struct file *file, const char *buf,
{ {
struct inode *inode = file->f_dentry->d_inode->i_mapping->host; struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
int err; int err;
struct iovec local_iov = { .iov_base = (void *)buf, .iov_len = count };
down(&inode->i_sem); down(&inode->i_sem);
err = generic_file_write_nolock(file, buf, count, ppos); err = generic_file_write_nolock(file, &local_iov, 1, ppos);
up(&inode->i_sem); up(&inode->i_sem);
return err; return err;
} }
ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
return __generic_file_read(filp, iov, nr_segs, ppos);
}
ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t * ppos)
{
struct inode *inode = file->f_dentry->d_inode;
ssize_t ret;
down(&inode->i_sem);
ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
up(&inode->i_sem);
return ret;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment