Commit 42ec8bc1 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] direct-to-BIO for O_DIRECT

Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
the kiovec layer.  It's followed by a patch which converts the raw
driver to use the O_DIRECT engine.

CPU utilisation is about the same as the kiovec-based implementation.
Read and write bandwidth are the same too, for 128k chunks.   But with
one megabyte chunks, this implementation is 20% faster at writing.

I assume this is because the kiobuf-based implementation has to stop
and wait for each 128k chunk, whereas this code streams the entire
request, regardless of its size.

This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
margin to widen on higher-end hardware which likes to have more
requests in flight.

Question is: what do we want to do with this sucker?  These are the
remaining users of kiovecs:

	drivers/md/lvm-snap.c
	drivers/media/video/video-buf.c
	drivers/mtd/devices/blkmtd.c
	drivers/scsi/sg.c

the video and mtd drivers seems to be fairly easy to de-kiobufize.
I'm aware of one proprietary driver which uses kiobufs.  XFS uses
kiobufs a little bit - just to map the pages.

So with a bit of effort and maintainer-irritation, we can extract
the kiobuf layer from the kernel.
parent 2dbd1502
......@@ -8,8 +8,8 @@
* device are used to bind the other minor numbers to block devices.
*/
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/iobuf.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/raw.h>
......@@ -86,12 +86,6 @@ int raw_open(struct inode *inode, struct file *filp)
return 0;
}
if (!filp->f_iobuf) {
err = alloc_kiovec(1, &filp->f_iobuf);
if (err)
return err;
}
down(&raw_devices[minor].mutex);
/*
* No, it is a normal raw device. All we need to do on open is
......@@ -256,124 +250,46 @@ int raw_ctl_ioctl(struct inode *inode,
return err;
}
ssize_t raw_read(struct file *filp, char * buf,
size_t size, loff_t *offp)
ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp)
{
return rw_raw_dev(READ, filp, buf, size, offp);
}
ssize_t raw_write(struct file *filp, const char *buf,
size_t size, loff_t *offp)
ssize_t raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
{
return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
}
#define SECTOR_BITS 9
#define SECTOR_SIZE (1U << SECTOR_BITS)
#define SECTOR_MASK (SECTOR_SIZE - 1)
ssize_t rw_raw_dev(int rw, struct file *filp, char *buf,
size_t size, loff_t *offp)
ssize_t
rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
{
struct kiobuf * iobuf;
int new_iobuf;
int err = 0;
unsigned long blocks;
size_t transferred;
int iosize;
int minor;
kdev_t dev;
unsigned long limit;
int sector_size, sector_bits, sector_mask;
sector_t blocknr;
struct block_device *bdev;
/*
* First, a few checks on device size limits
*/
struct inode *inode;
int minor;
ssize_t ret = 0;
minor = minor(filp->f_dentry->d_inode->i_rdev);
new_iobuf = 0;
iobuf = filp->f_iobuf;
if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
/*
* A parallel read/write is using the preallocated iobuf
* so just run slow and allocate a new one.
*/
err = alloc_kiovec(1, &iobuf);
if (err)
goto out;
new_iobuf = 1;
}
bdev = raw_devices[minor].binding;
dev = to_kdev_t(bdev->bd_dev);
sector_size = raw_devices[minor].sector_size;
sector_bits = raw_devices[minor].sector_bits;
sector_mask = sector_size - 1;
limit = bdev->bd_inode->i_size >> sector_bits;
if (!limit)
limit = INT_MAX;
dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
major(dev), minor(dev), limit);
err = -EINVAL;
if ((*offp & sector_mask) || (size & sector_mask))
goto out_free;
err = 0;
if (size)
err = -ENXIO;
if ((*offp >> sector_bits) >= limit)
goto out_free;
transferred = 0;
blocknr = *offp >> sector_bits;
while (size > 0) {
blocks = size >> sector_bits;
if (blocks > limit - blocknr)
blocks = limit - blocknr;
if (!blocks)
break;
iosize = blocks << sector_bits;
inode = bdev->bd_inode;
err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
if (err)
break;
err = brw_kiovec(rw, 1, &iobuf, raw_devices[minor].binding, &blocknr, sector_size);
if (rw == READ && err > 0)
mark_dirty_kiobuf(iobuf, err);
if (err >= 0) {
transferred += err;
size -= err;
buf += err;
}
blocknr += blocks;
unmap_kiobuf(iobuf);
if (err != iosize)
break;
if (size == 0)
goto out;
if (size < 0) {
ret = -EINVAL;
goto out;
}
if (transferred) {
*offp += transferred;
err = transferred;
if (*offp >= inode->i_size) {
ret = -ENXIO;
goto out;
}
out_free:
if (!new_iobuf)
clear_bit(0, &filp->f_iobuf_lock);
else
free_kiovec(1, &iobuf);
out:
return err;
if (size + *offp > inode->i_size)
size = inode->i_size - *offp;
ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
if (ret > 0)
*offp += ret;
if (inode->i_mapping->nrpages)
invalidate_inode_pages2(inode->i_mapping);
out:
return ret;
}
......@@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o file_table.o buffer.o \
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
fs-writeback.o mpage.o
fs-writeback.o mpage.o direct-io.o
ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
......
......@@ -106,9 +106,12 @@ static int blkdev_get_block(struct inode * inode, sector_t iblock, struct buffer
return 0;
}
static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
static int
blkdev_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count)
{
return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block);
return generic_direct_IO(rw, inode, buf, offset,
count, blkdev_get_block);
}
static int blkdev_writepage(struct page * page)
......
......@@ -2311,6 +2311,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
return tmp.b_blocknr;
}
#if 0
int generic_direct_IO(int rw, struct inode *inode,
struct kiobuf *iobuf, unsigned long blocknr,
int blocksize, get_block_t *get_block)
......@@ -2355,6 +2356,7 @@ int generic_direct_IO(int rw, struct inode *inode,
out:
return retval;
}
#endif
/*
* Start I/O on a physical range of kernel memory, defined by a vector
......
This diff is collapsed.
......@@ -607,11 +607,10 @@ static int ext2_bmap(struct address_space *mapping, long block)
}
static int
ext2_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
unsigned long blocknr, int blocksize)
ext2_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count)
{
return generic_direct_IO(rw, inode, iobuf, blocknr,
blocksize, ext2_get_block);
return generic_direct_IO(rw, inode, buf, offset, count, ext2_get_block);
}
static int
......
......@@ -185,8 +185,6 @@ int block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
int generic_direct_IO(int, struct inode *, struct kiobuf *,
unsigned long, int, get_block_t *);
int file_fsync(struct file *, struct dentry *, int);
#define OSYNC_METADATA (1<<0)
......
......@@ -303,8 +303,8 @@ struct address_space_operations {
int (*bmap)(struct address_space *, long);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
int (*direct_IO)(int, struct inode *, char *buf,
loff_t offset, size_t count);
};
struct backing_dev_info;
......@@ -1128,7 +1128,7 @@ extern int check_disk_change(kdev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
extern void invalidate_inode_pages2(struct address_space *);
extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
......@@ -1233,6 +1233,11 @@ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count);
int generic_direct_IO(int rw, struct inode *inode, char *buf,
loff_t offset, size_t count, get_block_t *get_block);
extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
......
......@@ -414,7 +414,7 @@ static int invalidate_list_pages2(struct address_space * mapping,
* free the pages because they're mapped.
* @mapping: the address_space which pages we want to invalidate
*/
void invalidate_inode_pages2(struct address_space * mapping)
void invalidate_inode_pages2(struct address_space *mapping)
{
int unlocked;
......@@ -1102,6 +1102,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
UPDATE_ATIME(inode);
}
#if 0
static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
{
ssize_t retval;
......@@ -1182,6 +1183,7 @@ static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, si
out:
return retval;
}
#endif
int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{
......@@ -1209,15 +1211,36 @@ int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long o
* This is the "read()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
ssize_t
generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
ssize_t retval;
if ((ssize_t) count < 0)
return -EINVAL;
if (filp->f_flags & O_DIRECT)
goto o_direct;
if (filp->f_flags & O_DIRECT) {
loff_t pos = *ppos, size;
struct address_space *mapping;
struct inode *inode;
mapping = filp->f_dentry->d_inode->i_mapping;
inode = mapping->host;
retval = 0;
if (!count)
goto out; /* skip atime */
size = inode->i_size;
if (pos < size) {
if (pos + count > size)
count = size - pos;
retval = generic_file_direct_IO(READ, inode,
buf, pos, count);
if (retval > 0)
*ppos = pos + retval;
}
UPDATE_ATIME(filp->f_dentry->d_inode);
goto out;
}
retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
......@@ -1230,36 +1253,14 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
desc.count = count;
desc.buf = buf;
desc.error = 0;
do_generic_file_read(filp, ppos, &desc, file_read_actor);
do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval = desc.written;
if (!retval)
retval = desc.error;
}
}
out:
out:
return retval;
o_direct:
{
loff_t pos = *ppos, size;
struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
struct inode *inode = mapping->host;
retval = 0;
if (!count)
goto out; /* skip atime */
size = inode->i_size;
if (pos < size) {
if (pos + count > size)
count = size - pos;
retval = generic_file_direct_IO(READ, filp, buf, count, pos);
if (retval > 0)
*ppos = pos + retval;
}
UPDATE_ATIME(filp->f_dentry->d_inode);
goto out;
}
}
static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
......@@ -2186,8 +2187,8 @@ generic_file_write(struct file *file, const char *buf,
}
if (unlikely(file->f_flags & O_DIRECT)) {
written = generic_file_direct_IO(WRITE, file,
(char *) buf, count, pos);
written = generic_file_direct_IO(WRITE, inode,
(char *)buf, pos, count);
if (written > 0) {
loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
......@@ -2195,7 +2196,8 @@ generic_file_write(struct file *file, const char *buf,
mark_inode_dirty(inode);
}
*ppos = end;
invalidate_inode_pages2(mapping);
if (mapping->nrpages)
invalidate_inode_pages2(mapping);
}
/*
* Sync the fs metadata but not the minor inode changes and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment