[PATCH] direct_io mopup

Some cleanup from the surprise direct-to-bio for O_DIRECT merge. - Remove bits and pieces from the kiobuf implementation - Replace the waitqueue in struct dio with just a task_struct pointer and use wake_up_process. (Ben). - Only take mmap_sem around the individual calls to get_user_pages(). (It pins the vmas, yes?) - Remove some debug code. - Fix JFS.

[PATCH] direct_io mopup
Some cleanup from the surprise direct-to-bio for O_DIRECT merge. - Remove bits and pieces from the kiobuf implementation - Replace the waitqueue in struct dio with just a task_struct pointer and use wake_up_process. (Ben). - Only take mmap_sem around the individual calls to get_user_pages(). (It pins the vmas, yes?) - Remove some debug code. - Fix JFS.
e3339bee · Andrew Morton · Linus Torvalds · 4504a57e · e3339bee · e3339bee
Commit e3339bee authored Jul 18, 2002 by Andrew Morton Committed by Linus Torvalds Jul 18, 2002
8 changed files
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2309,55 +2309,6 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 	return tmp.b_blocknr;
 }

-#if 0
-int generic_direct_IO(int rw, struct inode *inode,
-			struct kiobuf *iobuf, unsigned long blocknr,
-			int blocksize, get_block_t *get_block)
-{
-	int i, nr_blocks, retval = 0;
-	sector_t *blocks = iobuf->blocks;
-	struct block_device *bdev = NULL;
-
-	nr_blocks = iobuf->length / blocksize;
-	/* build the blocklist */
-	for (i = 0; i < nr_blocks; i++, blocknr++) {
-		struct buffer_head bh;
-
-		bh.b_state = 0;
-		bh.b_size = blocksize;
-
-		retval = get_block(inode, blocknr, &bh, rw & 1);
-		if (retval)
-			goto out;
-
-		if (rw == READ) {
-			if (buffer_new(&bh))
-				BUG();
-			if (!buffer_mapped(&bh)) {
-				/* there was an hole in the filesystem */
-				blocks[i] = -1UL;
-				continue;
-			}
-		} else {
-			if (buffer_new(&bh))
-				unmap_underlying_metadata(bh.b_bdev,
-							bh.b_blocknr);
-			if (!buffer_mapped(&bh))
-				BUG();
-		}
-		blocks[i] = bh.b_blocknr;
-		bdev = bh.b_bdev;
-	}
-
-	/* This does not understand multi-device filesystems currently */
-	if (bdev)
-		retval = brw_kiovec(rw, 1, &iobuf, bdev, blocks, blocksize);
-
- out:
-	return retval;
-}
-#endif
-
 /*
 * Start I/O on a physical range of kernel memory, defined by a vector
 * of kiobuf structs (much like a user-space iovec list).

--- a/fs/direct-io.c
+++ b/fs/direct-io.c
 /*
- * mm/direct-io.c
+ * fs/direct-io.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
@@ -61,7 +61,7 @@ struct dio {
 	atomic_t bio_count;
 	spinlock_t bio_list_lock;
 	struct bio *bio_list;		/* singly linked via bi_private */
-	wait_queue_head_t wait_q;
+	struct task_struct *waiter;
 };

 /*
@@ -81,6 +81,7 @@ static int dio_refill_pages(struct dio *dio)
 	int nr_pages;

 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+	down_read(&current->mm->mmap_sem);
 	ret = get_user_pages(
 		current,			/* Task for fault acounting */
 		current->mm,			/* whose pages? */
@@ -90,6 +91,7 @@ static int dio_refill_pages(struct dio *dio)
 		0,				/* force (?) */
 		&dio->pages[0],
 		NULL);				/* vmas */
+	up_read(&current->mm->mmap_sem);

 	if (ret >= 0) {
 		dio->curr_user_address += ret * PAGE_SIZE;
@@ -139,7 +141,7 @@ static void dio_bio_end_io(struct bio *bio)
 	bio->bi_private = dio->bio_list;
 	dio->bio_list = bio;
 	spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-	wake_up(&dio->wait_q);
+	wake_up_process(dio->waiter);
 }

 static int
@@ -193,13 +195,11 @@ static void dio_cleanup(struct dio *dio)
 */
 static struct bio *dio_await_one(struct dio *dio)
 {
-	DECLARE_WAITQUEUE(wait, current);
 	unsigned long flags;
 	struct bio *bio;

 	spin_lock_irqsave(&dio->bio_list_lock, flags);
 	while (dio->bio_list == NULL) {
-		add_wait_queue(&dio->wait_q, &wait);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (dio->bio_list == NULL) {
 			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
@@ -208,7 +208,6 @@ static struct bio *dio_await_one(struct dio *dio)
 			spin_lock_irqsave(&dio->bio_list_lock, flags);
 		}
 		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&dio->wait_q, &wait);
 	}
 	bio = dio->bio_list;
 	dio->bio_list = bio->bi_private;
@@ -224,23 +223,17 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int page_no;
-	int ret = 0;

 	for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 		struct page *page = bvec[page_no].bv_page;

-		if (!uptodate) {
-			if (ret == 0)
-				ret = -EIO;
-		}
-
 		if (dio->rw == READ)
 			set_page_dirty(page);
 		page_cache_release(page);
 	}
 	atomic_dec(&dio->bio_count);
 	bio_put(bio);
-	return ret;
+	return uptodate ? 0 : -EIO;
 }

 /*
@@ -265,7 +258,7 @@ static int dio_await_completion(struct dio *dio)
 * to keep the memory consumption sane we periodically reap any completed BIOs
 * during the BIO generation phase.
 *
- * This also helps to limis the peak amount of pinned userspace memory.
+ * This also helps to limit the peak amount of pinned userspace memory.
 */
 static int dio_bio_reap(struct dio *dio)
 {
@@ -388,15 +381,13 @@ int do_direct_IO(struct dio *dio)
 	return ret;
 }

-struct dio *g_dio;
-
 int
 generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
 			size_t count, get_block_t get_block)
 {
 	const unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
 	const unsigned long user_addr = (unsigned long)buf;
-	int ret = 0;
+	int ret;
 	int ret2;
 	struct dio dio;
 	size_t bytes;
@@ -407,8 +398,6 @@ generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
 		goto out;
 	}

-	g_dio = &dio;
-
 	/* BIO submission state */
 	dio.bio = NULL;
 	dio.bvec = NULL;
@@ -444,11 +433,9 @@ generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
 	atomic_set(&dio.bio_count, 0);
 	spin_lock_init(&dio.bio_list_lock);
 	dio.bio_list = NULL;
-	init_waitqueue_head(&dio.wait_q);
+	dio.waiter = current;

-	down_read(&current->mm->mmap_sem);
 	ret = do_direct_IO(&dio);
-	up_read(&current->mm->mmap_sem);

 	if (dio.bio)
 		dio_bio_submit(&dio);

--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -248,23 +248,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		if (!inode->i_mapping || !inode->i_mapping->a_ops ||
 			!inode->i_mapping->a_ops->direct_IO)
 				return -EINVAL;
-
-		/*
-		 * alloc_kiovec() can sleep and we are only serialized by
-		 * the big kernel lock here, so abuse the i_sem to serialize
-		 * this case too. We of course wouldn't need to go deep down
-		 * to the inode layer, we could stay at the file layer, but
-		 * we don't want to pay for the memory of a semaphore in each
-		 * file structure too and we use the inode semaphore that we just
-		 * pay for anyways.
-		 */
-		error = 0;
-		down(&inode->i_sem);
-		if (!filp->f_iobuf)
-			error = alloc_kiovec(1, &filp->f_iobuf);
-		up(&inode->i_sem);
-		if (error < 0)
-			return error;
 	}

 	/* required for strict SunOS emulation */

--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -115,9 +115,6 @@ void __fput(struct file * file)

 	locks_remove_flock(file);

-	if (file->f_iobuf)
-		free_kiovec(1, &file->f_iobuf);
-
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	fops_put(file->f_op);

--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -293,11 +293,10 @@ static int jfs_bmap(struct address_space *mapping, long block)
 	return generic_block_bmap(mapping, block, jfs_get_block);
 }

-static int jfs_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
-			 unsigned long blocknr, int blocksize)
+static int jfs_direct_IO(int rw, struct inode *inode, char *buf,
+			loff_t offset, size_t count)
 {
-	return generic_direct_IO(rw, inode, iobuf, blocknr,
-				 blocksize, jfs_get_block);
+	return generic_direct_IO(rw, inode, buf, offset, count, jfs_get_block);
 }

 struct address_space_operations jfs_aops = {

--- a/fs/open.c
+++ b/fs/open.c
@@ -647,15 +647,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 	f->f_op = fops_get(inode->i_fop);
 	file_move(f, &inode->i_sb->s_files);

-	/* preallocate kiobuf for O_DIRECT */
-	f->f_iobuf = NULL;
-	f->f_iobuf_lock = 0;
-	if (f->f_flags & O_DIRECT) {
-		error = alloc_kiovec(1, &f->f_iobuf);
-		if (error)
-			goto cleanup_all;
-	}
-
 	if (f->f_op && f->f_op->open) {
 		error = f->f_op->open(inode,f);
 		if (error)
@@ -675,8 +666,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 	return f;

 cleanup_all:
-	if (f->f_iobuf)
-		free_kiovec(1, &f->f_iobuf);
 	fops_put(f->f_op);
 	if (f->f_mode & FMODE_WRITE)
 		put_write_access(inode);

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -274,7 +274,6 @@ struct iattr {
 */
 struct page;
 struct address_space;
-struct kiobuf;

 struct address_space_operations {
 	int (*writepage)(struct page *);
@@ -493,10 +492,6 @@ struct file {

 	/* needed for tty driver, and maybe others */
 	void			*private_data;
-
-	/* preallocated helper kiobuf to speedup O_DIRECT */
-	struct kiobuf		*f_iobuf;
-	long			f_iobuf_lock;
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1102,89 +1102,6 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 	UPDATE_ATIME(inode);
 }

-#if 0
-static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
-{
-	ssize_t retval;
-	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
-	struct kiobuf * iobuf;
-	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
-	struct inode * inode = mapping->host;
-
-	new_iobuf = 0;
-	iobuf = filp->f_iobuf;
-	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
-		/*
-		 * A parallel read/write is using the preallocated iobuf
-		 * so just run slow and allocate a new one.
-		 */
-		retval = alloc_kiovec(1, &iobuf);
-		if (retval)
-			goto out;
-		new_iobuf = 1;
-	}
-
-	blocksize = 1 << inode->i_blkbits;
-	blocksize_bits = inode->i_blkbits;
-	blocksize_mask = blocksize - 1;
-	chunk_size = KIO_MAX_ATOMIC_IO << 10;
-
-	retval = -EINVAL;
-	if ((offset & blocksize_mask) || (count & blocksize_mask))
-		goto out_free;
-
-	/*
-	 * Flush to disk exclusively the _data_, metadata must remain
-	 * completly asynchronous or performance will go to /dev/null.
-	 */
-	retval = filemap_fdatawait(mapping);
-	if (retval == 0)
-		retval = filemap_fdatawrite(mapping);
-	if (retval == 0)
-		retval = filemap_fdatawait(mapping);
-	if (retval < 0)
-		goto out_free;
-
-	progress = retval = 0;
-	while (count > 0) {
-		iosize = count;
-		if (iosize > chunk_size)
-			iosize = chunk_size;
-
-		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
-		if (retval)
-			break;
-
-		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
-
-		if (rw == READ && retval > 0)
-			mark_dirty_kiobuf(iobuf, retval);
-		
-		if (retval >= 0) {
-			count -= retval;
-			buf += retval;
-			progress += retval;
-		}
-
-		unmap_kiobuf(iobuf);
-
-		if (retval != iosize)
-			break;
-	}
-
-	if (progress)
-		retval = progress;
-
- out_free:
-	if (!new_iobuf)
-		clear_bit(0, &filp->f_iobuf_lock);
-	else
-		free_kiovec(1, &iobuf);
- out:	
-	return retval;
-}
-#endif
-
 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
 	char *kaddr;