v2.4.10 -> v2.4.10.0.1

- me/al/andrea: buffers-in-pagecache coherency, buffer.c cleanups

v2.4.10 -> v2.4.10.0.1
- me/al/andrea: buffers-in-pagecache coherency, buffer.c cleanups
a41cd6e4 · Linus Torvalds · 8c7cba55 · a41cd6e4 · a41cd6e4 · a41cd6e4
Commit a41cd6e4 authored Feb 04, 2002 by Linus Torvalds
10 changed files
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -22,63 +22,85 @@

 #include <asm/uaccess.h>

-static inline int blkdev_get_block(struct inode * inode, long iblock, struct buffer_head * bh_result)
-{
-	int err;
+#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)

-	err = -EIO;
-	if (iblock >= buffered_blk_size(inode->i_rdev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS))
-		goto out;
+static inline unsigned int blksize_bits(unsigned int size)
+{
+	unsigned int bits = 8;
+	do {
+		bits++;
+		size >>= 1;
+	} while (size > 256);
+	return bits;
+}

-	bh_result->b_blocknr = iblock;
-	bh_result->b_state |= 1UL << BH_Mapped;
-	err = 0;
+static inline unsigned int block_size(kdev_t dev)
+{
+	int retval = BLOCK_SIZE;
+	int major = MAJOR(dev);

- out:
-	return err;
+	if (blksize_size[major]) {
+		int minor = MINOR(dev);
+		if (blksize_size[major][minor])
+			retval = blksize_size[major][minor];
+	}
+	return retval;
 }

-static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+static unsigned int max_block(kdev_t dev)
 {
-	int i, nr_blocks, retval, dev = inode->i_rdev;
-	unsigned long * blocks = iobuf->blocks;
+	unsigned int retval = ~0U;
+	int major = MAJOR(dev);

-	if (blocksize != BUFFERED_BLOCKSIZE)
-		BUG();
+	if (blk_size[major]) {
+		int minor = MINOR(dev);
+		unsigned int blocks = blk_size[major][minor];
+		if (blocks) {
+			unsigned int size = block_size(dev);
+			unsigned int sizebits = blksize_bits(size);
+			blocks += (size-1) >> BLOCK_SIZE_BITS;
+			retval = blocks << (BLOCK_SIZE_BITS - sizebits);
+			if (sizebits > BLOCK_SIZE_BITS)
+				retval = blocks >> (sizebits - BLOCK_SIZE_BITS);
+		}
+	}
+	return retval;
+}

-	nr_blocks = iobuf->length >> BUFFERED_BLOCKSIZE_BITS;
-	/* build the blocklist */
-	for (i = 0; i < nr_blocks; i++, blocknr++) {
-		struct buffer_head bh;

-		retval = blkdev_get_block(inode, blocknr, &bh);
-		if (retval)
-			goto out;
+static inline int blkdev_get_block(struct inode * inode, long iblock, struct buffer_head * bh_result)
+{
+	int err;

-		blocks[i] = bh.b_blocknr;
-	}
+	err = -EIO;
+	if (iblock >= max_block(inode->i_rdev))
+		goto out;

-	retval = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, blocksize);
+	bh_result->b_blocknr = iblock;
+	bh_result->b_state |= 1UL << BH_Mapped;
+	err = 0;

 out:
-	return retval;
+	return err;
 }

 static int blkdev_writepage(struct page * page)
 {
 	int err, i;
+	unsigned int blocksize;
 	unsigned long block;
 	struct buffer_head *bh, *head;
 	struct inode *inode = page->mapping->host;

 	if (!PageLocked(page))
 		BUG();
+	blocksize = block_size(inode->i_rdev);	

 	if (!page->buffers)
-		create_empty_buffers(page, inode->i_rdev, BUFFERED_BLOCKSIZE);
+		create_empty_buffers(page, inode->i_rdev, blocksize);
 	head = page->buffers;

-	block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
+	block = page->index << (PAGE_CACHE_SHIFT - blksize_bits(blocksize));

 	bh = head;
 	i = 0;
@@ -132,19 +154,21 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	struct inode *inode = page->mapping->host;
 	kdev_t dev = inode->i_rdev;
 	unsigned long iblock, lblock;
-	struct buffer_head *bh, *head, *arr[1 << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS)];
-	unsigned int blocks;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocks, blocksize, blocksize_bits;
 	int nr, i;

 	if (!PageLocked(page))
 		PAGE_BUG(page);
+	blocksize = block_size(dev);
+	blocksize_bits = blksize_bits(blocksize);
 	if (!page->buffers)
-		create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE);
+		create_empty_buffers(page, dev, blocksize);
 	head = page->buffers;

-	blocks = PAGE_CACHE_SIZE >> BUFFERED_BLOCKSIZE_BITS;
-	iblock = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
-	lblock = buffered_blk_size(dev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS);
+	blocks = PAGE_CACHE_SIZE >> blocksize_bits;
+	iblock = page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	lblock = max_block(dev);
 	bh = head;
 	nr = 0;
 	i = 0;
@@ -159,7 +183,7 @@ static int blkdev_readpage(struct file * file, struct page * page)
 					continue;
 			}
 			if (!buffer_mapped(bh)) {
-				memset(kmap(page) + i * BUFFERED_BLOCKSIZE, 0, BUFFERED_BLOCKSIZE);
+				memset(kmap(page) + i * blocksize, 0, blocksize);
 				flush_dcache_page(page);
 				kunmap(page);
 				set_bit(BH_Uptodate, &bh->b_state);
@@ -206,19 +230,21 @@ static int __blkdev_prepare_write(struct inode *inode, struct page *page,
 	unsigned long block;
 	int err = 0;
 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
-	kmap(page);
+	unsigned int blocksize, blocksize_bits;

+	blocksize = block_size(dev);
+	blocksize_bits = blksize_bits(blocksize);
 	if (!page->buffers)
-		create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE);
+		create_empty_buffers(page, dev, blocksize);
 	head = page->buffers;

-	block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS);
+	block = page->index << (PAGE_CACHE_SHIFT - blocksize_bits);

 	for(bh = head, block_start = 0; bh != head || !block_start;
 	    block++, block_start=block_end, bh = bh->b_this_page) {
 		if (!bh)
 			BUG();
-		block_end = block_start + BUFFERED_BLOCKSIZE;
+		block_end = block_start + blocksize;
 		if (block_end <= from)
 			continue;
 		if (block_start >= to)
@@ -258,7 +284,6 @@ static int blkdev_prepare_write(struct file *file, struct page *page, unsigned f
 	int err = __blkdev_prepare_write(inode, page, from, to);
 	if (err) {
 		ClearPageUptodate(page);
-		kunmap(page);
 	}
 	return err;
 }
@@ -269,11 +294,13 @@ static int __blkdev_commit_write(struct inode *inode, struct page *page,
 	unsigned block_start, block_end;
 	int partial = 0, need_balance_dirty = 0;
 	struct buffer_head *bh, *head;
+	unsigned int blocksize;

+	blocksize = block_size(inode->i_rdev);
 	for(bh = head = page->buffers, block_start = 0;
 	    bh != head || !block_start;
 	    block_start=block_end, bh = bh->b_this_page) {
-		block_end = block_start + BUFFERED_BLOCKSIZE;
+		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (!buffer_uptodate(bh))
 				partial = 1;
@@ -305,7 +332,6 @@ static int blkdev_commit_write(struct file *file, struct page *page,
 {
 	struct inode *inode = page->mapping->host;
 	__blkdev_commit_write(inode,page,from,to);
-	kunmap(page);
 	return 0;
 }

@@ -797,8 +823,6 @@ int blkdev_put(struct block_device *bdev, int kind)
 				invalidate_buffers(bd_inode->i_rdev);
 			}
 			lock_super(sb);
-			if (sb->s_flags & MS_RDONLY)
-				update_buffers(bd_inode->i_rdev);
 			unlock_super(sb);
 			drop_super(sb);
 		}
@@ -837,7 +861,6 @@ struct address_space_operations def_blk_aops = {
 	sync_page: block_sync_page,
 	prepare_write: blkdev_prepare_write,
 	commit_write: blkdev_commit_write,
-	direct_IO: blkdev_direct_IO,
 };

 struct file_operations def_blk_fops = {

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,8 @@ struct bh_free_head {
 };
 static struct bh_free_head free_list[NR_SIZES];

-static int grow_buffers(int size);
+static void truncate_buffers(kdev_t dev);
+static int grow_buffers(kdev_t dev, int block, int size);
 static void __refile_buffer(struct buffer_head *);

 /* This is used by some architectures to estimate available memory. */
@@ -559,59 +560,28 @@ static void __insert_into_queues(struct buffer_head *bh)
 	__insert_into_lru_list(bh, bh->b_list);
 }

-/* This function must only run if there are no other
- * references _anywhere_ to this buffer head.
- */
-static void put_last_free(struct buffer_head * bh)
+struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 {
-	struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
-	struct buffer_head **bhp = &head->list;
-
-	bh->b_state = 0;
-
-	spin_lock(&head->lock);
-	bh->b_dev = B_FREE;
-	if(!*bhp) {
-		*bhp = bh;
-		bh->b_prev_free = bh;
-	}
-	bh->b_next_free = *bhp;
-	bh->b_prev_free = (*bhp)->b_prev_free;
-	(*bhp)->b_prev_free->b_next_free = bh;
-	(*bhp)->b_prev_free = bh;
-	spin_unlock(&head->lock);
-}
+	struct buffer_head *bh, **p = &hash(dev, block);

-/*
- * Why like this, I hear you say... The reason is race-conditions.
- * As we don't lock buffers (unless we are reading them, that is),
- * something might happen to it while we sleep (ie a read-error
- * will force it bad). This shouldn't really happen currently, but
- * the code is ready.
- */
-static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
-{
-	struct buffer_head *bh = hash(dev, block);
+	read_lock(&hash_table_lock);

-	for (; bh; bh = bh->b_next)
-		if (bh->b_blocknr == block	&&
-		    bh->b_size    == size	&&
-		    bh->b_dev     == dev)
+	for (;;) {
+		bh = *p;
+		if (!bh)
 			break;
-	if (bh)
+		p = &bh->b_next;
+		if (bh->b_blocknr != block)
+			continue;
+		if (bh->b_size != size)
+			continue;
+		if (bh->b_dev != dev)
+			continue;
 		get_bh(bh);
+		break;
+	}

-	return bh;
-}
-
-struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
-{
-	struct buffer_head *bh;
-
-	read_lock(&hash_table_lock);
-	bh = __get_hash_table(dev, block, size);
 	read_unlock(&hash_table_lock);
-
 	return bh;
 }

@@ -688,7 +658,7 @@ int inode_has_buffers(struct inode *inode)
   we think the disk contains more recent information than the buffercache.
   The update == 1 pass marks the buffers we need to update, the update == 2
   pass does the actual I/O. */
-void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 {
 	int i, nlist, slept;
 	struct buffer_head * bh, * bh_next;
@@ -722,33 +692,18 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
 			/* All buffers in the lru lists are mapped */
 			if (!buffer_mapped(bh))
 				BUG();
+			if (buffer_dirty(bh))
+				printk("invalidate: dirty buffer\n");
 			if (!atomic_read(&bh->b_count)) {
 				if (destroy_dirty_buffers || !buffer_dirty(bh)) {
 					remove_inode_queue(bh);
+#if 0
 					__remove_from_queues(bh);
 					put_last_free(bh);
+#endif
 				}
-			} else if (update) {
-				if ((update == 2) ^ buffer_uptodate(bh)  &&
-				    (update == 2) ^ buffer_req(bh)) {
-					write_unlock(&hash_table_lock);
-					atomic_inc(&bh->b_count);
-					spin_unlock(&lru_list_lock);
-
-					if (update == 2) {
-						ll_rw_block(READ, 1, &bh);
-						wait_on_buffer(bh);
-					} else {
-						lock_buffer(bh);
-						clear_bit(BH_Uptodate, &bh->b_state);
-						clear_bit(BH_Req, &bh->b_state);
-						unlock_buffer(bh);
-					}						
-
-					atomic_dec(&bh->b_count);
-					goto retry;
-				}
-			}
+			} else
+				printk("invalidate: busy buffer\n");

 			write_unlock(&hash_table_lock);
 			if (slept)
@@ -759,13 +714,14 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
 	spin_unlock(&lru_list_lock);
 	if (slept)
 		goto retry;
+
+	/* Get rid of the page cache */
+	truncate_buffers(dev);
 }

 void set_blocksize(kdev_t dev, int size)
 {
 	extern int *blksize_size[];
-	int i, nlist, slept;
-	struct buffer_head * bh, * bh_next;

 	if (!blksize_size[MAJOR(dev)])
 		return;
@@ -780,60 +736,10 @@ void set_blocksize(kdev_t dev, int size)
 	}
 	if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 		return;
+
 	sync_buffers(dev, 2);
 	blksize_size[MAJOR(dev)][MINOR(dev)] = size;
-
- retry:
-	slept = 0;
-	spin_lock(&lru_list_lock);
-	for(nlist = 0; nlist < NR_LIST; nlist++) {
-		bh = lru_list[nlist];
-		if (!bh)
-			continue;
-		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
-			bh_next = bh->b_next_free;
-			if (bh->b_dev != dev || bh->b_size == size)
-				continue;
-			/* Unhashed? */
-			if (!bh->b_pprev)
-				continue;
-			if (buffer_locked(bh)) {
-				get_bh(bh);
-				spin_unlock(&lru_list_lock);
-				wait_on_buffer(bh);
-				slept = 1;
-				spin_lock(&lru_list_lock);
-				put_bh(bh);
-			}
-
-			write_lock(&hash_table_lock);
-			if (!atomic_read(&bh->b_count)) {
-				if (buffer_dirty(bh))
-					printk(KERN_WARNING
-					       "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
-					       kdevname(dev), bh->b_blocknr, bh->b_size);
-				remove_inode_queue(bh);
-				__remove_from_queues(bh);
-				put_last_free(bh);
-			} else {
-				if (atomic_set_buffer_clean(bh))
-					__refile_buffer(bh);
-				clear_bit(BH_Uptodate, &bh->b_state);
-				printk(KERN_WARNING
-				       "set_blocksize: "
-				       "b_count %d, dev %s, block %lu, from %p\n",
-				       atomic_read(&bh->b_count), bdevname(bh->b_dev),
-				       bh->b_blocknr, __builtin_return_address(0));
-			}
-			write_unlock(&hash_table_lock);
-			if (slept)
-				goto out;
-		}
-	}
- out:
-	spin_unlock(&lru_list_lock);
-	if (slept)
-		goto retry;
+	invalidate_buffers(dev);
 }

 static void free_more_memory(void)
@@ -1137,57 +1043,16 @@ void invalidate_inode_buffers(struct inode *inode)
 */
 struct buffer_head * getblk(kdev_t dev, int block, int size)
 {
-	struct buffer_head * bh;
-	int isize;
-
-repeat:
-	spin_lock(&lru_list_lock);
-	write_lock(&hash_table_lock);
-	bh = __get_hash_table(dev, block, size);
-	if (bh)
-		goto out;
-
-	isize = BUFSIZE_INDEX(size);
-	spin_lock(&free_list[isize].lock);
-	bh = free_list[isize].list;
-	if (bh) {
-		__remove_from_free_list(bh, isize);
-		atomic_set(&bh->b_count, 1);
-	}
-	spin_unlock(&free_list[isize].lock);
+	for (;;) {
+		struct buffer_head * bh;

-	/*
-	 * OK, FINALLY we know that this buffer is the only one of
-	 * its kind, we hold a reference (b_count>0), it is unlocked,
-	 * and it is clean.
-	 */
-	if (bh) {
-		init_buffer(bh, NULL, NULL);
-		bh->b_dev = dev;
-		bh->b_blocknr = block;
-		bh->b_state = 1 << BH_Mapped;
+		bh = get_hash_table(dev, block, size);
+		if (bh)
+			return bh;

-		/* Insert the buffer into the regular lists */
-		__insert_into_queues(bh);
-	out:
-		write_unlock(&hash_table_lock);
-		spin_unlock(&lru_list_lock);
-		touch_buffer(bh);
-		return bh;
+		if (!grow_buffers(dev, block, size))
+			free_more_memory();
 	}
-
-	/*
-	 * If we block while refilling the free list, somebody may
-	 * create the buffer first ... search the hashes again.
-	 */
-	write_unlock(&hash_table_lock);
-	spin_unlock(&lru_list_lock);
-
-	if (!grow_buffers(size))
-		free_more_memory();
-
-	/* FIXME: getblk should fail if there's no enough memory */
-	goto repeat;
 }

 /* -1 -> no need to flush
@@ -1313,22 +1178,7 @@ void __brelse(struct buffer_head * buf)
 */
 void __bforget(struct buffer_head * buf)
 {
-	/* grab the lru lock here to block bdflush. */
-	spin_lock(&lru_list_lock);
-	write_lock(&hash_table_lock);
-	if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
-		goto in_use;
-	__hash_unlink(buf);
-	write_unlock(&hash_table_lock);
-	remove_inode_queue(buf);
-	__remove_from_lru_list(buf, buf->b_list);
-	spin_unlock(&lru_list_lock);
-	put_last_free(buf);
-	return;
-
- in_use:
-	write_unlock(&hash_table_lock);
-	spin_unlock(&lru_list_lock);
+	__brelse(buf);
 }

 /**
@@ -1524,17 +1374,17 @@ static struct buffer_head * create_buffers(struct page * page, unsigned long siz
 	goto try_again;
 }

-static void unmap_buffer(struct buffer_head * bh)
+/*
+ * Called when truncating a buffer on a page completely.
+ *
+ * We can avoid IO by marking it clean.
+ * FIXME!! FIXME!! FIXME!! We need to unmap it too,
+ * so that the filesystem won't write to it. There's
+ * some bug somewhere..
+ */
+static void discard_buffer(struct buffer_head * bh)
 {
-	if (buffer_mapped(bh)) {
-		mark_buffer_clean(bh);
-		lock_buffer(bh);
-		clear_bit(BH_Uptodate, &bh->b_state);
-		clear_bit(BH_Mapped, &bh->b_state);
-		clear_bit(BH_Req, &bh->b_state);
-		clear_bit(BH_New, &bh->b_state);
-		unlock_buffer(bh);
-	}
+	mark_buffer_clean(bh);
 }

 /*
@@ -1564,7 +1414,7 @@ int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
 		 * is this block fully flushed?
 		 */
 		if (offset <= curr_off)
-			unmap_buffer(bh);
+			discard_buffer(bh);
 		curr_off = next_off;
 		bh = next;
 	} while (bh != head);
@@ -2141,47 +1991,6 @@ int generic_block_bmap(struct address_space *mapping, long block, get_block_t *g
 	return tmp.b_blocknr;
 }

-int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
-{
-	int i, nr_blocks, retval;
-	unsigned long * blocks = iobuf->blocks;
-
-	nr_blocks = iobuf->length / blocksize;
-	/* build the blocklist */
-	for (i = 0; i < nr_blocks; i++, blocknr++) {
-		struct buffer_head bh;
-
-		bh.b_state = 0;
-		bh.b_dev = inode->i_dev;
-		bh.b_size = blocksize;
-
-		retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
-		if (retval)
-			goto out;
-
-		if (rw == READ) {
-			if (buffer_new(&bh))
-				BUG();
-			if (!buffer_mapped(&bh)) {
-				/* there was an hole in the filesystem */
-				blocks[i] = -1UL;
-				continue;
-			}
-		} else {
-			if (buffer_new(&bh))
-				unmap_underlying_metadata(&bh);
-			if (!buffer_mapped(&bh))
-				BUG();
-		}
-		blocks[i] = bh.b_blocknr;
-	}
-
-	retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
-
- out:
-	return retval;
-}
-
 /*
 * IO completion routine for a buffer_head being used for kiobuf IO: we
 * can't dispatch the kiobuf callback until io_count reaches 0.  
@@ -2447,67 +2256,125 @@ int block_symlink(struct inode *inode, const char *symname, int len)
 	return err;
 }

+/*
+ * Create the page-cache page that contains the requested block
+ */
+static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
+{
+	struct page * page;
+
+	page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
+	if (IS_ERR(page))
+		return NULL;
+
+	if (!PageLocked(page))
+		BUG();
+
+	if (!page->buffers) {
+		struct buffer_head *bh, *tail;
+		struct buffer_head *head = create_buffers(page, size, 0);
+		if (!head)
+			goto failed;
+
+		bh = head;
+		do {
+			tail = bh;
+			bh = bh->b_this_page;
+		} while (bh);
+		tail->b_this_page = head;
+		page->buffers = head;
+		page_cache_get(page);
+		atomic_inc(&buffermem_pages);
+	}
+	return page;
+
+failed:
+	UnlockPage(page);
+	page_cache_release(page);
+	return NULL;
+}
+
+static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
+{
+	struct buffer_head *head = page->buffers;
+	struct buffer_head *bh = head;
+	unsigned int uptodate;
+
+	uptodate = 1 << BH_Mapped;
+	if (Page_Uptodate(page))
+		uptodate |= 1 << BH_Uptodate;
+
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	do {
+		if (!(bh->b_state & (1 << BH_Mapped))) {
+			init_buffer(bh, NULL, NULL);
+			bh->b_dev = dev;
+			bh->b_blocknr = block;
+			bh->b_state = uptodate;
+		}
+
+		/* Insert the buffer into the regular lists */
+		if (!bh->b_pprev) {
+			__insert_into_queues(bh);
+		}
+
+		block++;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+}
+
 /*
 * Try to increase the number of buffers available: the size argument
 * is used to determine what kind of buffers we want.
 */
-static int grow_buffers(int size)
+static int grow_buffers(kdev_t dev, int block, int size)
 {
 	struct page * page;
-	struct buffer_head *bh, *tmp;
-	struct buffer_head * insert_point;
-	int isize;
+	struct block_device *bdev;
+	unsigned long index;
+	int sizebits;

 	if ((size & 511) || (size > PAGE_SIZE)) {
 		printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size);
 		return 0;
 	}
+	sizebits = -1;
+	do {
+		sizebits++;
+	} while ((size << sizebits) < PAGE_SIZE);

-	page = alloc_page(GFP_NOFS);
-	if (!page)
-		goto out;
-	LockPage(page);
-	bh = create_buffers(page, size, 0);
-	if (!bh)
-		goto no_buffer_head;
-
-	isize = BUFSIZE_INDEX(size);
+	index = block >> sizebits;
+	block = index << sizebits;

-	spin_lock(&free_list[isize].lock);
-	insert_point = free_list[isize].list;
-	tmp = bh;
-	while (1) {
-		if (insert_point) {
-			tmp->b_next_free = insert_point->b_next_free;
-			tmp->b_prev_free = insert_point;
-			insert_point->b_next_free->b_prev_free = tmp;
-			insert_point->b_next_free = tmp;
-		} else {
-			tmp->b_prev_free = tmp;
-			tmp->b_next_free = tmp;
-		}
-		insert_point = tmp;
-		if (tmp->b_this_page)
-			tmp = tmp->b_this_page;
-		else
-			break;
+	bdev = bdget(kdev_t_to_nr(dev));
+	if (!bdev) {
+		printk("No block device for %s\n", kdevname(dev));
+		BUG();
 	}
-	tmp->b_this_page = bh;
-	free_list[isize].list = bh;
-	spin_unlock(&free_list[isize].lock);

-	page->buffers = bh;
-	page->flags &= ~(1 << PG_referenced);
-	lru_cache_add(page);
-	UnlockPage(page);
-	atomic_inc(&buffermem_pages);
-	return 1;
+	/* Create a page with the proper size buffers.. */
+	page = grow_dev_page(bdev, index, size);
+
+	/* This is "wrong" - talk to Al Viro */
+	atomic_dec(&bdev->bd_count);
+	if (!page)
+		return 0;

-no_buffer_head:
+	/* Hash in the buffers on the hash list */
+	hash_page_buffers(page, dev, block, size);
 	UnlockPage(page);
 	page_cache_release(page);
-out:
-	return 0;
+	return 1;
+}
+
+static void truncate_buffers(kdev_t dev)
+{
+	struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+	atomic_dec(&bdev->bd_count);
 }

 static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)

--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -586,10 +586,6 @@ static int ext2_bmap(struct address_space *mapping, long block)
 {
 	return generic_block_bmap(mapping,block,ext2_get_block);
 }
-static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
-{
-	return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block);
-}
 struct address_space_operations ext2_aops = {
 	readpage: ext2_readpage,
 	writepage: ext2_writepage,
@@ -597,7 +593,6 @@ struct address_space_operations ext2_aops = {
 	prepare_write: ext2_prepare_write,
 	commit_write: generic_commit_write,
 	bmap: ext2_bmap,
-	direct_IO: ext2_direct_IO,
 };

 /*

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -203,15 +203,4 @@ static inline int get_hardsect_size(kdev_t dev)
 #define blk_finished_io(nsects)	do { } while (0)
 #define blk_started_io(nsects)	do { } while (0)

-static inline int buffered_blk_size(kdev_t dev)
-{
-	int ret = INT_MAX;
-	int major = MAJOR(dev);
-
-	if (blk_size[major])
-		ret = blk_size[major][MINOR(dev)] + ((BUFFERED_BLOCKSIZE-1) >> BLOCK_SIZE_BITS);
-
-	return ret;
-}
-
 #endif
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -46,10 +46,6 @@ struct poll_table_struct;
 #define BLOCK_SIZE_BITS 10
 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)

-/* buffer header fixed size for the blkdev I/O through pagecache */
-#define BUFFERED_BLOCKSIZE_BITS 10
-#define BUFFERED_BLOCKSIZE (1 << BUFFERED_BLOCKSIZE_BITS)
-
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
 	int nr_files;		/* read only */
@@ -1174,14 +1170,9 @@ extern int invalidate_device(kdev_t, int);
 extern void invalidate_inode_pages(struct inode *);
 extern void invalidate_inode_pages2(struct address_space *);
 extern void invalidate_inode_buffers(struct inode *);
-#define invalidate_buffers(dev)	__invalidate_buffers((dev), 0, 0)
-#define destroy_buffers(dev)	__invalidate_buffers((dev), 1, 0)
-#define update_buffers(dev)			\
-do {						\
-	__invalidate_buffers((dev), 0, 1);	\
-	__invalidate_buffers((dev), 0, 2);	\
-} while (0)
-extern void __invalidate_buffers(kdev_t dev, int, int);
+#define invalidate_buffers(dev)	__invalidate_buffers((dev), 0)
+#define destroy_buffers(dev)	__invalidate_buffers((dev), 1)
+extern void __invalidate_buffers(kdev_t dev, int);
 extern void sync_inodes(kdev_t);
 extern void sync_unlocked_inodes(void);
 extern void write_inode_now(struct inode *, int);
@@ -1367,7 +1358,6 @@ extern int block_sync_page(struct page *);
 int generic_block_bmap(struct address_space *, long, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *);
 extern void create_empty_buffers(struct page *, kdev_t, unsigned long);

 extern int waitfor_one_page(struct page*);

--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -76,6 +76,9 @@ extern struct page * __find_get_page(struct address_space *mapping,
 	__find_get_page(mapping, index, page_hash(mapping, index))
 extern struct page * __find_lock_page (struct address_space * mapping,
 				unsigned long index, struct page **hash);
+extern struct page * find_or_create_page(struct address_space *mapping,
+				unsigned long index, unsigned int gfp_mask);
+
 extern void lock_page(struct page *page);
 #define find_lock_page(mapping, index) \
 	__find_lock_page(mapping, index, page_hash(mapping, index))

--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -131,6 +131,7 @@ extern struct page * read_swap_cache_async(swp_entry_t);
 extern void oom_kill(void);

 /* linux/mm/swapfile.c */
+extern int total_swap_pages;
 extern unsigned int nr_swapfiles;
 extern struct swap_info_struct swap_info[];
 extern int is_swap_partition(kdev_t);

--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -210,7 +210,6 @@ EXPORT_SYMBOL(waitfor_one_page);
 EXPORT_SYMBOL(generic_file_read);
 EXPORT_SYMBOL(do_generic_file_read);
 EXPORT_SYMBOL(generic_file_write);
-EXPORT_SYMBOL(generic_direct_IO);
 EXPORT_SYMBOL(generic_file_mmap);
 EXPORT_SYMBOL(generic_ro_fops);
 EXPORT_SYMBOL(generic_buffer_fdatasync);

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/iobuf.h>
+#include <linux/compiler.h>

 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -56,6 +57,7 @@ spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)

+static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
 static void add_page_to_hash_queue(struct page * page, struct page **p)
 {
 	struct page *next = *p;
@@ -792,11 +794,13 @@ struct page * __find_get_page(struct address_space *mapping,
 }

 /*
- * Same as the above, but lock the page too, verifying that
- * it's still valid once we own it.
+ * Must be called with the pagecache lock held,
+ * will return with it held (but it may be dropped
+ * during blocking operations..
 */
-struct page * __find_lock_page (struct address_space *mapping,
-				unsigned long offset, struct page **hash)
+static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
+static struct page * __find_lock_page_helper(struct address_space *mapping,
+					unsigned long offset, struct page *hash)
 {
 	struct page *page;

@@ -805,27 +809,72 @@ struct page * __find_lock_page (struct address_space *mapping,
 	 * the hash-list needs a held write-lock.
 	 */
 repeat:
-	spin_lock(&pagecache_lock);
-	page = __find_page_nolock(mapping, offset, *hash);
+	page = __find_page_nolock(mapping, offset, hash);
 	if (page) {
 		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
+		if (TryLockPage(page)) {
+			spin_unlock(&pagecache_lock);
+			lock_page(page);
+			spin_lock(&pagecache_lock);

-		lock_page(page);
+			/* Has the page been re-allocated while we slept? */
+			if (page->mapping != mapping || page->index != offset) {
+				UnlockPage(page);
+				page_cache_release(page);
+				goto repeat;
+			}
+		}
+	}
+	return page;
+}

-		/* Is the page still hashed? Ok, good.. */
-		if (page->mapping == mapping && page->index == offset)
-			return page;
+/*
+ * Same as the above, but lock the page too, verifying that
+ * it's still valid once we own it.
+ */
+struct page * __find_lock_page (struct address_space *mapping,
+				unsigned long offset, struct page **hash)
+{
+	struct page *page;

-		/* Nope: we raced. Release and try again.. */
-		UnlockPage(page);
-		page_cache_release(page);
-		goto repeat;
-	}
+	spin_lock(&pagecache_lock);
+	page = __find_lock_page_helper(mapping, offset, *hash);
 	spin_unlock(&pagecache_lock);
-	return NULL;
+	return page;
 }

+/*
+ * Same as above, but create the page if required..
+ */
+struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
+{
+	struct page *page;
+	struct page **hash = page_hash(mapping, index);
+
+	spin_lock(&pagecache_lock);
+	page = __find_lock_page_helper(mapping, index, *hash);
+	spin_unlock(&pagecache_lock);
+	if (!page) {
+		struct page *newpage = alloc_page(gfp_mask);
+		page = ERR_PTR(-ENOMEM);
+		if (newpage) {
+			spin_lock(&pagecache_lock);
+			page = __find_lock_page_helper(mapping, index, *hash);
+			if (likely(!page)) {
+				page = newpage;
+				__add_to_page_cache(page, mapping, index, hash);
+				newpage = NULL;
+			}
+			spin_unlock(&pagecache_lock);
+			if (unlikely(newpage != NULL))
+				page_cache_release(newpage);
+		}
+	}
+	return page;	
+}
+
+
+
 #if 0
 #define PROFILE_READAHEAD
 #define DEBUG_READAHEAD
@@ -960,10 +1009,7 @@ static inline unsigned long calc_end_index(struct inode * inode)
 {
 	unsigned long end_index;

-	if (!S_ISBLK(inode->i_mode))
-		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-	else
-		end_index = buffered_blk_size(inode->i_rdev) >> (PAGE_CACHE_SHIFT - BLOCK_SIZE_BITS);
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;

 	return end_index;
 }
@@ -972,10 +1018,7 @@ static inline loff_t calc_rsize(struct inode * inode)
 {
 	loff_t rsize;

-	if (!S_ISBLK(inode->i_mode))
-		rsize = inode->i_size;
-	else
-		rsize = (loff_t) buffered_blk_size(inode->i_rdev) << BLOCK_SIZE_BITS;
+	rsize = inode->i_size;

 	return rsize;
 }
@@ -1316,92 +1359,6 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 	UPDATE_ATIME(inode);
 }

-static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
-{
-	ssize_t retval;
-	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
-	struct kiobuf * iobuf;
-	struct inode * inode = filp->f_dentry->d_inode;
-	struct address_space * mapping = inode->i_mapping;
-
-	new_iobuf = 0;
-	iobuf = filp->f_iobuf;
-	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
-		/*
-		 * A parallel read/write is using the preallocated iobuf
-		 * so just run slow and allocate a new one.
-		 */
-		retval = alloc_kiovec(1, &iobuf);
-		if (retval)
-			goto out;
-		new_iobuf = 1;
-	}
-
-	if (!S_ISBLK(inode->i_mode)) {
-		blocksize = inode->i_sb->s_blocksize;
-		blocksize_bits = inode->i_sb->s_blocksize_bits;
-	} else {
-		blocksize = BUFFERED_BLOCKSIZE;
-		blocksize_bits = BUFFERED_BLOCKSIZE_BITS;
-	}
-	blocksize_mask = blocksize - 1;
-	chunk_size = KIO_MAX_ATOMIC_IO << 10;
-
-	retval = -EINVAL;
-	if ((offset & blocksize_mask) || (count & blocksize_mask))
-		goto out_free;
-	if (!mapping->a_ops->direct_IO)
-		goto out_free;
-
-	/*
-	 * Flush to disk exlusively the _data_, metadata must remains
-	 * completly asynchronous or performance will go to /dev/null.
-	 */
-	filemap_fdatasync(mapping);
-	retval = fsync_inode_data_buffers(inode);
-	filemap_fdatawait(mapping);
-	if (retval < 0)
-		goto out_free;
-
-	progress = retval = 0;
-	while (count > 0) {
-		iosize = count;
-		if (iosize > chunk_size)
-			iosize = chunk_size;
-
-		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
-		if (retval)
-			break;
-
-		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
-
-		if (rw == READ && retval > 0)
-			mark_dirty_kiobuf(iobuf, retval);
-		
-		if (retval >= 0) {
-			count -= retval;
-			buf += retval;
-			progress += retval;
-		}
-
-		unmap_kiobuf(iobuf);
-
-		if (retval != iosize)
-			break;
-	}
-
-	if (progress)
-		retval = progress;
-
- out_free:
-	if (!new_iobuf)
-		clear_bit(0, &filp->f_iobuf_lock);
-	else
-		free_kiovec(1, &iobuf);
- out:	
-	return retval;
-}
-
 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
 	char *kaddr;
@@ -1435,9 +1392,6 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
 	if ((ssize_t) count < 0)
 		return -EINVAL;

-	if (filp->f_flags & O_DIRECT)
-		goto o_direct;
-
 	retval = -EFAULT;
 	if (access_ok(VERIFY_WRITE, buf, count)) {
 		retval = 0;
@@ -1456,28 +1410,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
 				retval = desc.error;
 		}
 	}
- out:
 	return retval;
-
- o_direct:
-	{
-		loff_t pos = *ppos, size;
-		struct inode * inode = filp->f_dentry->d_inode;
-
-		retval = 0;
-		if (!count)
-			goto out; /* skip atime */
-		size = calc_rsize(inode);
-		if (pos < size) {
-			if (pos + count > size)
-				count = size - pos;
-			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
-			if (retval > 0)
-				*ppos = pos + retval;
-		}
-		UPDATE_ATIME(filp->f_dentry->d_inode);
-		goto out;
-	}
 }

 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
@@ -2778,9 +2711,6 @@ generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	mark_inode_dirty_sync(inode);

-	if (file->f_flags & O_DIRECT)
-		goto o_direct;
-
 	do {
 		unsigned long index, offset;
 		long page_fault;
@@ -2855,7 +2785,6 @@ generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 	if ((status >= 0) && (file->f_flags & O_SYNC))
 		status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
 	
-out_status:	
 	err = written ? written : status;
 out:

@@ -2864,25 +2793,6 @@ generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 fail_write:
 	status = -EFAULT;
 	goto unlock;
-
-o_direct:
-	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
-	if (written > 0) {
-		loff_t end = pos + written;
-		if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
-			inode->i_size = end;
-			mark_inode_dirty(inode);
-		}
-		*ppos = end;
-		invalidate_inode_pages2(mapping);
-	}
-	/*
-	 * Sync the fs metadata but not the minor inode changes and
-	 * of course not the data as we did direct DMA for the IO.
-	 */
-	if (written >= 0 && file->f_flags & O_SYNC)
-		status = generic_osync_inode(inode, OSYNC_METADATA);
-	goto out_status;
 }

 void __init page_cache_init(unsigned long mempages)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1101,6 +1101,10 @@ void swapin_readahead(swp_entry_t entry)
 	return;
 }

+/* Swap 80% full? Release the pages as they are paged in.. */
+#define vm_swap_full() \
+	(swapper_space.nrpages*5 > total_swap_pages*4)
+
 /*
 * We hold the mm semaphore and the page_table_lock on entry and exit.
 */
@@ -1158,10 +1162,12 @@ static int do_swap_page(struct mm_struct * mm,
 	swap_free(entry);
 	mark_page_accessed(page);
 	if (exclusive_swap_page(page)) {
-		if (vma->vm_flags & VM_WRITE)
-			pte = pte_mkwrite(pte);
-		pte = pte_mkdirty(pte);
-		delete_from_swap_cache(page);
+		if (write_access || vm_swap_full()) {
+			pte = pte_mkdirty(pte);
+			if (vma->vm_flags & VM_WRITE)
+				pte = pte_mkwrite(pte);
+			delete_from_swap_cache(page);
+		}
 	}
 	UnlockPage(page);