[PATCH] no-buffer-head ext2 option

Implements a new set of block address_space_operations which will never attach buffer_heads to file pagecache. These can be turned on for ext2 with the `nobh' mount option. During write-intensive testing on a 7G machine, total buffer_head storage remained below 0.3 megabytes. And those buffer_heads are against ZONE_NORMAL pagecache and will be reclaimed by ZONE_NORMAL memory pressure. This work is, of course, a special for the huge highmem machines. Possibly it obsoletes the buffer_heads_over_limit stuff (which doesn't work terribly well), but that code is simple, and will provide relief for other filesystems. It should be noted that the nobh_prepare_write() function and the PageMappedToDisk() infrastructure is what is needed to solve the problem of user data corruption when the filesystem which backs a sparse MAP_SHARED mapping runs out of space. We can use this code in filemap_nopage() to ensure that all mapped pages have space allocated on-disk. Deliver SIGBUS on ENOSPC. This will require a new address_space op, I expect.

[PATCH] no-buffer-head ext2 option
Implements a new set of block address_space_operations which will never attach buffer_heads to file pagecache. These can be turned on for ext2 with the `nobh' mount option. During write-intensive testing on a 7G machine, total buffer_head storage remained below 0.3 megabytes. And those buffer_heads are against ZONE_NORMAL pagecache and will be reclaimed by ZONE_NORMAL memory pressure. This work is, of course, a special for the huge highmem machines. Possibly it obsoletes the buffer_heads_over_limit stuff (which doesn't work terribly well), but that code is simple, and will provide relief for other filesystems. It should be noted that the nobh_prepare_write() function and the PageMappedToDisk() infrastructure is what is needed to solve the problem of user data corruption when the filesystem which backs a sparse MAP_SHARED mapping runs out of space. We can use this code in filemap_nopage() to ensure that all mapped pages have space allocated on-disk. Deliver SIGBUS on ENOSPC. This will require a new address_space op, I expect.
b1ad1f4e · Andrew Morton · Linus Torvalds · 36fb7f84 · b1ad1f4e · b1ad1f4e
Commit b1ad1f4e authored Nov 21, 2002 by Andrew Morton Committed by Linus Torvalds Nov 21, 2002
11 changed files
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1964,6 +1964,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	unsigned int blocksize, blocks;
 	int nr, i;
+	int fully_mapped = 1;

 	if (!PageLocked(page))
 		PAGE_BUG(page);
@@ -1986,6 +1987,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 			continue;

 		if (!buffer_mapped(bh)) {
+			fully_mapped = 0;
 			if (iblock < lblock) {
 				if (get_block(inode, iblock, bh, 0))
 					SetPageError(page);
@@ -2008,6 +2010,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 		arr[nr++] = bh;
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);

+	if (fully_mapped)
+		SetPageMappedToDisk(page);
+
 	if (!nr) {
 		/*
 		 * All buffers are uptodate - we can set the page uptodate
@@ -2204,6 +2209,198 @@ int generic_commit_write(struct file *file, struct page *page,
 	return 0;
 }

+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
+			get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	struct buffer_head map_bh;
+	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
+	unsigned block_in_page;
+	unsigned block_start;
+	sector_t block_in_file;
+	char *kaddr;
+	int nr_reads = 0;
+	int i;
+	int ret = 0;
+	int is_mapped_to_disk = 1;
+	int dirtied_it = 0;
+
+	if (PageMappedToDisk(page))
+		return 0;
+
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	map_bh.b_page = page;
+
+	/*
+	 * We loop across all blocks in the page, whether or not they are
+	 * part of the affected region.  This is so we can discover if the
+	 * page is fully mapped-to-disk.
+	 */
+	for (block_start = 0, block_in_page = 0;
+		  block_start < PAGE_CACHE_SIZE;
+		  block_in_page++, block_start += blocksize) {
+		unsigned block_end = block_start + blocksize;
+		int create;
+
+		map_bh.b_state = 0;
+		create = 1;
+		if (block_start >= to)
+			create = 0;
+		ret = get_block(inode, block_in_file + block_in_page,
+					&map_bh, create);
+		if (ret)
+			goto failed;
+		if (!buffer_mapped(&map_bh))
+			is_mapped_to_disk = 0;
+		if (buffer_new(&map_bh))
+			unmap_underlying_metadata(map_bh.b_bdev,
+							map_bh.b_blocknr);
+		if (PageUptodate(page))
+			continue;
+		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_start < from) {
+				memset(kaddr+block_start, 0, from-block_start);
+				dirtied_it = 1;
+			}
+			if (block_end > to) {
+				memset(kaddr + to, 0, block_end - to);
+				dirtied_it = 1;
+			}
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+			continue;
+		}
+		if (buffer_uptodate(&map_bh))
+			continue;	/* reiserfs does this */
+		if (block_start < from || block_end > to) {
+			struct buffer_head *bh = alloc_buffer_head();
+
+			if (!bh) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+			bh->b_state = map_bh.b_state;
+			atomic_set(&bh->b_count, 0);
+			bh->b_this_page = 0;
+			bh->b_page = page;
+			bh->b_blocknr = map_bh.b_blocknr;
+			bh->b_size = blocksize;
+			bh->b_data = (char *)block_start;
+			bh->b_bdev = map_bh.b_bdev;
+			bh->b_private = NULL;
+			read_bh[nr_reads++] = bh;
+		}
+	}
+
+	if (nr_reads) {
+		ll_rw_block(READ, nr_reads, read_bh);
+		for (i = 0; i < nr_reads; i++) {
+			wait_on_buffer(read_bh[i]);
+			if (!buffer_uptodate(read_bh[i]))
+				ret = -EIO;
+			free_buffer_head(read_bh[i]);
+			read_bh[i] = NULL;
+		}
+		if (ret)
+			goto failed;
+	}
+
+	if (is_mapped_to_disk)
+		SetPageMappedToDisk(page);
+	SetPageUptodate(page);
+
+	/*
+	 * Setting the page dirty here isn't necessary for the prepare_write
+	 * function - commit_write will do that.  But if/when this function is
+	 * used within the pagefault handler to ensure that all mmapped pages
+	 * have backing space in the filesystem, we will need to dirty the page
+	 * if its contents were altered.
+	 */
+	if (dirtied_it)
+		set_page_dirty(page);
+
+	return 0;
+
+failed:
+	for (i = 0; i < nr_reads; i++) {
+		if (read_bh[i])
+			free_buffer_head(read_bh[i]);
+	}
+
+	/*
+	 * Error recovery is pretty slack.  Clear the page and mark it dirty
+	 * so we'll later zero out any blocks which _were_ allocated.
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(kaddr, KM_USER0);
+	SetPageUptodate(page);
+	set_page_dirty(page);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_prepare_write);
+
+int nobh_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	set_page_dirty(page);
+	if (pos > inode->i_size) {
+		inode->i_size = pos;
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nobh_commit_write);
+
+/*
+ * This function assumes that ->prepare_write() uses nobh_prepare_write().
+ */
+int nobh_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned to;
+	struct page *page;
+	struct address_space_operations *a_ops = mapping->a_ops;
+	char *kaddr;
+	int ret = 0;
+
+	if ((offset & (blocksize - 1)) == 0)
+		goto out;
+
+	ret = -ENOMEM;
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		goto out;
+
+	to = (offset + blocksize) & ~(blocksize - 1);
+	ret = a_ops->prepare_write(NULL, page, offset, to);
+	if (ret == 0) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+	}
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(nobh_truncate_page);
+
 int block_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {

--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,6 +120,7 @@ extern struct file_operations ext2_file_operations;

 /* inode.c */
 extern struct address_space_operations ext2_aops;
+extern struct address_space_operations ext2_nobh_aops;

 /* namei.c */
 extern struct inode_operations ext2_dir_inode_operations;

--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -612,6 +612,13 @@ ext2_prepare_write(struct file *file, struct page *page,
 	return block_prepare_write(page,from,to,ext2_get_block);
 }

+static int
+ext2_nobh_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	return nobh_prepare_write(page,from,to,ext2_get_block);
+}
+
 static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ext2_get_block);
@@ -657,6 +664,18 @@ struct address_space_operations ext2_aops = {
 	.writepages		= ext2_writepages,
 };

+struct address_space_operations ext2_nobh_aops = {
+	.readpage		= ext2_readpage,
+	.readpages		= ext2_readpages,
+	.writepage		= ext2_writepage,
+	.sync_page		= block_sync_page,
+	.prepare_write		= ext2_nobh_prepare_write,
+	.commit_write		= nobh_commit_write,
+	.bmap			= ext2_bmap,
+	.direct_IO		= ext2_direct_IO,
+	.writepages		= ext2_writepages,
+};
+
 /*
 * Probably it should be a library function... search for first non-zero word
 * or memcmp with zero_page, whatever is better for particular architecture.
@@ -864,7 +883,11 @@ void ext2_truncate (struct inode * inode)
 	iblock = (inode->i_size + blocksize-1)
 					>> EXT2_BLOCK_SIZE_BITS(inode->i_sb);

-	block_truncate_page(inode->i_mapping, inode->i_size, ext2_get_block);
+	if (test_opt(inode->i_sb, NOBH))
+		nobh_truncate_page(inode->i_mapping, inode->i_size);
+	else
+		block_truncate_page(inode->i_mapping,
+				inode->i_size, ext2_get_block);

 	n = ext2_block_to_path(inode, iblock, offsets, NULL);
 	if (n == 0)
@@ -1044,17 +1067,26 @@ void ext2_read_inode (struct inode * inode)
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext2_file_inode_operations;
 		inode->i_fop = &ext2_file_operations;
-		inode->i_mapping->a_ops = &ext2_aops;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext2_dir_inode_operations;
 		inode->i_fop = &ext2_dir_operations;
-		inode->i_mapping->a_ops = &ext2_aops;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext2_inode_is_fast_symlink(inode))
 			inode->i_op = &ext2_fast_symlink_inode_operations;
 		else {
 			inode->i_op = &ext2_symlink_inode_operations;
-			inode->i_mapping->a_ops = &ext2_aops;
+			if (test_opt(inode->i_sb, NOBH))
+				inode->i_mapping->a_ops = &ext2_nobh_aops;
+			else
+				inode->i_mapping->a_ops = &ext2_aops;
 		}
 	} else {
 		inode->i_op = &ext2_special_inode_operations;

--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -127,7 +127,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode)
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext2_file_inode_operations;
 		inode->i_fop = &ext2_file_operations;
-		inode->i_mapping->a_ops = &ext2_aops;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
 		mark_inode_dirty(inode);
 		err = ext2_add_nondir(dentry, inode);
 	}
@@ -168,7 +171,10 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sizeof (EXT2_I(inode)->i_data)) {
 		/* slow symlink */
 		inode->i_op = &ext2_symlink_inode_operations;
-		inode->i_mapping->a_ops = &ext2_aops;
+		if (test_opt(inode->i_sb, NOBH))
+			inode->i_mapping->a_ops = &ext2_nobh_aops;
+		else
+			inode->i_mapping->a_ops = &ext2_aops;
 		err = page_symlink(inode, symname, l);
 		if (err)
 			goto out_fail;
@@ -222,7 +228,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)

 	inode->i_op = &ext2_dir_inode_operations;
 	inode->i_fop = &ext2_dir_operations;
-	inode->i_mapping->a_ops = &ext2_aops;
+	if (test_opt(inode->i_sb, NOBH))
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+	else
+		inode->i_mapping->a_ops = &ext2_aops;

 	ext2_inc_count(inode);


--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -391,6 +391,8 @@ static int parse_options (char * options,
 			set_opt (sbi->s_mount_opt, OLDALLOC);
 		else if (!strcmp (this_char, "orlov"))
 			clear_opt (sbi->s_mount_opt, OLDALLOC);
+		else if (!strcmp (this_char, "nobh"))
+			set_opt(sbi->s_mount_opt, NOBH);
 		/* Silently ignore the quota options */
 		else if (!strcmp (this_char, "grpquota")
 		         || !strcmp (this_char, "noquota")

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -178,6 +178,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	struct block_device *bdev = NULL;
 	struct buffer_head bh;
 	int length;
+	int fully_mapped = 1;

 	if (page_has_buffers(page))
 		goto confused;
@@ -194,6 +195,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		}

 		if (!buffer_mapped(&bh)) {
+			fully_mapped = 0;
 			if (first_hole == blocks_per_page)
 				first_hole = page_block;
 			continue;
@@ -220,6 +222,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 			unlock_page(page);
 			goto out;
 		}
+	} else if (fully_mapped) {
+		SetPageMappedToDisk(page);
 	}

 	/*

--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -190,6 +190,9 @@ sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int file_fsync(struct file *, struct dentry *, int);
+int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
+int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
+int nobh_truncate_page(struct address_space *, loff_t);

 #define OSYNC_METADATA	(1<<0)
 #define OSYNC_DATA	(1<<1)

--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -308,6 +308,7 @@ struct ext2_inode {
 #define EXT2_MOUNT_ERRORS_RO		0x0020	/* Remount fs ro on errors */
 #define EXT2_MOUNT_ERRORS_PANIC		0x0040	/* Panic on errors */
 #define EXT2_MOUNT_MINIX_DF		0x0080	/* Mimics the Minix statfs */
+#define EXT2_MOUNT_NOBH			0x0100	/* No buffer_heads */
 #define EXT2_MOUNT_NO_UID32		0x0200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */

--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -70,6 +70,7 @@
 #define PG_chainlock		15	/* lock bit for ->pte_chain */

 #define PG_direct		16	/* ->pte_chain points directly at pte */
+#define PG_mappedtodisk		17	/* Has blocks allocated on-disk */

 /*
 * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -233,6 +234,10 @@ extern void get_full_page_state(struct page_state *ret);
 #define ClearPageDirect(page)		clear_bit(PG_direct, &(page)->flags)
 #define TestClearPageDirect(page)	test_and_clear_bit(PG_direct, &(page)->flags)

+#define PageMappedToDisk(page)	test_bit(PG_mappedtodisk, &(page)->flags)
+#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
+#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
+
 /*
 * The PageSwapCache predicate doesn't use a PG_flag at this time,
 * but it may again do so one day.

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -265,7 +265,7 @@ static void prep_new_page(struct page *page, int order)

 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_checked);
+			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_refs(page, order);
 }


--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,6 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)

 	clear_page_dirty(page);
 	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
 	remove_from_page_cache(page);
 	page_cache_release(page);	/* pagecache ref */
 }