Commit b1ad1f4e authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] no-buffer-head ext2 option

Implements a new set of block address_space_operations which will never
attach buffer_heads to file pagecache.  These can be turned on for ext2
with the `nobh' mount option.

During write-intensive testing on a 7G machine, total buffer_head
storage remained below 0.3 megabytes.  And those buffer_heads are
against ZONE_NORMAL pagecache and will be reclaimed by ZONE_NORMAL
memory pressure.

This work is, of course, a special for the huge highmem machines.
Possibly it obsoletes the buffer_heads_over_limit stuff (which doesn't
work terribly well), but that code is simple, and will provide relief
for other filesystems.


It should be noted that the nobh_prepare_write() function and the
PageMappedToDisk() infrastructure is what is needed to solve the
problem of user data corruption when the filesystem which backs a
sparse MAP_SHARED mapping runs out of space.  We can use this code in
filemap_nopage() to ensure that all mapped pages have space allocated
on-disk.  Deliver SIGBUS on ENOSPC.

This will require a new address_space op, I expect.
parent 36fb7f84
......@@ -1964,6 +1964,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, blocks;
int nr, i;
int fully_mapped = 1;
if (!PageLocked(page))
PAGE_BUG(page);
......@@ -1986,6 +1987,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
continue;
if (!buffer_mapped(bh)) {
fully_mapped = 0;
if (iblock < lblock) {
if (get_block(inode, iblock, bh, 0))
SetPageError(page);
......@@ -2008,6 +2010,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
arr[nr++] = bh;
} while (i++, iblock++, (bh = bh->b_this_page) != head);
if (fully_mapped)
SetPageMappedToDisk(page);
if (!nr) {
/*
* All buffers are uptodate - we can set the page uptodate
......@@ -2204,6 +2209,198 @@ int generic_commit_write(struct file *file, struct page *page,
return 0;
}
/*
* On entry, the page is fully not uptodate.
* On exit the page is fully uptodate in the areas outside (from,to)
*/
int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head map_bh;
struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
unsigned block_in_page;
unsigned block_start;
sector_t block_in_file;
char *kaddr;
int nr_reads = 0;
int i;
int ret = 0;
int is_mapped_to_disk = 1;
int dirtied_it = 0;
if (PageMappedToDisk(page))
return 0;
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
map_bh.b_page = page;
/*
* We loop across all blocks in the page, whether or not they are
* part of the affected region. This is so we can discover if the
* page is fully mapped-to-disk.
*/
for (block_start = 0, block_in_page = 0;
block_start < PAGE_CACHE_SIZE;
block_in_page++, block_start += blocksize) {
unsigned block_end = block_start + blocksize;
int create;
map_bh.b_state = 0;
create = 1;
if (block_start >= to)
create = 0;
ret = get_block(inode, block_in_file + block_in_page,
&map_bh, create);
if (ret)
goto failed;
if (!buffer_mapped(&map_bh))
is_mapped_to_disk = 0;
if (buffer_new(&map_bh))
unmap_underlying_metadata(map_bh.b_bdev,
map_bh.b_blocknr);
if (PageUptodate(page))
continue;
if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
kaddr = kmap_atomic(page, KM_USER0);
if (block_start < from) {
memset(kaddr+block_start, 0, from-block_start);
dirtied_it = 1;
}
if (block_end > to) {
memset(kaddr + to, 0, block_end - to);
dirtied_it = 1;
}
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
continue;
}
if (buffer_uptodate(&map_bh))
continue; /* reiserfs does this */
if (block_start < from || block_end > to) {
struct buffer_head *bh = alloc_buffer_head();
if (!bh) {
ret = -ENOMEM;
goto failed;
}
bh->b_state = map_bh.b_state;
atomic_set(&bh->b_count, 0);
bh->b_this_page = 0;
bh->b_page = page;
bh->b_blocknr = map_bh.b_blocknr;
bh->b_size = blocksize;
bh->b_data = (char *)block_start;
bh->b_bdev = map_bh.b_bdev;
bh->b_private = NULL;
read_bh[nr_reads++] = bh;
}
}
if (nr_reads) {
ll_rw_block(READ, nr_reads, read_bh);
for (i = 0; i < nr_reads; i++) {
wait_on_buffer(read_bh[i]);
if (!buffer_uptodate(read_bh[i]))
ret = -EIO;
free_buffer_head(read_bh[i]);
read_bh[i] = NULL;
}
if (ret)
goto failed;
}
if (is_mapped_to_disk)
SetPageMappedToDisk(page);
SetPageUptodate(page);
/*
* Setting the page dirty here isn't necessary for the prepare_write
* function - commit_write will do that. But if/when this function is
* used within the pagefault handler to ensure that all mmapped pages
* have backing space in the filesystem, we will need to dirty the page
* if its contents were altered.
*/
if (dirtied_it)
set_page_dirty(page);
return 0;
failed:
for (i = 0; i < nr_reads; i++) {
if (read_bh[i])
free_buffer_head(read_bh[i]);
}
/*
* Error recovery is pretty slack. Clear the page and mark it dirty
* so we'll later zero out any blocks which _were_ allocated.
*/
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr, 0, PAGE_CACHE_SIZE);
kunmap_atomic(kaddr, KM_USER0);
SetPageUptodate(page);
set_page_dirty(page);
return ret;
}
EXPORT_SYMBOL(nobh_prepare_write);
int nobh_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
set_page_dirty(page);
if (pos > inode->i_size) {
inode->i_size = pos;
mark_inode_dirty(inode);
}
return 0;
}
EXPORT_SYMBOL(nobh_commit_write);
/*
* This function assumes that ->prepare_write() uses nobh_prepare_write().
*/
int nobh_truncate_page(struct address_space *mapping, loff_t from)
{
struct inode *inode = mapping->host;
unsigned blocksize = 1 << inode->i_blkbits;
pgoff_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned to;
struct page *page;
struct address_space_operations *a_ops = mapping->a_ops;
char *kaddr;
int ret = 0;
if ((offset & (blocksize - 1)) == 0)
goto out;
ret = -ENOMEM;
page = grab_cache_page(mapping, index);
if (!page)
goto out;
to = (offset + blocksize) & ~(blocksize - 1);
ret = a_ops->prepare_write(NULL, page, offset, to);
if (ret == 0) {
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
set_page_dirty(page);
}
unlock_page(page);
page_cache_release(page);
out:
return ret;
}
EXPORT_SYMBOL(nobh_truncate_page);
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
......
......@@ -120,6 +120,7 @@ extern struct file_operations ext2_file_operations;
/* inode.c */
extern struct address_space_operations ext2_aops;
extern struct address_space_operations ext2_nobh_aops;
/* namei.c */
extern struct inode_operations ext2_dir_inode_operations;
......
......@@ -612,6 +612,13 @@ ext2_prepare_write(struct file *file, struct page *page,
return block_prepare_write(page,from,to,ext2_get_block);
}
static int
ext2_nobh_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
return nobh_prepare_write(page,from,to,ext2_get_block);
}
static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,ext2_get_block);
......@@ -657,6 +664,18 @@ struct address_space_operations ext2_aops = {
.writepages = ext2_writepages,
};
struct address_space_operations ext2_nobh_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
.writepage = ext2_writepage,
.sync_page = block_sync_page,
.prepare_write = ext2_nobh_prepare_write,
.commit_write = nobh_commit_write,
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
};
/*
* Probably it should be a library function... search for first non-zero word
* or memcmp with zero_page, whatever is better for particular architecture.
......@@ -864,7 +883,11 @@ void ext2_truncate (struct inode * inode)
iblock = (inode->i_size + blocksize-1)
>> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
block_truncate_page(inode->i_mapping, inode->i_size, ext2_get_block);
if (test_opt(inode->i_sb, NOBH))
nobh_truncate_page(inode->i_mapping, inode->i_size);
else
block_truncate_page(inode->i_mapping,
inode->i_size, ext2_get_block);
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0)
......@@ -1044,17 +1067,26 @@ void ext2_read_inode (struct inode * inode)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext2_file_inode_operations;
inode->i_fop = &ext2_file_operations;
inode->i_mapping->a_ops = &ext2_aops;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
inode->i_mapping->a_ops = &ext2_aops;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (ext2_inode_is_fast_symlink(inode))
inode->i_op = &ext2_fast_symlink_inode_operations;
else {
inode->i_op = &ext2_symlink_inode_operations;
inode->i_mapping->a_ops = &ext2_aops;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
}
} else {
inode->i_op = &ext2_special_inode_operations;
......
......@@ -127,7 +127,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode)
if (!IS_ERR(inode)) {
inode->i_op = &ext2_file_inode_operations;
inode->i_fop = &ext2_file_operations;
inode->i_mapping->a_ops = &ext2_aops;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
mark_inode_dirty(inode);
err = ext2_add_nondir(dentry, inode);
}
......@@ -168,7 +171,10 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sizeof (EXT2_I(inode)->i_data)) {
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
inode->i_mapping->a_ops = &ext2_aops;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
......@@ -222,7 +228,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
inode->i_mapping->a_ops = &ext2_aops;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
ext2_inc_count(inode);
......
......@@ -391,6 +391,8 @@ static int parse_options (char * options,
set_opt (sbi->s_mount_opt, OLDALLOC);
else if (!strcmp (this_char, "orlov"))
clear_opt (sbi->s_mount_opt, OLDALLOC);
else if (!strcmp (this_char, "nobh"))
set_opt(sbi->s_mount_opt, NOBH);
/* Silently ignore the quota options */
else if (!strcmp (this_char, "grpquota")
|| !strcmp (this_char, "noquota")
......
......@@ -178,6 +178,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
struct block_device *bdev = NULL;
struct buffer_head bh;
int length;
int fully_mapped = 1;
if (page_has_buffers(page))
goto confused;
......@@ -194,6 +195,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
}
if (!buffer_mapped(&bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
continue;
......@@ -220,6 +222,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
unlock_page(page);
goto out;
}
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
/*
......
......@@ -190,6 +190,9 @@ sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
int file_fsync(struct file *, struct dentry *, int);
int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
int nobh_truncate_page(struct address_space *, loff_t);
#define OSYNC_METADATA (1<<0)
#define OSYNC_DATA (1<<1)
......
......@@ -308,6 +308,7 @@ struct ext2_inode {
#define EXT2_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */
#define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
#define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
#define EXT2_MOUNT_NOBH 0x0100 /* No buffer_heads */
#define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */
#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
#define EXT2_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
......
......@@ -70,6 +70,7 @@
#define PG_chainlock 15 /* lock bit for ->pte_chain */
#define PG_direct 16 /* ->pte_chain points directly at pte */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
......@@ -233,6 +234,10 @@ extern void get_full_page_state(struct page_state *ret);
#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags)
#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags)
#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
......
......@@ -265,7 +265,7 @@ static void prep_new_page(struct page *page, int order)
page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked);
1 << PG_checked | 1 << PG_mappedtodisk);
set_page_refs(page, order);
}
......
......@@ -52,6 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
clear_page_dirty(page);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment