Commit b1ad1f4e authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] no-buffer-head ext2 option

Implements a new set of block address_space_operations which will never
attach buffer_heads to file pagecache.  These can be turned on for ext2
with the `nobh' mount option.

During write-intensive testing on a 7G machine, total buffer_head
storage remained below 0.3 megabytes.  And those buffer_heads are
against ZONE_NORMAL pagecache and will be reclaimed by ZONE_NORMAL
memory pressure.

This work is, of course, a special for the huge highmem machines.
Possibly it obsoletes the buffer_heads_over_limit stuff (which doesn't
work terribly well), but that code is simple, and will provide relief
for other filesystems.


It should be noted that the nobh_prepare_write() function and the
PageMappedToDisk() infrastructure is what is needed to solve the
problem of user data corruption when the filesystem which backs a
sparse MAP_SHARED mapping runs out of space.  We can use this code in
filemap_nopage() to ensure that all mapped pages have space allocated
on-disk.  Deliver SIGBUS on ENOSPC.

This will require a new address_space op, I expect.
parent 36fb7f84
...@@ -1964,6 +1964,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) ...@@ -1964,6 +1964,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, blocks; unsigned int blocksize, blocks;
int nr, i; int nr, i;
int fully_mapped = 1;
if (!PageLocked(page)) if (!PageLocked(page))
PAGE_BUG(page); PAGE_BUG(page);
...@@ -1986,6 +1987,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) ...@@ -1986,6 +1987,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
continue; continue;
if (!buffer_mapped(bh)) { if (!buffer_mapped(bh)) {
fully_mapped = 0;
if (iblock < lblock) { if (iblock < lblock) {
if (get_block(inode, iblock, bh, 0)) if (get_block(inode, iblock, bh, 0))
SetPageError(page); SetPageError(page);
...@@ -2008,6 +2010,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block) ...@@ -2008,6 +2010,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
arr[nr++] = bh; arr[nr++] = bh;
} while (i++, iblock++, (bh = bh->b_this_page) != head); } while (i++, iblock++, (bh = bh->b_this_page) != head);
if (fully_mapped)
SetPageMappedToDisk(page);
if (!nr) { if (!nr) {
/* /*
* All buffers are uptodate - we can set the page uptodate * All buffers are uptodate - we can set the page uptodate
...@@ -2204,6 +2209,198 @@ int generic_commit_write(struct file *file, struct page *page, ...@@ -2204,6 +2209,198 @@ int generic_commit_write(struct file *file, struct page *page,
return 0; return 0;
} }
/*
* On entry, the page is fully not uptodate.
* On exit the page is fully uptodate in the areas outside (from,to)
*/
int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head map_bh;
struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
unsigned block_in_page;
unsigned block_start;
sector_t block_in_file;
char *kaddr;
int nr_reads = 0;
int i;
int ret = 0;
int is_mapped_to_disk = 1;
int dirtied_it = 0;
if (PageMappedToDisk(page))
return 0;
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
map_bh.b_page = page;
/*
* We loop across all blocks in the page, whether or not they are
* part of the affected region. This is so we can discover if the
* page is fully mapped-to-disk.
*/
for (block_start = 0, block_in_page = 0;
block_start < PAGE_CACHE_SIZE;
block_in_page++, block_start += blocksize) {
unsigned block_end = block_start + blocksize;
int create;
map_bh.b_state = 0;
create = 1;
if (block_start >= to)
create = 0;
ret = get_block(inode, block_in_file + block_in_page,
&map_bh, create);
if (ret)
goto failed;
if (!buffer_mapped(&map_bh))
is_mapped_to_disk = 0;
if (buffer_new(&map_bh))
unmap_underlying_metadata(map_bh.b_bdev,
map_bh.b_blocknr);
if (PageUptodate(page))
continue;
if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
kaddr = kmap_atomic(page, KM_USER0);
if (block_start < from) {
memset(kaddr+block_start, 0, from-block_start);
dirtied_it = 1;
}
if (block_end > to) {
memset(kaddr + to, 0, block_end - to);
dirtied_it = 1;
}
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
continue;
}
if (buffer_uptodate(&map_bh))
continue; /* reiserfs does this */
if (block_start < from || block_end > to) {
struct buffer_head *bh = alloc_buffer_head();
if (!bh) {
ret = -ENOMEM;
goto failed;
}
bh->b_state = map_bh.b_state;
atomic_set(&bh->b_count, 0);
bh->b_this_page = 0;
bh->b_page = page;
bh->b_blocknr = map_bh.b_blocknr;
bh->b_size = blocksize;
bh->b_data = (char *)block_start;
bh->b_bdev = map_bh.b_bdev;
bh->b_private = NULL;
read_bh[nr_reads++] = bh;
}
}
if (nr_reads) {
ll_rw_block(READ, nr_reads, read_bh);
for (i = 0; i < nr_reads; i++) {
wait_on_buffer(read_bh[i]);
if (!buffer_uptodate(read_bh[i]))
ret = -EIO;
free_buffer_head(read_bh[i]);
read_bh[i] = NULL;
}
if (ret)
goto failed;
}
if (is_mapped_to_disk)
SetPageMappedToDisk(page);
SetPageUptodate(page);
/*
* Setting the page dirty here isn't necessary for the prepare_write
* function - commit_write will do that. But if/when this function is
* used within the pagefault handler to ensure that all mmapped pages
* have backing space in the filesystem, we will need to dirty the page
* if its contents were altered.
*/
if (dirtied_it)
set_page_dirty(page);
return 0;
failed:
for (i = 0; i < nr_reads; i++) {
if (read_bh[i])
free_buffer_head(read_bh[i]);
}
/*
* Error recovery is pretty slack. Clear the page and mark it dirty
* so we'll later zero out any blocks which _were_ allocated.
*/
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr, 0, PAGE_CACHE_SIZE);
kunmap_atomic(kaddr, KM_USER0);
SetPageUptodate(page);
set_page_dirty(page);
return ret;
}
EXPORT_SYMBOL(nobh_prepare_write);
int nobh_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
set_page_dirty(page);
if (pos > inode->i_size) {
inode->i_size = pos;
mark_inode_dirty(inode);
}
return 0;
}
EXPORT_SYMBOL(nobh_commit_write);
/*
* This function assumes that ->prepare_write() uses nobh_prepare_write().
*/
int nobh_truncate_page(struct address_space *mapping, loff_t from)
{
struct inode *inode = mapping->host;
unsigned blocksize = 1 << inode->i_blkbits;
pgoff_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned to;
struct page *page;
struct address_space_operations *a_ops = mapping->a_ops;
char *kaddr;
int ret = 0;
if ((offset & (blocksize - 1)) == 0)
goto out;
ret = -ENOMEM;
page = grab_cache_page(mapping, index);
if (!page)
goto out;
to = (offset + blocksize) & ~(blocksize - 1);
ret = a_ops->prepare_write(NULL, page, offset, to);
if (ret == 0) {
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
set_page_dirty(page);
}
unlock_page(page);
page_cache_release(page);
out:
return ret;
}
EXPORT_SYMBOL(nobh_truncate_page);
int block_truncate_page(struct address_space *mapping, int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block) loff_t from, get_block_t *get_block)
{ {
......
...@@ -120,6 +120,7 @@ extern struct file_operations ext2_file_operations; ...@@ -120,6 +120,7 @@ extern struct file_operations ext2_file_operations;
/* inode.c */ /* inode.c */
extern struct address_space_operations ext2_aops; extern struct address_space_operations ext2_aops;
extern struct address_space_operations ext2_nobh_aops;
/* namei.c */ /* namei.c */
extern struct inode_operations ext2_dir_inode_operations; extern struct inode_operations ext2_dir_inode_operations;
......
...@@ -612,6 +612,13 @@ ext2_prepare_write(struct file *file, struct page *page, ...@@ -612,6 +612,13 @@ ext2_prepare_write(struct file *file, struct page *page,
return block_prepare_write(page,from,to,ext2_get_block); return block_prepare_write(page,from,to,ext2_get_block);
} }
static int
ext2_nobh_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
return nobh_prepare_write(page,from,to,ext2_get_block);
}
static sector_t ext2_bmap(struct address_space *mapping, sector_t block) static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
{ {
return generic_block_bmap(mapping,block,ext2_get_block); return generic_block_bmap(mapping,block,ext2_get_block);
...@@ -657,6 +664,18 @@ struct address_space_operations ext2_aops = { ...@@ -657,6 +664,18 @@ struct address_space_operations ext2_aops = {
.writepages = ext2_writepages, .writepages = ext2_writepages,
}; };
struct address_space_operations ext2_nobh_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
.writepage = ext2_writepage,
.sync_page = block_sync_page,
.prepare_write = ext2_nobh_prepare_write,
.commit_write = nobh_commit_write,
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
};
/* /*
* Probably it should be a library function... search for first non-zero word * Probably it should be a library function... search for first non-zero word
* or memcmp with zero_page, whatever is better for particular architecture. * or memcmp with zero_page, whatever is better for particular architecture.
...@@ -864,7 +883,11 @@ void ext2_truncate (struct inode * inode) ...@@ -864,7 +883,11 @@ void ext2_truncate (struct inode * inode)
iblock = (inode->i_size + blocksize-1) iblock = (inode->i_size + blocksize-1)
>> EXT2_BLOCK_SIZE_BITS(inode->i_sb); >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
block_truncate_page(inode->i_mapping, inode->i_size, ext2_get_block); if (test_opt(inode->i_sb, NOBH))
nobh_truncate_page(inode->i_mapping, inode->i_size);
else
block_truncate_page(inode->i_mapping,
inode->i_size, ext2_get_block);
n = ext2_block_to_path(inode, iblock, offsets, NULL); n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0) if (n == 0)
...@@ -1044,17 +1067,26 @@ void ext2_read_inode (struct inode * inode) ...@@ -1044,17 +1067,26 @@ void ext2_read_inode (struct inode * inode)
if (S_ISREG(inode->i_mode)) { if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext2_file_inode_operations; inode->i_op = &ext2_file_inode_operations;
inode->i_fop = &ext2_file_operations; inode->i_fop = &ext2_file_operations;
inode->i_mapping->a_ops = &ext2_aops; if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISDIR(inode->i_mode)) { } else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext2_dir_inode_operations; inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations; inode->i_fop = &ext2_dir_operations;
inode->i_mapping->a_ops = &ext2_aops; if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) { } else if (S_ISLNK(inode->i_mode)) {
if (ext2_inode_is_fast_symlink(inode)) if (ext2_inode_is_fast_symlink(inode))
inode->i_op = &ext2_fast_symlink_inode_operations; inode->i_op = &ext2_fast_symlink_inode_operations;
else { else {
inode->i_op = &ext2_symlink_inode_operations; inode->i_op = &ext2_symlink_inode_operations;
inode->i_mapping->a_ops = &ext2_aops; if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
} }
} else { } else {
inode->i_op = &ext2_special_inode_operations; inode->i_op = &ext2_special_inode_operations;
......
...@@ -127,7 +127,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode) ...@@ -127,7 +127,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode)
if (!IS_ERR(inode)) { if (!IS_ERR(inode)) {
inode->i_op = &ext2_file_inode_operations; inode->i_op = &ext2_file_inode_operations;
inode->i_fop = &ext2_file_operations; inode->i_fop = &ext2_file_operations;
inode->i_mapping->a_ops = &ext2_aops; if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
mark_inode_dirty(inode); mark_inode_dirty(inode);
err = ext2_add_nondir(dentry, inode); err = ext2_add_nondir(dentry, inode);
} }
...@@ -168,7 +171,10 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, ...@@ -168,7 +171,10 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sizeof (EXT2_I(inode)->i_data)) { if (l > sizeof (EXT2_I(inode)->i_data)) {
/* slow symlink */ /* slow symlink */
inode->i_op = &ext2_symlink_inode_operations; inode->i_op = &ext2_symlink_inode_operations;
inode->i_mapping->a_ops = &ext2_aops; if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
err = page_symlink(inode, symname, l); err = page_symlink(inode, symname, l);
if (err) if (err)
goto out_fail; goto out_fail;
...@@ -222,7 +228,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) ...@@ -222,7 +228,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
inode->i_op = &ext2_dir_inode_operations; inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations; inode->i_fop = &ext2_dir_operations;
inode->i_mapping->a_ops = &ext2_aops; if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
ext2_inc_count(inode); ext2_inc_count(inode);
......
...@@ -391,6 +391,8 @@ static int parse_options (char * options, ...@@ -391,6 +391,8 @@ static int parse_options (char * options,
set_opt (sbi->s_mount_opt, OLDALLOC); set_opt (sbi->s_mount_opt, OLDALLOC);
else if (!strcmp (this_char, "orlov")) else if (!strcmp (this_char, "orlov"))
clear_opt (sbi->s_mount_opt, OLDALLOC); clear_opt (sbi->s_mount_opt, OLDALLOC);
else if (!strcmp (this_char, "nobh"))
set_opt(sbi->s_mount_opt, NOBH);
/* Silently ignore the quota options */ /* Silently ignore the quota options */
else if (!strcmp (this_char, "grpquota") else if (!strcmp (this_char, "grpquota")
|| !strcmp (this_char, "noquota") || !strcmp (this_char, "noquota")
......
...@@ -178,6 +178,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, ...@@ -178,6 +178,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
struct block_device *bdev = NULL; struct block_device *bdev = NULL;
struct buffer_head bh; struct buffer_head bh;
int length; int length;
int fully_mapped = 1;
if (page_has_buffers(page)) if (page_has_buffers(page))
goto confused; goto confused;
...@@ -194,6 +195,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, ...@@ -194,6 +195,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
} }
if (!buffer_mapped(&bh)) { if (!buffer_mapped(&bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page) if (first_hole == blocks_per_page)
first_hole = page_block; first_hole = page_block;
continue; continue;
...@@ -220,6 +222,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, ...@@ -220,6 +222,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
unlock_page(page); unlock_page(page);
goto out; goto out;
} }
} else if (fully_mapped) {
SetPageMappedToDisk(page);
} }
/* /*
......
...@@ -190,6 +190,9 @@ sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); ...@@ -190,6 +190,9 @@ sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *);
int file_fsync(struct file *, struct dentry *, int); int file_fsync(struct file *, struct dentry *, int);
int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
int nobh_truncate_page(struct address_space *, loff_t);
#define OSYNC_METADATA (1<<0) #define OSYNC_METADATA (1<<0)
#define OSYNC_DATA (1<<1) #define OSYNC_DATA (1<<1)
......
...@@ -308,6 +308,7 @@ struct ext2_inode { ...@@ -308,6 +308,7 @@ struct ext2_inode {
#define EXT2_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ #define EXT2_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */
#define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
#define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
#define EXT2_MOUNT_NOBH 0x0100 /* No buffer_heads */
#define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */
#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
#define EXT2_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ #define EXT2_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
......
...@@ -70,6 +70,7 @@ ...@@ -70,6 +70,7 @@
#define PG_chainlock 15 /* lock bit for ->pte_chain */ #define PG_chainlock 15 /* lock bit for ->pte_chain */
#define PG_direct 16 /* ->pte_chain points directly at pte */ #define PG_direct 16 /* ->pte_chain points directly at pte */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
/* /*
* Global page accounting. One instance per CPU. Only unsigned longs are * Global page accounting. One instance per CPU. Only unsigned longs are
...@@ -233,6 +234,10 @@ extern void get_full_page_state(struct page_state *ret); ...@@ -233,6 +234,10 @@ extern void get_full_page_state(struct page_state *ret);
#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) #define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags)
#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) #define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags)
#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
/* /*
* The PageSwapCache predicate doesn't use a PG_flag at this time, * The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day. * but it may again do so one day.
......
...@@ -265,7 +265,7 @@ static void prep_new_page(struct page *page, int order) ...@@ -265,7 +265,7 @@ static void prep_new_page(struct page *page, int order)
page->flags &= ~(1 << PG_uptodate | 1 << PG_error | page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked); 1 << PG_checked | 1 << PG_mappedtodisk);
set_page_refs(page, order); set_page_refs(page, order);
} }
......
...@@ -52,6 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) ...@@ -52,6 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
clear_page_dirty(page); clear_page_dirty(page);
ClearPageUptodate(page); ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page); remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */ page_cache_release(page); /* pagecache ref */
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment