Commit 7feebd5c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Fix huge sparse tmpfs files

From: Hugh Dickins <hugh@veritas.com>

Kevin P.  Fleming pointed out that the 2.6 tmpfs does not allow writing huge
sparse files.  This is an unintended side-effect of the strict memory commit
changes: which should make no difference.

The solution is to treat the tmpfs files (of variable size) and the shmem
objects (of fixed size) differently: sounds nasty but works out well.  The
shmem objects follow the VM preallocation convention as before, but the tmpfs
files revert to allocation on demand as a filesystem would.  If there's not
enough memory to write to a tmpfs hole, it is reported as -ENOSPC rather than
-ENOMEM, so the mmap writer gets SIGBUS rather than everyone else getting
OOM-killed.
parent 77c8efae
...@@ -123,6 +123,42 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) ...@@ -123,6 +123,42 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
return sb->s_fs_info; return sb->s_fs_info;
} }
/*
* shmem_file_setup pre-accounts the whole fixed size of a VM object,
* for shared memory and for shared anonymous (/dev/zero) mappings
* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
* consistent with the pre-accounting of private mappings ...
*/
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
return (flags & VM_ACCOUNT)?
security_vm_enough_memory(VM_ACCT(size)): 0;
}
static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
if (flags & VM_ACCOUNT)
vm_unacct_memory(VM_ACCT(size));
}
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow huge sparse files.
* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
static inline int shmem_acct_block(unsigned long flags)
{
return (flags & VM_ACCOUNT)?
0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
}
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
if (!(flags & VM_ACCOUNT))
vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
}
static struct super_operations shmem_ops; static struct super_operations shmem_ops;
static struct address_space_operations shmem_aops; static struct address_space_operations shmem_aops;
static struct file_operations shmem_file_operations; static struct file_operations shmem_file_operations;
...@@ -173,6 +209,7 @@ static void shmem_recalc_inode(struct inode *inode) ...@@ -173,6 +209,7 @@ static void shmem_recalc_inode(struct inode *inode)
sbinfo->free_blocks += freed; sbinfo->free_blocks += freed;
inode->i_blocks -= freed*BLOCKS_PER_PAGE; inode->i_blocks -= freed*BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
shmem_unacct_blocks(info->flags, freed);
} }
} }
...@@ -456,7 +493,7 @@ static void shmem_truncate(struct inode *inode) ...@@ -456,7 +493,7 @@ static void shmem_truncate(struct inode *inode)
shmem_dir_unmap(dir); shmem_dir_unmap(dir);
if (empty) { if (empty) {
shmem_dir_free(empty); shmem_dir_free(empty);
info->alloced++; shmem_free_block(inode);
} }
empty = subdir; empty = subdir;
cond_resched_lock(&info->lock); cond_resched_lock(&info->lock);
...@@ -479,19 +516,19 @@ static void shmem_truncate(struct inode *inode) ...@@ -479,19 +516,19 @@ static void shmem_truncate(struct inode *inode)
else if (subdir) { else if (subdir) {
*dir = NULL; *dir = NULL;
shmem_dir_free(subdir); shmem_dir_free(subdir);
info->alloced++; shmem_free_block(inode);
} }
} }
done1: done1:
shmem_dir_unmap(dir-1); shmem_dir_unmap(dir-1);
if (empty) { if (empty) {
shmem_dir_free(empty); shmem_dir_free(empty);
info->alloced++; shmem_free_block(inode);
} }
if (info->next_index <= SHMEM_NR_DIRECT) { if (info->next_index <= SHMEM_NR_DIRECT) {
shmem_dir_free(info->i_indirect); shmem_dir_free(info->i_indirect);
info->i_indirect = NULL; info->i_indirect = NULL;
info->alloced++; shmem_free_block(inode);
} }
done2: done2:
BUG_ON(info->swapped > info->next_index); BUG_ON(info->swapped > info->next_index);
...@@ -516,20 +553,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) ...@@ -516,20 +553,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
{ {
struct inode *inode = dentry->d_inode; struct inode *inode = dentry->d_inode;
struct page *page = NULL; struct page *page = NULL;
long change = 0;
int error; int error;
if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) { if (attr->ia_valid & ATTR_SIZE) {
/* if (attr->ia_size < inode->i_size) {
* Account swap file usage based on new file size,
* but just let vmtruncate fail on out-of-range sizes.
*/
change = VM_ACCT(attr->ia_size) - VM_ACCT(inode->i_size);
if (change > 0) {
if (security_vm_enough_memory(change))
return -ENOMEM;
} else if (attr->ia_size < inode->i_size) {
vm_unacct_memory(-change);
/* /*
* If truncating down to a partial page, then * If truncating down to a partial page, then
* if that page is already allocated, hold it * if that page is already allocated, hold it
...@@ -563,8 +590,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) ...@@ -563,8 +590,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
error = inode_setattr(inode, attr); error = inode_setattr(inode, attr);
if (page) if (page)
page_cache_release(page); page_cache_release(page);
if (error)
vm_unacct_memory(change);
return error; return error;
} }
...@@ -577,8 +602,7 @@ static void shmem_delete_inode(struct inode *inode) ...@@ -577,8 +602,7 @@ static void shmem_delete_inode(struct inode *inode)
spin_lock(&shmem_ilock); spin_lock(&shmem_ilock);
list_del(&info->list); list_del(&info->list);
spin_unlock(&shmem_ilock); spin_unlock(&shmem_ilock);
if (info->flags & VM_ACCOUNT) shmem_unacct_size(info->flags, inode->i_size);
vm_unacct_memory(VM_ACCT(inode->i_size));
inode->i_size = 0; inode->i_size = 0;
shmem_truncate(inode); shmem_truncate(inode);
} }
...@@ -909,7 +933,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **p ...@@ -909,7 +933,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **p
shmem_swp_unmap(entry); shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb); sbinfo = SHMEM_SB(inode->i_sb);
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks == 0) { if (sbinfo->free_blocks == 0 || shmem_acct_block(info->flags)) {
spin_unlock(&sbinfo->stat_lock); spin_unlock(&sbinfo->stat_lock);
spin_unlock(&info->lock); spin_unlock(&info->lock);
error = -ENOSPC; error = -ENOSPC;
...@@ -923,6 +947,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **p ...@@ -923,6 +947,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **p
spin_unlock(&info->lock); spin_unlock(&info->lock);
filepage = page_cache_alloc(mapping); filepage = page_cache_alloc(mapping);
if (!filepage) { if (!filepage) {
shmem_unacct_blocks(info->flags, 1);
shmem_free_block(inode); shmem_free_block(inode);
error = -ENOMEM; error = -ENOMEM;
goto failed; goto failed;
...@@ -940,6 +965,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **p ...@@ -940,6 +965,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **p
filepage, mapping, idx, GFP_ATOMIC)) { filepage, mapping, idx, GFP_ATOMIC)) {
spin_unlock(&info->lock); spin_unlock(&info->lock);
page_cache_release(filepage); page_cache_release(filepage);
shmem_unacct_blocks(info->flags, 1);
shmem_free_block(inode); shmem_free_block(inode);
filepage = NULL; filepage = NULL;
if (error) if (error)
...@@ -1094,7 +1120,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) ...@@ -1094,7 +1120,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
info = SHMEM_I(inode); info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info); memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock); spin_lock_init(&info->lock);
info->flags = VM_ACCOUNT;
switch (mode & S_IFMT) { switch (mode & S_IFMT) {
default: default:
init_special_inode(inode, mode, dev); init_special_inode(inode, mode, dev);
...@@ -1167,7 +1192,6 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t ...@@ -1167,7 +1192,6 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
loff_t pos; loff_t pos;
unsigned long written; unsigned long written;
int err; int err;
loff_t maxpos;
if ((ssize_t) count < 0) if ((ssize_t) count < 0)
return -EINVAL; return -EINVAL;
...@@ -1184,15 +1208,6 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t ...@@ -1184,15 +1208,6 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
if (err || !count) if (err || !count)
goto out; goto out;
maxpos = inode->i_size;
if (maxpos < pos + count) {
maxpos = pos + count;
if (security_vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) {
err = -ENOMEM;
goto out;
}
}
err = remove_suid(file->f_dentry); err = remove_suid(file->f_dentry);
if (err) if (err)
goto out; goto out;
...@@ -1267,10 +1282,6 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t ...@@ -1267,10 +1282,6 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
*ppos = pos; *ppos = pos;
if (written) if (written)
err = written; err = written;
/* Short writes give back address space */
if (inode->i_size != maxpos)
vm_unacct_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size));
out: out:
up(&inode->i_sem); up(&inode->i_sem);
return err; return err;
...@@ -1551,13 +1562,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s ...@@ -1551,13 +1562,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
memcpy(info, symname, len); memcpy(info, symname, len);
inode->i_op = &shmem_symlink_inline_operations; inode->i_op = &shmem_symlink_inline_operations;
} else { } else {
if (security_vm_enough_memory(VM_ACCT(1))) {
iput(inode);
return -ENOMEM;
}
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) { if (error) {
vm_unacct_memory(VM_ACCT(1));
iput(inode); iput(inode);
return error; return error;
} }
...@@ -1947,7 +1953,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) ...@@ -1947,7 +1953,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (size > SHMEM_MAX_BYTES) if (size > SHMEM_MAX_BYTES)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
if ((flags & VM_ACCOUNT) && security_vm_enough_memory(VM_ACCT(size))) if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
error = -ENOMEM; error = -ENOMEM;
...@@ -1969,7 +1975,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) ...@@ -1969,7 +1975,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (!inode) if (!inode)
goto close_file; goto close_file;
SHMEM_I(inode)->flags &= flags; SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
d_instantiate(dentry, inode); d_instantiate(dentry, inode);
inode->i_size = size; inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */ inode->i_nlink = 0; /* It is unlinked */
...@@ -1985,8 +1991,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) ...@@ -1985,8 +1991,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
put_dentry: put_dentry:
dput(dentry); dput(dentry);
put_memory: put_memory:
if (flags & VM_ACCOUNT) shmem_unacct_size(flags, size);
vm_unacct_memory(VM_ACCT(size));
return ERR_PTR(error); return ERR_PTR(error);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment