Commit e2ea8374 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] mremap: move_vma fixes and cleanup

From: Hugh Dickins <hugh@veritas.com>

Partial rewrite of mremap's move_vma.  Rajesh Venkatasubramanian has pointed
out that vmtruncate could miss ptes, leaving orphaned pages, because move_vma
only made the new vma visible after filling it.  We see no good reason for
that, and time to make move_vma more robust.

Removed all its vma merging decisions, leave them to mmap.c's vma_merge, with
copy_vma added.  Removed duplicated is_mergeable_vma test from vma_merge, and
duplicated validate_mm from insert_vm_struct.

move_vma move from old to new then unmap old; but on error move back from new
to old and unmap new.  Don't unwind within move_page_tables, let move_vma
call it explicitly to unwind, with the right source vma.  Get the
VM_ACCOUNTing right even when the final do_munmap fails.
parent 209b450c
...@@ -541,6 +541,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid); ...@@ -541,6 +541,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
struct rb_node **, struct rb_node *); struct rb_node **, struct rb_node *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
unsigned long addr, unsigned long len, unsigned long pgoff);
extern void exit_mmap(struct mm_struct *); extern void exit_mmap(struct mm_struct *);
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
......
...@@ -385,7 +385,8 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, ...@@ -385,7 +385,8 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* whether that can be merged with its predecessor or its successor. Or * whether that can be merged with its predecessor or its successor. Or
* both (it neatly fills a hole). * both (it neatly fills a hole).
*/ */
static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, static struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev,
struct rb_node *rb_parent, unsigned long addr, struct rb_node *rb_parent, unsigned long addr,
unsigned long end, unsigned long vm_flags, unsigned long end, unsigned long vm_flags,
struct file *file, unsigned long pgoff) struct file *file, unsigned long pgoff)
...@@ -399,7 +400,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, ...@@ -399,7 +400,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
* vma->vm_flags & VM_SPECIAL, too. * vma->vm_flags & VM_SPECIAL, too.
*/ */
if (vm_flags & VM_SPECIAL) if (vm_flags & VM_SPECIAL)
return 0; return NULL;
i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
...@@ -412,7 +413,6 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, ...@@ -412,7 +413,6 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
* Can it merge with the predecessor? * Can it merge with the predecessor?
*/ */
if (prev->vm_end == addr && if (prev->vm_end == addr &&
is_mergeable_vma(prev, file, vm_flags) &&
can_vma_merge_after(prev, vm_flags, file, pgoff)) { can_vma_merge_after(prev, vm_flags, file, pgoff)) {
struct vm_area_struct *next; struct vm_area_struct *next;
int need_up = 0; int need_up = 0;
...@@ -443,12 +443,12 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, ...@@ -443,12 +443,12 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
mm->map_count--; mm->map_count--;
kmem_cache_free(vm_area_cachep, next); kmem_cache_free(vm_area_cachep, next);
return 1; return prev;
} }
spin_unlock(lock); spin_unlock(lock);
if (need_up) if (need_up)
up(i_shared_sem); up(i_shared_sem);
return 1; return prev;
} }
/* /*
...@@ -459,7 +459,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, ...@@ -459,7 +459,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
merge_next: merge_next:
if (!can_vma_merge_before(prev, vm_flags, file, if (!can_vma_merge_before(prev, vm_flags, file,
pgoff, (end - addr) >> PAGE_SHIFT)) pgoff, (end - addr) >> PAGE_SHIFT))
return 0; return NULL;
if (end == prev->vm_start) { if (end == prev->vm_start) {
if (file) if (file)
down(i_shared_sem); down(i_shared_sem);
...@@ -469,11 +469,11 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, ...@@ -469,11 +469,11 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
spin_unlock(lock); spin_unlock(lock);
if (file) if (file)
up(i_shared_sem); up(i_shared_sem);
return 1; return prev;
} }
} }
return 0; return NULL;
} }
/* /*
...@@ -1492,5 +1492,36 @@ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) ...@@ -1492,5 +1492,36 @@ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
if (__vma && __vma->vm_start < vma->vm_end) if (__vma && __vma->vm_start < vma->vm_end)
BUG(); BUG();
vma_link(mm, vma, prev, rb_link, rb_parent); vma_link(mm, vma, prev, rb_link, rb_parent);
validate_mm(mm); }
/*
* Copy the vma structure to a new location in the same mm,
* prior to moving page table entries, to effect an mremap move.
*/
struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
unsigned long addr, unsigned long len, unsigned long pgoff)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len,
vma->vm_flags, vma->vm_file, pgoff);
if (!new_vma) {
new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (new_vma) {
*new_vma = *vma;
INIT_LIST_HEAD(&new_vma->shared);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
}
}
return new_vma;
} }
...@@ -148,7 +148,7 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr, ...@@ -148,7 +148,7 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
static int move_page_tables(struct vm_area_struct *vma, static int move_page_tables(struct vm_area_struct *vma,
unsigned long new_addr, unsigned long old_addr, unsigned long len) unsigned long new_addr, unsigned long old_addr, unsigned long len)
{ {
unsigned long offset = len; unsigned long offset;
flush_cache_range(vma, old_addr, old_addr + len); flush_cache_range(vma, old_addr, old_addr + len);
...@@ -157,137 +157,75 @@ static int move_page_tables(struct vm_area_struct *vma, ...@@ -157,137 +157,75 @@ static int move_page_tables(struct vm_area_struct *vma,
* easy way out on the assumption that most remappings will be * easy way out on the assumption that most remappings will be
* only a few pages.. This also makes error recovery easier. * only a few pages.. This also makes error recovery easier.
*/ */
while (offset) { for (offset = 0; offset < len; offset += PAGE_SIZE) {
offset -= PAGE_SIZE; if (move_one_page(vma, old_addr+offset, new_addr+offset) < 0)
if (move_one_page(vma, old_addr + offset, new_addr + offset)) break;
goto oops_we_failed;
} }
return 0; return offset;
/*
* Ok, the move failed because we didn't have enough pages for
* the new page table tree. This is unlikely, but we have to
* take the possibility into account. In that case we just move
* all the pages back (this will work, because we still have
* the old page tables)
*/
oops_we_failed:
flush_cache_range(vma, new_addr, new_addr + len);
while ((offset += PAGE_SIZE) < len)
move_one_page(vma, new_addr + offset, old_addr + offset);
zap_page_range(vma, new_addr, len);
return -1;
} }
static unsigned long move_vma(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long old_addr, unsigned long old_len,
unsigned long new_addr) unsigned long new_len, unsigned long new_addr)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *next, *prev; struct vm_area_struct *new_vma;
int allocated_vma; unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
unsigned long moved_len;
unsigned long excess = 0;
int split = 0; int split = 0;
new_vma = NULL; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
next = find_vma_prev(mm, new_addr, &prev); new_vma = copy_vma(vma, new_addr, new_len, new_pgoff);
if (next) {
if (prev && prev->vm_end == new_addr &&
can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
!(vma->vm_flags & VM_SHARED)) {
spin_lock(&mm->page_table_lock);
prev->vm_end = new_addr + new_len;
spin_unlock(&mm->page_table_lock);
new_vma = prev;
if (next != prev->vm_next)
BUG();
if (prev->vm_end == next->vm_start &&
can_vma_merge(next, prev->vm_flags)) {
spin_lock(&mm->page_table_lock);
prev->vm_end = next->vm_end;
__vma_unlink(mm, next, prev);
spin_unlock(&mm->page_table_lock);
if (vma == next)
vma = prev;
mm->map_count--;
kmem_cache_free(vm_area_cachep, next);
}
} else if (next->vm_start == new_addr + new_len &&
can_vma_merge(next, vma->vm_flags) &&
!vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
spin_lock(&mm->page_table_lock);
next->vm_start = new_addr;
spin_unlock(&mm->page_table_lock);
new_vma = next;
}
} else {
prev = find_vma(mm, new_addr-1);
if (prev && prev->vm_end == new_addr &&
can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
!(vma->vm_flags & VM_SHARED)) {
spin_lock(&mm->page_table_lock);
prev->vm_end = new_addr + new_len;
spin_unlock(&mm->page_table_lock);
new_vma = prev;
}
}
allocated_vma = 0;
if (!new_vma) {
new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!new_vma) if (!new_vma)
goto out; return -ENOMEM;
allocated_vma = 1;
}
if (!move_page_tables(vma, new_addr, addr, old_len)) { moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
unsigned long vm_locked = vma->vm_flags & VM_LOCKED; if (moved_len < old_len) {
/*
if (allocated_vma) { * On error, move entries back from new area to old,
*new_vma = *vma; * which will succeed since page tables still there,
INIT_LIST_HEAD(&new_vma->shared); * and then proceed to unmap new area instead of old.
new_vma->vm_start = new_addr; */
new_vma->vm_end = new_addr+new_len; move_page_tables(new_vma, old_addr, new_addr, moved_len);
new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; vma = new_vma;
if (new_vma->vm_file) old_len = new_len;
get_file(new_vma->vm_file); old_addr = new_addr;
if (new_vma->vm_ops && new_vma->vm_ops->open) new_addr = -ENOMEM;
new_vma->vm_ops->open(new_vma);
insert_vm_struct(current->mm, new_vma);
} }
/* Conceal VM_ACCOUNT so old reservation is not undone */ /* Conceal VM_ACCOUNT so old reservation is not undone */
if (vma->vm_flags & VM_ACCOUNT) { if (vm_flags & VM_ACCOUNT) {
vma->vm_flags &= ~VM_ACCOUNT; vma->vm_flags &= ~VM_ACCOUNT;
if (addr > vma->vm_start) { excess = vma->vm_end - vma->vm_start - old_len;
if (addr + old_len < vma->vm_end) if (old_addr > vma->vm_start &&
old_addr + old_len < vma->vm_end)
split = 1; split = 1;
} else if (addr + old_len == vma->vm_end) }
vma = NULL; /* it will be removed */
} else
vma = NULL; /* nothing more to do */
do_munmap(current->mm, addr, old_len); if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
/* Restore VM_ACCOUNT if one or two pieces of vma left */ /* Restore VM_ACCOUNT if one or two pieces of vma left */
if (vma) { if (excess) {
vma->vm_flags |= VM_ACCOUNT; vma->vm_flags |= VM_ACCOUNT;
if (split) if (split)
vma->vm_next->vm_flags |= VM_ACCOUNT; vma->vm_next->vm_flags |= VM_ACCOUNT;
} }
current->mm->total_vm += new_len >> PAGE_SHIFT; mm->total_vm += new_len >> PAGE_SHIFT;
if (vm_locked) { if (vm_flags & VM_LOCKED) {
current->mm->locked_vm += new_len >> PAGE_SHIFT; mm->locked_vm += new_len >> PAGE_SHIFT;
if (new_len > old_len) if (new_len > old_len)
make_pages_present(new_addr + old_len, make_pages_present(new_addr + old_len,
new_addr + new_len); new_addr + new_len);
} }
return new_addr; return new_addr;
}
if (allocated_vma)
kmem_cache_free(vm_area_cachep, new_vma);
out:
return -ENOMEM;
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment