mmap.c 47.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Andrew Morton's avatar
Andrew Morton committed
2
 * mm/mmap.c
Linus Torvalds's avatar
Linus Torvalds committed
3 4
 *
 * Written by obz.
Andrew Morton's avatar
Andrew Morton committed
5 6
 *
 * Address space accounting code	<alan@redhat.com>
Linus Torvalds's avatar
Linus Torvalds committed
7
 */
Andrew Morton's avatar
Andrew Morton committed
8

Linus Torvalds's avatar
Linus Torvalds committed
9 10 11 12 13
#include <linux/slab.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
Andrew Morton's avatar
Andrew Morton committed
14
#include <linux/syscalls.h>
Linus Torvalds's avatar
Linus Torvalds committed
15 16
#include <linux/init.h>
#include <linux/file.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
#include <linux/fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
#include <linux/personality.h>
19
#include <linux/security.h>
20
#include <linux/hugetlb.h>
John Levon's avatar
John Levon committed
21
#include <linux/profile.h>
22
#include <linux/module.h>
23
#include <linux/mount.h>
24
#include <linux/mempolicy.h>
25
#include <linux/rmap.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27

#include <asm/uaccess.h>
28
#include <asm/cacheflush.h>
29 30
#include <asm/tlb.h>

Linus Torvalds's avatar
Linus Torvalds committed
31 32 33 34 35 36
/*
 * WARNING: the debugging will use recursive algorithms so never enable this
 * unless you know what you are doing.
 */
#undef DEBUG_MM_RB

Linus Torvalds's avatar
Linus Torvalds committed
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type	prot
 *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
 *		
 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
 *
 */
pgprot_t protection_map[16] = {
	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};

Andrew Morton's avatar
Andrew Morton committed
57 58
int sysctl_overcommit_memory = 0;	/* default is heuristic overcommit */
int sysctl_overcommit_ratio = 50;	/* default is 50% */
59
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
Andrew Morton's avatar
Andrew Morton committed
60 61
atomic_t vm_committed_space = ATOMIC_INIT(0);

62 63
EXPORT_SYMBOL(sysctl_overcommit_memory);
EXPORT_SYMBOL(sysctl_overcommit_ratio);
64
EXPORT_SYMBOL(sysctl_max_map_count);
65
EXPORT_SYMBOL(vm_committed_space);
Andrew Morton's avatar
Andrew Morton committed
66

67
/*
68
 * Requires inode->i_mapping->i_mmap_lock
69
 */
70
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
71
		struct file *file, struct address_space *mapping)
72
{
Andrew Morton's avatar
Andrew Morton committed
73 74
	if (vma->vm_flags & VM_DENYWRITE)
		atomic_inc(&file->f_dentry->d_inode->i_writecount);
75 76
	if (vma->vm_flags & VM_SHARED)
		mapping->i_mmap_writable--;
77

78
	flush_dcache_mmap_lock(mapping);
79 80
	if (unlikely(vma->vm_flags & VM_NONLINEAR))
		list_del_init(&vma->shared.vm_set.list);
81 82
	else
		vma_prio_tree_remove(vma, &mapping->i_mmap);
83
	flush_dcache_mmap_unlock(mapping);
84 85
}

86
/*
87
 * Remove one vm structure and free it.
88
 */
89
static void remove_vm_struct(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
90
{
91
	struct file *file = vma->vm_file;
Linus Torvalds's avatar
Linus Torvalds committed
92 93

	if (file) {
94
		struct address_space *mapping = file->f_mapping;
95
		spin_lock(&mapping->i_mmap_lock);
96
		__remove_shared_vm_struct(vma, file, mapping);
97
		spin_unlock(&mapping->i_mmap_lock);
Linus Torvalds's avatar
Linus Torvalds committed
98
	}
99 100 101 102
	if (vma->vm_ops && vma->vm_ops->close)
		vma->vm_ops->close(vma);
	if (file)
		fput(file);
103
	anon_vma_unlink(vma);
104 105
	mpol_free(vma_policy(vma));
	kmem_cache_free(vm_area_cachep, vma);
Linus Torvalds's avatar
Linus Torvalds committed
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
}

/*
 *  sys_brk() for the most part doesn't need the global kernel
 *  lock, except when an application is doing something nasty
 *  like trying to un-brk an area that has already been mapped
 *  to a regular file.  in this case, the unmapping will need
 *  to invoke file system routines that need the global lock.
 */
asmlinkage unsigned long sys_brk(unsigned long brk)
{
	unsigned long rlim, retval;
	unsigned long newbrk, oldbrk;
	struct mm_struct *mm = current->mm;

Linus Torvalds's avatar
Linus Torvalds committed
121
	down_write(&mm->mmap_sem);
Linus Torvalds's avatar
Linus Torvalds committed
122 123 124 125 126 127 128 129 130 131

	if (brk < mm->end_code)
		goto out;
	newbrk = PAGE_ALIGN(brk);
	oldbrk = PAGE_ALIGN(mm->brk);
	if (oldbrk == newbrk)
		goto set_brk;

	/* Always allow shrinking brk. */
	if (brk <= mm->brk) {
132
		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
Linus Torvalds's avatar
Linus Torvalds committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
			goto set_brk;
		goto out;
	}

	/* Check against rlimit.. */
	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
		goto out;

	/* Check against existing mmap mappings. */
	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
		goto out;

	/* Ok, looks good - let it rip. */
	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
		goto out;
set_brk:
	mm->brk = brk;
out:
	retval = mm->brk;
Linus Torvalds's avatar
Linus Torvalds committed
153
	up_write(&mm->mmap_sem);
Linus Torvalds's avatar
Linus Torvalds committed
154 155 156
	return retval;
}

Linus Torvalds's avatar
Linus Torvalds committed
157
#ifdef DEBUG_MM_RB
158 159 160
static int browse_rb(struct rb_root *root)
{
	int i = 0, j;
161 162 163 164 165 166 167 168 169 170 171 172
	struct rb_node *nd, *pn = NULL;
	unsigned long prev = 0, pend = 0;

	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
		struct vm_area_struct *vma;
		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
		if (vma->vm_start < prev)
			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
		if (vma->vm_start < pend)
			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
		if (vma->vm_start > vma->vm_end)
			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
Linus Torvalds's avatar
Linus Torvalds committed
173
		i++;
174 175 176 177 178
		pn = nd;
	}
	j = 0;
	for (nd = pn; nd; nd = rb_prev(nd)) {
		j++;
Linus Torvalds's avatar
Linus Torvalds committed
179
	}
180 181
	if (i != j)
		printk("backwards %d, forwards %d\n", j, i), i = 0;
Linus Torvalds's avatar
Linus Torvalds committed
182 183 184
	return i;
}

185 186
void validate_mm(struct mm_struct *mm)
{
Linus Torvalds's avatar
Linus Torvalds committed
187 188
	int bug = 0;
	int i = 0;
189
	struct vm_area_struct *tmp = mm->mmap;
Linus Torvalds's avatar
Linus Torvalds committed
190 191 192 193 194 195
	while (tmp) {
		tmp = tmp->vm_next;
		i++;
	}
	if (i != mm->map_count)
		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
196
	i = browse_rb(&mm->mm_rb);
Linus Torvalds's avatar
Linus Torvalds committed
197 198 199 200 201 202 203 204 205
	if (i != mm->map_count)
		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
	if (bug)
		BUG();
}
#else
#define validate_mm(mm) do { } while (0)
#endif

206 207 208 209
static struct vm_area_struct *
find_vma_prepare(struct mm_struct *mm, unsigned long addr,
		struct vm_area_struct **pprev, struct rb_node ***rb_link,
		struct rb_node ** rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
210 211
{
	struct vm_area_struct * vma;
212
	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
Linus Torvalds's avatar
Linus Torvalds committed
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242

	__rb_link = &mm->mm_rb.rb_node;
	rb_prev = __rb_parent = NULL;
	vma = NULL;

	while (*__rb_link) {
		struct vm_area_struct *vma_tmp;

		__rb_parent = *__rb_link;
		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

		if (vma_tmp->vm_end > addr) {
			vma = vma_tmp;
			if (vma_tmp->vm_start <= addr)
				return vma;
			__rb_link = &__rb_parent->rb_left;
		} else {
			rb_prev = __rb_parent;
			__rb_link = &__rb_parent->rb_right;
		}
	}

	*pprev = NULL;
	if (rb_prev)
		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
	*rb_link = __rb_link;
	*rb_parent = __rb_parent;
	return vma;
}

243 244 245
static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
		struct vm_area_struct *prev, struct rb_node *rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
246 247 248 249 250 251 252
{
	if (prev) {
		vma->vm_next = prev->vm_next;
		prev->vm_next = vma;
	} else {
		mm->mmap = vma;
		if (rb_parent)
253 254
			vma->vm_next = rb_entry(rb_parent,
					struct vm_area_struct, vm_rb);
Linus Torvalds's avatar
Linus Torvalds committed
255 256 257 258 259
		else
			vma->vm_next = NULL;
	}
}

Andrew Morton's avatar
Andrew Morton committed
260 261
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
		struct rb_node **rb_link, struct rb_node *rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
262 263 264 265 266
{
	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
}

267
static inline void __vma_link_file(struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
268 269 270 271 272
{
	struct file * file;

	file = vma->vm_file;
	if (file) {
273
		struct address_space *mapping = file->f_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
274 275

		if (vma->vm_flags & VM_DENYWRITE)
276
			atomic_dec(&file->f_dentry->d_inode->i_writecount);
277 278
		if (vma->vm_flags & VM_SHARED)
			mapping->i_mmap_writable++;
Linus Torvalds's avatar
Linus Torvalds committed
279

280
		flush_dcache_mmap_lock(mapping);
281 282 283
		if (unlikely(vma->vm_flags & VM_NONLINEAR))
			list_add_tail(&vma->shared.vm_set.list,
					&mapping->i_mmap_nonlinear);
284
		else
285
			vma_prio_tree_insert(vma, &mapping->i_mmap);
286
		flush_dcache_mmap_unlock(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
287 288 289
	}
}

290 291 292 293
static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
	struct vm_area_struct *prev, struct rb_node **rb_link,
	struct rb_node *rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
294 295 296
{
	__vma_link_list(mm, vma, prev, rb_parent);
	__vma_link_rb(mm, vma, rb_link, rb_parent);
297
	__anon_vma_link(vma);
Linus Torvalds's avatar
Linus Torvalds committed
298 299
}

300 301 302
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
			struct vm_area_struct *prev, struct rb_node **rb_link,
			struct rb_node *rb_parent)
Linus Torvalds's avatar
Linus Torvalds committed
303
{
304 305 306
	struct address_space *mapping = NULL;

	if (vma->vm_file)
307
		mapping = vma->vm_file->f_mapping;
308 309

	if (mapping)
310
		spin_lock(&mapping->i_mmap_lock);
311
	anon_vma_lock(vma);
312

Linus Torvalds's avatar
Linus Torvalds committed
313
	__vma_link(mm, vma, prev, rb_link, rb_parent);
314 315
	__vma_link_file(vma);

316
	anon_vma_unlock(vma);
317
	if (mapping)
318
		spin_unlock(&mapping->i_mmap_lock);
Linus Torvalds's avatar
Linus Torvalds committed
319

320
	mark_mm_hugetlb(mm, vma);
Linus Torvalds's avatar
Linus Torvalds committed
321 322 323 324
	mm->map_count++;
	validate_mm(mm);
}

325
/*
326 327 328
 * Helper for vma_adjust in the split_vma insert case:
 * insert vm structure into list and rbtree and anon_vma,
 * but it has already been inserted into prio_tree earlier.
329 330 331 332 333 334 335 336 337 338 339 340 341 342
 */
static void
__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
	struct vm_area_struct * __vma, * prev;
	struct rb_node ** rb_link, * rb_parent;

	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
	if (__vma && __vma->vm_start < vma->vm_end)
		BUG();
	__vma_link(mm, vma, prev, rb_link, rb_parent);
	mm->map_count++;
}

343 344 345 346 347 348 349 350 351 352
static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
		struct vm_area_struct *prev)
{
	prev->vm_next = vma->vm_next;
	rb_erase(&vma->vm_rb, &mm->mm_rb);
	if (mm->mmap_cache == vma)
		mm->mmap_cache = prev;
}

Andrew Morton's avatar
Andrew Morton committed
353
/*
354 355
 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 * is already present in an i_mmap tree without adjusting the tree.
Andrew Morton's avatar
Andrew Morton committed
356
 * The following helper function should be used when such adjustments
357
 * are necessary.  The "insert" vma (if any) is to be inserted
Andrew Morton's avatar
Andrew Morton committed
358 359 360
 * before we drop the necessary locks.
 */
void vma_adjust(struct vm_area_struct *vma, unsigned long start,
361
	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
Andrew Morton's avatar
Andrew Morton committed
362 363
{
	struct mm_struct *mm = vma->vm_mm;
364
	struct vm_area_struct *next = vma->vm_next;
365
	struct vm_area_struct *importer = NULL;
Andrew Morton's avatar
Andrew Morton committed
366
	struct address_space *mapping = NULL;
367
	struct prio_tree_root *root = NULL;
Andrew Morton's avatar
Andrew Morton committed
368
	struct file *file = vma->vm_file;
369
	struct anon_vma *anon_vma = NULL;
370 371 372 373 374
	long adjust_next = 0;
	int remove_next = 0;

	if (next && !insert) {
		if (end >= next->vm_end) {
375 376 377 378
			/*
			 * vma expands, overlapping all the next, and
			 * perhaps the one after too (mprotect case 6).
			 */
379 380
again:			remove_next = 1 + (end > next->vm_end);
			end = next->vm_end;
381
			anon_vma = next->anon_vma;
382
		} else if (end > next->vm_start) {
383 384
			/*
			 * vma expands, overlapping part of the next:
385 386 387 388
			 * mprotect case 5 shifting the boundary up.
			 */
			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
			anon_vma = next->anon_vma;
389
			importer = vma;
390 391 392 393 394
		} else if (end < vma->vm_end) {
			/*
			 * vma shrinks, and !insert tells it's not
			 * split_vma inserting another: so it must be
			 * mprotect case 4 shifting the boundary down.
395
			 */
396
			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
397
			anon_vma = next->anon_vma;
398
			importer = next;
399 400
		}
	}
Andrew Morton's avatar
Andrew Morton committed
401 402 403

	if (file) {
		mapping = file->f_mapping;
404
		if (!(vma->vm_flags & VM_NONLINEAR))
405
			root = &mapping->i_mmap;
Andrew Morton's avatar
Andrew Morton committed
406
		spin_lock(&mapping->i_mmap_lock);
407 408 409 410 411 412 413 414 415
		if (insert) {
			/*
			 * Put into prio_tree now, so instantiated pages
			 * are visible to arm/parisc __flush_dcache_page
			 * throughout; but we cannot insert into address
			 * space until vma start or end is updated.
			 */
			__vma_link_file(insert);
		}
Andrew Morton's avatar
Andrew Morton committed
416
	}
417 418 419 420 421 422 423

	/*
	 * When changing only vma->vm_end, we don't really need
	 * anon_vma lock: but is that case worth optimizing out?
	 */
	if (vma->anon_vma)
		anon_vma = vma->anon_vma;
424
	if (anon_vma) {
425
		spin_lock(&anon_vma->lock);
426 427 428 429 430 431 432 433 434 435
		/*
		 * Easily overlooked: when mprotect shifts the boundary,
		 * make sure the expanding vma has anon_vma set if the
		 * shrinking vma had, to cover any anon pages imported.
		 */
		if (importer && !importer->anon_vma) {
			importer->anon_vma = anon_vma;
			__anon_vma_link(importer);
		}
	}
Andrew Morton's avatar
Andrew Morton committed
436

437 438
	if (root) {
		flush_dcache_mmap_lock(mapping);
439
		vma_prio_tree_remove(vma, root);
440 441
		if (adjust_next)
			vma_prio_tree_remove(next, root);
442
	}
443

Andrew Morton's avatar
Andrew Morton committed
444 445 446
	vma->vm_start = start;
	vma->vm_end = end;
	vma->vm_pgoff = pgoff;
447
	if (adjust_next) {
448 449
		next->vm_start += adjust_next << PAGE_SHIFT;
		next->vm_pgoff += adjust_next;
450 451
	}

452
	if (root) {
453 454 455 456
		if (adjust_next) {
			vma_prio_tree_init(next);
			vma_prio_tree_insert(next, root);
		}
457 458
		vma_prio_tree_init(vma);
		vma_prio_tree_insert(vma, root);
459
		flush_dcache_mmap_unlock(mapping);
460
	}
Andrew Morton's avatar
Andrew Morton committed
461

462 463 464 465 466 467 468 469
	if (remove_next) {
		/*
		 * vma_merge has merged next into vma, and needs
		 * us to remove next before dropping the locks.
		 */
		__vma_unlink(mm, next, vma);
		if (file)
			__remove_shared_vm_struct(next, file, mapping);
470 471
		if (next->anon_vma)
			__anon_vma_merge(vma, next);
472 473 474 475 476 477 478
	} else if (insert) {
		/*
		 * split_vma has split insert from vma, and needs
		 * us to insert it before dropping the locks
		 * (it may either follow vma or precede it).
		 */
		__insert_vm_struct(mm, insert);
Andrew Morton's avatar
Andrew Morton committed
479 480
	}

481 482
	if (anon_vma)
		spin_unlock(&anon_vma->lock);
Andrew Morton's avatar
Andrew Morton committed
483 484
	if (mapping)
		spin_unlock(&mapping->i_mmap_lock);
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503

	if (remove_next) {
		if (file)
			fput(file);
		mm->map_count--;
		mpol_free(vma_policy(next));
		kmem_cache_free(vm_area_cachep, next);
		/*
		 * In mprotect's case 6 (see comments on vma_merge),
		 * we must remove another next too. It would clutter
		 * up the code too much to do both in one go.
		 */
		if (remove_next == 2) {
			next = vma->vm_next;
			goto again;
		}
	}

	validate_mm(mm);
Andrew Morton's avatar
Andrew Morton committed
504 505
}

506 507 508 509 510 511 512 513 514
/*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)

static inline int is_mergeable_vma(struct vm_area_struct *vma,
			struct file *file, unsigned long vm_flags)
{
515
	if (vma->vm_flags != vm_flags)
516 517 518
		return 0;
	if (vma->vm_file != file)
		return 0;
519
	if (vma->vm_ops && vma->vm_ops->close)
520 521 522 523
		return 0;
	return 1;
}

524 525 526 527 528 529
static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
					struct anon_vma *anon_vma2)
{
	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
}

530
/*
531
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
532 533
 * in front of (at a lower virtual address and file offset than) the vma.
 *
534 535 536
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
537 538 539 540 541 542
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 */
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
543
	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
544
{
545 546
	if (is_mergeable_vma(vma, file, vm_flags) &&
	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
547
		if (vma->vm_pgoff == vm_pgoff)
548 549 550 551 552 553
			return 1;
	}
	return 0;
}

/*
554
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
555
 * beyond (at a higher virtual address and file offset than) the vma.
556 557 558
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
559 560 561
 */
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
562
	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
563
{
564 565
	if (is_mergeable_vma(vma, file, vm_flags) &&
	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
566 567 568
		pgoff_t vm_pglen;
		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
569 570 571 572 573 574
			return 1;
	}
	return 0;
}

/*
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
 * whether that can be merged with its predecessor or its successor.
 * Or both (it neatly fills a hole).
 *
 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 * certain not to be mapped by the time vma_merge is called; but when
 * called for mprotect, it is certain to be already mapped (either at
 * an offset within prev, or at the start of next), and the flags of
 * this area are about to be changed to vm_flags - and the no-change
 * case has already been eliminated.
 *
 * The following mprotect cases have to be considered, where AAAA is
 * the area passed down from mprotect_fixup, never extending beyond one
 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
 *
 *     AAAA             AAAA                AAAA          AAAA
 *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
 *    cannot merge    might become    might become    might become
 *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
 *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
 *    mremap move:                                    PPPPNNNNNNNN 8
 *        AAAA
 *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
 *    might become    case 1 below    case 2 below    case 3 below
 *
 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
602
 */
603
struct vm_area_struct *vma_merge(struct mm_struct *mm,
604
			struct vm_area_struct *prev, unsigned long addr,
605
			unsigned long end, unsigned long vm_flags,
606 607
		     	struct anon_vma *anon_vma, struct file *file,
			pgoff_t pgoff, struct mempolicy *policy)
Linus Torvalds's avatar
Linus Torvalds committed
608
{
609
	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
610
	struct vm_area_struct *area, *next;
611

612
	/*
Andrew Morton's avatar
Andrew Morton committed
613 614
	 * We later require that vma->vm_flags == vm_flags,
	 * so this tests vma->vm_flags & VM_SPECIAL, too.
615 616
	 */
	if (vm_flags & VM_SPECIAL)
617
		return NULL;
618

619 620 621
	if (prev)
		next = prev->vm_next;
	else
622
		next = mm->mmap;
623 624 625
	area = next;
	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
		next = next->vm_next;
Linus Torvalds's avatar
Linus Torvalds committed
626

627 628 629
	/*
	 * Can it merge with the predecessor?
	 */
630
	if (prev && prev->vm_end == addr &&
Andrew Morton's avatar
Andrew Morton committed
631
  			mpol_equal(vma_policy(prev), policy) &&
632 633
			can_vma_merge_after(prev, vm_flags,
						anon_vma, file, pgoff)) {
634
		/*
Andrew Morton's avatar
Andrew Morton committed
635
		 * OK, it can.  Can we now merge in the successor as well?
636
		 */
Andrew Morton's avatar
Andrew Morton committed
637 638
		if (next && end == next->vm_start &&
				mpol_equal(policy, vma_policy(next)) &&
639 640 641 642
				can_vma_merge_before(next, vm_flags,
					anon_vma, file, pgoff+pglen) &&
				is_mergeable_anon_vma(prev->anon_vma,
						      next->anon_vma)) {
643
							/* cases 1, 6 */
Andrew Morton's avatar
Andrew Morton committed
644
			vma_adjust(prev, prev->vm_start,
645 646
				next->vm_end, prev->vm_pgoff, NULL);
		} else					/* cases 2, 5, 7 */
Andrew Morton's avatar
Andrew Morton committed
647 648
			vma_adjust(prev, prev->vm_start,
				end, prev->vm_pgoff, NULL);
649
		return prev;
Linus Torvalds's avatar
Linus Torvalds committed
650 651
	}

652
	/*
Andrew Morton's avatar
Andrew Morton committed
653
	 * Can this new request be merged in front of next?
654
	 */
655 656
	if (next && end == next->vm_start &&
 			mpol_equal(policy, vma_policy(next)) &&
657 658
			can_vma_merge_before(next, vm_flags,
					anon_vma, file, pgoff+pglen)) {
659 660 661 662 663
		if (prev && addr < prev->vm_end)	/* case 4 */
			vma_adjust(prev, prev->vm_start,
				addr, prev->vm_pgoff, NULL);
		else					/* cases 3, 8 */
			vma_adjust(area, addr, next->vm_end,
664
				next->vm_pgoff - pglen, NULL);
665
		return area;
Linus Torvalds's avatar
Linus Torvalds committed
666 667
	}

668
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
669 670
}

671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
	struct vm_area_struct *near;
	unsigned long vm_flags;

	near = vma->vm_next;
	if (!near)
		goto try_prev;

	/*
	 * Since only mprotect tries to remerge vmas, match flags
	 * which might be mprotected into each other later on.
	 * Neither mlock nor madvise tries to remerge at present,
	 * so leave their flags as obstructing a merge.
	 */
	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);

	if (near->anon_vma && vma->vm_end == near->vm_start &&
 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
			can_vma_merge_before(near, vm_flags,
				NULL, vma->vm_file, vma->vm_pgoff +
				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
		return near->anon_vma;
try_prev:
	/*
	 * It is potentially slow to have to call find_vma_prev here.
	 * But it's only on the first write fault on the vma, not
	 * every time, and we could devise a way to avoid it later
	 * (e.g. stash info in next's anon_vma_node when assigning
	 * an anon_vma, or when trying vma_merge).  Another time.
	 */
	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
		BUG();
	if (!near)
		goto none;

	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);

	if (near->anon_vma && near->vm_end == vma->vm_start &&
  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
			can_vma_merge_after(near, vm_flags,
				NULL, vma->vm_file, vma->vm_pgoff))
		return near->anon_vma;
none:
	/*
	 * There's no absolute need to look only at touching neighbours:
	 * we could search further afield for "compatible" anon_vmas.
	 * But it would probably just be a waste of time searching,
	 * or lead to too many vmas hanging off the same anon_vma.
	 * We're trying to allow mprotect remerging later on,
	 * not trying to minimize memory used for anon_vmas.
	 */
	return NULL;
}

Andrew Morton's avatar
Andrew Morton committed
736 737 738 739
/*
 * The caller must hold down_write(current->mm->mmap_sem).
 */

Andrew Morton's avatar
Andrew Morton committed
740 741 742
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
			unsigned long len, unsigned long prot,
			unsigned long flags, unsigned long pgoff)
Linus Torvalds's avatar
Linus Torvalds committed
743 744
{
	struct mm_struct * mm = current->mm;
Linus Torvalds's avatar
Linus Torvalds committed
745
	struct vm_area_struct * vma, * prev;
746
	struct inode *inode;
Linus Torvalds's avatar
Linus Torvalds committed
747
	unsigned int vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
748 749
	int correct_wcount = 0;
	int error;
750
	struct rb_node ** rb_link, * rb_parent;
751
	int accountable = 1;
Andrew Morton's avatar
Andrew Morton committed
752
	unsigned long charged = 0;
Linus Torvalds's avatar
Linus Torvalds committed
753

754
	if (file) {
755 756 757
		if (is_file_hugepages(file))
			accountable = 0;

758 759
		if (!file->f_op || !file->f_op->mmap)
			return -ENODEV;
Linus Torvalds's avatar
Linus Torvalds committed
760

761 762
		if ((prot & PROT_EXEC) &&
		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
763 764
			return -EPERM;
	}
765

766
	if (!len)
Linus Torvalds's avatar
Linus Torvalds committed
767 768
		return addr;

769
	/* Careful about overflows.. */
770
	len = PAGE_ALIGN(len);
771 772
	if (!len || len > TASK_SIZE)
		return -EINVAL;
773

Linus Torvalds's avatar
Linus Torvalds committed
774 775 776 777 778
	/* offset overflow? */
	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
		return -EINVAL;

	/* Too many mappings? */
779
	if (mm->map_count > sysctl_max_map_count)
Linus Torvalds's avatar
Linus Torvalds committed
780 781
		return -ENOMEM;

Linus Torvalds's avatar
Linus Torvalds committed
782 783 784
	/* Obtain the address to map to. we verify (or select) it and ensure
	 * that it represents a valid section of the address space.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
785 786 787
	addr = get_unmapped_area(file, addr, len, pgoff, flags);
	if (addr & ~PAGE_MASK)
		return addr;
Linus Torvalds's avatar
Linus Torvalds committed
788 789 790 791 792

	/* Do simple checking here so the lower-level routines won't have
	 * to. we assume access permissions have been handled by the open
	 * of the memory object, so we don't do any here.
	 */
793 794
	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
Linus Torvalds's avatar
Linus Torvalds committed
795

Andrew Morton's avatar
Andrew Morton committed
796 797 798 799 800
	if (flags & MAP_LOCKED) {
		if (!capable(CAP_IPC_LOCK))
			return -EPERM;
		vm_flags |= VM_LOCKED;
	}
Linus Torvalds's avatar
Linus Torvalds committed
801
	/* mlock MCL_FUTURE? */
Linus Torvalds's avatar
Linus Torvalds committed
802
	if (vm_flags & VM_LOCKED) {
Linus Torvalds's avatar
Linus Torvalds committed
803 804 805 806 807 808
		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
		locked += len;
		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
			return -EAGAIN;
	}

809 810
	inode = file ? file->f_dentry->d_inode : NULL;

Linus Torvalds's avatar
Linus Torvalds committed
811
	if (file) {
Linus Torvalds's avatar
Linus Torvalds committed
812 813
		switch (flags & MAP_TYPE) {
		case MAP_SHARED:
814
			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
Linus Torvalds's avatar
Linus Torvalds committed
815 816
				return -EACCES;

817 818 819 820
			/*
			 * Make sure we don't allow writing to an append-only
			 * file..
			 */
821
			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
Linus Torvalds's avatar
Linus Torvalds committed
822 823
				return -EACCES;

824 825 826
			/*
			 * Make sure there are no mandatory locks on the file.
			 */
827
			if (locks_verify_locked(inode))
Linus Torvalds's avatar
Linus Torvalds committed
828 829
				return -EAGAIN;

Linus Torvalds's avatar
Linus Torvalds committed
830 831 832 833
			vm_flags |= VM_SHARED | VM_MAYSHARE;
			if (!(file->f_mode & FMODE_WRITE))
				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

Linus Torvalds's avatar
Linus Torvalds committed
834 835 836 837 838 839 840 841 842
			/* fall through */
		case MAP_PRIVATE:
			if (!(file->f_mode & FMODE_READ))
				return -EACCES;
			break;

		default:
			return -EINVAL;
		}
Linus Torvalds's avatar
Linus Torvalds committed
843 844 845
	} else {
		switch (flags & MAP_TYPE) {
		case MAP_SHARED:
846 847 848
			vm_flags |= VM_SHARED | VM_MAYSHARE;
			break;
		case MAP_PRIVATE:
849 850 851 852
			/*
			 * Set pgoff according to addr for anon_vma.
			 */
			pgoff = addr >> PAGE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
853
			break;
854 855
		default:
			return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
856
		}
Linus Torvalds's avatar
Linus Torvalds committed
857 858
	}

859 860
	error = security_file_mmap(file, prot, flags);
	if (error)
861 862
		return error;
		
Linus Torvalds's avatar
Linus Torvalds committed
863 864
	/* Clear old maps */
	error = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
865 866 867
munmap_back:
	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
	if (vma && vma->vm_start < addr + len) {
868
		if (do_munmap(mm, addr, len))
Linus Torvalds's avatar
Linus Torvalds committed
869 870 871
			return -ENOMEM;
		goto munmap_back;
	}
Linus Torvalds's avatar
Linus Torvalds committed
872 873 874 875 876 877

	/* Check against address space limit. */
	if ((mm->total_vm << PAGE_SHIFT) + len
	    > current->rlim[RLIMIT_AS].rlim_cur)
		return -ENOMEM;

878 879
	if (accountable && (!(flags & MAP_NORESERVE) ||
			sysctl_overcommit_memory > 1)) {
880 881 882 883
		if (vm_flags & VM_SHARED) {
			/* Check memory availability in shmem_file_setup? */
			vm_flags |= VM_ACCOUNT;
		} else if (vm_flags & VM_WRITE) {
884 885 886
			/*
			 * Private writable mapping: check memory availability
			 */
887
			charged = len >> PAGE_SHIFT;
888
			if (security_vm_enough_memory(charged))
889 890 891
				return -ENOMEM;
			vm_flags |= VM_ACCOUNT;
		}
Andrew Morton's avatar
Andrew Morton committed
892
	}
Linus Torvalds's avatar
Linus Torvalds committed
893

894 895 896 897 898 899
	/*
	 * Can we just expand an old private anonymous mapping?
	 * The VM_SHARED test is necessary because shmem_zero_setup
	 * will create the file object for a shared anonymous map below.
	 */
	if (!file && !(vm_flags & VM_SHARED) &&
900 901
	    vma_merge(mm, prev, addr, addr + len, vm_flags,
					NULL, NULL, pgoff, NULL))
902
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
903

904 905
	/*
	 * Determine the object being mapped and call the appropriate
Linus Torvalds's avatar
Linus Torvalds committed
906 907 908 909
	 * specific mapper. the address has already been validated, but
	 * not unmapped, but the maps are removed from the list.
	 */
	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
Andrew Morton's avatar
Andrew Morton committed
910 911
	if (!vma) {
		error = -ENOMEM;
Andrew Morton's avatar
Andrew Morton committed
912
		goto unacct_error;
Andrew Morton's avatar
Andrew Morton committed
913 914
	}
	memset(vma, 0, sizeof(*vma));
Linus Torvalds's avatar
Linus Torvalds committed
915 916 917 918

	vma->vm_mm = mm;
	vma->vm_start = addr;
	vma->vm_end = addr + len;
Linus Torvalds's avatar
Linus Torvalds committed
919 920
	vma->vm_flags = vm_flags;
	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
Linus Torvalds's avatar
Linus Torvalds committed
921 922 923
	vma->vm_pgoff = pgoff;

	if (file) {
Linus Torvalds's avatar
Linus Torvalds committed
924 925 926
		error = -EINVAL;
		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
			goto free_vma;
Linus Torvalds's avatar
Linus Torvalds committed
927
		if (vm_flags & VM_DENYWRITE) {
Linus Torvalds's avatar
Linus Torvalds committed
928 929 930 931 932 933 934 935 936 937
			error = deny_write_access(file);
			if (error)
				goto free_vma;
			correct_wcount = 1;
		}
		vma->vm_file = file;
		get_file(file);
		error = file->f_op->mmap(file, vma);
		if (error)
			goto unmap_and_free_vma;
938
	} else if (vm_flags & VM_SHARED) {
Linus Torvalds's avatar
Linus Torvalds committed
939 940 941 942 943
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	}

944 945 946 947 948 949 950 951
	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
	 * that memory reservation must be checked; but that reservation
	 * belongs to shared memory object, not to vma: so now clear it.
	 */
	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
		vma->vm_flags &= ~VM_ACCOUNT;

Linus Torvalds's avatar
Linus Torvalds committed
952 953 954 955 956 957 958
	/* Can addr have changed??
	 *
	 * Answer: Yes, several device drivers can do it in their
	 *         f_op->mmap method. -DaveM
	 */
	addr = vma->vm_start;

959
	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
960
			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
961 962 963 964 965 966 967 968 969
		vma_link(mm, vma, prev, rb_link, rb_parent);
		if (correct_wcount)
			atomic_inc(&inode->i_writecount);
	} else {
		if (file) {
			if (correct_wcount)
				atomic_inc(&inode->i_writecount);
			fput(file);
		}
970
		mpol_free(vma_policy(vma));
971 972
		kmem_cache_free(vm_area_cachep, vma);
	}
Linus Torvalds's avatar
Linus Torvalds committed
973
out:	
Linus Torvalds's avatar
Linus Torvalds committed
974
	mm->total_vm += len >> PAGE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
975
	if (vm_flags & VM_LOCKED) {
Linus Torvalds's avatar
Linus Torvalds committed
976 977 978
		mm->locked_vm += len >> PAGE_SHIFT;
		make_pages_present(addr, addr + len);
	}
Andrew Morton's avatar
Andrew Morton committed
979 980
	if (flags & MAP_POPULATE) {
		up_write(&mm->mmap_sem);
981
		sys_remap_file_pages(addr, len, 0,
Andrew Morton's avatar
Andrew Morton committed
982 983 984
					pgoff, flags & MAP_NONBLOCK);
		down_write(&mm->mmap_sem);
	}
Linus Torvalds's avatar
Linus Torvalds committed
985 986 987 988
	return addr;

unmap_and_free_vma:
	if (correct_wcount)
989
		atomic_inc(&inode->i_writecount);
Linus Torvalds's avatar
Linus Torvalds committed
990 991
	vma->vm_file = NULL;
	fput(file);
Linus Torvalds's avatar
Linus Torvalds committed
992

Linus Torvalds's avatar
Linus Torvalds committed
993
	/* Undo any partial mapping done by a device driver. */
994
	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
995 996
free_vma:
	kmem_cache_free(vm_area_cachep, vma);
Andrew Morton's avatar
Andrew Morton committed
997 998 999
unacct_error:
	if (charged)
		vm_unacct_memory(charged);
Linus Torvalds's avatar
Linus Torvalds committed
1000 1001 1002
	return error;
}

1003 1004
EXPORT_SYMBOL(do_mmap_pgoff);

Linus Torvalds's avatar
Linus Torvalds committed
1005
/* Get an address range which is currently unmapped.
Linus Torvalds's avatar
Linus Torvalds committed
1006 1007 1008 1009 1010 1011 1012 1013 1014
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *	if (ret & ~PAGE_MASK)
 *		error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
Linus Torvalds's avatar
Linus Torvalds committed
1015 1016
 */
#ifndef HAVE_ARCH_UNMAPPED_AREA
1017 1018 1019
static inline unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
Linus Torvalds's avatar
Linus Torvalds committed
1020
{
1021
	struct mm_struct *mm = current->mm;
Linus Torvalds's avatar
Linus Torvalds committed
1022
	struct vm_area_struct *vma;
1023
	unsigned long start_addr;
Linus Torvalds's avatar
Linus Torvalds committed
1024 1025

	if (len > TASK_SIZE)
Linus Torvalds's avatar
Linus Torvalds committed
1026
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1027 1028 1029

	if (addr) {
		addr = PAGE_ALIGN(addr);
1030
		vma = find_vma(mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
1031 1032 1033
		if (TASK_SIZE - len >= addr &&
		    (!vma || addr + len <= vma->vm_start))
			return addr;
1034 1035
	}
	start_addr = addr = mm->free_area_cache;
Linus Torvalds's avatar
Linus Torvalds committed
1036

1037
full_search:
1038
	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
Linus Torvalds's avatar
Linus Torvalds committed
1039
		/* At this point:  (!vma || addr < vma->vm_end). */
1040 1041 1042 1043 1044 1045 1046 1047 1048
		if (TASK_SIZE - len < addr) {
			/*
			 * Start a new search - just in case we missed
			 * some holes.
			 */
			if (start_addr != TASK_UNMAPPED_BASE) {
				start_addr = addr = TASK_UNMAPPED_BASE;
				goto full_search;
			}
Linus Torvalds's avatar
Linus Torvalds committed
1049
			return -ENOMEM;
1050
		}
1051 1052 1053 1054 1055
		if (!vma || addr + len <= vma->vm_start) {
			/*
			 * Remember the place where we stopped the search:
			 */
			mm->free_area_cache = addr + len;
Linus Torvalds's avatar
Linus Torvalds committed
1056
			return addr;
1057
		}
Linus Torvalds's avatar
Linus Torvalds committed
1058
		addr = vma->vm_end;
Linus Torvalds's avatar
Linus Torvalds committed
1059 1060
	}
}
Linus Torvalds's avatar
Linus Torvalds committed
1061
#else
1062 1063 1064
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
			unsigned long, unsigned long);
Linus Torvalds's avatar
Linus Torvalds committed
1065 1066
#endif	

1067 1068 1069
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
		unsigned long pgoff, unsigned long flags)
Linus Torvalds's avatar
Linus Torvalds committed
1070 1071
{
	if (flags & MAP_FIXED) {
1072 1073
		unsigned long ret;

Linus Torvalds's avatar
Linus Torvalds committed
1074
		if (addr > TASK_SIZE - len)
1075
			return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1076 1077
		if (addr & ~PAGE_MASK)
			return -EINVAL;
1078 1079
		if (file && is_file_hugepages(file))  {
			/*
1080 1081
			 * Check if the given range is hugepage aligned, and
			 * can be made suitable for hugepages.
1082
			 */
1083
			ret = prepare_hugepage_range(addr, len);
1084 1085 1086 1087 1088 1089
		} else {
			/*
			 * Ensure that a normal request is not falling in a
			 * reserved hugepage range.  For some archs like IA-64,
			 * there is a separate region for hugepages.
			 */
1090
			ret = is_hugepage_only_range(addr, len);
1091
		}
1092
		if (ret)
1093
			return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1094 1095 1096 1097
		return addr;
	}

	if (file && file->f_op && file->f_op->get_unmapped_area)
1098 1099
		return file->f_op->get_unmapped_area(file, addr, len,
						pgoff, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1100 1101 1102

	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
}
Linus Torvalds's avatar
Linus Torvalds committed
1103

1104 1105
EXPORT_SYMBOL(get_unmapped_area);

Linus Torvalds's avatar
Linus Torvalds committed
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
	struct vm_area_struct *vma = NULL;

	if (mm) {
		/* Check the cache first. */
		/* (Cache hit rate is typically around 35%.) */
		vma = mm->mmap_cache;
		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1116
			struct rb_node * rb_node;
Linus Torvalds's avatar
Linus Torvalds committed
1117 1118 1119 1120 1121 1122 1123

			rb_node = mm->mm_rb.rb_node;
			vma = NULL;

			while (rb_node) {
				struct vm_area_struct * vma_tmp;

1124 1125
				vma_tmp = rb_entry(rb_node,
						struct vm_area_struct, vm_rb);
Linus Torvalds's avatar
Linus Torvalds committed
1126 1127 1128 1129

				if (vma_tmp->vm_end > addr) {
					vma = vma_tmp;
					if (vma_tmp->vm_start <= addr)
Linus Torvalds's avatar
Linus Torvalds committed
1130
						break;
Linus Torvalds's avatar
Linus Torvalds committed
1131 1132 1133
					rb_node = rb_node->rb_left;
				} else
					rb_node = rb_node->rb_right;
Linus Torvalds's avatar
Linus Torvalds committed
1134 1135 1136 1137 1138 1139 1140 1141
			}
			if (vma)
				mm->mmap_cache = vma;
		}
	}
	return vma;
}

1142 1143
EXPORT_SYMBOL(find_vma);

Linus Torvalds's avatar
Linus Torvalds committed
1144
/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
1145 1146 1147
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
			struct vm_area_struct **pprev)
Linus Torvalds's avatar
Linus Torvalds committed
1148
{
1149
	struct vm_area_struct *vma = NULL, *prev = NULL;
1150
	struct rb_node * rb_node;
1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
	if (!mm)
		goto out;

	/* Guard against addr being lower than the first VMA */
	vma = mm->mmap;

	/* Go through the RB tree quickly. */
	rb_node = mm->mm_rb.rb_node;

	while (rb_node) {
		struct vm_area_struct *vma_tmp;
		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);

		if (addr < vma_tmp->vm_end) {
			rb_node = rb_node->rb_left;
		} else {
			prev = vma_tmp;
			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
				break;
			rb_node = rb_node->rb_right;
Linus Torvalds's avatar
Linus Torvalds committed
1171 1172
		}
	}
1173

1174
out:
1175 1176
	*pprev = prev;
	return prev ? prev->vm_next : vma;
Linus Torvalds's avatar
Linus Torvalds committed
1177 1178
}

1179
#ifdef CONFIG_STACK_GROWSUP
Andrew Morton's avatar
Andrew Morton committed
1180
/*
1181
 * vma is the first one with address > vma->vm_end.  Have to extend vma.
Andrew Morton's avatar
Andrew Morton committed
1182 1183 1184 1185 1186
 */
int expand_stack(struct vm_area_struct * vma, unsigned long address)
{
	unsigned long grow;

1187 1188 1189
	if (!(vma->vm_flags & VM_GROWSUP))
		return -EFAULT;

1190 1191 1192 1193 1194 1195 1196 1197
	/*
	 * We must make sure the anon_vma is allocated
	 * so that the anon_vma locking is not a noop.
	 */
	if (unlikely(anon_vma_prepare(vma)))
		return -ENOMEM;
	anon_vma_lock(vma);

Andrew Morton's avatar
Andrew Morton committed
1198 1199
	/*
	 * vma->vm_start/vm_end cannot change under us because the caller
1200 1201
	 * is required to hold the mmap_sem in read mode.  We need the
	 * anon_vma lock to serialize against concurrent expand_stacks.
Andrew Morton's avatar
Andrew Morton committed
1202
	 */
1203
	address += 4 + PAGE_SIZE - 1;
Andrew Morton's avatar
Andrew Morton committed
1204
	address &= PAGE_MASK;
1205
	grow = (address - vma->vm_end) >> PAGE_SHIFT;
Andrew Morton's avatar
Andrew Morton committed
1206 1207

	/* Overcommit.. */
1208
	if (security_vm_enough_memory(grow)) {
1209
		anon_vma_unlock(vma);
Andrew Morton's avatar
Andrew Morton committed
1210 1211 1212
		return -ENOMEM;
	}
	
1213
	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
Andrew Morton's avatar
Andrew Morton committed
1214 1215
			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
			current->rlim[RLIMIT_AS].rlim_cur) {
1216
		anon_vma_unlock(vma);
Andrew Morton's avatar
Andrew Morton committed
1217 1218 1219
		vm_unacct_memory(grow);
		return -ENOMEM;
	}
1220
	vma->vm_end = address;
Andrew Morton's avatar
Andrew Morton committed
1221 1222 1223
	vma->vm_mm->total_vm += grow;
	if (vma->vm_flags & VM_LOCKED)
		vma->vm_mm->locked_vm += grow;
1224
	anon_vma_unlock(vma);
Andrew Morton's avatar
Andrew Morton committed
1225 1226 1227
	return 0;
}

1228 1229
struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
{
	struct vm_area_struct *vma, *prev;

	addr &= PAGE_MASK;
	vma = find_vma_prev(mm, addr, &prev);
	if (vma && (vma->vm_start <= addr))
		return vma;
	if (!prev || expand_stack(prev, addr))
		return NULL;
	if (prev->vm_flags & VM_LOCKED) {
		make_pages_present(addr, prev->vm_end);
	}
	return prev;
}
#else
1245 1246 1247
/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 */
1248
int expand_stack(struct vm_area_struct *vma, unsigned long address)
1249 1250 1251
{
	unsigned long grow;

1252 1253 1254 1255 1256 1257 1258 1259
	/*
	 * We must make sure the anon_vma is allocated
	 * so that the anon_vma locking is not a noop.
	 */
	if (unlikely(anon_vma_prepare(vma)))
		return -ENOMEM;
	anon_vma_lock(vma);

1260 1261
	/*
	 * vma->vm_start/vm_end cannot change under us because the caller
1262 1263
	 * is required to hold the mmap_sem in read mode.  We need the
	 * anon_vma lock to serialize against concurrent expand_stacks.
1264 1265 1266 1267 1268
	 */
	address &= PAGE_MASK;
	grow = (vma->vm_start - address) >> PAGE_SHIFT;

	/* Overcommit.. */
1269
	if (security_vm_enough_memory(grow)) {
1270
		anon_vma_unlock(vma);
1271 1272 1273 1274 1275 1276
		return -ENOMEM;
	}
	
	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
			current->rlim[RLIMIT_AS].rlim_cur) {
1277
		anon_vma_unlock(vma);
1278 1279 1280 1281 1282 1283 1284 1285
		vm_unacct_memory(grow);
		return -ENOMEM;
	}
	vma->vm_start = address;
	vma->vm_pgoff -= grow;
	vma->vm_mm->total_vm += grow;
	if (vma->vm_flags & VM_LOCKED)
		vma->vm_mm->locked_vm += grow;
1286
	anon_vma_unlock(vma);
1287 1288 1289
	return 0;
}

1290 1291
struct vm_area_struct *
find_extend_vma(struct mm_struct * mm, unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
{
	struct vm_area_struct * vma;
	unsigned long start;

	addr &= PAGE_MASK;
	vma = find_vma(mm,addr);
	if (!vma)
		return NULL;
	if (vma->vm_start <= addr)
		return vma;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		return NULL;
	start = vma->vm_start;
	if (expand_stack(vma, addr))
		return NULL;
	if (vma->vm_flags & VM_LOCKED) {
		make_pages_present(addr, start);
	}
	return vma;
}
1312
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326

/*
 * Try to free as many page directory entries as we can,
 * without having to work very hard at actually scanning
 * the page tables themselves.
 *
 * Right now we try to free page tables if we have a nice
 * PGDIR-aligned area that got free'd up. We could be more
 * granular if we want to, but this is fast and simple,
 * and covers the bad cases.
 *
 * "prev", if it exists, points to a vma before the one
 * we just free'd - but there's no telling how much before.
 */
1327
static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
Linus Torvalds's avatar
Linus Torvalds committed
1328 1329 1330 1331 1332
	unsigned long start, unsigned long end)
{
	unsigned long first = start & PGDIR_MASK;
	unsigned long last = end + PGDIR_SIZE - 1;
	unsigned long start_index, end_index;
1333
	struct mm_struct *mm = tlb->mm;
Linus Torvalds's avatar
Linus Torvalds committed
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360

	if (!prev) {
		prev = mm->mmap;
		if (!prev)
			goto no_mmaps;
		if (prev->vm_end > start) {
			if (last > prev->vm_start)
				last = prev->vm_start;
			goto no_mmaps;
		}
	}
	for (;;) {
		struct vm_area_struct *next = prev->vm_next;

		if (next) {
			if (next->vm_start < start) {
				prev = next;
				continue;
			}
			if (last > next->vm_start)
				last = next->vm_start;
		}
		if (prev->vm_end > first)
			first = prev->vm_end + PGDIR_SIZE - 1;
		break;
	}
no_mmaps:
1361
	if (last < first)	/* for arches with discontiguous pgd indices */
1362
		return;
Linus Torvalds's avatar
Linus Torvalds committed
1363 1364 1365 1366 1367
	/*
	 * If the PGD bits are not consecutive in the virtual address, the
	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
	 */
	start_index = pgd_index(first);
Russell King's avatar
Russell King committed
1368 1369
	if (start_index < FIRST_USER_PGD_NR)
		start_index = FIRST_USER_PGD_NR;
Linus Torvalds's avatar
Linus Torvalds committed
1370 1371
	end_index = pgd_index(last);
	if (end_index > start_index) {
1372
		clear_page_tables(tlb, start_index, end_index - start_index);
Linus Torvalds's avatar
Linus Torvalds committed
1373 1374 1375 1376
		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
	}
}

1377 1378 1379 1380 1381 1382
/* Normal function to fix up a mapping
 * This function is the default for when an area has no specific
 * function.  This may be used as part of a more specific routine.
 *
 * By the time this function is called, the area struct has been
 * removed from the process mapping list.
Linus Torvalds's avatar
Linus Torvalds committed
1383
 */
1384
static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1385 1386 1387 1388 1389 1390
{
	size_t len = area->vm_end - area->vm_start;

	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
	if (area->vm_flags & VM_LOCKED)
		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1391 1392 1393 1394 1395 1396
	/*
	 * Is this a new hole at the lowest possible address?
	 */
	if (area->vm_start >= TASK_UNMAPPED_BASE &&
				area->vm_start < area->vm_mm->free_area_cache)
	      area->vm_mm->free_area_cache = area->vm_start;
1397

1398
	remove_vm_struct(area);
1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
}

/*
 * Update the VMA and inode share lists.
 *
 * Ok - we have the memory areas we should free on the 'free' list,
 * so release them, and do the vma updates.
 */
static void unmap_vma_list(struct mm_struct *mm,
	struct vm_area_struct *mpnt)
{
	do {
		struct vm_area_struct *next = mpnt->vm_next;
		unmap_vma(mm, mpnt);
		mpnt = next;
	} while (mpnt != NULL);
	validate_mm(mm);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the page table lock held.
 */
static void unmap_region(struct mm_struct *mm,
1424
	struct vm_area_struct *vma,
1425 1426
	struct vm_area_struct *prev,
	unsigned long start,
1427
	unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
1428
{
1429
	struct mmu_gather *tlb;
1430
	unsigned long nr_accounted = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1431

1432
	lru_add_drain();
1433
	tlb = tlb_gather_mmu(mm, 0);
1434
	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1435
	vm_unacct_memory(nr_accounted);
1436 1437 1438 1439 1440

	if (is_hugepage_only_range(start, end - start))
		hugetlb_free_pgtables(tlb, prev, start, end);
	else
		free_pgtables(tlb, prev, start, end);
1441 1442
	tlb_finish_mmu(tlb, start, end);
}
Linus Torvalds's avatar
Linus Torvalds committed
1443

1444
/*
1445 1446
 * Create a list of vma's touched by the unmap, removing them from the mm's
 * vma list as we go..
1447
 */
1448 1449 1450
static void
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
	struct vm_area_struct *prev, unsigned long end)
1451
{
1452 1453
	struct vm_area_struct **insertion_point;
	struct vm_area_struct *tail_vma = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1454

1455
	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1456
	do {
1457
		rb_erase(&vma->vm_rb, &mm->mm_rb);
1458
		mm->map_count--;
1459 1460 1461 1462 1463 1464
		tail_vma = vma;
		vma = vma->vm_next;
	} while (vma && vma->vm_start < end);
	*insertion_point = vma;
	tail_vma->vm_next = NULL;
	mm->mmap_cache = NULL;		/* Kill the cache. */
1465
}
1466

1467
/*
1468 1469
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the the tail.
1470
 */
1471 1472
int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
	      unsigned long addr, int new_below)
1473
{
1474
	struct mempolicy *pol;
1475
	struct vm_area_struct *new;
Linus Torvalds's avatar
Linus Torvalds committed
1476

1477
	if (mm->map_count >= sysctl_max_map_count)
1478
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1479

1480 1481 1482
	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
	if (!new)
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
1483

1484
	/* most fields are the same, copy all, and then fixup */
1485
	*new = *vma;
1486
	vma_prio_tree_init(new);
1487

1488
	if (new_below)
1489
		new->vm_end = addr;
1490
	else {
1491 1492 1493
		new->vm_start = addr;
		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1494

1495 1496 1497 1498 1499 1500 1501
	pol = mpol_copy(vma_policy(vma));
	if (IS_ERR(pol)) {
		kmem_cache_free(vm_area_cachep, new);
		return PTR_ERR(pol);
	}
	vma_set_policy(new, pol);

1502 1503 1504 1505 1506
	if (new->vm_file)
		get_file(new->vm_file);

	if (new->vm_ops && new->vm_ops->open)
		new->vm_ops->open(new);
1507

Andrew Morton's avatar
Andrew Morton committed
1508 1509 1510 1511 1512
	if (new_below)
		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
			((addr - new->vm_start) >> PAGE_SHIFT), new);
	else
		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1513

1514 1515
	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
1516

1517 1518 1519
/* Munmap is split into 2 main parts -- this part which finds
 * what needs doing, and the areas themselves, which do the
 * work.  This now handles partial unmappings.
Andrew Morton's avatar
Andrew Morton committed
1520
 * Jeremy Fitzhardinge <jeremy@goop.org>
1521
 */
1522
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
{
	unsigned long end;
	struct vm_area_struct *mpnt, *prev, *last;

	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
		return -EINVAL;

	if ((len = PAGE_ALIGN(len)) == 0)
		return -EINVAL;

	/* Find the first overlapping VMA */
	mpnt = find_vma_prev(mm, start, &prev);
	if (!mpnt)
		return 0;
	/* we have  start < mpnt->vm_end  */

Andrew Morton's avatar
Andrew Morton committed
1539
	if (is_vm_hugetlb_page(mpnt)) {
1540 1541 1542 1543
		int ret = is_aligned_hugepage_range(start, len);

		if (ret)
			return ret;
Andrew Morton's avatar
Andrew Morton committed
1544 1545
	}

1546 1547 1548 1549 1550
	/* if it doesn't overlap, we have nothing.. */
	end = start + len;
	if (mpnt->vm_start >= end)
		return 0;

John Levon's avatar
John Levon committed
1551 1552 1553 1554
	/* Something will probably happen, so notify. */
	if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
		profile_exec_unmap(mm);
 
1555 1556
	/*
	 * If we need to split any vma, do it now to save pain later.
1557 1558 1559 1560
	 *
	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
	 * unmapped vm_area_struct will remain in use: so lower split_vma
	 * places tmp vma above, and higher split_vma places tmp vma below.
1561 1562
	 */
	if (start > mpnt->vm_start) {
1563
		if (split_vma(mm, mpnt, start, 0))
1564 1565
			return -ENOMEM;
		prev = mpnt;
Linus Torvalds's avatar
Linus Torvalds committed
1566 1567
	}

1568 1569 1570
	/* Does it split the last one? */
	last = find_vma(mm, end);
	if (last && end > last->vm_start) {
1571
		if (split_vma(mm, last, end, 1))
1572 1573
			return -ENOMEM;
	}
1574
	mpnt = prev? prev->vm_next: mm->mmap;
Linus Torvalds's avatar
Linus Torvalds committed
1575

1576 1577 1578
	/*
	 * Remove the vma's, and unmap the actual pages
	 */
1579
	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1580
	spin_lock(&mm->page_table_lock);
1581
	unmap_region(mm, mpnt, prev, start, end);
1582
	spin_unlock(&mm->page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1583

1584 1585 1586
	/* Fix up all other VM information */
	unmap_vma_list(mm, mpnt);

Linus Torvalds's avatar
Linus Torvalds committed
1587 1588 1589
	return 0;
}

1590 1591
EXPORT_SYMBOL(do_munmap);

Linus Torvalds's avatar
Linus Torvalds committed
1592 1593 1594 1595 1596
asmlinkage long sys_munmap(unsigned long addr, size_t len)
{
	int ret;
	struct mm_struct *mm = current->mm;

Linus Torvalds's avatar
Linus Torvalds committed
1597
	down_write(&mm->mmap_sem);
1598
	ret = do_munmap(mm, addr, len);
Linus Torvalds's avatar
Linus Torvalds committed
1599
	up_write(&mm->mmap_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610
	return ret;
}

/*
 *  this is really a simplified "do_mmap".  it only handles
 *  anonymous maps.  eventually we may be able to do some
 *  brk-specific accounting here.
 */
unsigned long do_brk(unsigned long addr, unsigned long len)
{
	struct mm_struct * mm = current->mm;
Linus Torvalds's avatar
Linus Torvalds committed
1611 1612
	struct vm_area_struct * vma, * prev;
	unsigned long flags;
1613
	struct rb_node ** rb_link, * rb_parent;
1614
	pgoff_t pgoff = addr >> PAGE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
1615 1616 1617 1618 1619

	len = PAGE_ALIGN(len);
	if (!len)
		return addr;

1620 1621 1622
	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
		return -EINVAL;

Linus Torvalds's avatar
Linus Torvalds committed
1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
	/*
	 * mlock MCL_FUTURE?
	 */
	if (mm->def_flags & VM_LOCKED) {
		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
		locked += len;
		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
			return -EAGAIN;
	}

	/*
	 * Clear old maps.  this also does some error checking for us
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1636 1637 1638
 munmap_back:
	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
	if (vma && vma->vm_start < addr + len) {
1639
		if (do_munmap(mm, addr, len))
Linus Torvalds's avatar
Linus Torvalds committed
1640 1641 1642
			return -ENOMEM;
		goto munmap_back;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1643 1644 1645 1646 1647 1648

	/* Check against address space limits *after* clearing old maps... */
	if ((mm->total_vm << PAGE_SHIFT) + len
	    > current->rlim[RLIMIT_AS].rlim_cur)
		return -ENOMEM;

1649
	if (mm->map_count > sysctl_max_map_count)
Linus Torvalds's avatar
Linus Torvalds committed
1650 1651
		return -ENOMEM;

1652
	if (security_vm_enough_memory(len >> PAGE_SHIFT))
Linus Torvalds's avatar
Linus Torvalds committed
1653 1654
		return -ENOMEM;

Andrew Morton's avatar
Andrew Morton committed
1655
	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
Linus Torvalds's avatar
Linus Torvalds committed
1656

1657
	/* Can we just expand an old private anonymous mapping? */
1658 1659
	if (vma_merge(mm, prev, addr, addr + len, flags,
					NULL, NULL, pgoff, NULL))
Linus Torvalds's avatar
Linus Torvalds committed
1660
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1661 1662 1663 1664 1665

	/*
	 * create a vma struct for an anonymous mapping
	 */
	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
Andrew Morton's avatar
Andrew Morton committed
1666 1667
	if (!vma) {
		vm_unacct_memory(len >> PAGE_SHIFT);
Linus Torvalds's avatar
Linus Torvalds committed
1668
		return -ENOMEM;
Andrew Morton's avatar
Andrew Morton committed
1669
	}
Andrew Morton's avatar
Andrew Morton committed
1670
	memset(vma, 0, sizeof(*vma));
Linus Torvalds's avatar
Linus Torvalds committed
1671 1672 1673 1674

	vma->vm_mm = mm;
	vma->vm_start = addr;
	vma->vm_end = addr + len;
1675
	vma->vm_pgoff = pgoff;
Linus Torvalds's avatar
Linus Torvalds committed
1676 1677
	vma->vm_flags = flags;
	vma->vm_page_prot = protection_map[flags & 0x0f];
Linus Torvalds's avatar
Linus Torvalds committed
1678
	vma_link(mm, vma, prev, rb_link, rb_parent);
Linus Torvalds's avatar
Linus Torvalds committed
1679 1680 1681 1682 1683 1684 1685 1686 1687
out:
	mm->total_vm += len >> PAGE_SHIFT;
	if (flags & VM_LOCKED) {
		mm->locked_vm += len >> PAGE_SHIFT;
		make_pages_present(addr, addr + len);
	}
	return addr;
}

1688 1689
EXPORT_SYMBOL(do_brk);

Linus Torvalds's avatar
Linus Torvalds committed
1690
/* Release all mmaps. */
1691
void exit_mmap(struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
1692
{
1693
	struct mmu_gather *tlb;
1694 1695
	struct vm_area_struct *vma;
	unsigned long nr_accounted = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1696

John Levon's avatar
John Levon committed
1697 1698
	profile_exit_mmap(mm);
 
1699 1700
	lru_add_drain();

Linus Torvalds's avatar
Linus Torvalds committed
1701
	spin_lock(&mm->page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1702

1703
	tlb = tlb_gather_mmu(mm, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1704
	flush_cache_mm(mm);
1705
	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1706
	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1707
					~0UL, &nr_accounted, NULL);
1708 1709
	vm_unacct_memory(nr_accounted);
	BUG_ON(mm->map_count);	/* This is just debugging */
1710
	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
1711
	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1712

1713
	vma = mm->mmap;
1714 1715 1716 1717 1718 1719
	mm->mmap = mm->mmap_cache = NULL;
	mm->mm_rb = RB_ROOT;
	mm->rss = 0;
	mm->total_vm = 0;
	mm->locked_vm = 0;

1720
	spin_unlock(&mm->page_table_lock);
1721 1722 1723 1724 1725

	/*
	 * Walk the list again, actually closing and freeing it
	 * without holding any MM locks.
	 */
1726 1727
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
1728
		remove_vm_struct(vma);
1729
		vma = next;
1730
	}
Linus Torvalds's avatar
Linus Torvalds committed
1731 1732 1733
}

/* Insert vm structure into process list sorted by address
1734
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
1735
 * then i_mmap_lock is taken here.
Linus Torvalds's avatar
Linus Torvalds committed
1736
 */
Linus Torvalds's avatar
Linus Torvalds committed
1737
void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
Linus Torvalds's avatar
Linus Torvalds committed
1738
{
Linus Torvalds's avatar
Linus Torvalds committed
1739
	struct vm_area_struct * __vma, * prev;
1740
	struct rb_node ** rb_link, * rb_parent;
Linus Torvalds's avatar
Linus Torvalds committed
1741

1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
	/*
	 * The vm_pgoff of a purely anonymous vma should be irrelevant
	 * until its first write fault, when page's anon_vma and index
	 * are set.  But now set the vm_pgoff it will almost certainly
	 * end up with (unless mremap moves it elsewhere before that
	 * first wfault), so /proc/pid/maps tells a consistent story.
	 *
	 * By setting it to reflect the virtual start address of the
	 * vma, merges and splits can happen in a seamless way, just
	 * using the existing file pgoff checks and manipulations.
	 * Similarly in do_mmap_pgoff and in do_brk.
	 */
	if (!vma->vm_file) {
		BUG_ON(vma->anon_vma);
		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
	}
1758
	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
Linus Torvalds's avatar
Linus Torvalds committed
1759 1760 1761
	if (__vma && __vma->vm_start < vma->vm_end)
		BUG();
	vma_link(mm, vma, prev, rb_link, rb_parent);
1762 1763 1764 1765 1766 1767
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
Hugh Dickins's avatar
Hugh Dickins committed
1768
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1769
	unsigned long addr, unsigned long len, pgoff_t pgoff)
1770
{
Hugh Dickins's avatar
Hugh Dickins committed
1771 1772
	struct vm_area_struct *vma = *vmap;
	unsigned long vma_start = vma->vm_start;
1773 1774 1775
	struct mm_struct *mm = vma->vm_mm;
	struct vm_area_struct *new_vma, *prev;
	struct rb_node **rb_link, *rb_parent;
Andrew Morton's avatar
Andrew Morton committed
1776
	struct mempolicy *pol;
1777

1778 1779 1780 1781 1782 1783 1784
	/*
	 * If anonymous vma has not yet been faulted, update new pgoff
	 * to match new location, to increase its chance of merging.
	 */
	if (!vma->vm_file && !vma->anon_vma)
		pgoff = addr >> PAGE_SHIFT;

1785
	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1786 1787
	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
Hugh Dickins's avatar
Hugh Dickins committed
1788 1789 1790 1791 1792 1793 1794 1795
	if (new_vma) {
		/*
		 * Source vma may have been merged into new_vma
		 */
		if (vma_start >= new_vma->vm_start &&
		    vma_start < new_vma->vm_end)
			*vmap = new_vma;
	} else {
1796 1797 1798
		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
		if (new_vma) {
			*new_vma = *vma;
1799
			vma_prio_tree_init(new_vma);
Andrew Morton's avatar
Andrew Morton committed
1800 1801 1802 1803 1804 1805
			pol = mpol_copy(vma_policy(vma));
			if (IS_ERR(pol)) {
				kmem_cache_free(vm_area_cachep, new_vma);
				return NULL;
			}
			vma_set_policy(new_vma, pol);
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816
			new_vma->vm_start = addr;
			new_vma->vm_end = addr + len;
			new_vma->vm_pgoff = pgoff;
			if (new_vma->vm_file)
				get_file(new_vma->vm_file);
			if (new_vma->vm_ops && new_vma->vm_ops->open)
				new_vma->vm_ops->open(new_vma);
			vma_link(mm, new_vma, prev, rb_link, rb_parent);
		}
	}
	return new_vma;
Linus Torvalds's avatar
Linus Torvalds committed
1817
}