Commit 968f11a8 authored by Jamie Lokier's avatar Jamie Lokier Committed by Linus Torvalds

[PATCH] Unpinned futexes v2: indexing changes

This changes the way futexes are indexed, so that they don't pin pages. 
It also fixes some bugs with private mappings and COW pages.

Currently, all futexes look up the page at the userspace address and pin
it, using the pair (page,offset) as an index into a table of waiting
futexes.  Any page with a futex waiting on it remains pinned in RAM,
which is a problem when many futexes are used, especially with FUTEX_FD.

Another problem is that the page is not always the correct one, if it
can be changed later by a COW (copy on write) operation.  This can
happen when waiting on a futex without writing to it after fork(),
exec() or mmap(), if the page is then written to before attempting to
wake a futex at the same adress. 

There are two symptoms of the COW problem:
 - The wrong process can receive wakeups
 - A process can fail to receive required wakeups. 

This patch fixes both by changing the indexing so that VM_SHARED
mappings use the triple (inode,offset,index), and private mappings use
the pair (mm,virtual_address).

The former correctly handles all shared mappings, including tmpfs and
therefore all kinds of shared memory (IPC shm, /dev/shm and
MAP_ANON|MAP_SHARED).  This works because every mapping which is
VM_SHARED has an associated non-zero vma->vm_file, and hence inode.
(This is ensured in do_mmap_pgoff, where it calls shmem_zero_setup). 

The latter handles all private mappings, both files and anonymous.  It
isn't affected by COW, because it doesn't care about the actual pages,
just the virtual address.

The patch has a few bonuses:

        1. It removes the vcache implementation, as only futexes were
           using it, and they don't any more.

        2. Removing the vcache should make COW page faults a bit faster.

        3. Futex operations no longer take the page table lock, walk
           the page table, fault in pages that aren't mapped in the
           page table, or do a vcache hash lookup - they are mostly a
           simple offset calculation with one hash for the futex
           table.  So they should be noticably faster.

Special thanks to Hugh Dickins, Andrew Morton and Rusty Russell for
insightful feedback.  All suggestions are included.
parent 707c584e
......@@ -110,6 +110,7 @@ struct vm_area_struct {
#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
......
/*
* virtual => physical mapping cache support.
*/
#ifndef _LINUX_VCACHE_H
#define _LINUX_VCACHE_H
typedef struct vcache_s {
unsigned long address;
struct mm_struct *mm;
struct list_head hash_entry;
void (*callback)(struct vcache_s *data, struct page *new_page);
} vcache_t;
extern spinlock_t vcache_lock;
extern void __attach_vcache(vcache_t *vcache,
unsigned long address,
struct mm_struct *mm,
void (*callback)(struct vcache_s *data, struct page *new_page));
extern void __detach_vcache(vcache_t *vcache);
extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
struct page *new_page);
#endif
This diff is collapsed.
......@@ -9,6 +9,6 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
slab.o swap.o truncate.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
......@@ -144,9 +144,10 @@ long sys_remap_file_pages(unsigned long start, unsigned long size,
return err;
#endif
down_read(&mm->mmap_sem);
/* We need down_write() to change vma->vm_flags. */
down_write(&mm->mmap_sem);
vma = find_vma(mm, start);
/*
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
......@@ -155,11 +156,27 @@ long sys_remap_file_pages(unsigned long start, unsigned long size,
if (vma && (vma->vm_flags & VM_SHARED) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
end <= vma->vm_end)
err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK);
up_read(&mm->mmap_sem);
end <= vma->vm_end) {
/* Must set VM_NONLINEAR before any pages are populated. */
if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
vma->vm_flags |= VM_NONLINEAR;
/* ->populate can take a long time, so downgrade the lock. */
downgrade_write(&mm->mmap_sem);
err = vma->vm_ops->populate(vma, start, size,
vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK);
/*
* We can't clear VM_NONLINEAR because we'd have to do
* it after ->populate completes, and that would prevent
* downgrading the lock. (Locks can't be upgraded).
*/
up_read(&mm->mmap_sem);
} else {
up_write(&mm->mmap_sem);
}
return err;
}
......
......@@ -43,7 +43,6 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/vcache.h>
#include <linux/rmap-locking.h>
#include <linux/module.h>
......@@ -962,7 +961,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
pte_t *page_table)
{
invalidate_vcache(address, vma->vm_mm, new_page);
flush_cache_page(vma, address);
establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
}
......
/*
* linux/mm/vcache.c
*
* virtual => physical page mapping cache. Users of this mechanism
* register callbacks for a given (virt,mm,phys) page mapping, and
* the kernel guarantees to call back when this mapping is invalidated.
* (ie. upon COW or unmap.)
*
* Started by Ingo Molnar, Copyright (C) 2002
*/
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/vcache.h>
#define VCACHE_HASHBITS 8
#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
static struct list_head hash[VCACHE_HASHSIZE];
static struct list_head *hash_vcache(unsigned long address,
struct mm_struct *mm)
{
return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
}
void __attach_vcache(vcache_t *vcache,
unsigned long address,
struct mm_struct *mm,
void (*callback)(struct vcache_s *data, struct page *new))
{
struct list_head *hash_head;
address &= PAGE_MASK;
vcache->address = address;
vcache->mm = mm;
vcache->callback = callback;
hash_head = hash_vcache(address, mm);
list_add_tail(&vcache->hash_entry, hash_head);
}
void __detach_vcache(vcache_t *vcache)
{
list_del_init(&vcache->hash_entry);
}
void invalidate_vcache(unsigned long address, struct mm_struct *mm,
struct page *new_page)
{
struct list_head *l, *hash_head;
vcache_t *vcache;
address &= PAGE_MASK;
hash_head = hash_vcache(address, mm);
/*
* This is safe, because this path is called with the pagetable
* lock held. So while other mm's might add new entries in
* parallel, *this* mm is locked out, so if the list is empty
* now then we do not have to take the vcache lock to see it's
* really empty.
*/
if (likely(list_empty(hash_head)))
return;
spin_lock(&vcache_lock);
list_for_each(l, hash_head) {
vcache = list_entry(l, vcache_t, hash_entry);
if (vcache->address != address || vcache->mm != mm)
continue;
vcache->callback(vcache, new_page);
}
spin_unlock(&vcache_lock);
}
static int __init vcache_init(void)
{
unsigned int i;
for (i = 0; i < VCACHE_HASHSIZE; i++)
INIT_LIST_HEAD(hash + i);
return 0;
}
__initcall(vcache_init);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment