Commit 8462805c authored by Jamie Lokier's avatar Jamie Lokier Committed by Linus Torvalds

[PATCH] futex bug fixes

This fixes two serious bugs in the futex code.

One is a race condition which results in list corruption when
FUTEX_REQUEUE is used.  It is due to the split locks change introduced
in 2.6.0-test6, and oopses when triggered.

The other is a security hole.  A program can use FUTEX_FD to create
futexes on mms or inodes which don't reference them, and when those
structures are reused by a different mm or inode, the addresses match.
The effect is that a malicious or flawed program can steal wakeups from
completely unrelated tasks, causing them to block (or worse if they are
counting on the token passing property).

These are the specific changes:

    1. Each futex_q retains a reference to its key mm or inode.

    2. The condition for a futex_q to indicate that it's woken can usually
       be interrogated lock-free.

    3. futex_wait calls the hash function once instead of three times,
       and usually takes the per-bucket lock once too.

    4. When a futex is woken, the per-bucket lock is not usually taken,
       so that's one less cache line transfer during heavy SMP futex use.

    5. The wait condition and barriers in futex_wait are simpler.

    5. FUTEX_REQUEUE is fixed.  The per-bucket lock juggling is done
       in such a way that there are no race conditions against the tests
       for whether a futex is woken.


This patch is an combination of patches previously sent to the list.  An
equivalent patch has been in Andrew Morton's tree for a while, with no
failure reports.  Also I have been running it on my own SMP box for a
while.  Conversely, we have received an oops report for the 2.6.0-test6
code, so the fix is needed.
parent c69f2894
...@@ -45,6 +45,9 @@ ...@@ -45,6 +45,9 @@
* Futexes are matched on equal values of this key. * Futexes are matched on equal values of this key.
* The key type depends on whether it's a shared or private mapping. * The key type depends on whether it's a shared or private mapping.
* Don't rearrange members without looking at hash_futex(). * Don't rearrange members without looking at hash_futex().
*
* offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
* We set bit 0 to indicate if it's an inode-based key.
*/ */
union futex_key { union futex_key {
struct { struct {
...@@ -66,12 +69,20 @@ union futex_key { ...@@ -66,12 +69,20 @@ union futex_key {
/* /*
* We use this hashed waitqueue instead of a normal wait_queue_t, so * We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared): * we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/ */
struct futex_q { struct futex_q {
struct list_head list; struct list_head list;
wait_queue_head_t waiters; wait_queue_head_t waiters;
/* Which hash list lock to use. */
spinlock_t *lock_ptr;
/* Key which the futex is hashed on. */ /* Key which the futex is hashed on. */
union futex_key key; union futex_key key;
...@@ -124,8 +135,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) ...@@ -124,8 +135,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
* Returns: 0, or negative error code. * Returns: 0, or negative error code.
* The key words are stored in *key on success. * The key words are stored in *key on success.
* *
* Should be called with &current->mm->mmap_sem, * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
* but NOT &futex_lock or &current->mm->page_table_lock.
*/ */
static int get_futex_key(unsigned long uaddr, union futex_key *key) static int get_futex_key(unsigned long uaddr, union futex_key *key)
{ {
...@@ -172,9 +182,10 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) ...@@ -172,9 +182,10 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
} }
/* /*
* Linear mappings are also simple. * Linear file mappings are also simple.
*/ */
key->shared.inode = vma->vm_file->f_dentry->d_inode; key->shared.inode = vma->vm_file->f_dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) { if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff); + vma->vm_pgoff);
...@@ -214,16 +225,68 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) ...@@ -214,16 +225,68 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
return err; return err;
} }
/*
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
* NOTE: mmap_sem MUST be held between get_futex_key() and calling this
* function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
static inline void get_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
atomic_inc(&key->shared.inode->i_count);
else
atomic_inc(&key->private.mm->mm_count);
}
}
/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held.
*/
static inline void drop_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
iput(key->shared.inode);
else
mmdrop(key->private.mm);
}
}
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
static inline void wake_futex(struct futex_q *q)
{
list_del_init(&q->list);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
* list_del_init() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
* The waiting task can free the futex_q as soon as this is written,
* without taking any locks. This must come last.
*/
q->lock_ptr = 0;
}
/* /*
* Wake up all waiters hashed on the physical page that is mapped * Wake up all waiters hashed on the physical page that is mapped
* to this virtual address: * to this virtual address:
*/ */
static int futex_wake(unsigned long uaddr, int num) static int futex_wake(unsigned long uaddr, int nr_wake)
{ {
struct list_head *i, *next, *head;
struct futex_hash_bucket *bh;
union futex_key key; union futex_key key;
struct futex_hash_bucket *bh;
struct list_head *head;
struct futex_q *this, *next;
int ret; int ret;
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
...@@ -236,21 +299,15 @@ static int futex_wake(unsigned long uaddr, int num) ...@@ -236,21 +299,15 @@ static int futex_wake(unsigned long uaddr, int num)
spin_lock(&bh->lock); spin_lock(&bh->lock);
head = &bh->chain; head = &bh->chain;
list_for_each_safe(i, next, head) { list_for_each_entry_safe(this, next, head, list) {
struct futex_q *this = list_entry(i, struct futex_q, list);
if (match_futex (&this->key, &key)) { if (match_futex (&this->key, &key)) {
list_del_init(i); wake_futex(this);
wake_up_all(&this->waiters); if (++ret >= nr_wake)
if (this->filp)
send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
ret++;
if (ret >= num)
break; break;
} }
} }
spin_unlock(&bh->lock);
spin_unlock(&bh->lock);
out: out:
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
return ret; return ret;
...@@ -263,10 +320,11 @@ static int futex_wake(unsigned long uaddr, int num) ...@@ -263,10 +320,11 @@ static int futex_wake(unsigned long uaddr, int num)
static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
int nr_wake, int nr_requeue) int nr_wake, int nr_requeue)
{ {
struct list_head *i, *next, *head1, *head2;
struct futex_hash_bucket *bh1, *bh2;
union futex_key key1, key2; union futex_key key1, key2;
int ret; struct futex_hash_bucket *bh1, *bh2;
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
...@@ -279,78 +337,107 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, ...@@ -279,78 +337,107 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
bh1 = hash_futex(&key1); bh1 = hash_futex(&key1);
bh2 = hash_futex(&key2); bh2 = hash_futex(&key2);
if (bh1 < bh2) {
if (bh1 < bh2)
spin_lock(&bh1->lock); spin_lock(&bh1->lock);
spin_lock(&bh2->lock); spin_lock(&bh2->lock);
} else {
spin_lock(&bh2->lock);
if (bh1 > bh2) if (bh1 > bh2)
spin_lock(&bh1->lock); spin_lock(&bh1->lock);
}
head1 = &bh1->chain;
head2 = &bh2->chain;
list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list);
if (match_futex (&this->key, &key1)) { head1 = &bh1->chain;
list_del_init(i); list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) { if (++ret <= nr_wake) {
wake_up_all(&this->waiters); wake_futex(this);
if (this->filp)
send_sigio(&this->filp->f_owner,
this->fd, POLL_IN);
} else { } else {
list_add_tail(i, head2); list_move_tail(&this->list, &bh2->chain);
this->lock_ptr = &bh2->lock;
this->key = key2; this->key = key2;
get_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue) if (ret - nr_wake >= nr_requeue)
break; break;
/* Make sure to stop if key1 == key2 */ /* Make sure to stop if key1 == key2 */
if (head1 == head2 && head1 != next) if (head1 == &bh2->chain && head1 != &next->list)
head1 = i; head1 = &this->list;
}
} }
} }
if (bh1 < bh2) {
spin_unlock(&bh2->lock);
spin_unlock(&bh1->lock);
} else {
if (bh1 > bh2)
spin_unlock(&bh1->lock); spin_unlock(&bh1->lock);
if (bh1 != bh2)
spin_unlock(&bh2->lock); spin_unlock(&bh2->lock);
}
/* drop_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
drop_key_refs(&key1);
out: out:
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
return ret; return ret;
} }
static inline void queue_me(struct futex_q *q, union futex_key *key, /*
int fd, struct file *filp) * queue_me and unqueue_me must be called as a pair, each
* exactly once. They are called with the hashed spinlock held.
*/
/* The key must be already stored in q->key. */
static inline void queue_me(struct futex_q *q, int fd, struct file *filp)
{ {
struct futex_hash_bucket *bh = hash_futex(key); struct futex_hash_bucket *bh;
struct list_head *head = &bh->chain;
q->key = *key;
q->fd = fd; q->fd = fd;
q->filp = filp; q->filp = filp;
init_waitqueue_head(&q->waiters);
get_key_refs(&q->key);
bh = hash_futex(&q->key);
q->lock_ptr = &bh->lock;
spin_lock(&bh->lock); spin_lock(&bh->lock);
list_add_tail(&q->list, head); list_add_tail(&q->list, &bh->chain);
spin_unlock(&bh->lock); spin_unlock(&bh->lock);
} }
/* Return 1 if we were still queued (ie. 0 means we were woken) */ /* Return 1 if we were still queued (ie. 0 means we were woken) */
static inline int unqueue_me(struct futex_q *q) static int unqueue_me(struct futex_q *q)
{ {
struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0; int ret = 0;
spinlock_t *lock_ptr;
spin_lock(&bh->lock); /* In the common case we don't take the spinlock, which is nice. */
if (!list_empty(&q->list)) { retry:
lock_ptr = q->lock_ptr;
if (lock_ptr != 0) {
spin_lock(lock_ptr);
/*
* q->lock_ptr can change between reading it and
* spin_lock(), causing us to take the wrong lock. This
* corrects the race condition.
*
* Reasoning goes like this: if we have the wrong lock,
* q->lock_ptr must have changed (maybe several times)
* between reading it and the spin_lock(). It can
* change again after the spin_lock() but only if it was
* already changed before the spin_lock(). It cannot,
* however, change back to the original value. Therefore
* we can detect whether we acquired the correct lock.
*/
if (unlikely(lock_ptr != q->lock_ptr)) {
spin_unlock(lock_ptr);
goto retry;
}
if (likely(!list_empty(&q->list))) {
list_del(&q->list); list_del(&q->list);
ret = 1; ret = 1;
} }
spin_unlock(&bh->lock); spin_unlock(lock_ptr);
}
drop_key_refs(&q->key);
return ret; return ret;
} }
...@@ -358,19 +445,15 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) ...@@ -358,19 +445,15 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
{ {
DECLARE_WAITQUEUE(wait, current); DECLARE_WAITQUEUE(wait, current);
int ret, curval; int ret, curval;
union futex_key key;
struct futex_q q; struct futex_q q;
struct futex_hash_bucket *bh = NULL;
init_waitqueue_head(&q.waiters);
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
ret = get_futex_key(uaddr, &key); ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0)) if (unlikely(ret != 0))
goto out_release_sem; goto out_release_sem;
queue_me(&q, &key, -1, NULL); queue_me(&q, -1, NULL);
/* /*
* Access the page after the futex is queued. * Access the page after the futex is queued.
...@@ -400,23 +483,17 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) ...@@ -400,23 +483,17 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
* rely on the futex_wake() code removing us from hash when it * rely on the futex_wake() code removing us from hash when it
* wakes us up. * wakes us up.
*/ */
add_wait_queue(&q.waiters, &wait);
bh = hash_futex(&key);
spin_lock(&bh->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (unlikely(list_empty(&q.list))) { /* add_wait_queue is the barrier after __set_current_state. */
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/* /*
* We were woken already. * !list_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/ */
spin_unlock(&bh->lock); if (likely(!list_empty(&q.list)))
set_current_state(TASK_RUNNING);
return 0;
}
spin_unlock(&bh->lock);
time = schedule_timeout(time); time = schedule_timeout(time);
set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
/* /*
* NOTE: we don't remove ourselves from the waitqueue because * NOTE: we don't remove ourselves from the waitqueue because
...@@ -446,7 +523,7 @@ static int futex_close(struct inode *inode, struct file *filp) ...@@ -446,7 +523,7 @@ static int futex_close(struct inode *inode, struct file *filp)
struct futex_q *q = filp->private_data; struct futex_q *q = filp->private_data;
unqueue_me(q); unqueue_me(q);
kfree(filp->private_data); kfree(q);
return 0; return 0;
} }
...@@ -455,14 +532,16 @@ static unsigned int futex_poll(struct file *filp, ...@@ -455,14 +532,16 @@ static unsigned int futex_poll(struct file *filp,
struct poll_table_struct *wait) struct poll_table_struct *wait)
{ {
struct futex_q *q = filp->private_data; struct futex_q *q = filp->private_data;
struct futex_hash_bucket *bh = hash_futex(&q->key);
int ret = 0; int ret = 0;
poll_wait(filp, &q->waiters, wait); poll_wait(filp, &q->waiters, wait);
spin_lock(&bh->lock);
/*
* list_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
if (list_empty(&q->list)) if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM; ret = POLLIN | POLLRDNORM;
spin_unlock(&bh->lock);
return ret; return ret;
} }
...@@ -472,12 +551,13 @@ static struct file_operations futex_fops = { ...@@ -472,12 +551,13 @@ static struct file_operations futex_fops = {
.poll = futex_poll, .poll = futex_poll,
}; };
/* Signal allows caller to avoid the race which would occur if they /*
set the sigio stuff up afterwards. */ * Signal allows caller to avoid the race which would occur if they
* set the sigio stuff up afterwards.
*/
static int futex_fd(unsigned long uaddr, int signal) static int futex_fd(unsigned long uaddr, int signal)
{ {
struct futex_q *q; struct futex_q *q;
union futex_key key;
struct file *filp; struct file *filp;
int ret, err; int ret, err;
...@@ -519,20 +599,24 @@ static int futex_fd(unsigned long uaddr, int signal) ...@@ -519,20 +599,24 @@ static int futex_fd(unsigned long uaddr, int signal)
} }
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
err = get_futex_key(uaddr, &key); err = get_futex_key(uaddr, &q->key);
up_read(&current->mm->mmap_sem);
if (unlikely(err != 0)) { if (unlikely(err != 0)) {
up_read(&current->mm->mmap_sem);
put_unused_fd(ret); put_unused_fd(ret);
put_filp(filp); put_filp(filp);
kfree(q); kfree(q);
return err; return err;
} }
init_waitqueue_head(&q->waiters); /*
* queue_me() must be called before releasing mmap_sem, because
* key->shared.inode needs to be referenced while holding it.
*/
filp->private_data = q; filp->private_data = q;
queue_me(q, &key, ret, filp); queue_me(q, ret, filp);
up_read(&current->mm->mmap_sem);
/* Now we map fd to filp, so userspace can access it */ /* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp); fd_install(ret, filp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment