Commit 4f07ec0d authored by Thomas Gleixner's avatar Thomas Gleixner

futex: Prevent inconsistent state and exit race

The recent rework of the requeue PI code introduced a possibility for
going back to user space in inconsistent state:

CPU 0				CPU 1

requeue_futex()
  if (lock_pifutex_user()) {
      dequeue_waiter();
      wake_waiter(task);
				sched_in(task);
     				return_from_futex_syscall();

  ---> Inconsistent state because PI state is not established

It becomes worse if the woken up task immediately exits:

				sys_exit();
				
      attach_pistate(vpid);	<--- FAIL


Attach the pi state before dequeuing and waking the waiter. If the waiter
gets a spurious wakeup before the dequeue operation it will wait in
futex_requeue_pi_wakeup_sync() and therefore cannot return and exit.

Fixes: 07d91ef5 ("futex: Prevent requeue_pi() lock nesting issue on RT")
Reported-by: syzbot+4d1bd0725ef09168e1a0@syzkaller.appspotmail.com
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210902094414.558914045@linutronix.de
parent a974b540
...@@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, ...@@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
newval |= FUTEX_WAITERS; newval |= FUTEX_WAITERS;
ret = lock_pi_update_atomic(uaddr, uval, newval); ret = lock_pi_update_atomic(uaddr, uval, newval);
/* If the take over worked, return 1 */ if (ret)
return ret < 0 ? ret : 1; return ret;
/*
* If the waiter bit was requested the caller also needs PI
* state attached to the new owner of the user space futex.
*
* @task is guaranteed to be alive and it cannot be exiting
* because it is either sleeping or waiting in
* futex_requeue_pi_wakeup_sync().
*/
if (set_waiters) {
ret = attach_to_pi_owner(uaddr, newval, key, ps,
exiting);
WARN_ON(ret);
}
return 1;
} }
/* /*
...@@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ...@@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
return -EAGAIN; return -EAGAIN;
/* /*
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
* the contended case or if set_waiters is 1. The pi_state is returned * in the contended case or if @set_waiters is true.
* in ps in contended cases. *
* In the contended case PI state is attached to the lock owner. If
* the user space lock can be acquired then PI state is attached to
* the new owner (@top_waiter->task) when @set_waiters is true.
*/ */
vpid = task_pid_vnr(top_waiter->task); vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
exiting, set_waiters); exiting, set_waiters);
if (ret == 1) { if (ret == 1) {
/* Dequeue, wake up and update top_waiter::requeue_state */ /*
* Lock was acquired in user space and PI state was
* attached to @top_waiter->task. That means state is fully
* consistent and the waiter can return to user space
* immediately after the wakeup.
*/
requeue_pi_wake_futex(top_waiter, key2, hb2); requeue_pi_wake_futex(top_waiter, key2, hb2);
return vpid;
} else if (ret < 0) { } else if (ret < 0) {
/* Rewind top_waiter::requeue_state */ /* Rewind top_waiter::requeue_state */
futex_requeue_pi_complete(top_waiter, ret); futex_requeue_pi_complete(top_waiter, ret);
...@@ -2208,19 +2230,26 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ...@@ -2208,19 +2230,26 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
&exiting, nr_requeue); &exiting, nr_requeue);
/* /*
* At this point the top_waiter has either taken uaddr2 or is * At this point the top_waiter has either taken uaddr2 or
* waiting on it. If the former, then the pi_state will not * is waiting on it. In both cases pi_state has been
* exist yet, look it up one more time to ensure we have a * established and an initial refcount on it. In case of an
* reference to it. If the lock was taken, @ret contains the * error there's nothing.
* VPID of the top waiter task.
* If the lock was not taken, we have pi_state and an initial
* refcount on it. In case of an error we have nothing.
* *
* The top waiter's requeue_state is up to date: * The top waiter's requeue_state is up to date:
* *
* - If the lock was acquired atomically (ret > 0), then * - If the lock was acquired atomically (ret == 1), then
* the state is Q_REQUEUE_PI_LOCKED. * the state is Q_REQUEUE_PI_LOCKED.
* *
* The top waiter has been dequeued and woken up and can
* return to user space immediately. The kernel/user
* space state is consistent. In case that there must be
* more waiters requeued the WAITERS bit in the user
* space futex is set so the top waiter task has to go
* into the syscall slowpath to unlock the futex. This
* will block until this requeue operation has been
* completed and the hash bucket locks have been
* dropped.
*
* - If the trylock failed with an error (ret < 0) then * - If the trylock failed with an error (ret < 0) then
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
* happened", or Q_REQUEUE_PI_IGNORE when there was an * happened", or Q_REQUEUE_PI_IGNORE when there was an
...@@ -2234,36 +2263,20 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ...@@ -2234,36 +2263,20 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* the same sanity checks for requeue_pi as the loop * the same sanity checks for requeue_pi as the loop
* below does. * below does.
*/ */
if (ret > 0) {
WARN_ON(pi_state);
task_count++;
/*
* If futex_proxy_trylock_atomic() acquired the
* user space futex, then the user space value
* @uaddr2 has been set to the @hb1's top waiter
* task VPID. This task is guaranteed to be alive
* and cannot be exiting because it is either
* sleeping or blocked on @hb2 lock.
*
* The @uaddr2 futex cannot have waiters either as
* otherwise futex_proxy_trylock_atomic() would not
* have succeeded.
*
* In order to requeue waiters to @hb2, pi state is
* required. Hand in the VPID value (@ret) and
* allocate PI state with an initial refcount on
* it.
*/
ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
&exiting);
WARN_ON(ret);
}
switch (ret) { switch (ret) {
case 0: case 0:
/* We hold a reference on the pi state. */ /* We hold a reference on the pi state. */
break; break;
case 1:
/*
* futex_proxy_trylock_atomic() acquired the user space
* futex. Adjust task_count.
*/
task_count++;
ret = 0;
break;
/* /*
* If the above failed, then pi_state is NULL and * If the above failed, then pi_state is NULL and
* waiter::requeue_state is correct. * waiter::requeue_state is correct.
...@@ -2395,9 +2408,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ...@@ -2395,9 +2408,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
} }
/* /*
* We took an extra initial reference to the pi_state either in * We took an extra initial reference to the pi_state in
* futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need * futex_proxy_trylock_atomic(). We need to drop it here again.
* to drop it here again.
*/ */
put_pi_state(pi_state); put_pi_state(pi_state);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment