Commit fbeb558b authored by Peter Zijlstra's avatar Peter Zijlstra

futex/pi: Fix recursive rt_mutex waiter state

Some new assertions pointed out that the existing code has nested rt_mutex wait
state in the futex code.

Specifically, the futex_lock_pi() cancel case uses spin_lock() while there
still is a rt_waiter enqueued for this task, resulting in a state where there
are two waiters for the same task (and task_struct::pi_blocked_on gets
scrambled).

The reason to take hb->lock at this point is to avoid the wake_futex_pi()
EAGAIN case.

This happens when futex_top_waiter() and rt_mutex_top_waiter() state becomes
inconsistent. The current rules are such that this inconsistency will not be
observed.

Notably the case that needs to be avoided is where futex_lock_pi() and
futex_unlock_pi() interleave such that unlock will fail to observe a new
waiter.

*However* the case at hand is where a waiter is leaving, in this case the race
means a waiter that is going away is not observed -- which is harmless,
provided this race is explicitly handled.

This is a somewhat dangerous proposition because the converse race is not
observing a new waiter, which must absolutely not happen. But since the race is
valid this cannot be asserted.
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: default avatarThomas Gleixner <tglx@linutronix.de>
Reviewed-by: default avatarSebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: default avatarSebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20230915151943.GD6743@noisy.programming.kicks-ass.net
parent 45f67f30
...@@ -611,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, ...@@ -611,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
/* /*
* Caller must hold a reference on @pi_state. * Caller must hold a reference on @pi_state.
*/ */
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) static int wake_futex_pi(u32 __user *uaddr, u32 uval,
struct futex_pi_state *pi_state,
struct rt_mutex_waiter *top_waiter)
{ {
struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner; struct task_struct *new_owner;
bool postunlock = false; bool postunlock = false;
DEFINE_RT_WAKE_Q(wqh); DEFINE_RT_WAKE_Q(wqh);
u32 curval, newval; u32 curval, newval;
int ret = 0; int ret = 0;
top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (WARN_ON_ONCE(!top_waiter)) {
/*
* As per the comment in futex_unlock_pi() this should not happen.
*
* When this happens, give up our locks and try again, giving
* the futex_lock_pi() instance time to complete, either by
* waiting on the rtmutex or removing itself from the futex
* queue.
*/
ret = -EAGAIN;
goto out_unlock;
}
new_owner = top_waiter->task; new_owner = top_waiter->task;
/* /*
...@@ -1046,19 +1033,33 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl ...@@ -1046,19 +1033,33 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup: cleanup:
spin_lock(q.lock_ptr);
/* /*
* If we failed to acquire the lock (deadlock/signal/timeout), we must * If we failed to acquire the lock (deadlock/signal/timeout), we must
* first acquire the hb->lock before removing the lock from the * must unwind the above, however we canont lock hb->lock because
* rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait * rt_mutex already has a waiter enqueued and hb->lock can itself try
* lists consistent. * and enqueue an rt_waiter through rtlock.
*
* Doing the cleanup without holding hb->lock can cause inconsistent
* state between hb and pi_state, but only in the direction of not
* seeing a waiter that is leaving.
*
* See futex_unlock_pi(), it deals with this inconsistency.
*
* There be dragons here, since we must deal with the inconsistency on
* the way out (here), it is impossible to detect/warn about the race
* the other way around (missing an incoming waiter).
* *
* In particular; it is important that futex_unlock_pi() can not * What could possibly go wrong...
* observe this inconsistency.
*/ */
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
ret = 0; ret = 0;
/*
* Now that the rt_waiter has been dequeued, it is safe to use
* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
* the
*/
spin_lock(q.lock_ptr);
/* /*
* Waiter is unqueued. * Waiter is unqueued.
*/ */
...@@ -1143,6 +1144,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1143,6 +1144,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
top_waiter = futex_top_waiter(hb, &key); top_waiter = futex_top_waiter(hb, &key);
if (top_waiter) { if (top_waiter) {
struct futex_pi_state *pi_state = top_waiter->pi_state; struct futex_pi_state *pi_state = top_waiter->pi_state;
struct rt_mutex_waiter *rt_waiter;
ret = -EINVAL; ret = -EINVAL;
if (!pi_state) if (!pi_state)
...@@ -1155,22 +1157,39 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1155,22 +1157,39 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
if (pi_state->owner != current) if (pi_state->owner != current)
goto out_unlock; goto out_unlock;
get_pi_state(pi_state);
/* /*
* By taking wait_lock while still holding hb->lock, we ensure * By taking wait_lock while still holding hb->lock, we ensure
* there is no point where we hold neither; and therefore * there is no point where we hold neither; and thereby
* wake_futex_p() must observe a state consistent with what we * wake_futex_pi() must observe any new waiters.
* observed. *
* Since the cleanup: case in futex_lock_pi() removes the
* rt_waiter without holding hb->lock, it is possible for
* wake_futex_pi() to not find a waiter while the above does,
* in this case the waiter is on the way out and it can be
* ignored.
* *
* In particular; this forces __rt_mutex_start_proxy() to * In particular; this forces __rt_mutex_start_proxy() to
* complete such that we're guaranteed to observe the * complete such that we're guaranteed to observe the
* rt_waiter. Also see the WARN in wake_futex_pi(). * rt_waiter.
*/ */
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Futex vs rt_mutex waiter state -- if there are no rt_mutex
* waiters even though futex thinks there are, then the waiter
* is leaving and the uncontended path is safe to take.
*/
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (!rt_waiter) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
goto do_uncontended;
}
get_pi_state(pi_state);
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state); ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
put_pi_state(pi_state); put_pi_state(pi_state);
...@@ -1198,6 +1217,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) ...@@ -1198,6 +1217,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
return ret; return ret;
} }
do_uncontended:
/* /*
* We have no kernel internal state, i.e. no waiters in the * We have no kernel internal state, i.e. no waiters in the
* kernel. Waiters which are about to queue themselves are stuck * kernel. Waiters which are about to queue themselves are stuck
......
...@@ -850,11 +850,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ...@@ -850,11 +850,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
pi_mutex = &q.pi_state->pi_mutex; pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
/* Current is not longer pi_blocked_on */ /*
spin_lock(q.lock_ptr); * See futex_unlock_pi()'s cleanup: comment.
*/
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0; ret = 0;
spin_lock(q.lock_ptr);
debug_rt_mutex_free_waiter(&rt_waiter); debug_rt_mutex_free_waiter(&rt_waiter);
/* /*
* Fixup the pi_state owner and possibly acquire the lock if we * Fixup the pi_state owner and possibly acquire the lock if we
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment