Commit 3d4775df authored by Thomas Gleixner's avatar Thomas Gleixner

futex: Replace PF_EXITPIDONE with a state

The futex exit handling relies on PF_ flags. That's suboptimal as it
requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in
the middle of do_exit() to enforce the observability of PF_EXITING in the
futex code.

Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic
over to the new state. The PF_EXITING dependency will be cleaned up in a
later step.

This prepares for handling various futex exit issues later.
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Reviewed-by: default avatarIngo Molnar <mingo@kernel.org>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20191106224556.149449274@linutronix.de
parent ba31c1a4
...@@ -50,6 +50,10 @@ union futex_key { ...@@ -50,6 +50,10 @@ union futex_key {
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
#ifdef CONFIG_FUTEX #ifdef CONFIG_FUTEX
enum {
FUTEX_STATE_OK,
FUTEX_STATE_DEAD,
};
static inline void futex_init_task(struct task_struct *tsk) static inline void futex_init_task(struct task_struct *tsk)
{ {
...@@ -59,6 +63,34 @@ static inline void futex_init_task(struct task_struct *tsk) ...@@ -59,6 +63,34 @@ static inline void futex_init_task(struct task_struct *tsk)
#endif #endif
INIT_LIST_HEAD(&tsk->pi_state_list); INIT_LIST_HEAD(&tsk->pi_state_list);
tsk->pi_state_cache = NULL; tsk->pi_state_cache = NULL;
tsk->futex_state = FUTEX_STATE_OK;
}
/**
* futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
* @tsk: task to set the state on
*
* Set the futex exit state of the task lockless. The futex waiter code
* observes that state when a task is exiting and loops until the task has
* actually finished the futex cleanup. The worst case for this is that the
* waiter runs through the wait loop until the state becomes visible.
*
* This has two callers:
*
* - futex_mm_release() after the futex exit cleanup has been done
*
* - do_exit() from the recursive fault handling path.
*
* In case of a recursive fault this is best effort. Either the futex exit
* code has run already or not. If the OWNER_DIED bit has been set on the
* futex then the waiter can take it over. If not, the problem is pushed
* back to user space. If the futex exit code did not run yet, then an
* already queued waiter might block forever, but there is nothing which
* can be done about that.
*/
static inline void futex_exit_done(struct task_struct *tsk)
{
tsk->futex_state = FUTEX_STATE_DEAD;
} }
void futex_mm_release(struct task_struct *tsk); void futex_mm_release(struct task_struct *tsk);
...@@ -68,6 +100,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ...@@ -68,6 +100,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
#else #else
static inline void futex_init_task(struct task_struct *tsk) { } static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_mm_release(struct task_struct *tsk) { } static inline void futex_mm_release(struct task_struct *tsk) { }
static inline void futex_exit_done(struct task_struct *tsk) { }
static inline long do_futex(u32 __user *uaddr, int op, u32 val, static inline long do_futex(u32 __user *uaddr, int op, u32 val,
ktime_t *timeout, u32 __user *uaddr2, ktime_t *timeout, u32 __user *uaddr2,
u32 val2, u32 val3) u32 val2, u32 val3)
......
...@@ -1053,6 +1053,7 @@ struct task_struct { ...@@ -1053,6 +1053,7 @@ struct task_struct {
#endif #endif
struct list_head pi_state_list; struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache; struct futex_pi_state *pi_state_cache;
unsigned int futex_state;
#endif #endif
#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
...@@ -1441,7 +1442,6 @@ extern struct pid *cad_pid; ...@@ -1441,7 +1442,6 @@ extern struct pid *cad_pid;
*/ */
#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_EXITING 0x00000004 /* Getting shut down */
#define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
......
...@@ -746,16 +746,7 @@ void __noreturn do_exit(long code) ...@@ -746,16 +746,7 @@ void __noreturn do_exit(long code)
*/ */
if (unlikely(tsk->flags & PF_EXITING)) { if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n"); pr_alert("Fixing recursive fault but reboot is needed!\n");
/* futex_exit_done(tsk);
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
* cleanup has been done or not. In the worst case it
* loops once more. We pretend that the cleanup was
* done as there is no way to return. Either the
* OWNER_DIED bit is set by now or we push the blocked
* task into the wait for ever nirwana as well.
*/
tsk->flags |= PF_EXITPIDONE;
set_current_state(TASK_UNINTERRUPTIBLE); set_current_state(TASK_UNINTERRUPTIBLE);
schedule(); schedule();
} }
...@@ -846,12 +837,7 @@ void __noreturn do_exit(long code) ...@@ -846,12 +837,7 @@ void __noreturn do_exit(long code)
* Make sure we are holding no locks: * Make sure we are holding no locks:
*/ */
debug_check_no_locks_held(); debug_check_no_locks_held();
/* futex_exit_done(tsk);
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
* or not. In the worst case it loops once more.
*/
tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context) if (tsk->io_context)
exit_io_context(tsk); exit_io_context(tsk);
......
...@@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, ...@@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
u32 uval2; u32 uval2;
/* /*
* If PF_EXITPIDONE is not yet set, then try again. * If the futex exit state is not yet FUTEX_STATE_DEAD, wait
* for it to finish.
*/ */
if (tsk && !(tsk->flags & PF_EXITPIDONE)) if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
return -EAGAIN; return -EAGAIN;
/* /*
...@@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, ...@@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* *uaddr = 0xC0000000; tsk = get_task(PID); * *uaddr = 0xC0000000; tsk = get_task(PID);
* } if (!tsk->flags & PF_EXITING) { * } if (!tsk->flags & PF_EXITING) {
* ... attach(); * ... attach();
* tsk->flags |= PF_EXITPIDONE; } else { * tsk->futex_state = } else {
* if (!(tsk->flags & PF_EXITPIDONE)) * FUTEX_STATE_DEAD; if (tsk->futex_state !=
* FUTEX_STATE_DEAD)
* return -EAGAIN; * return -EAGAIN;
* return -ESRCH; <--- FAIL * return -ESRCH; <--- FAIL
* } * }
...@@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, ...@@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
} }
/* /*
* We need to look at the task state flags to figure out, * We need to look at the task state to figure out, whether the
* whether the task is exiting. To protect against the do_exit * task is exiting. To protect against the change of the task state
* change of the task flags, we do this protected by * in futex_exit_release(), we do this protected by p->pi_lock:
* p->pi_lock:
*/ */
raw_spin_lock_irq(&p->pi_lock); raw_spin_lock_irq(&p->pi_lock);
if (unlikely(p->flags & PF_EXITING)) { if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
/* /*
* The task is on the way out. When PF_EXITPIDONE is * The task is on the way out. When the futex state is
* set, we know that the task has finished the * FUTEX_STATE_DEAD, we know that the task has finished
* cleanup: * the cleanup:
*/ */
int ret = handle_exit_race(uaddr, uval, p); int ret = handle_exit_race(uaddr, uval, p);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment