Commit 887d9dc9 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Thomas Gleixner

hrtimer: Allow hrtimer::function() to free the timer

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.
Suggested-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.orgSigned-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
parent c4bfa3f5
...@@ -53,30 +53,25 @@ enum hrtimer_restart { ...@@ -53,30 +53,25 @@ enum hrtimer_restart {
* *
* 0x00 inactive * 0x00 inactive
* 0x01 enqueued into rbtree * 0x01 enqueued into rbtree
* 0x02 callback function running
* 0x04 timer is migrated to another cpu
* *
* Special cases: * The callback state is not part of the timer->state because clearing it would
* 0x03 callback function running and enqueued * mean touching the timer after the callback, this makes it impossible to free
* (was requeued on another CPU) * the timer from the callback function.
* 0x05 timer was migrated on CPU hotunplug
* *
* The "callback function running and enqueued" status is only possible on * Therefore we track the callback state in:
* SMP. It happens for example when a posix timer expired and the callback *
* timer->base->cpu_base->running == timer
*
* On SMP it is possible to have a "callback function running and enqueued"
* status. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer * queued a signal. Between dropping the lock which protects the posix timer
* and reacquiring the base lock of the hrtimer, another CPU can deliver the * and reacquiring the base lock of the hrtimer, another CPU can deliver the
* signal and rearm the timer. We have to preserve the callback running state, * signal and rearm the timer.
* as otherwise the timer could be removed before the softirq code finishes the
* the handling of the timer.
*
* The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
* to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
* *
* All state transitions are protected by cpu_base->lock. * All state transitions are protected by cpu_base->lock.
*/ */
#define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_ENQUEUED 0x01
#define HRTIMER_STATE_CALLBACK 0x02
/** /**
* struct hrtimer - the basic hrtimer structure * struct hrtimer - the basic hrtimer structure
...@@ -163,6 +158,8 @@ enum hrtimer_base_type { ...@@ -163,6 +158,8 @@ enum hrtimer_base_type {
* struct hrtimer_cpu_base - the per cpu clock bases * struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases * @lock: lock protecting the base and associated clock bases
* and timers * and timers
* @seq: seqcount around __run_hrtimer
* @running: pointer to the currently running hrtimer
* @cpu: cpu number * @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers * @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events * @clock_was_set_seq: Sequence counter of clock was set events
...@@ -184,6 +181,8 @@ enum hrtimer_base_type { ...@@ -184,6 +181,8 @@ enum hrtimer_base_type {
*/ */
struct hrtimer_cpu_base { struct hrtimer_cpu_base {
raw_spinlock_t lock; raw_spinlock_t lock;
seqcount_t seq;
struct hrtimer *running;
unsigned int cpu; unsigned int cpu;
unsigned int active_bases; unsigned int active_bases;
unsigned int clock_was_set_seq; unsigned int clock_was_set_seq;
...@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); ...@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
extern u64 hrtimer_get_next_event(void); extern u64 hrtimer_get_next_event(void);
/* extern bool hrtimer_active(const struct hrtimer *timer);
* A timer is active, when it is enqueued into the rbtree or the
* callback function is running or it's in the state of being migrated
* to another cpu.
*/
static inline int hrtimer_active(const struct hrtimer *timer)
{
return timer->state != HRTIMER_STATE_INACTIVE;
}
/* /*
* Helper function to check, whether the timer is on one of the queues * Helper function to check, whether the timer is on one of the queues
...@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) ...@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
*/ */
static inline int hrtimer_callback_running(struct hrtimer *timer) static inline int hrtimer_callback_running(struct hrtimer *timer)
{ {
return timer->state & HRTIMER_STATE_CALLBACK; return timer->base->cpu_base->running == timer;
} }
/* Forward a hrtimer so it expires after now: */ /* Forward a hrtimer so it expires after now: */
......
...@@ -67,6 +67,7 @@ ...@@ -67,6 +67,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{ {
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base = .clock_base =
{ {
{ {
...@@ -110,6 +111,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) ...@@ -110,6 +111,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
*/ */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
.seq = SEQCNT_ZERO(migration_cpu_base),
.clock_base = { { .cpu_base = &migration_cpu_base, }, },
};
#define migration_base migration_cpu_base.clock_base[0]
/* /*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are * means that all timers which are tied to this base via timer->base are
...@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) ...@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
* be found on the lists/queues. * be found on the lists/queues.
* *
* When the timer's base is locked, and the timer removed from list, it is * When the timer's base is locked, and the timer removed from list, it is
* possible to set timer->base = NULL and drop the lock: the timer remains * possible to set timer->base = &migration_base and drop the lock: the timer
* locked. * remains locked.
*/ */
static static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
...@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, ...@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
for (;;) { for (;;) {
base = timer->base; base = timer->base;
if (likely(base != NULL)) { if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base)) if (likely(base == timer->base))
return base; return base;
...@@ -194,8 +207,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, ...@@ -194,8 +207,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
if (unlikely(hrtimer_callback_running(timer))) if (unlikely(hrtimer_callback_running(timer)))
return base; return base;
/* See the comment in lock_timer_base() */ /* See the comment in lock_hrtimer_base() */
timer->base = NULL; timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock); raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock);
...@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, ...@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
base->cpu_base->active_bases |= 1 << base->index; base->cpu_base->active_bases |= 1 << base->index;
/* timer->state = HRTIMER_STATE_ENQUEUED;
* HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
* state of a possibly running callback.
*/
timer->state |= HRTIMER_STATE_ENQUEUED;
return timerqueue_add(&base->active, &timer->node); return timerqueue_add(&base->active, &timer->node);
} }
...@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest ...@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
timer_stats_hrtimer_clear_start_info(timer); timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
if (!restart) { if (!restart)
/* state = HRTIMER_STATE_INACTIVE;
* We must preserve the CALLBACK state flag here,
* otherwise we could move the timer base in
* switch_hrtimer_base.
*/
state &= HRTIMER_STATE_CALLBACK;
}
__remove_hrtimer(timer, base, state, reprogram); __remove_hrtimer(timer, base, state, reprogram);
return 1; return 1;
} }
...@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, ...@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
} }
EXPORT_SYMBOL_GPL(hrtimer_init); EXPORT_SYMBOL_GPL(hrtimer_init);
/*
* A timer is active, when it is enqueued into the rbtree or the
* callback function is running or it's in the state of being migrated
* to another cpu.
*
* It is important for this function to not return a false negative.
*/
bool hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_cpu_base *cpu_base;
unsigned int seq;
do {
cpu_base = READ_ONCE(timer->base->cpu_base);
seq = raw_read_seqcount_begin(&cpu_base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
cpu_base->running == timer)
return true;
} while (read_seqcount_retry(&cpu_base->seq, seq) ||
cpu_base != READ_ONCE(timer->base->cpu_base));
return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);
/*
* The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
* distinct sections:
*
* - queued: the timer is queued
* - callback: the timer is being ran
* - post: the timer is inactive or (re)queued
*
* On the read side we ensure we observe timer->state and cpu_base->running
* from the same section, if anything changed while we looked at it, we retry.
* This includes timer->base changing because sequence numbers alone are
* insufficient for that.
*
* The sequence numbers are required because otherwise we could still observe
* a false negative if the read side got smeared over multiple consequtive
* __run_hrtimer() invocations.
*/
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base, struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now) struct hrtimer *timer, ktime_t *now)
...@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, ...@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
enum hrtimer_restart (*fn)(struct hrtimer *); enum hrtimer_restart (*fn)(struct hrtimer *);
int restart; int restart;
WARN_ON(!irqs_disabled()); lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer); debug_deactivate(timer);
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); cpu_base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe cpu_base->running == NULL &&
* timer->state == INACTIVE.
*/
raw_write_seqcount_barrier(&cpu_base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer); timer_stats_account_hrtimer(timer);
fn = timer->function; fn = timer->function;
...@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, ...@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
raw_spin_lock(&cpu_base->lock); raw_spin_lock(&cpu_base->lock);
/* /*
* Note: We clear the CALLBACK bit after enqueue_hrtimer and * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in * we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt() * hrtimer_start_range_ns() or in hrtimer_interrupt()
* *
...@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, ...@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
!(timer->state & HRTIMER_STATE_ENQUEUED)) !(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base); enqueue_hrtimer(timer, base);
WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); /*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe cpu_base->running == NULL &&
* timer->state == INACTIVE.
*/
raw_write_seqcount_barrier(&cpu_base->seq);
timer->state &= ~HRTIMER_STATE_CALLBACK; WARN_ON_ONCE(cpu_base->running != timer);
cpu_base->running = NULL;
} }
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment