Commit 0adb8f3d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer fix from Ingo Molnar:
 "Fix a timer granularity handling race+bug, which would manifest itself
  by spuriously increasing timeouts of some timers (from 1 jiffy to ~500
  jiffies in the worst case measured) in certain nohz states"

* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  timers: Fix excessive granularity of new timers after a nohz idle
parents 53ede64d 2fe59f50
...@@ -203,6 +203,7 @@ struct timer_base { ...@@ -203,6 +203,7 @@ struct timer_base {
bool migration_enabled; bool migration_enabled;
bool nohz_active; bool nohz_active;
bool is_idle; bool is_idle;
bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE); DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE]; struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned; } ____cacheline_aligned;
...@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) ...@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base) static inline void forward_timer_base(struct timer_base *base)
{ {
unsigned long jnow = READ_ONCE(jiffies); unsigned long jnow;
/* /*
* We only forward the base when it's idle and we have a delta between * We only forward the base when we are idle or have just come out of
* base clock and jiffies. * idle (must_forward_clk logic), and have a delta between base clock
* and jiffies. In the common case, run_timers will take care of it.
*/ */
if (!base->is_idle || (long) (jnow - base->clk) < 2) if (likely(!base->must_forward_clk))
return;
jnow = READ_ONCE(jiffies);
base->must_forward_clk = base->is_idle;
if ((long)(jnow - base->clk) < 2)
return; return;
/* /*
...@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) ...@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return: * same array bucket then just return:
*/ */
if (timer_pending(timer)) { if (timer_pending(timer)) {
/*
* The downside of this optimization is that it can result in
* larger granularity than you would get from adding a new
* timer with this expiry.
*/
if (timer->expires == expires) if (timer->expires == expires)
return 1; return 1;
...@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) ...@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance. * dequeue/enqueue dance.
*/ */
base = lock_timer_base(timer, &flags); base = lock_timer_base(timer, &flags);
forward_timer_base(base);
clk = base->clk; clk = base->clk;
idx = calc_wheel_index(expires, clk); idx = calc_wheel_index(expires, clk);
...@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) ...@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
} }
} else { } else {
base = lock_timer_base(timer, &flags); base = lock_timer_base(timer, &flags);
forward_timer_base(base);
} }
ret = detach_if_pending(timer, base, false); ret = detach_if_pending(timer, base, false);
...@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) ...@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
raw_spin_lock(&base->lock); raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags, WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu); (timer->flags & ~TIMER_BASEMASK) | base->cpu);
forward_timer_base(base);
} }
} }
/* Try to forward a stale timer base clock */
forward_timer_base(base);
timer->expires = expires; timer->expires = expires;
/* /*
* If 'idx' was calculated above and the base time did not advance * If 'idx' was calculated above and the base time did not advance
...@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) ...@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags, WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu); (timer->flags & ~TIMER_BASEMASK) | cpu);
} }
forward_timer_base(base);
debug_activate(timer, timer->expires); debug_activate(timer, timer->expires);
internal_add_timer(base, timer); internal_add_timer(base, timer);
...@@ -1497,11 +1510,17 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) ...@@ -1497,11 +1510,17 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
if (!is_max_delta) if (!is_max_delta)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC; expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/* /*
* If we expect to sleep more than a tick, mark the base idle: * If we expect to sleep more than a tick, mark the base idle.
*/ * Also the tick is stopped so any added timer must forward
if ((expires - basem) > TICK_NSEC) * the base clk itself to keep granularity small. This idle
* logic is only maintained for the BASE_STD base, deferrable
* timers may still see large granularity skew (by design).
*/
if ((expires - basem) > TICK_NSEC) {
base->must_forward_clk = true;
base->is_idle = true; base->is_idle = true;
} }
}
raw_spin_unlock(&base->lock); raw_spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires); return cmp_next_hrtimer_event(basem, expires);
...@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) ...@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{ {
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
* must_forward_clk must be cleared before running timers so that any
* timer functions that call mod_timer will not try to forward the
* base. idle trcking / clock forwarding logic is only used with
* BASE_STD timers.
*
* The deferrable base does not do idle tracking at all, so we do
* not forward it. This can result in very large variations in
* granularity for deferrable timers, but they can be deferred for
* long periods due to idle.
*/
base->must_forward_clk = false;
__run_timers(base); __run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment