Commit 620a30fa authored by Frederic Weisbecker's avatar Frederic Weisbecker Committed by Thomas Gleixner

timers/nohz: Protect idle/iowait sleep time under seqcount

Reading idle/IO sleep time (eg: from /proc/stat) can race with idle exit
updates because the state machine handling the stats is not atomic and
requires a coherent read batch.

As a result reading the sleep time may report irrelevant or backward
values.

Fix this with protecting the simple state machine within a seqcount.
This is expected to be cheap enough not to add measurable performance
impact on the idle path.

Note this only fixes reader VS writer condition partitially. A race
remains that involves remote updates of the CPU iowait task counter. It
can hardly be fixed.
Reported-by: default avatarYu Liao <liaoyu15@huawei.com>
Signed-off-by: default avatarFrederic Weisbecker <frederic@kernel.org>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230222144649.624380-4-frederic@kernel.org
parent 07b65a80
...@@ -646,6 +646,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) ...@@ -646,6 +646,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
delta = ktime_sub(now, ts->idle_entrytime); delta = ktime_sub(now, ts->idle_entrytime);
write_seqcount_begin(&ts->idle_sleeptime_seq);
if (nr_iowait_cpu(smp_processor_id()) > 0) if (nr_iowait_cpu(smp_processor_id()) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
else else
...@@ -653,14 +654,18 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) ...@@ -653,14 +654,18 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
ts->idle_entrytime = now; ts->idle_entrytime = now;
ts->idle_active = 0; ts->idle_active = 0;
write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event(); sched_clock_idle_wakeup_event();
} }
static void tick_nohz_start_idle(struct tick_sched *ts) static void tick_nohz_start_idle(struct tick_sched *ts)
{ {
write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get(); ts->idle_entrytime = ktime_get();
ts->idle_active = 1; ts->idle_active = 1;
write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_sleep_event(); sched_clock_idle_sleep_event();
} }
...@@ -668,6 +673,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, ...@@ -668,6 +673,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
bool compute_delta, u64 *last_update_time) bool compute_delta, u64 *last_update_time)
{ {
ktime_t now, idle; ktime_t now, idle;
unsigned int seq;
if (!tick_nohz_active) if (!tick_nohz_active)
return -1; return -1;
...@@ -676,6 +682,9 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, ...@@ -676,6 +682,9 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
if (last_update_time) if (last_update_time)
*last_update_time = ktime_to_us(now); *last_update_time = ktime_to_us(now);
do {
seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
if (ts->idle_active && compute_delta) { if (ts->idle_active && compute_delta) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime); ktime_t delta = ktime_sub(now, ts->idle_entrytime);
...@@ -683,6 +692,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, ...@@ -683,6 +692,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
} else { } else {
idle = *sleeptime; idle = *sleeptime;
} }
} while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
return ktime_to_us(idle); return ktime_to_us(idle);
......
...@@ -75,6 +75,7 @@ struct tick_sched { ...@@ -75,6 +75,7 @@ struct tick_sched {
ktime_t idle_waketime; ktime_t idle_waketime;
/* Idle entry */ /* Idle entry */
seqcount_t idle_sleeptime_seq;
ktime_t idle_entrytime; ktime_t idle_entrytime;
/* Tick stop */ /* Tick stop */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment