Commit 1f01bf90 authored by Nicholas Piggin's avatar Nicholas Piggin Committed by Michael Ellerman

powerpc/watchdog: read TB close to where it is used

When taking watchdog actions, printing messages, comparing and
re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use
and under wd_smp_lock or printing lock (if applicable).

This should keep timebase mostly monotonic with kernel log messages, and
could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to
something a long way in the past, and causing other CPUs to appear to be
stuck.

These additional TB reads are all slowpath (lockup has been detected),
so performance does not matter.
Signed-off-by: default avatarNicholas Piggin <npiggin@gmail.com>
Reviewed-by: default avatarLaurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211110025056.2084347-5-npiggin@gmail.com
parent 76521c4b
...@@ -157,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs) ...@@ -157,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
/* Do not panic from here because that can recurse into NMI IPI layer */ /* Do not panic from here because that can recurse into NMI IPI layer */
} }
static bool set_cpu_stuck(int cpu, u64 tb) static bool set_cpu_stuck(int cpu)
{ {
cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
...@@ -166,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb) ...@@ -166,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
*/ */
smp_mb(); smp_mb();
if (cpumask_empty(&wd_smp_cpus_pending)) { if (cpumask_empty(&wd_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb; wd_smp_last_reset_tb = get_tb();
cpumask_andnot(&wd_smp_cpus_pending, cpumask_andnot(&wd_smp_cpus_pending,
&wd_cpus_enabled, &wd_cpus_enabled,
&wd_smp_cpus_stuck); &wd_smp_cpus_stuck);
...@@ -175,14 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb) ...@@ -175,14 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb)
return false; return false;
} }
static void watchdog_smp_panic(int cpu, u64 tb) static void watchdog_smp_panic(int cpu)
{ {
static cpumask_t wd_smp_cpus_ipi; // protected by reporting static cpumask_t wd_smp_cpus_ipi; // protected by reporting
unsigned long flags; unsigned long flags;
u64 tb;
int c; int c;
wd_smp_lock(&flags); wd_smp_lock(&flags);
/* Double check some things under lock */ /* Double check some things under lock */
tb = get_tb();
if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
goto out; goto out;
if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
...@@ -196,7 +198,7 @@ static void watchdog_smp_panic(int cpu, u64 tb) ...@@ -196,7 +198,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
continue; // should not happen continue; // should not happen
__cpumask_set_cpu(c, &wd_smp_cpus_ipi); __cpumask_set_cpu(c, &wd_smp_cpus_ipi);
if (set_cpu_stuck(c, tb)) if (set_cpu_stuck(c))
break; break;
} }
if (cpumask_empty(&wd_smp_cpus_ipi)) { if (cpumask_empty(&wd_smp_cpus_ipi)) {
...@@ -242,7 +244,7 @@ static void watchdog_smp_panic(int cpu, u64 tb) ...@@ -242,7 +244,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
wd_smp_unlock(&flags); wd_smp_unlock(&flags);
} }
static void wd_smp_clear_cpu_pending(int cpu, u64 tb) static void wd_smp_clear_cpu_pending(int cpu)
{ {
if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
...@@ -250,7 +252,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) ...@@ -250,7 +252,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
unsigned long flags; unsigned long flags;
pr_emerg("CPU %d became unstuck TB:%lld\n", pr_emerg("CPU %d became unstuck TB:%lld\n",
cpu, tb); cpu, get_tb());
print_irqtrace_events(current); print_irqtrace_events(current);
if (regs) if (regs)
show_regs(regs); show_regs(regs);
...@@ -316,7 +318,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) ...@@ -316,7 +318,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
*/ */
wd_smp_lock(&flags); wd_smp_lock(&flags);
if (cpumask_empty(&wd_smp_cpus_pending)) { if (cpumask_empty(&wd_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb; wd_smp_last_reset_tb = get_tb();
cpumask_andnot(&wd_smp_cpus_pending, cpumask_andnot(&wd_smp_cpus_pending,
&wd_cpus_enabled, &wd_cpus_enabled,
&wd_smp_cpus_stuck); &wd_smp_cpus_stuck);
...@@ -331,10 +333,10 @@ static void watchdog_timer_interrupt(int cpu) ...@@ -331,10 +333,10 @@ static void watchdog_timer_interrupt(int cpu)
per_cpu(wd_timer_tb, cpu) = tb; per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_clear_cpu_pending(cpu, tb); wd_smp_clear_cpu_pending(cpu);
if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
watchdog_smp_panic(cpu, tb); watchdog_smp_panic(cpu);
} }
DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
...@@ -371,7 +373,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) ...@@ -371,7 +373,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
return 0; return 0;
} }
set_cpu_stuck(cpu, tb); set_cpu_stuck(cpu);
wd_smp_unlock(&flags); wd_smp_unlock(&flags);
...@@ -432,7 +434,7 @@ void arch_touch_nmi_watchdog(void) ...@@ -432,7 +434,7 @@ void arch_touch_nmi_watchdog(void)
tb = get_tb(); tb = get_tb();
if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
per_cpu(wd_timer_tb, cpu) = tb; per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_clear_cpu_pending(cpu, tb); wd_smp_clear_cpu_pending(cpu);
} }
} }
EXPORT_SYMBOL(arch_touch_nmi_watchdog); EXPORT_SYMBOL(arch_touch_nmi_watchdog);
...@@ -490,7 +492,7 @@ static void stop_watchdog(void *arg) ...@@ -490,7 +492,7 @@ static void stop_watchdog(void *arg)
cpumask_clear_cpu(cpu, &wd_cpus_enabled); cpumask_clear_cpu(cpu, &wd_cpus_enabled);
wd_smp_unlock(&flags); wd_smp_unlock(&flags);
wd_smp_clear_cpu_pending(cpu, get_tb()); wd_smp_clear_cpu_pending(cpu);
} }
static int stop_watchdog_on_cpu(unsigned int cpu) static int stop_watchdog_on_cpu(unsigned int cpu)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment