Commit 6f3f04c1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Updates to scheduler metrics:
     - PELT fixes & enhancements
     - PSI fixes & enhancements
     - Refactor cpu_util_without()

 - Updates to instrumentation/debugging:
     - Remove sched_trace_*() helper functions - can be done via debug
       info
     - Fix double update_rq_clock() warnings

 - Introduce & use "preemption model accessors" to simplify some of the
   Kconfig complexity.

 - Make softirq handling RT-safe.

 - Misc smaller fixes & cleanups.

* tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  topology: Remove unused cpu_cluster_mask()
  sched: Reverse sched_class layout
  sched/deadline: Remove superfluous rq clock update in push_dl_task()
  sched/core: Avoid obvious double update_rq_clock warning
  smp: Make softirq handling RT safe in flush_smp_call_function_queue()
  smp: Rename flush_smp_call_function_from_idle()
  sched: Fix missing prototype warnings
  sched/fair: Remove cfs_rq_tg_path()
  sched/fair: Remove sched_trace_*() helper functions
  sched/fair: Refactor cpu_util_without()
  sched/fair: Revise comment about lb decision matrix
  sched/psi: report zeroes for CPU full at the system level
  sched/fair: Delete useless condition in tg_unthrottle_up()
  sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq
  sched/fair: Move calculate of avg_load to a better location
  mailmap: Update my email address to @redhat.com
  MAINTAINERS: Add myself as scheduler topology reviewer
  psi: Fix trigger being fired unexpectedly at initial
  ftrace: Use preemption model accessors for trace header printout
  kcsan: Use preemption model accessors
parents cfeb2522 991d8d81
...@@ -398,6 +398,7 @@ Vasily Averin <vasily.averin@linux.dev> <vvs@virtuozzo.com> ...@@ -398,6 +398,7 @@ Vasily Averin <vasily.averin@linux.dev> <vvs@virtuozzo.com>
Vasily Averin <vasily.averin@linux.dev> <vvs@openvz.org> Vasily Averin <vasily.averin@linux.dev> <vvs@openvz.org>
Vasily Averin <vasily.averin@linux.dev> <vvs@parallels.com> Vasily Averin <vasily.averin@linux.dev> <vvs@parallels.com>
Vasily Averin <vasily.averin@linux.dev> <vvs@sw.ru> Vasily Averin <vasily.averin@linux.dev> <vvs@sw.ru>
Valentin Schneider <vschneid@redhat.com> <valentin.schneider@arm.com>
Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com> Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com>
Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com> Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com>
Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org> Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org>
......
...@@ -37,11 +37,7 @@ Pressure interface ...@@ -37,11 +37,7 @@ Pressure interface
Pressure information for each resource is exported through the Pressure information for each resource is exported through the
respective file in /proc/pressure/ -- cpu, memory, and io. respective file in /proc/pressure/ -- cpu, memory, and io.
The format for CPU is as such:: The format is as such::
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
and for memory and IO::
some avg10=0.00 avg60=0.00 avg300=0.00 total=0 some avg10=0.00 avg60=0.00 avg300=0.00 total=0
full avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0
...@@ -58,6 +54,9 @@ situation from a state where some tasks are stalled but the CPU is ...@@ -58,6 +54,9 @@ situation from a state where some tasks are stalled but the CPU is
still doing productive work. As such, time spent in this subset of the still doing productive work. As such, time spent in this subset of the
stall state is tracked separately and exported in the "full" averages. stall state is tracked separately and exported in the "full" averages.
CPU full is undefined at the system level, but has been reported
since 5.13, so it is set to zero for backward compatibility.
The ratios (in %) are tracked as recent trends over ten, sixty, and The ratios (in %) are tracked as recent trends over ten, sixty, and
three hundred second windows, which gives insight into short term events three hundred second windows, which gives insight into short term events
as well as medium and long term trends. The total absolute stall time as well as medium and long term trends. The total absolute stall time
......
...@@ -17524,6 +17524,7 @@ R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR) ...@@ -17524,6 +17524,7 @@ R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH) R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING) R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE) R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
R: Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
L: linux-kernel@vger.kernel.org L: linux-kernel@vger.kernel.org
S: Maintained S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
......
...@@ -126,13 +126,13 @@ ...@@ -126,13 +126,13 @@
*/ */
#define SCHED_DATA \ #define SCHED_DATA \
STRUCT_ALIGN(); \ STRUCT_ALIGN(); \
__begin_sched_classes = .; \ __sched_class_highest = .; \
*(__idle_sched_class) \
*(__fair_sched_class) \
*(__rt_sched_class) \
*(__dl_sched_class) \
*(__stop_sched_class) \ *(__stop_sched_class) \
__end_sched_classes = .; *(__dl_sched_class) \
*(__rt_sched_class) \
*(__fair_sched_class) \
*(__idle_sched_class) \
__sched_class_lowest = .;
/* The actual configuration determine if the init/exit sections /* The actual configuration determine if the init/exit sections
* are handled as text/data or they can be discarded (which * are handled as text/data or they can be discarded (which
......
...@@ -589,6 +589,15 @@ struct softirq_action ...@@ -589,6 +589,15 @@ struct softirq_action
asmlinkage void do_softirq(void); asmlinkage void do_softirq(void);
asmlinkage void __do_softirq(void); asmlinkage void __do_softirq(void);
#ifdef CONFIG_PREEMPT_RT
extern void do_softirq_post_smp_call_flush(unsigned int was_pending);
#else
static inline void do_softirq_post_smp_call_flush(unsigned int unused)
{
do_softirq();
}
#endif
extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void open_softirq(int nr, void (*action)(struct softirq_action *));
extern void softirq_init(void); extern void softirq_init(void);
extern void __raise_softirq_irqoff(unsigned int nr); extern void __raise_softirq_irqoff(unsigned int nr);
......
...@@ -2382,20 +2382,6 @@ static inline void rseq_syscall(struct pt_regs *regs) ...@@ -2382,20 +2382,6 @@ static inline void rseq_syscall(struct pt_regs *regs)
#endif #endif
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
#ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk); extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p); extern void sched_core_fork(struct task_struct *p);
...@@ -2406,4 +2392,6 @@ static inline void sched_core_free(struct task_struct *tsk) { } ...@@ -2406,4 +2392,6 @@ static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { } static inline void sched_core_fork(struct task_struct *p) { }
#endif #endif
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
#endif #endif
...@@ -240,13 +240,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) ...@@ -240,13 +240,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu)
} }
#endif #endif
#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask)
static inline const struct cpumask *cpu_cluster_mask(int cpu)
{
return topology_cluster_cpumask(cpu);
}
#endif
static inline const struct cpumask *cpu_cpu_mask(int cpu) static inline const struct cpumask *cpu_cpu_mask(int cpu)
{ {
return cpumask_of_node(cpu_to_node(cpu)); return cpumask_of_node(cpu_to_node(cpu));
......
...@@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc) ...@@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
else else
nthreads *= 2; nthreads *= 2;
if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { if (!preempt_model_preemptible() ||
!IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
/* /*
* Without any preemption, keep 2 CPUs free for other tasks, one * Without any preemption, keep 2 CPUs free for other tasks, one
* of which is the main test case function checking for * of which is the main test case function checking for
* completion or failure. * completion or failure.
*/ */
const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; const long min_unused_cpus = preempt_model_none() ? 2 : 0;
const long min_required_cpus = 2 + min_unused_cpus; const long min_required_cpus = 2 + min_unused_cpus;
if (num_online_cpus() < min_required_cpus) { if (num_online_cpus() < min_required_cpus) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
/* Headers: */ /* Headers: */
#include <linux/sched/clock.h> #include <linux/sched/clock.h>
#include <linux/sched/cputime.h> #include <linux/sched/cputime.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/posix-timers.h> #include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h> #include <linux/sched/rt.h>
...@@ -31,6 +32,7 @@ ...@@ -31,6 +32,7 @@
#include <uapi/linux/sched/types.h> #include <uapi/linux/sched/types.h>
#include "sched.h" #include "sched.h"
#include "smp.h"
#include "autogroup.h" #include "autogroup.h"
#include "stats.h" #include "stats.h"
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <linux/sched/debug.h> #include <linux/sched/debug.h>
#include <linux/sched/isolation.h> #include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h> #include <linux/sched/loadavg.h>
#include <linux/sched/nohz.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/sched/rseq_api.h> #include <linux/sched/rseq_api.h>
#include <linux/sched/task_stack.h> #include <linux/sched/task_stack.h>
......
...@@ -26,7 +26,10 @@ ...@@ -26,7 +26,10 @@
#include <linux/topology.h> #include <linux/topology.h>
#include <linux/sched/clock.h> #include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h> #include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/debug.h> #include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/init.h>
#include <linux/sched/isolation.h> #include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h> #include <linux/sched/loadavg.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
...@@ -610,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) ...@@ -610,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2); swap(rq1, rq2);
raw_spin_rq_lock(rq1); raw_spin_rq_lock(rq1);
if (__rq_lockp(rq1) == __rq_lockp(rq2)) if (__rq_lockp(rq1) != __rq_lockp(rq2))
return; raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); double_rq_clock_clear_update(rq1, rq2);
} }
#endif #endif
...@@ -2190,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) ...@@ -2190,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{ {
if (p->sched_class == rq->curr->sched_class) if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags); rq->curr->sched_class->check_preempt_curr(rq, p, flags);
else if (p->sched_class > rq->curr->sched_class) else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq); resched_curr(rq);
/* /*
...@@ -2408,7 +2411,7 @@ static int migration_cpu_stop(void *data) ...@@ -2408,7 +2411,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr * __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/ */
flush_smp_call_function_from_idle(); flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock); raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf); rq_lock(rq, &rf);
...@@ -5689,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ...@@ -5689,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the * higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs. * opportunity to pull in more work from other CPUs.
*/ */
if (likely(prev->sched_class <= &fair_sched_class && if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) { rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf); p = pick_next_task_fair(rq, prev, rf);
...@@ -9469,11 +9472,11 @@ void __init sched_init(void) ...@@ -9469,11 +9472,11 @@ void __init sched_init(void)
int i; int i;
/* Make sure the linker didn't screw up */ /* Make sure the linker didn't screw up */
BUG_ON(&idle_sched_class + 1 != &fair_sched_class || BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
&fair_sched_class + 1 != &rt_sched_class || &fair_sched_class != &rt_sched_class + 1 ||
&rt_sched_class + 1 != &dl_sched_class); &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
BUG_ON(&dl_sched_class + 1 != &stop_sched_class); BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif #endif
wait_bit_init(); wait_bit_init();
......
...@@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) ...@@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0); return (dl_se->runtime <= 0);
} }
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
/* /*
* This function implements the GRUB accounting rule: * This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is * according to the GRUB reclaiming algorithm, the runtime is
...@@ -1832,6 +1830,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) ...@@ -1832,6 +1830,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{ {
struct rq_flags rf;
struct rq *rq; struct rq *rq;
if (READ_ONCE(p->__state) != TASK_WAKING) if (READ_ONCE(p->__state) != TASK_WAKING)
...@@ -1843,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused ...@@ -1843,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but * from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it * rq->lock is not... So, lock it
*/ */
raw_spin_rq_lock(rq); rq_lock(rq, &rf);
if (p->dl.dl_non_contending) { if (p->dl.dl_non_contending) {
update_rq_clock(rq); update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl); sub_running_bw(&p->dl, &rq->dl);
...@@ -1859,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused ...@@ -1859,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p); put_task_struct(p);
} }
sub_rq_bw(&p->dl, &rq->dl); sub_rq_bw(&p->dl, &rq->dl);
raw_spin_rq_unlock(rq); rq_unlock(rq, &rf);
} }
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
...@@ -2319,13 +2318,7 @@ static int push_dl_task(struct rq *rq) ...@@ -2319,13 +2318,7 @@ static int push_dl_task(struct rq *rq)
deactivate_task(rq, next_task, 0); deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu); set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
/*
* Update the later_rq clock here, because the clock is used
* by the cpufreq_update_util() inside __add_running_bw().
*/
update_rq_clock(later_rq);
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1; ret = 1;
resched_curr(later_rq); resched_curr(later_rq);
......
This diff is collapsed.
...@@ -327,7 +327,7 @@ static void do_idle(void) ...@@ -327,7 +327,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side * RCU relies on this call to be done outside of an RCU read-side
* critical section. * critical section.
*/ */
flush_smp_call_function_from_idle(); flush_smp_call_function_queue();
schedule_idle(); schedule_idle();
if (unlikely(klp_patch_pending(current))) if (unlikely(klp_patch_pending(current)))
......
...@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq) ...@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq)
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{ {
if (unlikely(cfs_rq->throttle_count)) if (unlikely(cfs_rq->throttle_count))
return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
} }
#else #else
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
......
...@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) ...@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
mutex_unlock(&group->avgs_lock); mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) { for (full = 0; full < 2; full++) {
unsigned long avg[3]; unsigned long avg[3] = { 0, };
u64 total; u64 total = 0;
int w; int w;
for (w = 0; w < 3; w++) /* CPU FULL is undefined at the system level */
avg[w] = group->avg[res * 2 + full][w]; if (!(group == &psi_system && res == PSI_CPU && full)) {
total = div_u64(group->total[PSI_AVGS][res * 2 + full], for (w = 0; w < 3; w++)
NSEC_PER_USEC); avg[w] = group->avg[res * 2 + full][w];
total = div_u64(group->total[PSI_AVGS][res * 2 + full],
NSEC_PER_USEC);
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some", full ? "full" : "some",
...@@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, ...@@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state; t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC; t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC;
window_reset(&t->win, 0, 0, 0); window_reset(&t->win, sched_clock(),
group->total[PSI_POLL][t->state], 0);
t->event = 0; t->event = 0;
t->last_event_time = 0; t->last_event_time = 0;
......
...@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) ...@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0; int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq); struct rq *rq = rq_of_rt_rq(rt_rq);
struct rq_flags rf;
int skip; int skip;
/* /*
...@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) ...@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip) if (skip)
continue; continue;
raw_spin_rq_lock(rq); rq_lock(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
if (rt_rq->rt_time) { if (rt_rq->rt_time) {
...@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) ...@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue) if (enqueue)
sched_rt_rq_enqueue(rt_rq); sched_rt_rq_enqueue(rt_rq);
raw_spin_rq_unlock(rq); rq_unlock(rq, &rf);
} }
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
......
...@@ -603,8 +603,8 @@ struct cfs_rq { ...@@ -603,8 +603,8 @@ struct cfs_rq {
s64 runtime_remaining; s64 runtime_remaining;
u64 throttled_clock; u64 throttled_clock;
u64 throttled_clock_task; u64 throttled_clock_pelt;
u64 throttled_clock_task_time; u64 throttled_clock_pelt_time;
int throttled; int throttled;
int throttle_count; int throttle_count;
struct list_head throttled_list; struct list_head throttled_list;
...@@ -1827,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu) ...@@ -1827,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif #endif
extern int sched_update_scaling(void); extern int sched_update_scaling(void);
#endif /* CONFIG_SMP */
extern void flush_smp_call_function_from_idle(void);
#else /* !CONFIG_SMP: */
static inline void flush_smp_call_function_from_idle(void) { }
#endif
#include "stats.h" #include "stats.h"
...@@ -2182,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) ...@@ -2182,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
* *
* include/asm-generic/vmlinux.lds.h * include/asm-generic/vmlinux.lds.h
* *
* *CAREFUL* they are laid out in *REVERSE* order!!!
*
* Also enforce alignment on the instance, not the type, to guarantee layout. * Also enforce alignment on the instance, not the type, to guarantee layout.
*/ */
#define DEFINE_SCHED_CLASS(name) \ #define DEFINE_SCHED_CLASS(name) \
...@@ -2190,17 +2187,16 @@ const struct sched_class name##_sched_class \ ...@@ -2190,17 +2187,16 @@ const struct sched_class name##_sched_class \
__section("__" #name "_sched_class") __section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */ /* Defined in include/asm-generic/vmlinux.lds.h */
extern struct sched_class __begin_sched_classes[]; extern struct sched_class __sched_class_highest[];
extern struct sched_class __end_sched_classes[]; extern struct sched_class __sched_class_lowest[];
#define sched_class_highest (__end_sched_classes - 1)
#define sched_class_lowest (__begin_sched_classes - 1)
#define for_class_range(class, _from, _to) \ #define for_class_range(class, _from, _to) \
for (class = (_from); class != (_to); class--) for (class = (_from); class < (_to); class++)
#define for_each_class(class) \ #define for_each_class(class) \
for_class_range(class, sched_class_highest, sched_class_lowest) for_class_range(class, __sched_class_highest, __sched_class_lowest)
#define sched_class_above(_a, _b) ((_a) < (_b))
extern const struct sched_class stop_sched_class; extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class; extern const struct sched_class dl_sched_class;
...@@ -2309,6 +2305,7 @@ extern void resched_cpu(int cpu); ...@@ -2309,6 +2305,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth; extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
...@@ -2478,6 +2475,24 @@ unsigned long arch_scale_freq_capacity(int cpu) ...@@ -2478,6 +2475,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
} }
#endif #endif
#ifdef CONFIG_SCHED_DEBUG
/*
* In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
* acquire rq lock instead of rq_lock(). So at the end of these two functions
* we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
* rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
*/
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
{
rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
/* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
#ifdef CONFIG_SMP
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
#endif
}
#else
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
#endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -2543,14 +2558,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) ...@@ -2543,14 +2558,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock) __acquires(busiest->lock)
__acquires(this_rq->lock) __acquires(this_rq->lock)
{ {
if (__rq_lockp(this_rq) == __rq_lockp(busiest)) if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
return 0; likely(raw_spin_rq_trylock(busiest))) {
double_rq_clock_clear_update(this_rq, busiest);
if (likely(raw_spin_rq_trylock(busiest)))
return 0; return 0;
}
if (rq_order_less(this_rq, busiest)) { if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
double_rq_clock_clear_update(this_rq, busiest);
return 0; return 0;
} }
...@@ -2644,6 +2660,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ...@@ -2644,6 +2660,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2); BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1); raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */ __acquire(rq2->lock); /* Fake it out ;) */
double_rq_clock_clear_update(rq1, rq2);
} }
/* /*
......
...@@ -7,3 +7,9 @@ ...@@ -7,3 +7,9 @@
extern void sched_ttwu_pending(void *arg); extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu); extern void send_call_function_single_ipi(int cpu);
#ifdef CONFIG_SMP
extern void flush_smp_call_function_queue(void);
#else
static inline void flush_smp_call_function_queue(void) { }
#endif
...@@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); ...@@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline); static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu) int smpcfd_prepare_cpu(unsigned int cpu)
{ {
...@@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu) ...@@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* ensure that the outgoing CPU doesn't go offline with work * ensure that the outgoing CPU doesn't go offline with work
* still pending. * still pending.
*/ */
flush_smp_call_function_queue(false); __flush_smp_call_function_queue(false);
irq_work_run(); irq_work_run();
return 0; return 0;
} }
...@@ -544,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void) ...@@ -544,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void)
{ {
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_GOTIPI); smp_processor_id(), CFD_SEQ_GOTIPI);
flush_smp_call_function_queue(true); __flush_smp_call_function_queue(true);
} }
/** /**
* flush_smp_call_function_queue - Flush pending smp-call-function callbacks * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
* *
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'. * offline CPU. Skip this check if set to 'false'.
...@@ -561,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void) ...@@ -561,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void)
* Loop through the call_single_queue and run all the queued callbacks. * Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled. * Must be called with interrupts disabled.
*/ */
static void flush_smp_call_function_queue(bool warn_cpu_offline) static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{ {
call_single_data_t *csd, *csd_next; call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev; struct llist_node *entry, *prev;
...@@ -684,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) ...@@ -684,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
smp_processor_id(), CFD_SEQ_HDLEND); smp_processor_id(), CFD_SEQ_HDLEND);
} }
void flush_smp_call_function_from_idle(void)
/**
* flush_smp_call_function_queue - Flush pending smp-call-function callbacks
* from task context (idle, migration thread)
*
* When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
* set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
* setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
* handle queued SMP function calls before scheduling.
*
* The migration thread has to ensure that an eventually pending wakeup has
* been handled before it migrates a task.
*/
void flush_smp_call_function_queue(void)
{ {
unsigned int was_pending;
unsigned long flags; unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue))) if (llist_empty(this_cpu_ptr(&call_single_queue)))
...@@ -694,9 +708,11 @@ void flush_smp_call_function_from_idle(void) ...@@ -694,9 +708,11 @@ void flush_smp_call_function_from_idle(void)
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_IDLE); smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags); local_irq_save(flags);
flush_smp_call_function_queue(true); /* Get the already pending soft interrupts for RT enabled kernels */
was_pending = local_softirq_pending();
__flush_smp_call_function_queue(true);
if (local_softirq_pending()) if (local_softirq_pending())
do_softirq(); do_softirq_post_smp_call_flush(was_pending);
local_irq_restore(flags); local_irq_restore(flags);
} }
......
...@@ -294,6 +294,19 @@ static inline void invoke_softirq(void) ...@@ -294,6 +294,19 @@ static inline void invoke_softirq(void)
wakeup_softirqd(); wakeup_softirqd();
} }
/*
* flush_smp_call_function_queue() can raise a soft interrupt in a function
* call. On RT kernels this is undesired and the only known functionality
* in the block layer which does this is disabled on RT. If soft interrupts
* get raised which haven't been raised before the flush, warn so it can be
* investigated.
*/
void do_softirq_post_smp_call_flush(unsigned int was_pending)
{
if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
invoke_softirq();
}
#else /* CONFIG_PREEMPT_RT */ #else /* CONFIG_PREEMPT_RT */
/* /*
......
...@@ -535,8 +535,6 @@ void stop_machine_park(int cpu) ...@@ -535,8 +535,6 @@ void stop_machine_park(int cpu)
kthread_park(stopper->thread); kthread_park(stopper->thread);
} }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
static void cpu_stop_create(unsigned int cpu) static void cpu_stop_create(unsigned int cpu)
{ {
sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
......
...@@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) ...@@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries, entries,
total, total,
buf->cpu, buf->cpu,
#if defined(CONFIG_PREEMPT_NONE) preempt_model_none() ? "server" :
"server", preempt_model_voluntary() ? "desktop" :
#elif defined(CONFIG_PREEMPT_VOLUNTARY) preempt_model_full() ? "preempt" :
"desktop", preempt_model_rt() ? "preempt_rt" :
#elif defined(CONFIG_PREEMPT)
"preempt",
#elif defined(CONFIG_PREEMPT_RT)
"preempt_rt",
#else
"unknown", "unknown",
#endif
/* These are reserved for later use */ /* These are reserved for later use */
0, 0, 0, 0); 0, 0, 0, 0);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment