Commit f7951c33 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Thomas Gleixner:

 - Cleanup and improvement of NUMA balancing

 - Refactoring and improvements to the PELT (Per Entity Load Tracking)
   code

 - Watchdog simplification and related cleanups

 - The usual pile of small incremental fixes and improvements

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
  watchdog: Reduce message verbosity
  stop_machine: Reflow cpu_stop_queue_two_works()
  sched/numa: Move task_numa_placement() closer to numa_migrate_preferred()
  sched/numa: Use group_weights to identify if migration degrades locality
  sched/numa: Update the scan period without holding the numa_group lock
  sched/numa: Remove numa_has_capacity()
  sched/numa: Modify migrate_swap() to accept additional parameters
  sched/numa: Remove unused task_capacity from 'struct numa_stats'
  sched/numa: Skip nodes that are at 'hoplimit'
  sched/debug: Reverse the order of printing faults
  sched/numa: Use task faults only if numa_group is not yet set up
  sched/numa: Set preferred_node based on best_cpu
  sched/numa: Simplify load_too_imbalanced()
  sched/numa: Evaluate move once per node
  sched/numa: Remove redundant field
  sched/debug: Show the sum wait time of a task group
  sched/fair: Remove #ifdefs from scale_rt_capacity()
  sched/core: Remove get_cpu() from sched_fork()
  sched/cpufreq: Clarify sugov_get_util()
  sched/sysctl: Remove unused sched_time_avg_ms sysctl
  ...
parents 2406fb8d 1b6266eb
......@@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
dvcpu->arch.wait = 0;
if (swq_has_sleeper(&dvcpu->wq))
swake_up(&dvcpu->wq);
swake_up_one(&dvcpu->wq);
return 0;
}
......@@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
vcpu->arch.wait = 0;
if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
swake_up_one(&vcpu->wq);
}
/* low level hrtimer wake routine */
......
......@@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
wqp = kvm_arch_vcpu_wq(vcpu);
if (swq_has_sleeper(wqp)) {
swake_up(wqp);
swake_up_one(wqp);
++vcpu->stat.halt_wakeup;
}
......@@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
}
}
prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
if (kvmppc_vcore_check_block(vc)) {
finish_swait(&vc->wq, &wait);
......@@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
} else if (vc->vcore_state == VCORE_SLEEPING) {
swake_up(&vc->wq);
swake_up_one(&vc->wq);
}
}
......
......@@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
* yield-candidate.
*/
vcpu->preempted = true;
swake_up(&vcpu->wq);
swake_up_one(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
/*
......
......@@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
for (;;) {
if (!n.halted)
prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
......@@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
if (n->halted)
smp_send_reschedule(n->cpu);
else if (swq_has_sleeper(&n->wq))
swake_up(&n->wq);
swake_up_one(&n->wq);
}
static void apf_task_wake_all(void)
......
......@@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
* using swait_active() is safe.
*/
if (swait_active(q))
swake_up(q);
swake_up_one(q);
if (apic_lvtt_tscdeadline(apic))
ktimer->expired_tscdeadline = ktimer->tscdeadline;
......
......@@ -164,6 +164,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_ONLINE_DYN,
......
......@@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
#else
extern int lockup_detector_online_cpu(unsigned int cpu);
extern int lockup_detector_offline_cpu(unsigned int cpu);
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static inline void touch_softlockup_watchdog_sched(void) { }
static inline void touch_softlockup_watchdog(void) { }
static inline void touch_softlockup_watchdog_sync(void) { }
static inline void touch_all_softlockup_watchdogs(void) { }
#endif
#define lockup_detector_online_cpu NULL
#define lockup_detector_offline_cpu NULL
#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
#ifdef CONFIG_DETECT_HUNG_TASK
void reset_hung_task_detector(void);
......
......@@ -1017,7 +1017,6 @@ struct task_struct {
u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct list_head numa_entry;
struct numa_group *numa_group;
/*
......
......@@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
extern __read_mostly unsigned int sysctl_sched_time_avg;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
......
......@@ -25,8 +25,6 @@ struct smpboot_thread_data;
* parked (cpu offline)
* @unpark: Optional unpark function, called when the thread is
* unparked (cpu online)
* @cpumask: Internal state. To update which threads are unparked,
* call smpboot_update_cpumask_percpu_thread().
* @selfparking: Thread is not parked by the park function.
* @thread_comm: The base name of the thread
*/
......@@ -40,23 +38,12 @@ struct smp_hotplug_thread {
void (*cleanup)(unsigned int cpu, bool online);
void (*park)(unsigned int cpu);
void (*unpark)(unsigned int cpu);
cpumask_var_t cpumask;
bool selfparking;
const char *thread_comm;
};
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
const struct cpumask *cpumask);
static inline int
smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
return smpboot_register_percpu_thread_cpumask(plug_thread,
cpu_possible_mask);
}
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *);
#endif
......@@ -16,7 +16,7 @@
* wait-queues, but the semantics are actually completely different, and
* every single user we have ever had has been buggy (or pointless).
*
* A "swake_up()" only wakes up _one_ waiter, which is not at all what
* A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
* "wake_up()" does, and has led to problems. In other cases, it has
* been fine, because there's only ever one waiter (kvm), but in that
* case gthe whole "simple" wait-queue is just pointless to begin with,
......@@ -38,8 +38,8 @@
* all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
* sleeper state.
*
* - the exclusive mode; because this requires preserving the list order
* and this is hard.
* - the !exclusive mode; because that leads to O(n) wakeups, everything is
* exclusive.
*
* - custom wake callback functions; because you cannot give any guarantees
* about random code. This also allows swait to be used in RT, such that
......@@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
* CPU0 - waker CPU1 - waiter
*
* for (;;) {
* @cond = true; prepare_to_swait(&wq_head, &wait, state);
* @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state);
* smp_mb(); // smp_mb() from set_current_state()
* if (swait_active(wq_head)) if (@cond)
* wake_up(wq_head); break;
......@@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
return swait_active(wq);
}
extern void swake_up(struct swait_queue_head *q);
extern void swake_up_one(struct swait_queue_head *q);
extern void swake_up_all(struct swait_queue_head *q);
extern void swake_up_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
/* as per ___wait_event() but for swait, therefore "exclusive == 1" */
#define ___swait_event(wq, condition, state, ret, cmd) \
({ \
__label__ __out; \
struct swait_queue __wait; \
long __ret = ret; \
\
......@@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
break; \
goto __out; \
} \
\
cmd; \
} \
finish_swait(&wq, &__wait); \
__ret; \
__out: __ret; \
})
#define __swait_event(wq, condition) \
(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
schedule())
#define swait_event(wq, condition) \
#define swait_event_exclusive(wq, condition) \
do { \
if (condition) \
break; \
......@@ -208,7 +208,7 @@ do { \
TASK_UNINTERRUPTIBLE, timeout, \
__ret = schedule_timeout(__ret))
#define swait_event_timeout(wq, condition, timeout) \
#define swait_event_timeout_exclusive(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
......@@ -220,7 +220,7 @@ do { \
___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
schedule())
#define swait_event_interruptible(wq, condition) \
#define swait_event_interruptible_exclusive(wq, condition) \
({ \
int __ret = 0; \
if (!(condition)) \
......@@ -233,7 +233,7 @@ do { \
TASK_INTERRUPTIBLE, timeout, \
__ret = schedule_timeout(__ret))
#define swait_event_interruptible_timeout(wq, condition, timeout) \
#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
......@@ -246,7 +246,7 @@ do { \
(void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
/**
* swait_event_idle - wait without system load contribution
* swait_event_idle_exclusive - wait without system load contribution
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
......@@ -257,7 +257,7 @@ do { \
* condition and doesn't want to contribute to system load. Signals are
* ignored.
*/
#define swait_event_idle(wq, condition) \
#define swait_event_idle_exclusive(wq, condition) \
do { \
if (condition) \
break; \
......@@ -270,7 +270,7 @@ do { \
__ret = schedule_timeout(__ret))
/**
* swait_event_idle_timeout - wait up to timeout without load contribution
* swait_event_idle_timeout_exclusive - wait up to timeout without load contribution
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout at which we'll give up in jiffies
......@@ -288,7 +288,7 @@ do { \
* or the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed.
*/
#define swait_event_idle_timeout(wq, condition, timeout) \
#define swait_event_idle_timeout_exclusive(wq, condition, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
......
......@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
[CPUHP_AP_WATCHDOG_ONLINE] = {
.name = "lockup_detector:online",
.startup.single = lockup_detector_online_cpu,
.teardown.single = lockup_detector_offline_cpu,
},
[CPUHP_AP_WORKQUEUE_ONLINE] = {
.name = "workqueue:online",
.startup.single = workqueue_online_cpu,
......
......@@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self)
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
complete_all(&self->parked);
complete(&self->parked);
schedule();
}
__set_current_state(TASK_RUNNING);
......@@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k)
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
......@@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
return -EBUSY;
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
......
......@@ -92,7 +92,7 @@ static void s2idle_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
swait_event(s2idle_wait_head,
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
......@@ -160,7 +160,7 @@ void s2idle_wake(void)
raw_spin_lock_irqsave(&s2idle_lock, flags);
if (s2idle_state > S2IDLE_STATE_NONE) {
s2idle_state = S2IDLE_STATE_WAKE;
swake_up(&s2idle_wait_head);
swake_up_one(&s2idle_wait_head);
}
raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}
......
......@@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
if (!newval && READ_ONCE(sp->srcu_gp_waiting))
swake_up(&sp->srcu_wq);
swake_up_one(&sp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
......@@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
idx = sp->srcu_idx;
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
/* Invoke the callbacks we removed above. */
......
......@@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
swake_up(&rsp->gp_wq);
swake_up_one(&rsp->gp_wq);
}
/*
......@@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
* Helper function for swait_event_idle() wakeup at force-quiescent-state
* Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
* time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
......@@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
......@@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = swait_event_idle_timeout(rsp->gp_wq,
ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
......
......@@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
swake_up(&rsp->expedited_wq);
swake_up_one(&rsp->expedited_wq);
}
break;
}
......@@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
ret = swait_event_timeout(
ret = swait_event_timeout_exclusive(
rsp->expedited_wq,
sync_rcu_preempt_exp_done_unlocked(rnp_root),
jiffies_stall);
......
......@@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
del_timer(&rdp->nocb_timer);
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
swake_up_one(&rdp_leader->nocb_wq);
} else {
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
......@@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
swait_event_interruptible(
swait_event_interruptible_exclusive(
rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
(d = rcu_seq_done(&rnp->gp_seq, c)));
if (likely(d))
......@@ -2188,7 +2188,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
swait_event_interruptible_exclusive(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = true;
......@@ -2253,7 +2253,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
swake_up_one(&rdp->nocb_wq);
}
}
......@@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
{
for (;;) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
swait_event_interruptible(rdp->nocb_wq,
swait_event_interruptible_exclusive(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
......
......@@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
......
......@@ -17,6 +17,8 @@
#include "../workqueue_internal.h"
#include "../smpboot.h"
#include "pelt.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
......@@ -44,14 +46,6 @@ const_debug unsigned int sysctl_sched_features =
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* period over which we average the RT time consumption, measured
* in ms.
*
* default: 1s
*/
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
......@@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#ifdef HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
update_irq_load_avg(rq, irq_delta + steal);
#endif
}
......@@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
* See __iter_div_u64_rem() for another example of this.
*/
asm("" : "+rm" (rq->age_stamp));
rq->age_stamp += period;
rq->rt_avg /= 2;
}
}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
......@@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
......@@ -1280,16 +1258,17 @@ static int migrate_swap_stop(void *data)
/*
* Cross migrate two tasks
*/
int migrate_swap(struct task_struct *cur, struct task_struct *p)
int migrate_swap(struct task_struct *cur, struct task_struct *p,
int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = task_cpu(cur),
.src_cpu = curr_cpu,
.dst_task = p,
.dst_cpu = task_cpu(p),
.dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
......@@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
out:
return ret;
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
......@@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {}
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
__sched_fork(clone_flags, p);
/*
......@@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
if (dl_prio(p->prio)) {
put_cpu();
if (dl_prio(p->prio))
return -EAGAIN;
} else if (rt_prio(p->prio)) {
else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
} else {
else
p->sched_class = &fair_sched_class;
}
init_entity_runnable_average(&p->se);
......@@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
__set_task_cpu(p, smp_processor_id());
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
......@@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
put_cpu();
return 0;
}
......@@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq)
}
}
static void set_cpu_rq_start_time(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
rq->age_stamp = sched_clock_cpu(cpu);
}
/*
* used to mark begin/end of suspend/resume:
*/
......@@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
......@@ -6106,7 +6073,6 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();
......@@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
u64 ws = 0;
int i;
for_each_possible_cpu(i)
ws += schedstat_val(tg->se[i]->statistics.wait_sum);
seq_printf(sf, "wait_sum %llu\n", ws);
}
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
......
......@@ -53,9 +53,7 @@ struct sugov_cpu {
unsigned int iowait_boost_max;
u64 last_update;
/* The fields below are only needed when sharing a policy: */
unsigned long util_cfs;
unsigned long util_dl;
unsigned long bw_dl;
unsigned long max;
/* The field below is for single-CPU policies only: */
......@@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
static void sugov_get_util(struct sugov_cpu *sg_cpu)
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the irq utilization.
*
* The DL bandwidth number otoh is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util, irq, max;
sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->util_cfs = cpu_util_cfs(rq);
sg_cpu->util_dl = cpu_util_dl(rq);
}
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
if (rt_rq_is_runnable(&rq->rt))
return sg_cpu->max;
return max;
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return max;
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = cpu_util_cfs(rq);
util += cpu_util_rt(rq);
/*
* We do not make cpu_util_dl() a permanent part of this sum because we
* want to use cpu_bw_dl() later on, but we need to check if the
* CFS+RT+DL sum is saturated (ie. no idle time) such that we select
* f_max when there is no idle time.
*
* NOTE: numerical errors or stop class might cause us to not quite hit
* saturation when we should -- something for later.
*/
if ((util + cpu_util_dl(rq)) >= max)
return max;
/*
* There is still idle time; further improve the number by using the
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* 1 - irq
* U' = irq + ------- * U
* max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
/*
* Utilization required by DEADLINE must always be granted while, for
* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
* gracefully reduce the frequency when no tasks show up for longer
* Bandwidth required by DEADLINE must always be granted while, for
* FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
* to gracefully reduce the frequency when no tasks show up for longer
* periods of time.
*
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
* Ideally we would like to set bw_dl as min/guaranteed freq and util +
* bw_dl as requested freq. However, cpufreq is not yet ready for such
* an interface. So, we only do the latter for now.
*/
return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
return min(max, util + sg_cpu->bw_dl);
}
/**
......@@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
sg_policy->need_freq_update = true;
}
......@@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu);
sugov_get_util(sg_cpu);
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_aggregate_util(sg_cpu);
sugov_iowait_apply(sg_cpu, time, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
......@@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
if (j_util * max > j_max * util) {
......
......@@ -16,6 +16,7 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
#include "sched.h"
#include "pelt.h"
struct dl_bandwidth def_dl_bandwidth;
......@@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq)
curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (dl_entity_is_special(dl_se))
return;
......@@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
return p;
}
......@@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
......@@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
......
......@@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp)
cmp += 3;
}
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
break;
}
i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
if (i < 0)
return i;
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
return i;
return 0;
}
static ssize_t
......@@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
{
char buf[64];
char *cmp;
int i;
int ret;
struct inode *inode;
if (cnt > 63)
......@@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
i = sched_feat_set(cmp);
ret = sched_feat_set(cmp);
inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
if (ret < 0)
return ret;
*ppos += cnt;
......@@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf)
{
SEQ_printf(m, "numa_faults node=%d ", node);
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
}
#endif
......
This diff is collapsed.
This diff is collapsed.
#ifdef CONFIG_SMP
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
int update_irq_load_avg(struct rq *rq, u64 running);
#else
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
#endif
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
* This flag is used to synchronize util_avg updates with util_est updates.
* We map this information into the LSB bit of the utilization saved at
* dequeue time (i.e. util_est.dequeued).
*/
#define UTIL_AVG_UNCHANGED 0x1
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Avoid store if the flag has been already set */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
#else
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int
update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
static inline int
update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
#endif
......@@ -5,6 +5,8 @@
*/
#include "sched.h"
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
......@@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
......@@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rt_queue_push_tasks(rq);
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
return p;
}
......@@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
* if it is still active
......@@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
watchdog(rq, p);
......
......@@ -594,6 +594,7 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
int rt_queued;
......@@ -673,7 +674,26 @@ struct dl_rq {
u64 bw_ratio;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
#else
#define entity_is_task(se) 1
#endif
#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
static inline long se_weight(struct sched_entity *se)
{
return scale_load_down(se->load.weight);
}
static inline long se_runnable(struct sched_entity *se)
{
return scale_load_down(se->runnable_weight);
}
static inline bool sched_asym_prefer(int a, int b)
{
......@@ -833,8 +853,12 @@ struct rq {
struct list_head cfs_tasks;
u64 rt_avg;
u64 age_stamp;
struct sched_avg avg_rt;
struct sched_avg avg_dl;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
#define HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
u64 idle_stamp;
u64 avg_idle;
......@@ -1075,7 +1099,8 @@ enum numa_faults_stats {
};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
#else
static inline void
......@@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_time_avg;
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
static inline u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
#ifdef CONFIG_SCHED_HRTICK
/*
......@@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
......@@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
sched_avg_update(rq);
}
#else
#ifndef arch_scale_cpu_capacity
static __always_inline
......@@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
return SCHED_CAPACITY_SCALE;
}
#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
......@@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
static inline unsigned long cpu_util_dl(struct rq *rq)
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
}
static inline unsigned long cpu_util_dl(struct rq *rq)
{
return READ_ONCE(rq->avg_dl.util_avg);
}
static inline unsigned long cpu_util_cfs(struct rq *rq)
{
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
......@@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
return util;
}
static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
#endif
#ifdef HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return rq->avg_irq.util_avg;
}
static inline
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
{
util *= (max - irq);
util /= max;
return util;
}
#else
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return 0;
}
static inline
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
{
return util;
}
#endif
......@@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
void swake_up(struct swait_queue_head *q)
void swake_up_one(struct swait_queue_head *q)
{
unsigned long flags;
......@@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up);
EXPORT_SYMBOL(swake_up_one);
/*
* Does not allow usage from IRQ disabled, since we must be able to
......@@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_all);
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
list_add(&wait->task_list, &q->task_list);
list_add_tail(&wait->task_list, &q->task_list);
}
void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
unsigned long flags;
......@@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
set_current_state(state);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_swait);
EXPORT_SYMBOL(prepare_to_swait_exclusive);
long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
if (signal_pending_state(state, current))
return -ERESTARTSYS;
unsigned long flags;
long ret = 0;
prepare_to_swait(q, wait, state);
raw_spin_lock_irqsave(&q->lock, flags);
if (unlikely(signal_pending_state(state, current))) {
/*
* See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
* must not see us.
*/
list_del_init(&wait->task_list);
ret = -ERESTARTSYS;
} else {
__prepare_to_swait(q, wait);
set_current_state(state);
}
raw_spin_unlock_irqrestore(&q->lock, flags);
return 0;
return ret;
}
EXPORT_SYMBOL(prepare_to_swait_event);
......
......@@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu)
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list)
if (cpumask_test_cpu(cpu, cur->cpumask))
smpboot_unpark_thread(cur, cpu);
smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
return 0;
}
......@@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
* smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
* smpboot_register_percpu_thread - Register a per_cpu thread related
* to hotplug
* @plug_thread: Hotplug thread descriptor
* @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
const struct cpumask *cpumask)
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
free_cpumask_var(plug_thread->cpumask);
goto out;
}
if (cpumask_test_cpu(cpu, cpumask))
smpboot_unpark_thread(plug_thread, cpu);
smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
......@@ -315,7 +306,7 @@ int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_threa
put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
......@@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
/**
* smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
* @plug_thread: Hotplug thread descriptor
* @new: Revised mask to use
*
* The cpumask field in the smp_hotplug_thread must not be updated directly
* by the client, but only by calling this function.
* This function can only be called on a registered smp_hotplug_thread.
*/
void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *new)
{
struct cpumask *old = plug_thread->cpumask;
static struct cpumask tmp;
unsigned int cpu;
lockdep_assert_cpus_held();
mutex_lock(&smpboot_threads_lock);
/* Park threads that were exclusively enabled on the old mask. */
cpumask_andnot(&tmp, old, new);
for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_park_thread(plug_thread, cpu);
/* Unpark threads that are exclusively enabled on the new mask. */
cpumask_andnot(&tmp, new, old);
for_each_cpu_and(cpu, &tmp, cpu_online_mask)
smpboot_unpark_thread(plug_thread, cpu);
cpumask_copy(old, new);
mutex_unlock(&smpboot_threads_lock);
}
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
/*
......
......@@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
retry:
/*
* The waking up of stopper threads has to happen in the same
* scheduling context as the queueing. Otherwise, there is a
* possibility of one of the above stoppers being woken up by another
* CPU, and preempting us. This will cause us to not wake up the other
* stopper forever.
*/
preempt_disable();
raw_spin_lock_irq(&stopper1->lock);
raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
if (!stopper1->enabled || !stopper2->enabled) {
err = -ENOENT;
goto unlock;
}
/*
* Ensure that if we race with __stop_cpus() the stoppers won't get
* queued up in reverse order leading to system deadlock.
......@@ -255,36 +266,30 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
* It can be falsely true but it is safe to spin until it is cleared,
* queue_stop_cpus_work() does everything under preempt_disable().
*/
err = -EDEADLK;
if (unlikely(stop_cpus_in_progress))
goto unlock;
if (unlikely(stop_cpus_in_progress)) {
err = -EDEADLK;
goto unlock;
}
err = 0;
__cpu_stop_queue_work(stopper1, work1, &wakeq);
__cpu_stop_queue_work(stopper2, work2, &wakeq);
/*
* The waking up of stopper threads has to happen
* in the same scheduling context as the queueing.
* Otherwise, there is a possibility of one of the
* above stoppers being woken up by another CPU,
* and preempting us. This will cause us to n ot
* wake up the other stopper forever.
*/
preempt_disable();
unlock:
raw_spin_unlock(&stopper2->lock);
raw_spin_unlock_irq(&stopper1->lock);
if (unlikely(err == -EDEADLK)) {
preempt_enable();
while (stop_cpus_in_progress)
cpu_relax();
goto retry;
}
if (!err) {
wake_up_q(&wakeq);
preempt_enable();
}
wake_up_q(&wakeq);
preempt_enable();
return err;
}
......
......@@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_time_avg_ms",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
......
......@@ -18,18 +18,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
#include <uapi/linux/sched/types.h>
#include <linux/tick.h>
#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/kthread.h>
static DEFINE_MUTEX(watchdog_mutex);
......@@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
static bool softlockup_threads_initialized __read_mostly;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
......@@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
/*
* The watchdog thread function - touches the timestamp.
*
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
* for more than 2*watchdog_thresh seconds then the debug-printout
* triggers in watchdog_timer_fn().
*/
static int softlockup_fn(void *data)
{
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
}
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
......@@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
watchdog_interrupt_count();
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
reinit_completion(this_cpu_ptr(&softlockup_completion));
stop_one_cpu_nowait(smp_processor_id(),
softlockup_fn, NULL,
this_cpu_ptr(&softlockup_stop_work));
}
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
......@@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
static void watchdog_set_prio(unsigned int policy, unsigned int prio)
{
struct sched_param param = { .sched_priority = prio };
sched_setscheduler(current, policy, &param);
}
static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
struct completion *done = this_cpu_ptr(&softlockup_completion);
WARN_ON_ONCE(cpu != smp_processor_id());
init_completion(done);
complete(done);
/*
* Start the timer first to prevent the NMI watchdog triggering
......@@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu)
/* Enable the perf event */
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
watchdog_nmi_enable(cpu);
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
WARN_ON_ONCE(cpu != smp_processor_id());
/*
* Disable the perf event first. That prevents that a large delay
* between disabling the timer and disabling the perf event causes
......@@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu)
*/
watchdog_nmi_disable(cpu);
hrtimer_cancel(hrtimer);
wait_for_completion(this_cpu_ptr(&softlockup_completion));
}
static void watchdog_cleanup(unsigned int cpu, bool online)
static int softlockup_stop_fn(void *data)
{
watchdog_disable(cpu);
watchdog_disable(smp_processor_id());
return 0;
}
static int watchdog_should_run(unsigned int cpu)
static void softlockup_stop_all(void)
{
return __this_cpu_read(hrtimer_interrupts) !=
__this_cpu_read(soft_lockup_hrtimer_cnt);
int cpu;
if (!softlockup_initialized)
return;
for_each_cpu(cpu, &watchdog_allowed_mask)
smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
cpumask_clear(&watchdog_allowed_mask);
}
/*
* The watchdog thread function - touches the timestamp.
*
* It only runs once every sample_period seconds (4 seconds by
* default) to reset the softlockup timestamp. If this gets delayed
* for more than 2*watchdog_thresh seconds then the debug-printout
* triggers in watchdog_timer_fn().
*/
static void watchdog(unsigned int cpu)
static int softlockup_start_fn(void *data)
{
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
watchdog_enable(smp_processor_id());
return 0;
}
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
.thread_fn = watchdog,
.thread_comm = "watchdog/%u",
.setup = watchdog_enable,
.cleanup = watchdog_cleanup,
.park = watchdog_disable,
.unpark = watchdog_enable,
};
static void softlockup_update_smpboot_threads(void)
static void softlockup_start_all(void)
{
lockdep_assert_held(&watchdog_mutex);
if (!softlockup_threads_initialized)
return;
int cpu;
smpboot_update_cpumask_percpu_thread(&watchdog_threads,
&watchdog_allowed_mask);
cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
for_each_cpu(cpu, &watchdog_allowed_mask)
smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}
/* Temporarily park all watchdog threads */
static void softlockup_park_all_threads(void)
int lockup_detector_online_cpu(unsigned int cpu)
{
cpumask_clear(&watchdog_allowed_mask);
softlockup_update_smpboot_threads();
watchdog_enable(cpu);
return 0;
}
/* Unpark enabled threads */
static void softlockup_unpark_threads(void)
int lockup_detector_offline_cpu(unsigned int cpu)
{
cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
softlockup_update_smpboot_threads();
watchdog_disable(cpu);
return 0;
}
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
watchdog_nmi_stop();
softlockup_park_all_threads();
softlockup_stop_all();
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
softlockup_unpark_threads();
softlockup_start_all();
watchdog_nmi_start();
cpus_read_unlock();
/*
......@@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void)
*/
static __init void lockup_detector_setup(void)
{
int ret;
/*
* If sysctl is off and watchdog got disabled on the command line,
* nothing to do here.
......@@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void)
!(watchdog_enabled && watchdog_thresh))
return;
ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
&watchdog_allowed_mask);
if (ret) {
pr_err("Failed to initialize soft lockup detector threads\n");
return;
}
mutex_lock(&watchdog_mutex);
softlockup_threads_initialized = true;
lockup_detector_reconfigure();
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static inline int watchdog_park_threads(void) { return 0; }
static inline void watchdog_unpark_threads(void) { }
static inline int watchdog_enable_all_cpus(void) { return 0; }
static inline void watchdog_disable_all_cpus(void) { }
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
......
......@@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void)
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
PTR_ERR(evt));
pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
PTR_ERR(evt));
return PTR_ERR(evt);
}
this_cpu_write(watchdog_ev, evt);
......
......@@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.pause = false;
swake_up(kvm_arch_vcpu_wq(vcpu));
swake_up_one(kvm_arch_vcpu_wq(vcpu));
}
}
......@@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
{
struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
(!vcpu->arch.pause)));
if (vcpu->arch.power_off || vcpu->arch.pause) {
......
......@@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
smp_mb(); /* Make sure the above is visible */
wq = kvm_arch_vcpu_wq(vcpu);
swake_up(wq);
swake_up_one(wq);
return PSCI_RET_SUCCESS;
}
......
......@@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work)
trace_kvm_async_pf_completed(addr, gva);
if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
swake_up_one(&vcpu->wq);
mmput(mm);
kvm_put_kvm(vcpu->kvm);
......
......@@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_blocking(vcpu);
for (;;) {
prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
if (kvm_vcpu_check_block(vcpu) < 0)
break;
......@@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
wqp = kvm_arch_vcpu_wq(vcpu);
if (swq_has_sleeper(wqp)) {
swake_up(wqp);
swake_up_one(wqp);
++vcpu->stat.halt_wakeup;
return true;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment