Commit 6fb2489d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

 - The hopefully final fix for the reported race problems in
   kthread_parkme(). The previous attempt still left a hole and was
   partially wrong.

 - Plug a race in the remote tick mechanism which triggers a warning
   about updates not being done correctly. That's a false positive if
   the race condition is hit as the remote CPU is idle. Plug it by
   checking the condition again when holding run queue lock.

 - Fix a bug in the utilization estimation of a run queue which causes
   the estimation to be 0 when a run queue is throttled.

 - Advance the global expiration of the period timer when the timer is
   restarted after a idle period. Otherwise the expiry time is stale and
   the timer fires prematurely.

 - Cure the drift between the bandwidth timer and the runqueue
   accounting, which leads to bogus throttling of runqueues

 - Place the call to cpufreq_update_util() correctly so the function
   will observe the correct number of running RT tasks and not a stale
   one.

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  kthread, sched/core: Fix kthread_parkme() (again...)
  sched/util_est: Fix util_est_dequeue() for throttled cfs_rq
  sched/fair: Advance global expiration when period timer is restarted
  sched/fair: Fix bandwidth timer clock drift condition
  sched/rt: Fix call to cpufreq_update_util()
  sched/nohz: Skip remote tick on idle task entirely
parents f5c926b9 1cef1150
...@@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k); ...@@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k);
int kthread_park(struct task_struct *k); int kthread_park(struct task_struct *k);
void kthread_unpark(struct task_struct *k); void kthread_unpark(struct task_struct *k);
void kthread_parkme(void); void kthread_parkme(void);
void kthread_park_complete(struct task_struct *k);
int kthreadd(void *unused); int kthreadd(void *unused);
extern struct task_struct *kthreadd_task; extern struct task_struct *kthreadd_task;
......
...@@ -118,7 +118,7 @@ struct task_group; ...@@ -118,7 +118,7 @@ struct task_group;
* the comment with set_special_state(). * the comment with set_special_state().
*/ */
#define is_special_task_state(state) \ #define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
#define __set_current_state(state_value) \ #define __set_current_state(state_value) \
do { \ do { \
......
...@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) ...@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self) static void __kthread_parkme(struct kthread *self)
{ {
for (;;) { for (;;) {
set_current_state(TASK_PARKED); /*
* TASK_PARKED is a special state; we must serialize against
* possible pending wakeups to avoid store-store collisions on
* task->state.
*
* Such a collision might possibly result in the task state
* changin from TASK_PARKED and us failing the
* wait_task_inactive() in kthread_park().
*/
set_special_state(TASK_PARKED);
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break; break;
complete_all(&self->parked);
schedule(); schedule();
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
...@@ -191,11 +202,6 @@ void kthread_parkme(void) ...@@ -191,11 +202,6 @@ void kthread_parkme(void)
} }
EXPORT_SYMBOL_GPL(kthread_parkme); EXPORT_SYMBOL_GPL(kthread_parkme);
void kthread_park_complete(struct task_struct *k)
{
complete_all(&to_kthread(k)->parked);
}
static int kthread(void *_create) static int kthread(void *_create)
{ {
/* Copy data: it's on kthread's stack */ /* Copy data: it's on kthread's stack */
...@@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k) ...@@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
reinit_completion(&kthread->parked); reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
*/
wake_up_state(k, TASK_PARKED); wake_up_state(k, TASK_PARKED);
} }
EXPORT_SYMBOL_GPL(kthread_unpark); EXPORT_SYMBOL_GPL(kthread_unpark);
...@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k) ...@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) { if (k != current) {
wake_up_process(k); wake_up_process(k);
/*
* Wait for __kthread_parkme() to complete(), this means we
* _will_ have TASK_PARKED and are about to call schedule().
*/
wait_for_completion(&kthread->parked); wait_for_completion(&kthread->parked);
/*
* Now wait for that schedule() to complete and the task to
* get scheduled out.
*/
WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
} }
return 0; return 0;
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
*/ */
#include "sched.h" #include "sched.h"
#include <linux/kthread.h>
#include <linux/nospec.h> #include <linux/nospec.h>
#include <linux/kcov.h> #include <linux/kcov.h>
...@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) ...@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
membarrier_mm_sync_core_before_usermode(mm); membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm); mmdrop(mm);
} }
if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { if (unlikely(prev_state == TASK_DEAD)) {
switch (prev_state) { if (prev->sched_class->task_dead)
case TASK_DEAD: prev->sched_class->task_dead(prev);
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
/* /*
* Remove function-return probe instances associated with this * Remove function-return probe instances associated with this
* task and put them back on the free list. * task and put them back on the free list.
*/ */
kprobe_flush_task(prev); kprobe_flush_task(prev);
/* Task is done with its stack. */
put_task_stack(prev);
put_task_struct(prev); /* Task is done with its stack. */
break; put_task_stack(prev);
case TASK_PARKED: put_task_struct(prev);
kthread_park_complete(prev);
break;
}
} }
tick_nohz_task_switch(); tick_nohz_task_switch();
...@@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work) ...@@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work); struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu; int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct task_struct *curr;
struct rq_flags rf; struct rq_flags rf;
u64 delta;
/* /*
* Handle the tick only if it appears the remote CPU is running in full * Handle the tick only if it appears the remote CPU is running in full
...@@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work) ...@@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless * statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running. * of when exactly it is running.
*/ */
if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
struct task_struct *curr; goto out_requeue;
u64 delta;
rq_lock_irq(rq, &rf); rq_lock_irq(rq, &rf);
update_rq_clock(rq); curr = rq->curr;
curr = rq->curr; if (is_idle_task(curr))
delta = rq_clock_task(rq) - curr->se.exec_start; goto out_unlock;
/* update_rq_clock(rq);
* Make sure the next tick runs within a reasonable delta = rq_clock_task(rq) - curr->se.exec_start;
* amount of time.
*/ /*
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); * Make sure the next tick runs within a reasonable
curr->sched_class->task_tick(rq, curr, 0); * amount of time.
rq_unlock_irq(rq, &rf); */
} WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
curr->sched_class->task_tick(rq, curr, 0);
out_unlock:
rq_unlock_irq(rq, &rf);
out_requeue:
/* /*
* Run the remote tick once per second (1Hz). This arbitrary * Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough * frequency is large enough to avoid overload but short enough
......
...@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) ...@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{ {
struct rq *rq = cpu_rq(sg_cpu->cpu); struct rq *rq = cpu_rq(sg_cpu->cpu);
if (rq->rt.rt_nr_running) if (rt_rq_is_runnable(&rq->rt))
return sg_cpu->max; return sg_cpu->max;
/* /*
......
...@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ...@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (!sched_feat(UTIL_EST)) if (!sched_feat(UTIL_EST))
return; return;
/* /* Update root cfs_rq's estimated utilization */
* Update root cfs_rq's estimated utilization ue.enqueued = cfs_rq->avg.util_est.enqueued;
* ue.enqueued -= min_t(unsigned int, ue.enqueued,
* If *p is the last task then the root cfs_rq's estimated utilization (_task_util_est(p) | UTIL_AVG_UNCHANGED));
* of a CPU is 0 by definition.
*/
ue.enqueued = 0;
if (cfs_rq->nr_running) {
ue.enqueued = cfs_rq->avg.util_est.enqueued;
ue.enqueued -= min_t(unsigned int, ue.enqueued,
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
}
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/* /*
...@@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) ...@@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id()); now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota; cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
cfs_b->expires_seq++;
} }
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
...@@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
struct task_group *tg = cfs_rq->tg; struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires; u64 amount = 0, min_amount, expires;
int expires_seq;
/* note: this is a positive sum as runtime_remaining <= 0 */ /* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
...@@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0; cfs_b->idle = 0;
} }
} }
expires_seq = cfs_b->expires_seq;
expires = cfs_b->runtime_expires; expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock); raw_spin_unlock(&cfs_b->lock);
...@@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* spread between our sched_clock and the one on which runtime was * spread between our sched_clock and the one on which runtime was
* issued. * issued.
*/ */
if ((s64)(expires - cfs_rq->runtime_expires) > 0) if (cfs_rq->expires_seq != expires_seq) {
cfs_rq->expires_seq = expires_seq;
cfs_rq->runtime_expires = expires; cfs_rq->runtime_expires = expires;
}
return cfs_rq->runtime_remaining > 0; return cfs_rq->runtime_remaining > 0;
} }
...@@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired. * has not truly expired.
* *
* Fortunately we can check determine whether this the case by checking * Fortunately we can check determine whether this the case by checking
* whether the global deadline has advanced. It is valid to compare * whether the global deadline(cfs_b->expires_seq) has advanced.
* cfs_b->runtime_expires without any locks since we only care about
* exact equality, so a partial write will still work.
*/ */
if (cfs_rq->expires_seq == cfs_b->expires_seq) {
if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */ /* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC; cfs_rq->runtime_expires += TICK_NSEC;
} else { } else {
...@@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) ...@@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{ {
u64 overrun;
lockdep_assert_held(&cfs_b->lock); lockdep_assert_held(&cfs_b->lock);
if (!cfs_b->period_active) { if (cfs_b->period_active)
cfs_b->period_active = 1; return;
hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); cfs_b->period_active = 1;
} overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
cfs_b->expires_seq++;
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
} }
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
......
...@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) ...@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu]; rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se) if (!rt_se) {
dequeue_top_rt_rq(rt_rq); dequeue_top_rt_rq(rt_rq);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
else if (on_rt_rq(rt_se)) else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se, 0); dequeue_rt_entity(rt_se, 0);
} }
...@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) ...@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running); sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0; rt_rq->rt_queued = 0;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
} }
static void static void
...@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) ...@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
if (rt_rq->rt_queued) if (rt_rq->rt_queued)
return; return;
if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
if (rt_rq_throttled(rt_rq))
return; return;
add_nr_running(rq, rt_rq->rt_nr_running); if (rt_rq->rt_nr_running) {
rt_rq->rt_queued = 1; add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
}
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0); cpufreq_update_util(rq, 0);
......
...@@ -334,9 +334,10 @@ struct cfs_bandwidth { ...@@ -334,9 +334,10 @@ struct cfs_bandwidth {
u64 runtime; u64 runtime;
s64 hierarchical_quota; s64 hierarchical_quota;
u64 runtime_expires; u64 runtime_expires;
int expires_seq;
int idle; short idle;
int period_active; short period_active;
struct hrtimer period_timer; struct hrtimer period_timer;
struct hrtimer slack_timer; struct hrtimer slack_timer;
struct list_head throttled_cfs_rq; struct list_head throttled_cfs_rq;
...@@ -551,6 +552,7 @@ struct cfs_rq { ...@@ -551,6 +552,7 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled; int runtime_enabled;
int expires_seq;
u64 runtime_expires; u64 runtime_expires;
s64 runtime_remaining; s64 runtime_remaining;
...@@ -609,6 +611,11 @@ struct rt_rq { ...@@ -609,6 +611,11 @@ struct rt_rq {
#endif #endif
}; };
static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
{
return rt_rq->rt_queued && rt_rq->rt_nr_running;
}
/* Deadline class' related fields in a runqueue */ /* Deadline class' related fields in a runqueue */
struct dl_rq { struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */ /* runqueue is an rbtree, ordered by deadline */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment