Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner: - The hopefully final fix for the reported race problems in kthread_parkme(). The previous attempt still left a hole and was partially wrong. - Plug a race in the remote tick mechanism which triggers a warning about updates not being done correctly. That's a false positive if the race condition is hit as the remote CPU is idle. Plug it by checking the condition again when holding run queue lock. - Fix a bug in the utilization estimation of a run queue which causes the estimation to be 0 when a run queue is throttled. - Advance the global expiration of the period timer when the timer is restarted after a idle period. Otherwise the expiry time is stale and the timer fires prematurely. - Cure the drift between the bandwidth timer and the runqueue accounting, which leads to bogus throttling of runqueues - Place the call to cpufreq_update_util() correctly so the function will observe the correct number of running RT tasks and not a stale one. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: kthread, sched/core: Fix kthread_parkme() (again...) sched/util_est: Fix util_est_dequeue() for throttled cfs_rq sched/fair: Advance global expiration when period timer is restarted sched/fair: Fix bandwidth timer clock drift condition sched/rt: Fix call to cpufreq_update_util() sched/nohz: Skip remote tick on idle task entirely

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: - The hopefully final fix for the reported race problems in kthread_parkme(). The previous attempt still left a hole and was partially wrong. - Plug a race in the remote tick mechanism which triggers a warning about updates not being done correctly. That's a false positive if the race condition is hit as the remote CPU is idle. Plug it by checking the condition again when holding run queue lock. - Fix a bug in the utilization estimation of a run queue which causes the estimation to be 0 when a run queue is throttled. - Advance the global expiration of the period timer when the timer is restarted after a idle period. Otherwise the expiry time is stale and the timer fires prematurely. - Cure the drift between the bandwidth timer and the runqueue accounting, which leads to bogus throttling of runqueues - Place the call to cpufreq_update_util() correctly so the function will observe the correct number of running RT tasks and not a stale one. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: kthread, sched/core: Fix kthread_parkme() (again...) sched/util_est: Fix util_est_dequeue() for throttled cfs_rq sched/fair: Advance global expiration when period timer is restarted sched/fair: Fix bandwidth timer clock drift condition sched/rt: Fix call to cpufreq_update_util() sched/nohz: Skip remote tick on idle task entirely
6fb2489d · Linus Torvalds · f5c926b9 · 1cef1150 · 6fb2489d · 6fb2489d
Commit 6fb2489d authored Jul 08, 2018 by Linus Torvalds
8 changed files
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
-void kthread_park_complete(struct task_struct *k);
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,7 +118,7 @@ struct task_group;
 * the comment with set_special_state().
 */
 #define is_special_task_state(state)				\
-	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD))
+	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
 #define __set_current_state(state_value)			\
 	do {							\

--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
 static void __kthread_parkme(struct kthread *self)
 {
 	for (;;) {
-		set_current_state(TASK_PARKED);
+		/*
+		 * TASK_PARKED is a special state; we must serialize against
+		 * possible pending wakeups to avoid store-store collisions on
+		 * task->state.
+		 *
+		 * Such a collision might possibly result in the task state
+		 * changin from TASK_PARKED and us failing the
+		 * wait_task_inactive() in kthread_park().
+		 */
+		set_special_state(TASK_PARKED);
 		if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
 			break;
+		complete_all(&self->parked);
 		schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -191,11 +202,6 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
-void kthread_park_complete(struct task_struct *k)
-{
-	complete_all(&to_kthread(k)->parked);
-}
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
 	reinit_completion(&kthread->parked);
 	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	/*
+	 * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
+	 */
 	wake_up_state(k, TASK_PARKED);
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
 	set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
 	if (k != current) {
 		wake_up_process(k);
+		/*
+		 * Wait for __kthread_parkme() to complete(), this means we
+		 * _will_ have TASK_PARKED and are about to call schedule().
+		 */
 		wait_for_completion(&kthread->parked);
+		/*
+		 * Now wait for that schedule() to complete and the task to
+		 * get scheduled out.
+		 */
+		WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
 	}
 	return 0;

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,7 +7,6 @@
 */
 #include "sched.h"
-#include <linux/kthread.h>
 #include <linux/nospec.h>
 #include <linux/kcov.h>
@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
-	if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
+	if (unlikely(prev_state == TASK_DEAD)) {
-		switch (prev_state) {
+		if (prev->sched_class->task_dead)
-		case TASK_DEAD:
+			prev->sched_class->task_dead(prev);
-			if (prev->sched_class->task_dead)
-				prev->sched_class->task_dead(prev);
-			/*
+		/*
-			 * Remove function-return probe instances associated with this
+		 * Remove function-return probe instances associated with this
-			 * task and put them back on the free list.
+		 * task and put them back on the free list.
-			 */
+		 */
-			kprobe_flush_task(prev);
+		kprobe_flush_task(prev);
-			/* Task is done with its stack. */
-			put_task_stack(prev);
-			put_task_struct(prev);
+		/* Task is done with its stack. */
-			break;
+		put_task_stack(prev);
-		case TASK_PARKED:
+		put_task_struct(prev);
-			kthread_park_complete(prev);
-			break;
-		}
 	}
 	tick_nohz_task_switch();
@@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
 	struct tick_work *twork = container_of(dwork, struct tick_work, work);
 	int cpu = twork->cpu;
 	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *curr;
 	struct rq_flags rf;
+	u64 delta;
 	/*
 	 * Handle the tick only if it appears the remote CPU is running in full
@@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
 	 * statistics and checks timeslices in a time-independent way, regardless
 	 * of when exactly it is running.
 	 */
-	if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+	if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
-		struct task_struct *curr;
+		goto out_requeue;
-		u64 delta;
-		rq_lock_irq(rq, &rf);
+	rq_lock_irq(rq, &rf);
-		update_rq_clock(rq);
+	curr = rq->curr;
-		curr = rq->curr;
+	if (is_idle_task(curr))
-		delta = rq_clock_task(rq) - curr->se.exec_start;
+		goto out_unlock;
-		/*
+	update_rq_clock(rq);
-		 * Make sure the next tick runs within a reasonable
+	delta = rq_clock_task(rq) - curr->se.exec_start;
-		 * amount of time.
-		 */
+	/*
-		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+	 * Make sure the next tick runs within a reasonable
-		curr->sched_class->task_tick(rq, curr, 0);
+	 * amount of time.
-		rq_unlock_irq(rq, &rf);
+	 */
-	}
+	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+	curr->sched_class->task_tick(rq, curr, 0);
+out_unlock:
+	rq_unlock_irq(rq, &rf);
+out_requeue:
 	/*
 	 * Run the remote tick once per second (1Hz). This arbitrary
 	 * frequency is large enough to avoid overload but short enough

--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	if (rq->rt.rt_nr_running)
+	if (rt_rq_is_runnable(&rq->rt))
 		return sg_cpu->max;
 	/*

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	if (!sched_feat(UTIL_EST))
 		return;
-	/*
+	/* Update root cfs_rq's estimated utilization */
-	 * Update root cfs_rq's estimated utilization
+	ue.enqueued  = cfs_rq->avg.util_est.enqueued;
-	 *
+	ue.enqueued -= min_t(unsigned int, ue.enqueued,
-	 * If *p is the last task then the root cfs_rq's estimated utilization
+			     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
-	 * of a CPU is 0 by definition.
-	 */
-	ue.enqueued = 0;
-	if (cfs_rq->nr_running) {
-		ue.enqueued  = cfs_rq->avg.util_est.enqueued;
-		ue.enqueued -= min_t(unsigned int, ue.enqueued,
-				     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
-	}
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 	/*
@@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+	cfs_b->expires_seq++;
 }
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 	u64 amount = 0, min_amount, expires;
+	int expires_seq;
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 			cfs_b->idle = 0;
 		}
 	}
+	expires_seq = cfs_b->expires_seq;
 	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
@@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * spread between our sched_clock and the one on which runtime was
 	 * issued.
 	 */
-	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+	if (cfs_rq->expires_seq != expires_seq) {
+		cfs_rq->expires_seq = expires_seq;
 		cfs_rq->runtime_expires = expires;
+	}
 	return cfs_rq->runtime_remaining > 0;
 }
@@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * has not truly expired.
 	 *
 	 * Fortunately we can check determine whether this the case by checking
-	 * whether the global deadline has advanced. It is valid to compare
+	 * whether the global deadline(cfs_b->expires_seq) has advanced.
-	 * cfs_b->runtime_expires without any locks since we only care about
-	 * exact equality, so a partial write will still work.
 	 */
+	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
 		/* extend local deadline, drift is bounded above by 2 ticks */
 		cfs_rq->runtime_expires += TICK_NSEC;
 	} else {
@@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+	u64 overrun;
 	lockdep_assert_held(&cfs_b->lock);
-	if (!cfs_b->period_active) {
+	if (cfs_b->period_active)
-		cfs_b->period_active = 1;
+		return;
-		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
-		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+	cfs_b->period_active = 1;
-	}
+	overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+	cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
+	cfs_b->expires_seq++;
+	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 	rt_se = rt_rq->tg->rt_se[cpu];
-	if (!rt_se)
+	if (!rt_se) {
 		dequeue_top_rt_rq(rt_rq);
+		/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+		cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
+	}
 	else if (on_rt_rq(rt_se))
 		dequeue_rt_entity(rt_se, 0);
 }
@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
 	sub_nr_running(rq, rt_rq->rt_nr_running);
 	rt_rq->rt_queued = 0;
-	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_util(rq, 0);
 }
 static void
@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
 	if (rt_rq->rt_queued)
 		return;
-	if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+	if (rt_rq_throttled(rt_rq))
 		return;
-	add_nr_running(rq, rt_rq->rt_nr_running);
+	if (rt_rq->rt_nr_running) {
-	rt_rq->rt_queued = 1;
+		add_nr_running(rq, rt_rq->rt_nr_running);
+		rt_rq->rt_queued = 1;
+	}
 	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 	cpufreq_update_util(rq, 0);

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -334,9 +334,10 @@ struct cfs_bandwidth {
 	u64			runtime;
 	s64			hierarchical_quota;
 	u64			runtime_expires;
+	int			expires_seq;
-	int			idle;
+	short			idle;
-	int			period_active;
+	short			period_active;
 	struct hrtimer		period_timer;
 	struct hrtimer		slack_timer;
 	struct list_head	throttled_cfs_rq;
@@ -551,6 +552,7 @@ struct cfs_rq {
 #ifdef CONFIG_CFS_BANDWIDTH
 	int			runtime_enabled;
+	int			expires_seq;
 	u64			runtime_expires;
 	s64			runtime_remaining;
@@ -609,6 +611,11 @@ struct rt_rq {
 #endif
 };
+static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_queued && rt_rq->rt_nr_running;
+}
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
 	/* runqueue is an rbtree, ordered by deadline */