sched: Add support for throttling group entities

Now that consumption is tracked (via update_curr()) we add support to throttle group entities (and their corresponding cfs_rqs) in the case where this is no run-time remaining. Throttled entities are dequeued to prevent scheduling, additionally we mark them as throttled (using cfs_rq->throttled) to prevent them from becoming re-enqueued until they are unthrottled. A list of a task_group's throttled entities are maintained on the cfs_bandwidth structure. Note: While the machinery for throttling is added in this patch the act of throttling an entity exceeding its bandwidth is deferred until later within the series. Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184757.480608533@google.comSigned-off-by: Ingo Molnar <mingo@elte.hu>

sched: Add support for throttling group entities
Now that consumption is tracked (via update_curr()) we add support to throttle group entities (and their corresponding cfs_rqs) in the case where this is no run-time remaining. Throttled entities are dequeued to prevent scheduling, additionally we mark them as throttled (using cfs_rq->throttled) to prevent them from becoming re-enqueued until they are unthrottled. A list of a task_group's throttled entities are maintained on the cfs_bandwidth structure. Note: While the machinery for throttling is added in this patch the act of throttling an entity exceeding its bandwidth is deferred until later within the series. Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184757.480608533@google.comSigned-off-by: Ingo Molnar <mingo@elte.hu>
85dac906 · Paul Turner · Ingo Molnar · a9cf55b2 · 85dac906 · 85dac906
Commit 85dac906 authored Jul 21, 2011 by Paul Turner Committed by Ingo Molnar Aug 14, 2011
Show whitespace changes
Inline Side-by-side

Showing with 92 additions and 4 deletions

kernel/sched.c kernel/sched.c +7 -0

kernel/sched_fair.c kernel/sched_fair.c +85 -4

No files found.
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -260,6 +260,8 @@ struct cfs_bandwidth {
 	int idle, timer_active;
 	struct hrtimer period_timer;
+	struct list_head throttled_cfs_rq;
 #endif
 };
@@ -399,6 +401,9 @@ struct cfs_rq {
 	int runtime_enabled;
 	u64 runtime_expires;
 	s64 runtime_remaining;
+	int throttled;
+	struct list_head throttled_list;
 #endif
 #endif
 };
@@ -441,6 +446,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->quota = RUNTIME_INF;
 	cfs_b->period = ns_to_ktime(default_cfs_period());
+	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 }
@@ -448,6 +454,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
+	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 /* requires cfs_b->lock, may release to reprogram timer */

--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1291,7 +1291,8 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
 }
-static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
@@ -1332,6 +1333,8 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 */
 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
 		cfs_rq->runtime_expires = expires;
+	return cfs_rq->runtime_remaining > 0;
 }
 /*
@@ -1378,7 +1381,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	if (likely(cfs_rq->runtime_remaining > 0))
 		return;
-	assign_cfs_rq_runtime(cfs_rq);
+	/*
+	 * if we're unable to extend our runtime we resched so that the active
+	 * hierarchy can be throttled
+	 */
+	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+		resched_task(rq_of(cfs_rq)->curr);
 }
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -1390,6 +1398,47 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttled;
+}
+static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	long task_delta, dequeue = 1;
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+	/* account load preceding throttle */
+	update_cfs_load(cfs_rq, 0);
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		/* throttled entity or throttle-on-deactivate */
+		if (!se->on_rq)
+			break;
+		if (dequeue)
+			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		qcfs_rq->h_nr_running -= task_delta;
+		if (qcfs_rq->load.weight)
+			dequeue = 0;
+	}
+	if (!se)
+		rq->nr_running -= task_delta;
+	cfs_rq->throttled = 1;
+	raw_spin_lock(&cfs_b->lock);
+	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	raw_spin_unlock(&cfs_b->lock);
+}
 /*
 * Responsible for refilling a task_group's bandwidth and unthrottling its
 * cfs_rqs as appropriate. If there has been no activity within the last
@@ -1425,6 +1474,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec) {}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
 #endif
 /**************************************************
@@ -1503,7 +1557,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running increment below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		cfs_rq->h_nr_running++;
 		flags = ENQUEUE_WAKEUP;
 	}
@@ -1511,10 +1575,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
+	if (!se)
 		inc_nr_running(rq);
 	hrtick_update(rq);
 }
@@ -1535,6 +1603,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running decrement below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		cfs_rq->h_nr_running--;
 		/* Don't dequeue parent if it has other entities besides us */
@@ -1557,10 +1634,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
+	if (!se)
 		dec_nr_running(rq);
 	hrtick_update(rq);
 }