Commit 62fb1851 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

sched: revert load_balance_monitor() changes

The following commits cause a number of regressions:

  commit 58e2d4ca
  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  Date:   Fri Jan 25 21:08:00 2008 +0100
  sched: group scheduling, change how cpu load is calculated

  commit 6b2d7700
  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  Date:   Fri Jan 25 21:08:00 2008 +0100
  sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups

Namely:
 - very frequent wakeups on SMP, reported by PowerTop users.
 - cacheline trashing on (large) SMP
 - some latencies larger than 500ms

While there is a mergeable patch to fix the latter, the former issues
are not fixable in a manner suitable for .25 (we're at -rc3 now).

Hence we revert them and try again in v2.6.26.
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Tested-by: default avatarAlexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 976dde01
...@@ -1542,10 +1542,6 @@ extern unsigned int sysctl_sched_child_runs_first; ...@@ -1542,10 +1542,6 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_nr_migrate;
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
extern unsigned int sysctl_sched_min_bal_int_shares;
extern unsigned int sysctl_sched_max_bal_int_shares;
#endif
int sched_nr_latency_handler(struct ctl_table *table, int write, int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, struct file *file, void __user *buffer, size_t *length,
......
This diff is collapsed.
...@@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) ...@@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent; return se->parent;
} }
#define GROUP_IMBALANCE_PCT 20
#else /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \ #define for_each_sched_entity(se) \
...@@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) ...@@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, struct sched_entity *se = &p->se;
*topse = NULL; /* Highest schedulable entity */
int incload = 1;
for_each_sched_entity(se) { for_each_sched_entity(se) {
topse = se; if (se->on_rq)
if (se->on_rq) {
incload = 0;
break; break;
}
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup); enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1; wakeup = 1;
} }
/* Increment cpu load if we just enqueued the first task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (incload)
inc_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr); hrtick_start_fair(rq, rq->curr);
} }
...@@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) ...@@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, struct sched_entity *se = &p->se;
*topse = NULL; /* Highest schedulable entity */
int decload = 1;
for_each_sched_entity(se) { for_each_sched_entity(se) {
topse = se;
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep); dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */ /* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) { if (cfs_rq->load.weight)
if (parent_entity(se))
decload = 0;
break; break;
}
sleep = 1; sleep = 1;
} }
/* Decrement cpu load if we just dequeued the last task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (decload)
dec_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr); hrtick_start_fair(rq, rq->curr);
} }
...@@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg) ...@@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr;
struct task_struct *p;
if (!cfs_rq->nr_running || !first_fair(cfs_rq))
return MAX_PRIO;
curr = cfs_rq->curr;
if (!curr)
curr = __pick_next_entity(cfs_rq);
p = task_of(curr);
return p->prio;
}
#endif
static unsigned long static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, unsigned long max_load_move,
...@@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct cfs_rq *busy_cfs_rq; struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move; long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator; struct rq_iterator cfs_rq_iterator;
unsigned long load_moved;
cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair; cfs_rq_iterator.next = load_balance_next_fair;
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; struct cfs_rq *this_cfs_rq;
unsigned long maxload, task_load, group_weight; long imbalance;
unsigned long thisload, per_task_load; unsigned long maxload;
struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
task_load = busy_cfs_rq->load.weight;
group_weight = se->load.weight;
/* this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
* 'group_weight' is contributed by tasks of total weight
* 'task_load'. To move 'rem_load_move' worth of weight only,
* we need to move a maximum task load of:
*
* maxload = (remload / group_weight) * task_load;
*/
maxload = (rem_load_move * task_load) / group_weight;
if (!maxload || !task_load) imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if (imbalance <= 0)
continue; continue;
per_task_load = task_load / busy_cfs_rq->nr_running; /* Don't pull more than imbalance/2 */
/* imbalance /= 2;
* balance_tasks will try to forcibly move atleast one task if maxload = min(rem_load_move, imbalance);
* possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
* maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
*/
if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
continue;
/* Disable priority-based load balance */ *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
*this_best_prio = 0;
thisload = this_cfs_rq->load.weight;
#else #else
# define maxload rem_load_move # define maxload rem_load_move
#endif #endif
...@@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
* load_balance_[start|next]_fair iterators * load_balance_[start|next]_fair iterators
*/ */
cfs_rq_iterator.arg = busy_cfs_rq; cfs_rq_iterator.arg = busy_cfs_rq;
load_moved = balance_tasks(this_rq, this_cpu, busiest, rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned, maxload, sd, idle, all_pinned,
this_best_prio, this_best_prio,
&cfs_rq_iterator); &cfs_rq_iterator);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* load_moved holds the task load that was moved. The
* effective (group) weight moved would be:
* load_moved_eff = load_moved/task_load * group_weight;
*/
load_moved = (group_weight * load_moved) / task_load;
/* Adjust shares on both cpus to reflect load_moved */
group_weight -= load_moved;
set_se_shares(se, group_weight);
se = busy_cfs_rq->tg->se[this_cpu];
if (!thisload)
group_weight = load_moved;
else
group_weight = se->load.weight + load_moved;
set_se_shares(se, group_weight);
#endif
rem_load_move -= load_moved;
if (rem_load_move <= 0) if (rem_load_move <= 0)
break; break;
} }
......
...@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) ...@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/ */
for_each_sched_rt_entity(rt_se) for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se); enqueue_rt_entity(rt_se);
inc_cpu_load(rq, p->se.load.weight);
} }
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
...@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) ...@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running) if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se); enqueue_rt_entity(rt_se);
} }
dec_cpu_load(rq, p->se.load.weight);
} }
/* /*
......
...@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = { ...@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec, .proc_handler = &proc_dointvec,
}, },
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_min_bal_int_shares",
.data = &sysctl_sched_min_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_max_bal_int_shares",
.data = &sysctl_sched_max_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#endif #endif
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment