Commit 62fb1851 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

sched: revert load_balance_monitor() changes

The following commits cause a number of regressions:

  commit 58e2d4ca
  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  Date:   Fri Jan 25 21:08:00 2008 +0100
  sched: group scheduling, change how cpu load is calculated

  commit 6b2d7700
  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  Date:   Fri Jan 25 21:08:00 2008 +0100
  sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups

Namely:
 - very frequent wakeups on SMP, reported by PowerTop users.
 - cacheline trashing on (large) SMP
 - some latencies larger than 500ms

While there is a mergeable patch to fix the latter, the former issues
are not fixable in a manner suitable for .25 (we're at -rc3 now).

Hence we revert them and try again in v2.6.26.
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Tested-by: default avatarAlexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 976dde01
......@@ -1542,10 +1542,6 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
extern unsigned int sysctl_sched_min_bal_int_shares;
extern unsigned int sysctl_sched_max_bal_int_shares;
#endif
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
......
This diff is collapsed.
......@@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
#define GROUP_IMBALANCE_PCT 20
#else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
......@@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se,
*topse = NULL; /* Highest schedulable entity */
int incload = 1;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
topse = se;
if (se->on_rq) {
incload = 0;
if (se->on_rq)
break;
}
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
/* Increment cpu load if we just enqueued the first task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (incload)
inc_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
......@@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se,
*topse = NULL; /* Highest schedulable entity */
int decload = 1;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
topse = se;
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
if (parent_entity(se))
decload = 0;
if (cfs_rq->load.weight)
break;
}
sleep = 1;
}
/* Decrement cpu load if we just dequeued the last task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (decload)
dec_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
......@@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr;
struct task_struct *p;
if (!cfs_rq->nr_running || !first_fair(cfs_rq))
return MAX_PRIO;
curr = cfs_rq->curr;
if (!curr)
curr = __pick_next_entity(cfs_rq);
p = task_of(curr);
return p->prio;
}
#endif
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
......@@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator;
unsigned long load_moved;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
unsigned long maxload, task_load, group_weight;
unsigned long thisload, per_task_load;
struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
task_load = busy_cfs_rq->load.weight;
group_weight = se->load.weight;
struct cfs_rq *this_cfs_rq;
long imbalance;
unsigned long maxload;
/*
* 'group_weight' is contributed by tasks of total weight
* 'task_load'. To move 'rem_load_move' worth of weight only,
* we need to move a maximum task load of:
*
* maxload = (remload / group_weight) * task_load;
*/
maxload = (rem_load_move * task_load) / group_weight;
this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
if (!maxload || !task_load)
imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if (imbalance <= 0)
continue;
per_task_load = task_load / busy_cfs_rq->nr_running;
/*
* balance_tasks will try to forcibly move atleast one task if
* possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
* maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
*/
if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
continue;
/* Don't pull more than imbalance/2 */
imbalance /= 2;
maxload = min(rem_load_move, imbalance);
/* Disable priority-based load balance */
*this_best_prio = 0;
thisload = this_cfs_rq->load.weight;
*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
#else
# define maxload rem_load_move
#endif
......@@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator.arg = busy_cfs_rq;
load_moved = balance_tasks(this_rq, this_cpu, busiest,
rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned,
this_best_prio,
&cfs_rq_iterator);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* load_moved holds the task load that was moved. The
* effective (group) weight moved would be:
* load_moved_eff = load_moved/task_load * group_weight;
*/
load_moved = (group_weight * load_moved) / task_load;
/* Adjust shares on both cpus to reflect load_moved */
group_weight -= load_moved;
set_se_shares(se, group_weight);
se = busy_cfs_rq->tg->se[this_cpu];
if (!thisload)
group_weight = load_moved;
else
group_weight = se->load.weight + load_moved;
set_se_shares(se, group_weight);
#endif
rem_load_move -= load_moved;
if (rem_load_move <= 0)
break;
}
......
......@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/
for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se);
inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
......@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se);
}
dec_cpu_load(rq, p->se.load.weight);
}
/*
......
......@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_min_bal_int_shares",
.data = &sysctl_sched_min_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_max_bal_int_shares",
.data = &sysctl_sched_max_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#endif
{
.ctl_name = CTL_UNNUMBERED,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment