Commit d0797b39 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched:
  sched: tweak the sched_runtime_limit tunable
  sched: skip updating rq's next_balance under null SD
  sched: fix broken SMT/MC optimizations
  sched: accounting regression since rc1
  sched: fix sysctl directory permissions
  sched: sched_clock_idle_[sleep|wakeup]_event()
parents 0542170d 505c0efd
...@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = { ...@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = {
void mark_tsc_unstable(char *reason) void mark_tsc_unstable(char *reason)
{ {
sched_clock_unstable_event();
if (!tsc_unstable) { if (!tsc_unstable) {
tsc_unstable = 1; tsc_unstable = 1;
tsc_enabled = 0; tsc_enabled = 0;
......
...@@ -63,6 +63,7 @@ ...@@ -63,6 +63,7 @@
ACPI_MODULE_NAME("processor_idle"); ACPI_MODULE_NAME("processor_idle");
#define ACPI_PROCESSOR_FILE_POWER "power" #define ACPI_PROCESSOR_FILE_POWER "power"
#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY)
#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
static void (*pm_idle_save) (void) __read_mostly; static void (*pm_idle_save) (void) __read_mostly;
...@@ -462,6 +463,9 @@ static void acpi_processor_idle(void) ...@@ -462,6 +463,9 @@ static void acpi_processor_idle(void)
* TBD: Can't get time duration while in C1, as resumes * TBD: Can't get time duration while in C1, as resumes
* go to an ISR rather than here. Need to instrument * go to an ISR rather than here. Need to instrument
* base interrupt handler. * base interrupt handler.
*
* Note: the TSC better not stop in C1, sched_clock() will
* skew otherwise.
*/ */
sleep_ticks = 0xFFFFFFFF; sleep_ticks = 0xFFFFFFFF;
break; break;
...@@ -469,6 +473,8 @@ static void acpi_processor_idle(void) ...@@ -469,6 +473,8 @@ static void acpi_processor_idle(void)
case ACPI_STATE_C2: case ACPI_STATE_C2:
/* Get start time (ticks) */ /* Get start time (ticks) */
t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
/* Tell the scheduler that we are going deep-idle: */
sched_clock_idle_sleep_event();
/* Invoke C2 */ /* Invoke C2 */
acpi_state_timer_broadcast(pr, cx, 1); acpi_state_timer_broadcast(pr, cx, 1);
acpi_cstate_enter(cx); acpi_cstate_enter(cx);
...@@ -479,17 +485,22 @@ static void acpi_processor_idle(void) ...@@ -479,17 +485,22 @@ static void acpi_processor_idle(void)
/* TSC halts in C2, so notify users */ /* TSC halts in C2, so notify users */
mark_tsc_unstable("possible TSC halt in C2"); mark_tsc_unstable("possible TSC halt in C2");
#endif #endif
/* Compute time (ticks) that we were actually asleep */
sleep_ticks = ticks_elapsed(t1, t2);
/* Tell the scheduler how much we idled: */
sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
/* Re-enable interrupts */ /* Re-enable interrupts */
local_irq_enable(); local_irq_enable();
/* Do not account our idle-switching overhead: */
sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
current_thread_info()->status |= TS_POLLING; current_thread_info()->status |= TS_POLLING;
/* Compute time (ticks) that we were actually asleep */
sleep_ticks =
ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
acpi_state_timer_broadcast(pr, cx, 0); acpi_state_timer_broadcast(pr, cx, 0);
break; break;
case ACPI_STATE_C3: case ACPI_STATE_C3:
/* /*
* disable bus master * disable bus master
* bm_check implies we need ARB_DIS * bm_check implies we need ARB_DIS
...@@ -518,6 +529,8 @@ static void acpi_processor_idle(void) ...@@ -518,6 +529,8 @@ static void acpi_processor_idle(void)
t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
/* Invoke C3 */ /* Invoke C3 */
acpi_state_timer_broadcast(pr, cx, 1); acpi_state_timer_broadcast(pr, cx, 1);
/* Tell the scheduler that we are going deep-idle: */
sched_clock_idle_sleep_event();
acpi_cstate_enter(cx); acpi_cstate_enter(cx);
/* Get end time (ticks) */ /* Get end time (ticks) */
t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
...@@ -531,12 +544,17 @@ static void acpi_processor_idle(void) ...@@ -531,12 +544,17 @@ static void acpi_processor_idle(void)
/* TSC halts in C3, so notify users */ /* TSC halts in C3, so notify users */
mark_tsc_unstable("TSC halts in C3"); mark_tsc_unstable("TSC halts in C3");
#endif #endif
/* Compute time (ticks) that we were actually asleep */
sleep_ticks = ticks_elapsed(t1, t2);
/* Tell the scheduler how much we idled: */
sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
/* Re-enable interrupts */ /* Re-enable interrupts */
local_irq_enable(); local_irq_enable();
/* Do not account our idle-switching overhead: */
sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
current_thread_info()->status |= TS_POLLING; current_thread_info()->status |= TS_POLLING;
/* Compute time (ticks) that we were actually asleep */
sleep_ticks =
ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
acpi_state_timer_broadcast(pr, cx, 0); acpi_state_timer_broadcast(pr, cx, 0);
break; break;
......
...@@ -320,7 +320,21 @@ int proc_pid_status(struct task_struct *task, char *buffer) ...@@ -320,7 +320,21 @@ int proc_pid_status(struct task_struct *task, char *buffer)
return buffer - orig; return buffer - orig;
} }
static clock_t task_utime(struct task_struct *p) /*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
static cputime_t task_utime(struct task_struct *p)
{
return p->utime;
}
static cputime_t task_stime(struct task_struct *p)
{
return p->stime;
}
#else
static cputime_t task_utime(struct task_struct *p)
{ {
clock_t utime = cputime_to_clock_t(p->utime), clock_t utime = cputime_to_clock_t(p->utime),
total = utime + cputime_to_clock_t(p->stime); total = utime + cputime_to_clock_t(p->stime);
...@@ -337,10 +351,10 @@ static clock_t task_utime(struct task_struct *p) ...@@ -337,10 +351,10 @@ static clock_t task_utime(struct task_struct *p)
} }
utime = (clock_t)temp; utime = (clock_t)temp;
return utime; return clock_t_to_cputime(utime);
} }
static clock_t task_stime(struct task_struct *p) static cputime_t task_stime(struct task_struct *p)
{ {
clock_t stime; clock_t stime;
...@@ -349,10 +363,12 @@ static clock_t task_stime(struct task_struct *p) ...@@ -349,10 +363,12 @@ static clock_t task_stime(struct task_struct *p)
* the total, to make sure the total observed by userspace * the total, to make sure the total observed by userspace
* grows monotonically - apps rely on that): * grows monotonically - apps rely on that):
*/ */
stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
cputime_to_clock_t(task_utime(p));
return stime; return clock_t_to_cputime(stime);
} }
#endif
static int do_task_stat(struct task_struct *task, char *buffer, int whole) static int do_task_stat(struct task_struct *task, char *buffer, int whole)
{ {
...@@ -368,8 +384,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) ...@@ -368,8 +384,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
unsigned long long start_time; unsigned long long start_time;
unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0; unsigned long min_flt = 0, maj_flt = 0;
cputime_t cutime, cstime; cputime_t cutime, cstime, utime, stime;
clock_t utime, stime;
unsigned long rsslim = 0; unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)]; char tcomm[sizeof(task->comm)];
unsigned long flags; unsigned long flags;
...@@ -387,8 +402,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) ...@@ -387,8 +402,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
sigemptyset(&sigign); sigemptyset(&sigign);
sigemptyset(&sigcatch); sigemptyset(&sigcatch);
cutime = cstime = cputime_zero; cutime = cstime = utime = stime = cputime_zero;
utime = stime = 0;
rcu_read_lock(); rcu_read_lock();
if (lock_task_sighand(task, &flags)) { if (lock_task_sighand(task, &flags)) {
...@@ -414,15 +428,15 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) ...@@ -414,15 +428,15 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
do { do {
min_flt += t->min_flt; min_flt += t->min_flt;
maj_flt += t->maj_flt; maj_flt += t->maj_flt;
utime += task_utime(t); utime = cputime_add(utime, task_utime(t));
stime += task_stime(t); stime = cputime_add(stime, task_stime(t));
t = next_thread(t); t = next_thread(t);
} while (t != task); } while (t != task);
min_flt += sig->min_flt; min_flt += sig->min_flt;
maj_flt += sig->maj_flt; maj_flt += sig->maj_flt;
utime += cputime_to_clock_t(sig->utime); utime = cputime_add(utime, sig->utime);
stime += cputime_to_clock_t(sig->stime); stime = cputime_add(stime, sig->stime);
} }
sid = signal_session(sig); sid = signal_session(sig);
...@@ -471,8 +485,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) ...@@ -471,8 +485,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
cmin_flt, cmin_flt,
maj_flt, maj_flt,
cmaj_flt, cmaj_flt,
utime, cputime_to_clock_t(utime),
stime, cputime_to_clock_t(stime),
cputime_to_clock_t(cutime), cputime_to_clock_t(cutime),
cputime_to_clock_t(cstime), cputime_to_clock_t(cstime),
priority, priority,
......
...@@ -681,7 +681,7 @@ enum cpu_idle_type { ...@@ -681,7 +681,7 @@ enum cpu_idle_type {
#define SCHED_LOAD_SHIFT 10 #define SCHED_LOAD_SHIFT 10
#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
...@@ -1388,7 +1388,8 @@ extern void sched_exec(void); ...@@ -1388,7 +1388,8 @@ extern void sched_exec(void);
#define sched_exec() {} #define sched_exec() {}
#endif #endif
extern void sched_clock_unstable_event(void); extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
extern void idle_task_exit(void); extern void idle_task_exit(void);
......
...@@ -262,7 +262,8 @@ struct rq { ...@@ -262,7 +262,8 @@ struct rq {
s64 clock_max_delta; s64 clock_max_delta;
unsigned int clock_warps, clock_overflows; unsigned int clock_warps, clock_overflows;
unsigned int clock_unstable_events; u64 idle_clock;
unsigned int clock_deep_idle_events;
u64 tick_timestamp; u64 tick_timestamp;
atomic_t nr_iowait; atomic_t nr_iowait;
...@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void) ...@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
} }
/* /*
* CPU frequency is/was unstable - start new by setting prev_clock_raw: * We are going deep-idle (irqs are disabled):
*/ */
void sched_clock_unstable_event(void) void sched_clock_idle_sleep_event(void)
{ {
unsigned long flags; struct rq *rq = cpu_rq(smp_processor_id());
struct rq *rq;
rq = task_rq_lock(current, &flags); spin_lock(&rq->lock);
rq->prev_clock_raw = sched_clock(); __update_rq_clock(rq);
rq->clock_unstable_events++; spin_unlock(&rq->lock);
task_rq_unlock(rq, &flags); rq->clock_deep_idle_events++;
} }
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled delta nanoseconds (called with irqs disabled):
*/
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
struct rq *rq = cpu_rq(smp_processor_id());
u64 now = sched_clock();
rq->idle_clock += delta_ns;
/*
* Override the previous timestamp and ignore all
* sched_clock() deltas that occured while we idled,
* and use the PM-provided delta_ns to advance the
* rq clock:
*/
spin_lock(&rq->lock);
rq->prev_clock_raw = now;
rq->clock += delta_ns;
spin_unlock(&rq->lock);
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
/* /*
* resched_task - mark a task 'to be rescheduled now'. * resched_task - mark a task 'to be rescheduled now'.
...@@ -2494,7 +2517,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -2494,7 +2517,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* a think about bumping its value to force at least one task to be * a think about bumping its value to force at least one task to be
* moved * moved
*/ */
if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
unsigned long tmp, pwr_now, pwr_move; unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn; unsigned int imbn;
...@@ -3020,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3020,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
struct sched_domain *sd; struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */ /* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ; unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE)) if (!(sd->flags & SD_LOAD_BALANCE))
...@@ -3056,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3056,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (sd->flags & SD_SERIALIZE) if (sd->flags & SD_SERIALIZE)
spin_unlock(&balancing); spin_unlock(&balancing);
out: out:
if (time_after(next_balance, sd->last_balance + interval)) if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval; next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
/* /*
* Stop the load balance at this level. There is another * Stop the load balance at this level. There is another
...@@ -3067,6 +3093,13 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3067,6 +3093,13 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (!balance) if (!balance)
break; break;
} }
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
rq->next_balance = next_balance; rq->next_balance = next_balance;
} }
...@@ -4890,7 +4923,7 @@ static inline void sched_init_granularity(void) ...@@ -4890,7 +4923,7 @@ static inline void sched_init_granularity(void)
if (sysctl_sched_granularity > gran_limit) if (sysctl_sched_granularity > gran_limit)
sysctl_sched_granularity = gran_limit; sysctl_sched_granularity = gran_limit;
sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; sysctl_sched_runtime_limit = sysctl_sched_granularity * 8;
sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
} }
...@@ -5234,15 +5267,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu) ...@@ -5234,15 +5267,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
static struct ctl_table sd_ctl_dir[] = { static struct ctl_table sd_ctl_dir[] = {
{ {
.procname = "sched_domain", .procname = "sched_domain",
.mode = 0755, .mode = 0555,
}, },
{0,}, {0,},
}; };
static struct ctl_table sd_ctl_root[] = { static struct ctl_table sd_ctl_root[] = {
{ {
.ctl_name = CTL_KERN,
.procname = "kernel", .procname = "kernel",
.mode = 0755, .mode = 0555,
.child = sd_ctl_dir, .child = sd_ctl_dir,
}, },
{0,}, {0,},
...@@ -5318,7 +5352,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) ...@@ -5318,7 +5352,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
snprintf(buf, 32, "domain%d", i); snprintf(buf, 32, "domain%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL); entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0755; entry->mode = 0555;
entry->child = sd_alloc_ctl_domain_table(sd); entry->child = sd_alloc_ctl_domain_table(sd);
entry++; entry++;
i++; i++;
...@@ -5338,7 +5372,7 @@ static void init_sched_domain_sysctl(void) ...@@ -5338,7 +5372,7 @@ static void init_sched_domain_sysctl(void)
for (i = 0; i < cpu_num; i++, entry++) { for (i = 0; i < cpu_num; i++, entry++) {
snprintf(buf, 32, "cpu%d", i); snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL); entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0755; entry->mode = 0555;
entry->child = sd_alloc_ctl_cpu_table(i); entry->child = sd_alloc_ctl_cpu_table(i);
} }
sd_sysctl_header = register_sysctl_table(sd_ctl_root); sd_sysctl_header = register_sysctl_table(sd_ctl_root);
......
...@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) ...@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
P(next_balance); P(next_balance);
P(curr->pid); P(curr->pid);
P(clock); P(clock);
P(idle_clock);
P(prev_clock_raw); P(prev_clock_raw);
P(clock_warps); P(clock_warps);
P(clock_overflows); P(clock_overflows);
P(clock_unstable_events); P(clock_deep_idle_events);
P(clock_max_delta); P(clock_max_delta);
P(cpu_load[0]); P(cpu_load[0]);
P(cpu_load[1]); P(cpu_load[1]);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment