Commit b11ce8a2 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-v28-for-linus' of...

Merge branch 'sched-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-v28-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (38 commits)
  sched debug: add name to sched_domain sysctl entries
  sched: sync wakeups vs avg_overlap
  sched: remove redundant code in cpu_cgroup_create()
  sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq
  cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const
  sched: minor optimizations in wake_affine and select_task_rq_fair
  sched: maintain only task entities in cfs_rq->tasks list
  sched: fixup buddy selection
  sched: more sanity checks on the bandwidth settings
  sched: add some comments to the bandwidth code
  sched: fixlet for group load balance
  sched: rework wakeup preemption
  CFS scheduler: documentation about scheduling policies
  sched: clarify ifdef tangle
  sched: fix list traversal to use _rcu variant
  sched: turn off WAKEUP_OVERLAP
  sched: wakeup preempt when small overlap
  kernel/cpu.c: create a CPU_STARTING cpu_chain notifier
  kernel/cpu.c: Move the CPU_DYING notifiers
  sched: fix __load_balance_iterator() for cfq with only one task
  ...
parents f6bccf69 a5d8c348
......@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
mkdir $ARGV[0],0777;
$state = 0;
while (<STDIN>) {
if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
if ($state == 1) { close OUT }
$state = 1;
$fn = "$ARGV[0]/$1.4";
$fn = "$ARGV[0]/$1.9";
print STDERR "Creating $fn\n";
open OUT, ">$fn" or die "can't open $fn: $!\n";
print OUT $_;
......
......@@ -149,6 +149,9 @@ smp_callin(void)
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;
/* inform the notifiers about the new cpu */
notify_cpu_starting(cpuid);
/* Must have completely accurate bogos. */
local_irq_enable();
......
......@@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
/*
* Enable local interrupts.
*/
notify_cpu_starting(cpu);
local_irq_enable();
local_fiq_enable();
......
......@@ -178,6 +178,7 @@ void __init smp_callin(void)
unmask_irq(IPI_INTR_VECT);
unmask_irq(TIMER0_INTR_VECT);
preempt_disable();
notify_cpu_starting(cpu);
local_irq_enable();
cpu_set(cpu, cpu_online_map);
......
......@@ -401,6 +401,7 @@ smp_callin (void)
spin_lock(&vector_lock);
/* Setup the per cpu irq handling data structures */
__setup_vector_irq(cpuid);
notify_cpu_starting(cpuid);
cpu_set(cpuid, cpu_online_map);
per_cpu(cpu_state, cpuid) = CPU_ONLINE;
spin_unlock(&vector_lock);
......
......@@ -498,6 +498,8 @@ static void __init smp_online(void)
{
int cpu_id = smp_processor_id();
notify_cpu_starting(cpu_id);
local_irq_enable();
/* Get our bogomips. */
......
......@@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void)
cpu = smp_processor_id();
cpu_data[cpu].udelay_val = loops_per_jiffy;
notify_cpu_starting(cpu);
mp_ops->smp_finish();
set_cpu_sibling_map(cpu);
......
......@@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused)
secondary_cpu_time_init();
ipi_call_lock();
notify_cpu_starting(cpu);
cpu_set(cpu, cpu_online_map);
/* Update sibling maps */
base = cpu_first_thread_in_core(cpu);
......
......@@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid)
/* Enable pfault pseudo page faults on this cpu. */
pfault_init();
/* call cpu notifiers */
notify_cpu_starting(smp_processor_id());
/* Mark this cpu as online */
spin_lock(&call_lock);
cpu_set(smp_processor_id(), cpu_online_map);
......
......@@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void)
preempt_disable();
notify_cpu_starting(smp_processor_id());
local_irq_enable();
calibrate_delay();
......
......@@ -88,6 +88,7 @@ void __init smp4d_callin(void)
local_flush_cache_all();
local_flush_tlb_all();
notify_cpu_starting(cpuid);
/*
* Unblock the master CPU _only_ when the scheduler state
* of all secondary CPUs will be up-to-date, so after
......
......@@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void)
local_flush_cache_all();
local_flush_tlb_all();
notify_cpu_starting(cpuid);
/* Get our local ticker going. */
smp_setup_percpu_timer();
......
......@@ -85,6 +85,7 @@ static int idle_proc(void *cpup)
while (!cpu_isset(cpu, smp_commenced_mask))
cpu_relax();
notify_cpu_starting(cpu);
cpu_set(cpu, cpu_online_map);
default_idle();
return 0;
......
......@@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void)
end_local_APIC_setup();
map_cpu_to_logical_apicid();
notify_cpu_starting(cpuid);
/*
* Get our bogomips.
*
......
......@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
notify_cpu_starting(cpuid);
/* enable interrupts */
local_irq_enable();
......
......@@ -10,6 +10,18 @@
#include <linux/wait.h>
/**
* struct completion - structure used to maintain state for a "completion"
*
* This is the opaque structure used to maintain the state for a "completion".
* Completions currently use a FIFO to queue threads that have to wait for
* the "completion" event.
*
* See also: complete(), wait_for_completion() (and friends _timeout,
* _interruptible, _interruptible_timeout, and _killable), init_completion(),
* and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
* INIT_COMPLETION().
*/
struct completion {
unsigned int done;
wait_queue_head_t wait;
......@@ -21,6 +33,14 @@ struct completion {
#define COMPLETION_INITIALIZER_ONSTACK(work) \
({ init_completion(&work); work; })
/**
* DECLARE_COMPLETION: - declare and initialize a completion structure
* @work: identifier for the completion structure
*
* This macro declares and initializes a completion structure. Generally used
* for static declarations. You should use the _ONSTACK variant for automatic
* variables.
*/
#define DECLARE_COMPLETION(work) \
struct completion work = COMPLETION_INITIALIZER(work)
......@@ -29,6 +49,13 @@ struct completion {
* completions - so we use the _ONSTACK() variant for those that
* are on the kernel stack:
*/
/**
* DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure
* @work: identifier for the completion structure
*
* This macro declares and initializes a completion structure on the kernel
* stack.
*/
#ifdef CONFIG_LOCKDEP
# define DECLARE_COMPLETION_ONSTACK(work) \
struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
......@@ -36,6 +63,13 @@ struct completion {
# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
#endif
/**
* init_completion: - Initialize a dynamically allocated completion
* @x: completion structure that is to be initialized
*
* This inline function will initialize a dynamically created completion
* structure.
*/
static inline void init_completion(struct completion *x)
{
x->done = 0;
......@@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x);
extern void complete(struct completion *);
extern void complete_all(struct completion *);
/**
* INIT_COMPLETION: - reinitialize a completion structure
* @x: completion structure to be reinitialized
*
* This macro should be used to reinitialize a completion structure so it can
* be reused. This is especially important after complete_all() is used.
*/
#define INIT_COMPLETION(x) ((x).done = 0)
......
......@@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
#endif
int cpu_up(unsigned int cpu);
void notify_cpu_starting(unsigned int cpu);
extern void cpu_hotplug_init(void);
extern void cpu_maps_update_begin(void);
extern void cpu_maps_update_done(void);
......
......@@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret)
#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
* not handling interrupts, soon dead */
* not handling interrupts, soon dead.
* Called on the dying cpu, interrupts
* are already disabled. Must not
* sleep, must not fail */
#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug
* lock is dropped */
#define CPU_STARTING 0x000A /* CPU (unsigned)v soon running.
* Called on the new cpu, just before
* enabling interrupts. Must not sleep,
* must not fail */
/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
* operation in progress
......@@ -229,6 +236,7 @@ static inline int notifier_to_errno(int ret)
#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
#define CPU_STARTING_FROZEN (CPU_STARTING | CPU_TASKS_FROZEN)
/* Hibernation and suspend events */
#define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */
......
......@@ -104,8 +104,8 @@ struct prop_local_single {
* snapshot of the last seen global state
* and a lock protecting this state
*/
int shift;
unsigned long period;
int shift;
spinlock_t lock; /* protect the snapshot state */
};
......
......@@ -451,8 +451,8 @@ struct signal_struct {
* - everyone except group_exit_task is stopped during signal delivery
* of fatal signals, group_exit_task processes the signal.
*/
struct task_struct *group_exit_task;
int notify_count;
struct task_struct *group_exit_task;
/* thread group stop support, overloads group_exit_code too */
int group_stop_count;
......@@ -824,6 +824,9 @@ struct sched_domain {
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
};
extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
......@@ -897,7 +900,7 @@ struct sched_class {
void (*yield_task) (struct rq *rq);
int (*select_task_rq)(struct task_struct *p, int sync);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
......@@ -1010,8 +1013,8 @@ struct sched_entity {
struct sched_rt_entity {
struct list_head run_list;
unsigned int time_slice;
unsigned long timeout;
unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
......
......@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
struct take_cpu_down_param *param = _param;
int err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
......@@ -453,6 +454,25 @@ void __ref enable_nonboot_cpus(void)
}
#endif /* CONFIG_PM_SLEEP_SMP */
/**
* notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
* @cpu: cpu that just started
*
* This function calls the cpu_chain notifiers with CPU_STARTING.
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
#ifdef CONFIG_PM_SLEEP_SMP
if (cpu_isset(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
/*
......
......@@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* that has tasks along with an empty 'mems'. But if we did see such
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
*/
static void scan_for_empty_cpusets(const struct cpuset *root)
static void scan_for_empty_cpusets(struct cpuset *root)
{
LIST_HEAD(queue);
struct cpuset *cp; /* scans cpusets being updated */
......
This diff is collapsed.
This diff is collapsed.
......@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
......@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
{
resched_task(rq->idle);
}
......@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
}
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
......@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
}
/*
......
......@@ -102,11 +102,11 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
struct sched_rt_entity *rt_se = rt_rq->rt_se;
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
if (rt_rq->highest_prio < curr->prio)
resched_task(curr);
......@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
......@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
continue;
spin_lock(&iter->rt_runtime_lock);
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
* indicate its been disabled and disalow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
/*
* From runqueues with spare time, take 1/n part of their
* spare time, but no more than our period.
*/
diff = iter->rt_runtime - iter->rt_time;
if (diff > 0) {
diff = div_u64((u64)diff, weight);
......@@ -274,6 +286,9 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
return more;
}
/*
* Ensure this RQ takes back all the runtime it lend to its neighbours.
*/
static void __disable_runtime(struct rq *rq)
{
struct root_domain *rd = rq->rd;
......@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
spin_lock(&rt_b->rt_runtime_lock);
spin_lock(&rt_rq->rt_runtime_lock);
/*
* Either we're all inf and nobody needs to borrow, or we're
* already disabled and thus have nothing to do, or we have
* exactly the right amount of runtime to take out.
*/
if (rt_rq->rt_runtime == RUNTIME_INF ||
rt_rq->rt_runtime == rt_b->rt_runtime)
goto balanced;
spin_unlock(&rt_rq->rt_runtime_lock);
/*
* Calculate the difference between what we started out with
* and what we current have, that's the amount of runtime
* we lend and now have to reclaim.
*/
want = rt_b->rt_runtime - rt_rq->rt_runtime;
/*
* Greedy reclaim, take back as much as we can.
*/
for_each_cpu_mask(i, rd->span) {
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
s64 diff;
/*
* Can't reclaim from ourselves or disabled runqueues.
*/
if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
continue;
......@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
}
spin_lock(&rt_rq->rt_runtime_lock);
/*
* We cannot be left wanting - that would mean some runtime
* leaked out of the system.
*/
BUG_ON(want);
balanced:
/*
* Disable all the borrow logic by pretending we have inf
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
......@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
if (unlikely(!scheduler_running))
return;
/*
* Reset each runqueue's bandwidth settings
*/
for_each_leaf_rt_rq(rt_rq, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
......@@ -389,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int i, idle = 1;
cpumask_t span;
if (rt_b->rt_runtime == RUNTIME_INF)
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return 1;
span = sched_rt_period_mask();
......@@ -487,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
......@@ -784,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
{
if (p->prio < rq->curr->prio) {
resched_task(rq->curr);
......
......@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
{
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
}
static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
......@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
unsigned long rt_runtime;
int rc;
sscanf(buf, "%lu", &rt_runtime);
sscanf(buf, "%ld", &rt_runtime);
rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment