Commit 80fe02b5 authored by Linus Torvalds's avatar Linus Torvalds

Merge branches 'sched-core-for-linus' and 'sched-urgent-for-linus' of...

Merge branches 'sched-core-for-linus' and 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (60 commits)
  sched: Fix and optimise calculation of the weight-inverse
  sched: Avoid going ahead if ->cpus_allowed is not changed
  sched, rt: Update rq clock when unthrottling of an otherwise idle CPU
  sched: Remove unused parameters from sched_fork() and wake_up_new_task()
  sched: Shorten the construction of the span cpu mask of sched domain
  sched: Wrap the 'cfs_rq->nr_spread_over' field with CONFIG_SCHED_DEBUG
  sched: Remove unused 'this_best_prio arg' from balance_tasks()
  sched: Remove noop in alloc_rt_sched_group()
  sched: Get rid of lock_depth
  sched: Remove obsolete comment from scheduler_tick()
  sched: Fix sched_domain iterations vs. RCU
  sched: Next buddy hint on sleep and preempt path
  sched: Make set_*_buddy() work on non-task entities
  sched: Remove need_migrate_task()
  sched: Move the second half of ttwu() to the remote cpu
  sched: Restructure ttwu() some more
  sched: Rename ttwu_post_activation() to ttwu_do_wakeup()
  sched: Remove rq argument from ttwu_stat()
  sched: Remove rq->lock from the first half of ttwu()
  sched: Drop rq->lock from sched_exec()
  ...

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: Fix rt_rq runtime leakage bug
...@@ -120,7 +120,6 @@ format: ...@@ -120,7 +120,6 @@ format:
field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0;
field:int common_pid; offset:4; size:4; signed:1; field:int common_pid; offset:4; size:4; signed:1;
field:int common_lock_depth; offset:8; size:4; signed:1;
field:unsigned long __probe_ip; offset:12; size:4; signed:0; field:unsigned long __probe_ip; offset:12; size:4; signed:0;
field:int __probe_nargs; offset:16; size:4; signed:1; field:int __probe_nargs; offset:16; size:4; signed:1;
......
...@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) ...@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs)
switch (which) { switch (which) {
case IPI_RESCHEDULE: case IPI_RESCHEDULE:
/* Reschedule callback. Everything to be done scheduler_ipi();
is done by the interrupt return path. */
break; break;
case IPI_CALL_FUNC: case IPI_CALL_FUNC:
......
...@@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) ...@@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
break; break;
case IPI_RESCHEDULE: case IPI_RESCHEDULE:
/* scheduler_ipi();
* nothing more to do - eveything is
* done on the interrupt return path
*/
break; break;
case IPI_CALL_FUNC: case IPI_CALL_FUNC:
......
...@@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) ...@@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance)
while (msg_queue->count) { while (msg_queue->count) {
msg = &msg_queue->ipi_message[msg_queue->head]; msg = &msg_queue->ipi_message[msg_queue->head];
switch (msg->type) { switch (msg->type) {
case BFIN_IPI_RESCHEDULE:
scheduler_ipi();
break;
case BFIN_IPI_CALL_FUNC: case BFIN_IPI_CALL_FUNC:
spin_unlock_irqrestore(&msg_queue->lock, flags); spin_unlock_irqrestore(&msg_queue->lock, flags);
ipi_call_function(cpu, msg); ipi_call_function(cpu, msg);
......
...@@ -342,6 +342,9 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ...@@ -342,6 +342,9 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id)
ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi);
if (ipi.vector & IPI_SCHEDULE) {
scheduler_ipi();
}
if (ipi.vector & IPI_CALL) { if (ipi.vector & IPI_CALL) {
func(info); func(info);
} }
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/irq.h> #include <linux/irq.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/acpi.h> #include <linux/acpi.h>
#include <linux/sched.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/intrinsics.h> #include <asm/intrinsics.h>
...@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) ...@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
smp_local_flush_tlb(); smp_local_flush_tlb();
kstat_incr_irqs_this_cpu(irq, desc); kstat_incr_irqs_this_cpu(irq, desc);
} else if (unlikely(IS_RESCHEDULE(vector))) { } else if (unlikely(IS_RESCHEDULE(vector))) {
scheduler_ipi();
kstat_incr_irqs_this_cpu(irq, desc); kstat_incr_irqs_this_cpu(irq, desc);
} else { } else {
ia64_setreg(_IA64_REG_CR_TPR, vector); ia64_setreg(_IA64_REG_CR_TPR, vector);
......
...@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; ...@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt;
static int xen_slab_ready; static int xen_slab_ready;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#include <linux/sched.h>
/* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ,
* it ends up to issue several memory accesses upon percpu data and * it ends up to issue several memory accesses upon percpu data and
* thus adds unnecessary traffic to other paths. * thus adds unnecessary traffic to other paths.
...@@ -99,7 +101,13 @@ static int xen_slab_ready; ...@@ -99,7 +101,13 @@ static int xen_slab_ready;
static irqreturn_t static irqreturn_t
xen_dummy_handler(int irq, void *dev_id) xen_dummy_handler(int irq, void *dev_id)
{ {
return IRQ_HANDLED;
}
static irqreturn_t
xen_resched_handler(int irq, void *dev_id)
{
scheduler_ipi();
return IRQ_HANDLED; return IRQ_HANDLED;
} }
...@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { ...@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = {
}; };
static struct irqaction xen_resched_irqaction = { static struct irqaction xen_resched_irqaction = {
.handler = xen_dummy_handler, .handler = xen_resched_handler,
.flags = IRQF_DISABLED, .flags = IRQF_DISABLED,
.name = "resched" .name = "resched"
}; };
......
...@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) ...@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id)
* *
* Description: This routine executes on CPU which received * Description: This routine executes on CPU which received
* 'RESCHEDULE_IPI'. * 'RESCHEDULE_IPI'.
* Rescheduling is processed at the exit of interrupt
* operation.
* *
* Born on Date: 2002.02.05 * Born on Date: 2002.02.05
* *
...@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) ...@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id)
*==========================================================================*/ *==========================================================================*/
void smp_reschedule_interrupt(void) void smp_reschedule_interrupt(void)
{ {
/* nothing to do */ scheduler_ipi();
} }
/*==========================================================================* /*==========================================================================*
......
...@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) ...@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id)
if (action & SMP_CALL_FUNCTION) if (action & SMP_CALL_FUNCTION)
smp_call_function_interrupt(); smp_call_function_interrupt();
if (action & SMP_RESCHEDULE_YOURSELF)
scheduler_ipi();
/* Check if we've been told to flush the icache */ /* Check if we've been told to flush the icache */
if (action & SMP_ICACHE_FLUSH) if (action & SMP_ICACHE_FLUSH)
......
...@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) ...@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi)
static void ipi_resched_interrupt(void) static void ipi_resched_interrupt(void)
{ {
/* Return from interrupt should be enough to cause scheduler check */ scheduler_ipi();
} }
static void ipi_call_interrupt(void) static void ipi_call_interrupt(void)
......
...@@ -308,6 +308,8 @@ static void ipi_call_dispatch(void) ...@@ -308,6 +308,8 @@ static void ipi_call_dispatch(void)
static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
{ {
scheduler_ipi();
return IRQ_HANDLED; return IRQ_HANDLED;
} }
......
...@@ -55,6 +55,8 @@ void titan_mailbox_irq(void) ...@@ -55,6 +55,8 @@ void titan_mailbox_irq(void)
if (status & 0x2) if (status & 0x2)
smp_call_function_interrupt(); smp_call_function_interrupt();
if (status & 0x4)
scheduler_ipi();
break; break;
case 1: case 1:
...@@ -63,6 +65,8 @@ void titan_mailbox_irq(void) ...@@ -63,6 +65,8 @@ void titan_mailbox_irq(void)
if (status & 0x2) if (status & 0x2)
smp_call_function_interrupt(); smp_call_function_interrupt();
if (status & 0x4)
scheduler_ipi();
break; break;
} }
} }
......
...@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) ...@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) {
LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ);
scheduler_ipi();
} else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) {
LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ);
scheduler_ipi();
} else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) {
LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ);
smp_call_function_interrupt(); smp_call_function_interrupt();
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/kernel_stat.h> #include <linux/kernel_stat.h>
#include <linux/sched.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/io.h> #include <asm/io.h>
...@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) ...@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void)
/* Clear the mailbox to clear the interrupt */ /* Clear the mailbox to clear the interrupt */
__raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]);
/* if (action & SMP_RESCHEDULE_YOURSELF)
* Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the scheduler_ipi();
* interrupt will do the reschedule for us
*/
if (action & SMP_CALL_FUNCTION) if (action & SMP_CALL_FUNCTION)
smp_call_function_interrupt(); smp_call_function_interrupt();
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/kernel_stat.h> #include <linux/kernel_stat.h>
#include <linux/sched.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/io.h> #include <asm/io.h>
...@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) ...@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void)
/* Clear the mailbox to clear the interrupt */ /* Clear the mailbox to clear the interrupt */
____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]);
/* if (action & SMP_RESCHEDULE_YOURSELF)
* Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the scheduler_ipi();
* interrupt will do the reschedule for us
*/
if (action & SMP_CALL_FUNCTION) if (action & SMP_CALL_FUNCTION)
smp_call_function_interrupt(); smp_call_function_interrupt();
......
...@@ -494,14 +494,11 @@ void smp_send_stop(void) ...@@ -494,14 +494,11 @@ void smp_send_stop(void)
* @irq: The interrupt number. * @irq: The interrupt number.
* @dev_id: The device ID. * @dev_id: The device ID.
* *
* We need do nothing here, since the scheduling will be effected on our way
* back through entry.S.
*
* Returns IRQ_HANDLED to indicate we handled the interrupt successfully. * Returns IRQ_HANDLED to indicate we handled the interrupt successfully.
*/ */
static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
{ {
/* do nothing */ scheduler_ipi();
return IRQ_HANDLED; return IRQ_HANDLED;
} }
......
...@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) ...@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id)
case IPI_RESCHEDULE: case IPI_RESCHEDULE:
smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu);
/* scheduler_ipi();
* Reschedule callback. Everything to be
* done is done by the interrupt return path.
*/
break; break;
case IPI_CALL_FUNC: case IPI_CALL_FUNC:
......
...@@ -116,7 +116,7 @@ void smp_message_recv(int msg) ...@@ -116,7 +116,7 @@ void smp_message_recv(int msg)
generic_smp_call_function_interrupt(); generic_smp_call_function_interrupt();
break; break;
case PPC_MSG_RESCHEDULE: case PPC_MSG_RESCHEDULE:
/* we notice need_resched on exit */ scheduler_ipi();
break; break;
case PPC_MSG_CALL_FUNC_SINGLE: case PPC_MSG_CALL_FUNC_SINGLE:
generic_smp_call_function_single_interrupt(); generic_smp_call_function_single_interrupt();
...@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) ...@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data)
static irqreturn_t reschedule_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data)
{ {
/* we just need the return path side effect of checking need_resched */ scheduler_ipi();
return IRQ_HANDLED; return IRQ_HANDLED;
} }
......
...@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, ...@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code,
kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++;
/* /*
* handle bit signal external calls * handle bit signal external calls
*
* For the ec_schedule signal we have to do nothing. All the work
* is done automatically when we return from the interrupt.
*/ */
bits = xchg(&S390_lowcore.ext_call_fast, 0); bits = xchg(&S390_lowcore.ext_call_fast, 0);
if (test_bit(ec_schedule, &bits))
scheduler_ipi();
if (test_bit(ec_call_function, &bits)) if (test_bit(ec_call_function, &bits))
generic_smp_call_function_interrupt(); generic_smp_call_function_interrupt();
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h> #include <asm/atomic.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/system.h> #include <asm/system.h>
...@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) ...@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg)
generic_smp_call_function_interrupt(); generic_smp_call_function_interrupt();
break; break;
case SMP_MSG_RESCHEDULE: case SMP_MSG_RESCHEDULE:
scheduler_ipi();
break; break;
case SMP_MSG_FUNCTION_SINGLE: case SMP_MSG_FUNCTION_SINGLE:
generic_smp_call_function_single_interrupt(); generic_smp_call_function_single_interrupt();
......
...@@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus) ...@@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
#define smt_capable() (sparc64_multi_core) #define smt_capable() (sparc64_multi_core)
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) extern cpumask_t cpu_core_map[NR_CPUS];
static inline const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_core_map[cpu];
}
#endif /* _ASM_SPARC64_TOPOLOGY_H */ #endif /* _ASM_SPARC64_TOPOLOGY_H */
...@@ -129,7 +129,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; ...@@ -129,7 +129,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 };
void smp_send_reschedule(int cpu) void smp_send_reschedule(int cpu)
{ {
/* See sparc64 */ /*
* XXX missing reschedule IPI, see scheduler_ipi()
*/
} }
void smp_send_stop(void) void smp_send_stop(void)
......
...@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) ...@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu)
void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
{ {
clear_softint(1 << irq); clear_softint(1 << irq);
scheduler_ipi();
} }
/* This is a nop because we capture all other cpus /* This is a nop because we capture all other cpus
......
...@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) ...@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end)
/* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */
static irqreturn_t handle_reschedule_ipi(int irq, void *token) static irqreturn_t handle_reschedule_ipi(int irq, void *token)
{ {
/*
* Nothing to do here; when we return from interrupt, the
* rescheduling will occur there. But do bump the interrupt
* profiler count in the meantime.
*/
__get_cpu_var(irq_stat).irq_resched_count++; __get_cpu_var(irq_stat).irq_resched_count++;
scheduler_ipi();
return IRQ_HANDLED; return IRQ_HANDLED;
} }
......
...@@ -173,7 +173,7 @@ void IPI_handler(int cpu) ...@@ -173,7 +173,7 @@ void IPI_handler(int cpu)
break; break;
case 'R': case 'R':
set_tsk_need_resched(current); scheduler_ipi();
break; break;
case 'S': case 'S':
......
...@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) ...@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
} }
/* /*
* Reschedule call back. Nothing to do, * Reschedule call back.
* all the work is done automatically when
* we return from the interrupt.
*/ */
void smp_reschedule_interrupt(struct pt_regs *regs) void smp_reschedule_interrupt(struct pt_regs *regs)
{ {
ack_APIC_irq(); ack_APIC_irq();
inc_irq_stat(irq_resched_count); inc_irq_stat(irq_resched_count);
scheduler_ipi();
/* /*
* KVM uses this interrupt to force a cpu out of guest mode * KVM uses this interrupt to force a cpu out of guest mode
*/ */
......
...@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); ...@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
/* /*
* Reschedule call back. Nothing to do, * Reschedule call back.
* all the work is done automatically when
* we return from the interrupt.
*/ */
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{ {
inc_irq_stat(irq_resched_count); inc_irq_stat(irq_resched_count);
scheduler_ipi();
return IRQ_HANDLED; return IRQ_HANDLED;
} }
......
...@@ -134,7 +134,6 @@ extern struct cred init_cred; ...@@ -134,7 +134,6 @@ extern struct cred init_cred;
.stack = &init_thread_info, \ .stack = &init_thread_info, \
.usage = ATOMIC_INIT(2), \ .usage = ATOMIC_INIT(2), \
.flags = PF_KTHREAD, \ .flags = PF_KTHREAD, \
.lock_depth = -1, \
.prio = MAX_PRIO-20, \ .prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \
......
...@@ -51,7 +51,7 @@ struct mutex { ...@@ -51,7 +51,7 @@ struct mutex {
spinlock_t wait_lock; spinlock_t wait_lock;
struct list_head wait_list; struct list_head wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
struct thread_info *owner; struct task_struct *owner;
#endif #endif
#ifdef CONFIG_DEBUG_MUTEXES #ifdef CONFIG_DEBUG_MUTEXES
const char *name; const char *name;
......
...@@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); ...@@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout);
extern signed long schedule_timeout_uninterruptible(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void); asmlinkage void schedule(void);
extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
struct nsproxy; struct nsproxy;
struct user_namespace; struct user_namespace;
...@@ -731,10 +731,6 @@ struct sched_info { ...@@ -731,10 +731,6 @@ struct sched_info {
/* timestamps */ /* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */ unsigned long long last_arrival,/* when we last ran on a cpu */
last_queued; /* when we were last queued to run */ last_queued; /* when we were last queued to run */
#ifdef CONFIG_SCHEDSTATS
/* BKL stats */
unsigned int bkl_count;
#endif
}; };
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
...@@ -868,6 +864,7 @@ static inline int sd_power_saving_flags(void) ...@@ -868,6 +864,7 @@ static inline int sd_power_saving_flags(void)
struct sched_group { struct sched_group {
struct sched_group *next; /* Must be a circular list */ struct sched_group *next; /* Must be a circular list */
atomic_t ref;
/* /*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a * CPU power of this group, SCHED_LOAD_SCALE being max power for a
...@@ -882,9 +879,6 @@ struct sched_group { ...@@ -882,9 +879,6 @@ struct sched_group {
* NOTE: this field is variable length. (Allocated dynamically * NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure, * by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with) * depending on how many CPUs the kernel has booted up with)
*
* It is also be embedded into static data structures at build
* time. (See 'struct static_sched_group' in kernel/sched.c)
*/ */
unsigned long cpumask[0]; unsigned long cpumask[0];
}; };
...@@ -894,17 +888,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) ...@@ -894,17 +888,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
return to_cpumask(sg->cpumask); return to_cpumask(sg->cpumask);
} }
enum sched_domain_level {
SD_LV_NONE = 0,
SD_LV_SIBLING,
SD_LV_MC,
SD_LV_BOOK,
SD_LV_CPU,
SD_LV_NODE,
SD_LV_ALLNODES,
SD_LV_MAX
};
struct sched_domain_attr { struct sched_domain_attr {
int relax_domain_level; int relax_domain_level;
}; };
...@@ -913,6 +896,8 @@ struct sched_domain_attr { ...@@ -913,6 +896,8 @@ struct sched_domain_attr {
.relax_domain_level = -1, \ .relax_domain_level = -1, \
} }
extern int sched_domain_level_max;
struct sched_domain { struct sched_domain {
/* These fields must be setup */ /* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *parent; /* top domain must be null terminated */
...@@ -930,7 +915,7 @@ struct sched_domain { ...@@ -930,7 +915,7 @@ struct sched_domain {
unsigned int forkexec_idx; unsigned int forkexec_idx;
unsigned int smt_gain; unsigned int smt_gain;
int flags; /* See SD_* */ int flags; /* See SD_* */
enum sched_domain_level level; int level;
/* Runtime fields. */ /* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */ unsigned long last_balance; /* init to jiffies. units in jiffies */
...@@ -973,6 +958,10 @@ struct sched_domain { ...@@ -973,6 +958,10 @@ struct sched_domain {
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
char *name; char *name;
#endif #endif
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
unsigned int span_weight; unsigned int span_weight;
/* /*
...@@ -981,9 +970,6 @@ struct sched_domain { ...@@ -981,9 +970,6 @@ struct sched_domain {
* NOTE: this field is variable length. (Allocated dynamically * NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure, * by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with) * depending on how many CPUs the kernel has booted up with)
*
* It is also be embedded into static data structures at build
* time. (See 'struct static_sched_domain' in kernel/sched.c)
*/ */
unsigned long span[0]; unsigned long span[0];
}; };
...@@ -1048,8 +1034,12 @@ struct sched_domain; ...@@ -1048,8 +1034,12 @@ struct sched_domain;
#define WF_FORK 0x02 /* child wakeup after fork */ #define WF_FORK 0x02 /* child wakeup after fork */
#define ENQUEUE_WAKEUP 1 #define ENQUEUE_WAKEUP 1
#define ENQUEUE_WAKING 2 #define ENQUEUE_HEAD 2
#define ENQUEUE_HEAD 4 #ifdef CONFIG_SMP
#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
#else
#define ENQUEUE_WAKING 0
#endif
#define DEQUEUE_SLEEP 1 #define DEQUEUE_SLEEP 1
...@@ -1067,12 +1057,11 @@ struct sched_class { ...@@ -1067,12 +1057,11 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p); void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int (*select_task_rq)(struct rq *rq, struct task_struct *p, int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
int sd_flag, int flags);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct rq *this_rq, struct task_struct *task); void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p, void (*set_cpus_allowed)(struct task_struct *p,
...@@ -1197,13 +1186,11 @@ struct task_struct { ...@@ -1197,13 +1186,11 @@ struct task_struct {
unsigned int flags; /* per process flags, defined below */ unsigned int flags; /* per process flags, defined below */
unsigned int ptrace; unsigned int ptrace;
int lock_depth; /* BKL lock depth */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW struct task_struct *wake_entry;
int oncpu; int on_cpu;
#endif
#endif #endif
int on_rq;
int prio, static_prio, normal_prio; int prio, static_prio, normal_prio;
unsigned int rt_priority; unsigned int rt_priority;
...@@ -1274,6 +1261,7 @@ struct task_struct { ...@@ -1274,6 +1261,7 @@ struct task_struct {
/* Revert to default priority/policy when forking */ /* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1; unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
pid_t pid; pid_t pid;
pid_t tgid; pid_t tgid;
...@@ -2063,14 +2051,13 @@ extern void xtime_update(unsigned long ticks); ...@@ -2063,14 +2051,13 @@ extern void xtime_update(unsigned long ticks);
extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk); extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk, extern void wake_up_new_task(struct task_struct *tsk);
unsigned long clone_flags);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk);
#else #else
static inline void kick_process(struct task_struct *tsk) { } static inline void kick_process(struct task_struct *tsk) { }
#endif #endif
extern void sched_fork(struct task_struct *p, int clone_flags); extern void sched_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p); extern void sched_dead(struct task_struct *p);
extern void proc_caches_init(void); extern void proc_caches_init(void);
...@@ -2195,8 +2182,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); ...@@ -2195,8 +2182,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
extern char *get_task_comm(char *to, struct task_struct *tsk); extern char *get_task_comm(char *to, struct task_struct *tsk);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
void scheduler_ipi(void);
extern unsigned long wait_task_inactive(struct task_struct *, long match_state); extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else #else
static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p, static inline unsigned long wait_task_inactive(struct task_struct *p,
long match_state) long match_state)
{ {
......
...@@ -827,6 +827,11 @@ config SCHED_AUTOGROUP ...@@ -827,6 +827,11 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based desktop applications. Task group autogeneration is currently based
upon task session. upon task session.
config SCHED_TTWU_QUEUE
bool
depends on !SPARC32
default y
config MM_OWNER config MM_OWNER
bool bool
......
...@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) ...@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val) static int update_relax_domain_level(struct cpuset *cs, s64 val)
{ {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (val < -1 || val >= SD_LV_MAX) if (val < -1 || val >= sched_domain_level_max)
return -EINVAL; return -EINVAL;
#endif #endif
......
...@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
posix_cpu_timers_init(p); posix_cpu_timers_init(p);
p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time); do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time; p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time); monotonic_to_bootbased(&p->real_start_time);
...@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif #endif
/* Perform scheduler related setup. Assign this task to a CPU. */ /* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags); sched_fork(p);
retval = perf_event_init_task(p); retval = perf_event_init_task(p);
if (retval) if (retval)
...@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags, ...@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags,
*/ */
p->flags &= ~PF_STARTING; p->flags &= ~PF_STARTING;
wake_up_new_task(p, clone_flags); wake_up_new_task(p);
tracehook_report_clone_complete(trace, regs, tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p); clone_flags, nr, p);
......
...@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) ...@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
return; return;
DEBUG_LOCKS_WARN_ON(lock->magic != lock); DEBUG_LOCKS_WARN_ON(lock->magic != lock);
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
mutex_clear_owner(lock); mutex_clear_owner(lock);
} }
......
...@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, ...@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
static inline void mutex_set_owner(struct mutex *lock) static inline void mutex_set_owner(struct mutex *lock)
{ {
lock->owner = current_thread_info(); lock->owner = current;
} }
static inline void mutex_clear_owner(struct mutex *lock) static inline void mutex_clear_owner(struct mutex *lock)
......
...@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, ...@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/ */
for (;;) { for (;;) {
struct thread_info *owner; struct task_struct *owner;
/*
* If we own the BKL, then don't spin. The owner of
* the mutex might be waiting on us to release the BKL.
*/
if (unlikely(current->lock_depth >= 0))
break;
/* /*
* If there's an owner, wait for it to either * If there's an owner, wait for it to either
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static inline void mutex_set_owner(struct mutex *lock) static inline void mutex_set_owner(struct mutex *lock)
{ {
lock->owner = current_thread_info(); lock->owner = current;
} }
static inline void mutex_clear_owner(struct mutex *lock) static inline void mutex_clear_owner(struct mutex *lock)
......
...@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) ...@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
#endif #endif
/* /*
* sched_domains_mutex serializes calls to arch_init_sched_domains, * sched_domains_mutex serializes calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains. * detach_destroy_domains and partition_sched_domains.
*/ */
static DEFINE_MUTEX(sched_domains_mutex); static DEFINE_MUTEX(sched_domains_mutex);
...@@ -312,6 +312,9 @@ struct cfs_rq { ...@@ -312,6 +312,9 @@ struct cfs_rq {
u64 exec_clock; u64 exec_clock;
u64 min_vruntime; u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
struct rb_root tasks_timeline; struct rb_root tasks_timeline;
struct rb_node *rb_leftmost; struct rb_node *rb_leftmost;
...@@ -325,7 +328,9 @@ struct cfs_rq { ...@@ -325,7 +328,9 @@ struct cfs_rq {
*/ */
struct sched_entity *curr, *next, *last, *skip; struct sched_entity *curr, *next, *last, *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over; unsigned int nr_spread_over;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
...@@ -417,6 +422,7 @@ struct rt_rq { ...@@ -417,6 +422,7 @@ struct rt_rq {
*/ */
struct root_domain { struct root_domain {
atomic_t refcount; atomic_t refcount;
struct rcu_head rcu;
cpumask_var_t span; cpumask_var_t span;
cpumask_var_t online; cpumask_var_t online;
...@@ -460,7 +466,7 @@ struct rq { ...@@ -460,7 +466,7 @@ struct rq {
u64 nohz_stamp; u64 nohz_stamp;
unsigned char nohz_balance_kick; unsigned char nohz_balance_kick;
#endif #endif
unsigned int skip_clock_update; int skip_clock_update;
/* capture load from *all* tasks on this cpu: */ /* capture load from *all* tasks on this cpu: */
struct load_weight load; struct load_weight load;
...@@ -553,6 +559,10 @@ struct rq { ...@@ -553,6 +559,10 @@ struct rq {
unsigned int ttwu_count; unsigned int ttwu_count;
unsigned int ttwu_local; unsigned int ttwu_local;
#endif #endif
#ifdef CONFIG_SMP
struct task_struct *wake_list;
#endif
}; };
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
...@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq) ...@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \ #define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \ rcu_dereference_check((p), \
rcu_read_lock_sched_held() || \ rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex)) lockdep_is_held(&sched_domains_mutex))
/* /*
...@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq) ...@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
* Return the group to which this tasks belongs. * Return the group to which this tasks belongs.
* *
* We use task_subsys_state_check() and extend the RCU verification * We use task_subsys_state_check() and extend the RCU verification
* with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore * holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup. * by holding that lock, we pin the task to the current cgroup.
*/ */
...@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p) ...@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id, css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock)); lockdep_is_held(&p->pi_lock));
tg = container_of(css, struct task_group, css); tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg); return autogroup_task_group(p, tg);
...@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq) ...@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
{ {
s64 delta; s64 delta;
if (rq->skip_clock_update) if (rq->skip_clock_update > 0)
return; return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
...@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) ...@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
return rq->curr == p; return rq->curr == p;
} }
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p) static inline int task_running(struct rq *rq, struct task_struct *p)
{ {
#ifdef CONFIG_SMP
return p->on_cpu;
#else
return task_current(rq, p); return task_current(rq, p);
#endif
} }
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{ {
#ifdef CONFIG_SMP
/*
* We can optimise this out completely for !SMP, because the
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
next->on_cpu = 1;
#endif
} }
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{ {
#ifdef CONFIG_SMP
/*
* After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
prev->on_cpu = 0;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK #ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */ /* this is a valid case when another task releases the spinlock */
rq->lock.owner = current; rq->lock.owner = current;
...@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ...@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
} }
#else /* __ARCH_WANT_UNLOCKED_CTXSW */ #else /* __ARCH_WANT_UNLOCKED_CTXSW */
static inline int task_running(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->oncpu;
#else
return task_current(rq, p);
#endif
}
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{ {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) ...@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
* SMP rebalancing from interrupt is the only thing that cares * SMP rebalancing from interrupt is the only thing that cares
* here. * here.
*/ */
next->oncpu = 1; next->on_cpu = 1;
#endif #endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock); raw_spin_unlock_irq(&rq->lock);
...@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ...@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{ {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* After ->oncpu is cleared, the task can be moved to a different CPU. * After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely * We must ensure this doesn't happen until the switch is completely
* finished. * finished.
*/ */
smp_wmb(); smp_wmb();
prev->oncpu = 0; prev->on_cpu = 0;
#endif #endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable(); local_irq_enable();
...@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ...@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/* /*
* Check whether the task is waking, we use this to synchronize ->cpus_allowed * __task_rq_lock - lock the rq @p resides on.
* against ttwu().
*/
static inline int task_is_waking(struct task_struct *p)
{
return unlikely(p->state == TASK_WAKING);
}
/*
* __task_rq_lock - lock the runqueue a given task resides on.
* Must be called interrupts disabled.
*/ */
static inline struct rq *__task_rq_lock(struct task_struct *p) static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock) __acquires(rq->lock)
{ {
struct rq *rq; struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) { for (;;) {
rq = task_rq(p); rq = task_rq(p);
raw_spin_lock(&rq->lock); raw_spin_lock(&rq->lock);
...@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) ...@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
} }
/* /*
* task_rq_lock - lock the runqueue a given task resides on and disable * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/ */
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(p->pi_lock)
__acquires(rq->lock) __acquires(rq->lock)
{ {
struct rq *rq; struct rq *rq;
for (;;) { for (;;) {
local_irq_save(*flags); raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p); rq = task_rq(p);
raw_spin_lock(&rq->lock); raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p))) if (likely(rq == task_rq(p)))
return rq; return rq;
raw_spin_unlock_irqrestore(&rq->lock, *flags); raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
} }
} }
...@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq) ...@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
} }
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) static inline void
task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock) __releases(rq->lock)
__releases(p->pi_lock)
{ {
raw_spin_unlock_irqrestore(&rq->lock, *flags); raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
} }
/* /*
...@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void) ...@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
int i; int i;
struct sched_domain *sd; struct sched_domain *sd;
rcu_read_lock();
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) for_each_cpu(i, sched_domain_span(sd)) {
if (!idle_cpu(i)) if (!idle_cpu(i)) {
return i; cpu = i;
goto unlock;
}
}
} }
unlock:
rcu_read_unlock();
return cpu; return cpu;
} }
/* /*
...@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, ...@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
{ {
u64 tmp; u64 tmp;
tmp = (u64)delta_exec * weight;
if (!lw->inv_weight) { if (!lw->inv_weight) {
if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
lw->inv_weight = 1; lw->inv_weight = 1;
else else
lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) lw->inv_weight = WMULT_CONST / lw->weight;
/ (lw->weight+1);
} }
tmp = (u64)delta_exec * weight;
/* /*
* Check whether we'd overflow the 64-bit multiplication: * Check whether we'd overflow the 64-bit multiplication:
*/ */
...@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ...@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq); update_rq_clock(rq);
sched_info_queued(p); sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags); p->sched_class->enqueue_task(rq, p, flags);
p->se.on_rq = 1;
} }
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
...@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ...@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq); update_rq_clock(rq);
sched_info_dequeued(p); sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags); p->sched_class->dequeue_task(rq, p, flags);
p->se.on_rq = 0;
} }
/* /*
...@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) ...@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In * A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update. * this case, we can save a useless back to back clock update.
*/ */
if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1; rq->skip_clock_update = 1;
} }
...@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ...@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
*/ */
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
#endif #endif
trace_sched_migrate_task(p, new_cpu); trace_sched_migrate_task(p, new_cpu);
...@@ -2181,19 +2207,6 @@ struct migration_arg { ...@@ -2181,19 +2207,6 @@ struct migration_arg {
static int migration_cpu_stop(void *data); static int migration_cpu_stop(void *data);
/*
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
static bool migrate_task(struct task_struct *p, struct rq *rq)
{
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
*/
return p->se.on_rq || task_running(rq, p);
}
/* /*
* wait_task_inactive - wait for a thread to unschedule. * wait_task_inactive - wait for a thread to unschedule.
* *
...@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) ...@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p); trace_sched_wait_task(p);
running = task_running(rq, p); running = task_running(rq, p);
on_rq = p->se.on_rq; on_rq = p->on_rq;
ncsw = 0; ncsw = 0;
if (!match_state || p->state == match_state) if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
/* /*
* If it changed from the expected state, bail out now. * If it changed from the expected state, bail out now.
...@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process); ...@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. * ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/ */
static int select_fallback_rq(int cpu, struct task_struct *p) static int select_fallback_rq(int cpu, struct task_struct *p)
{ {
...@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) ...@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
} }
/* /*
* The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/ */
static inline static inline
int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{ {
int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/* /*
* In order not to call set_task_cpu() on a blocking task we need * In order not to call set_task_cpu() on a blocking task we need
...@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample) ...@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample)
} }
#endif #endif
static inline void ttwu_activate(struct task_struct *p, struct rq *rq, static void
bool is_sync, bool is_migrate, bool is_local, ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
unsigned long en_flags)
{ {
#ifdef CONFIG_SCHEDSTATS
struct rq *rq = this_rq();
#ifdef CONFIG_SMP
int this_cpu = smp_processor_id();
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
schedstat_inc(p, se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
schedstat_inc(p, se.statistics.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd, ttwu_wake_remote);
break;
}
}
rcu_read_unlock();
}
#endif /* CONFIG_SMP */
schedstat_inc(rq, ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups); schedstat_inc(p, se.statistics.nr_wakeups);
if (is_sync)
if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync); schedstat_inc(p, se.statistics.nr_wakeups_sync);
if (is_migrate)
if (cpu != task_cpu(p))
schedstat_inc(p, se.statistics.nr_wakeups_migrate); schedstat_inc(p, se.statistics.nr_wakeups_migrate);
if (is_local)
schedstat_inc(p, se.statistics.nr_wakeups_local);
else
schedstat_inc(p, se.statistics.nr_wakeups_remote);
#endif /* CONFIG_SCHEDSTATS */
}
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags); activate_task(rq, p, en_flags);
p->on_rq = 1;
/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
} }
static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, /*
int wake_flags, bool success) * Mark the task runnable and perform wakeup-preemption.
*/
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{ {
trace_sched_wakeup(p, success); trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags); check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING; p->state = TASK_RUNNING;
...@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, ...@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
rq->idle_stamp = 0; rq->idle_stamp = 0;
} }
#endif #endif
/* if a worker is waking up, notify workqueue */ }
if ((p->flags & PF_WQ_WORKER) && success)
wq_worker_waking_up(p, cpu_of(rq)); static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
#endif
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
ttwu_do_wakeup(rq, p, wake_flags);
}
/*
* Called in case the task @p isn't fully descheduled from its runqueue,
* in this case we must do a remote wakeup. Its a 'light' wakeup though,
* since all we need to do is flip p->state to TASK_RUNNING, since
* the task is still ->on_rq.
*/
static int ttwu_remote(struct task_struct *p, int wake_flags)
{
struct rq *rq;
int ret = 0;
rq = __task_rq_lock(p);
if (p->on_rq) {
ttwu_do_wakeup(rq, p, wake_flags);
ret = 1;
}
__task_rq_unlock(rq);
return ret;
}
#ifdef CONFIG_SMP
static void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct task_struct *list = xchg(&rq->wake_list, NULL);
if (!list)
return;
raw_spin_lock(&rq->lock);
while (list) {
struct task_struct *p = list;
list = list->wake_entry;
ttwu_do_activate(rq, p, 0);
}
raw_spin_unlock(&rq->lock);
}
void scheduler_ipi(void)
{
sched_ttwu_pending();
}
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *next = rq->wake_list;
for (;;) {
struct task_struct *old = next;
p->wake_entry = next;
next = cmpxchg(&rq->wake_list, old, p);
if (next == old)
break;
}
if (!next)
smp_send_reschedule(cpu);
}
#endif
static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
ttwu_queue_remote(p, cpu);
return;
}
#endif
raw_spin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0);
raw_spin_unlock(&rq->lock);
} }
/** /**
...@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, ...@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
* Returns %true if @p was woken up, %false if it was already running * Returns %true if @p was woken up, %false if it was already running
* or @state didn't match @p's state. * or @state didn't match @p's state.
*/ */
static int try_to_wake_up(struct task_struct *p, unsigned int state, static int
int wake_flags) try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{ {
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags; unsigned long flags;
unsigned long en_flags = ENQUEUE_WAKEUP; int cpu, success = 0;
struct rq *rq;
this_cpu = get_cpu();
smp_wmb(); smp_wmb();
rq = task_rq_lock(p, &flags); raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state)) if (!(p->state & state))
goto out; goto out;
if (p->se.on_rq) success = 1; /* we're going to change ->state */
goto out_running;
cpu = task_cpu(p); cpu = task_cpu(p);
orig_cpu = cpu;
#ifdef CONFIG_SMP if (p->on_rq && ttwu_remote(p, wake_flags))
if (unlikely(task_running(rq, p))) goto stat;
goto out_activate;
#ifdef CONFIG_SMP
/* /*
* In order to handle concurrent wakeups and release the rq->lock * If the owning (remote) cpu is still in the middle of schedule() with
* we put the task in TASK_WAKING state. * this task as prev, wait until its done referencing the task.
*
* First fix up the nr_uninterruptible count:
*/ */
if (task_contributes_to_load(p)) { while (p->on_cpu) {
if (likely(cpu_online(orig_cpu))) #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
rq->nr_uninterruptible--; /*
else * If called from interrupt context we could have landed in the
this_rq()->nr_uninterruptible--; * middle of schedule(), in this case we should take care not
* to spin on ->on_cpu if p is current, since that would
* deadlock.
*/
if (p == current) {
ttwu_queue(p, cpu);
goto stat;
} }
p->state = TASK_WAKING; #endif
cpu_relax();
if (p->sched_class->task_waking) {
p->sched_class->task_waking(rq, p);
en_flags |= ENQUEUE_WAKING;
} }
cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
if (cpu != orig_cpu)
set_task_cpu(p, cpu);
__task_rq_unlock(rq);
rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
/* /*
* We migrated the task without holding either rq->lock, however * Pairs with the smp_wmb() in finish_lock_switch().
* since the task is not on the task list itself, nobody else
* will try and migrate the task, hence the rq should match the
* cpu we just moved it to.
*/ */
WARN_ON(task_cpu(p) != cpu); smp_rmb();
WARN_ON(p->state != TASK_WAKING);
#ifdef CONFIG_SCHEDSTATS p->sched_contributes_to_load = !!task_contributes_to_load(p);
schedstat_inc(rq, ttwu_count); p->state = TASK_WAKING;
if (cpu == this_cpu)
schedstat_inc(rq, ttwu_local);
else {
struct sched_domain *sd;
for_each_domain(this_cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd, ttwu_wake_remote);
break;
}
}
}
#endif /* CONFIG_SCHEDSTATS */
out_activate: if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu)
set_task_cpu(p, cpu);
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
cpu == this_cpu, en_flags); ttwu_queue(p, cpu);
success = 1; stat:
out_running: ttwu_stat(p, cpu, wake_flags);
ttwu_post_activation(p, rq, wake_flags, success);
out: out:
task_rq_unlock(rq, &flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
put_cpu();
return success; return success;
} }
...@@ -2549,29 +2659,32 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, ...@@ -2549,29 +2659,32 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
* *
* Put @p on the run-queue if it's not already there. The caller must * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not * ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task. this_rq() stays locked over invocation. * the current task.
*/ */
static void try_to_wake_up_local(struct task_struct *p) static void try_to_wake_up_local(struct task_struct *p)
{ {
struct rq *rq = task_rq(p); struct rq *rq = task_rq(p);
bool success = false;
BUG_ON(rq != this_rq()); BUG_ON(rq != this_rq());
BUG_ON(p == current); BUG_ON(p == current);
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
}
if (!(p->state & TASK_NORMAL)) if (!(p->state & TASK_NORMAL))
return; goto out;
if (!p->se.on_rq) { if (!p->on_rq)
if (likely(!task_running(rq, p))) { ttwu_activate(rq, p, ENQUEUE_WAKEUP);
schedstat_inc(rq, ttwu_count);
schedstat_inc(rq, ttwu_local); ttwu_do_wakeup(rq, p, 0);
} ttwu_stat(p, smp_processor_id(), 0);
ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); out:
success = true; raw_spin_unlock(&p->pi_lock);
}
ttwu_post_activation(p, rq, 0, success);
} }
/** /**
...@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) ...@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*/ */
static void __sched_fork(struct task_struct *p) static void __sched_fork(struct task_struct *p)
{ {
p->on_rq = 0;
p->se.on_rq = 0;
p->se.exec_start = 0; p->se.exec_start = 0;
p->se.sum_exec_runtime = 0; p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0; p->se.nr_migrations = 0;
p->se.vruntime = 0; p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics)); memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif #endif
INIT_LIST_HEAD(&p->rt.run_list); INIT_LIST_HEAD(&p->rt.run_list);
p->se.on_rq = 0;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers); INIT_HLIST_HEAD(&p->preempt_notifiers);
...@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p) ...@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p)
/* /*
* fork()/clone()-time setup: * fork()/clone()-time setup:
*/ */
void sched_fork(struct task_struct *p, int clone_flags) void sched_fork(struct task_struct *p)
{ {
unsigned long flags;
int cpu = get_cpu(); int cpu = get_cpu();
__sched_fork(p); __sched_fork(p);
...@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags) ...@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
* *
* Silence PROVE_RCU. * Silence PROVE_RCU.
*/ */
rcu_read_lock(); raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu); set_task_cpu(p, cpu);
rcu_read_unlock(); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on())) if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info)); memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif #endif
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) #if defined(CONFIG_SMP)
p->oncpu = 0; p->on_cpu = 0;
#endif #endif
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */ /* Want to start with kernel preemption disabled. */
...@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags) ...@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
* that must be done for every newly created context, then puts the task * that must be done for every newly created context, then puts the task
* on the runqueue and wakes it. * on the runqueue and wakes it.
*/ */
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) void wake_up_new_task(struct task_struct *p)
{ {
unsigned long flags; unsigned long flags;
struct rq *rq; struct rq *rq;
int cpu __maybe_unused = get_cpu();
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
rq = task_rq_lock(p, &flags);
p->state = TASK_WAKING;
/* /*
* Fork balancing, do it here and not earlier because: * Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path * - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug * - any previously selected cpu might disappear through hotplug
*
* We set TASK_WAKING so that select_task_rq() can drop rq->lock
* without people poking at ->cpus_allowed.
*/ */
cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
set_task_cpu(p, cpu);
p->state = TASK_RUNNING;
task_rq_unlock(rq, &flags);
#endif #endif
rq = task_rq_lock(p, &flags); rq = __task_rq_lock(p);
activate_task(rq, p, 0); activate_task(rq, p, 0);
trace_sched_wakeup_new(p, 1); p->on_rq = 1;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK); check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (p->sched_class->task_woken) if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p); p->sched_class->task_woken(rq, p);
#endif #endif
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
put_cpu();
} }
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
...@@ -3450,27 +3556,22 @@ void sched_exec(void) ...@@ -3450,27 +3556,22 @@ void sched_exec(void)
{ {
struct task_struct *p = current; struct task_struct *p = current;
unsigned long flags; unsigned long flags;
struct rq *rq;
int dest_cpu; int dest_cpu;
rq = task_rq_lock(p, &flags); raw_spin_lock_irqsave(&p->pi_lock, flags);
dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id()) if (dest_cpu == smp_processor_id())
goto unlock; goto unlock;
/* if (likely(cpu_active(dest_cpu))) {
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu }; struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
return; return;
} }
unlock: unlock:
task_rq_unlock(rq, &flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
} }
#endif #endif
...@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p) ...@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq); ns = do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
return ns; return ns;
} }
...@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) ...@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
return ns; return ns;
} }
...@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) ...@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals); thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
return ns; return ns;
} }
...@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ...@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
/* /*
* This function gets called by the timer code, with HZ frequency. * This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled. * We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/ */
void scheduler_tick(void) void scheduler_tick(void)
{ {
...@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev) ...@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0)); profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq(), sched_count); schedstat_inc(this_rq(), sched_count);
#ifdef CONFIG_SCHEDSTATS
if (unlikely(prev->lock_depth >= 0)) {
schedstat_inc(this_rq(), rq_sched_info.bkl_count);
schedstat_inc(prev, sched_info.bkl_count);
}
#endif
} }
static void put_prev_task(struct rq *rq, struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev)
{ {
if (prev->se.on_rq) if (prev->on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq); update_rq_clock(rq);
prev->sched_class->put_prev_task(rq, prev); prev->sched_class->put_prev_task(rq, prev);
} }
...@@ -4097,11 +4189,13 @@ asmlinkage void __sched schedule(void) ...@@ -4097,11 +4189,13 @@ asmlinkage void __sched schedule(void)
if (unlikely(signal_pending_state(prev->state, prev))) { if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING; prev->state = TASK_RUNNING;
} else { } else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
/* /*
* If a worker is going to sleep, notify and * If a worker went to sleep, notify and ask workqueue
* ask workqueue whether it wants to wake up a * whether it wants to wake up a task to maintain
* task to maintain concurrency. If so, wake * concurrency.
* up the task.
*/ */
if (prev->flags & PF_WQ_WORKER) { if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup; struct task_struct *to_wakeup;
...@@ -4110,11 +4204,10 @@ asmlinkage void __sched schedule(void) ...@@ -4110,11 +4204,10 @@ asmlinkage void __sched schedule(void)
if (to_wakeup) if (to_wakeup)
try_to_wake_up_local(to_wakeup); try_to_wake_up_local(to_wakeup);
} }
deactivate_task(rq, prev, DEQUEUE_SLEEP);
/* /*
* If we are going to sleep and we have plugged IO queued, make * If we are going to sleep and we have plugged IO
* sure to submit it to avoid deadlocks. * queued, make sure to submit it to avoid deadlocks.
*/ */
if (blk_needs_flush_plug(prev)) { if (blk_needs_flush_plug(prev)) {
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
...@@ -4161,70 +4254,53 @@ asmlinkage void __sched schedule(void) ...@@ -4161,70 +4254,53 @@ asmlinkage void __sched schedule(void)
EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
* Look out! "owner" is an entirely speculative pointer static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
* access and not reliable.
*/
int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
{ {
unsigned int cpu; bool ret = false;
struct rq *rq;
if (!sched_feat(OWNER_SPIN)) rcu_read_lock();
return 0; if (lock->owner != owner)
goto fail;
#ifdef CONFIG_DEBUG_PAGEALLOC
/* /*
* Need to access the cpu field knowing that * Ensure we emit the owner->on_cpu, dereference _after_ checking
* DEBUG_PAGEALLOC could have unmapped it if * lock->owner still matches owner, if that fails, owner might
* the mutex owner just released it and exited. * point to free()d memory, if it still matches, the rcu_read_lock()
* ensures the memory stays valid.
*/ */
if (probe_kernel_address(&owner->cpu, cpu)) barrier();
return 0;
#else
cpu = owner->cpu;
#endif
/* ret = owner->on_cpu;
* Even if the access succeeded (likely case), fail:
* the cpu field may no longer be valid. rcu_read_unlock();
*/
if (cpu >= nr_cpumask_bits)
return 0;
/* return ret;
* We need to validate that we can do a }
* get_cpu() and that we have the percpu area.
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/ */
if (!cpu_online(cpu)) int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
if (!sched_feat(OWNER_SPIN))
return 0; return 0;
rq = cpu_rq(cpu); while (owner_running(lock, owner)) {
if (need_resched())
for (;;) {
/*
* Owner changed, break to re-assess state.
*/
if (lock->owner != owner) {
/*
* If the lock has switched to a different owner,
* we likely have heavy contention. Return 0 to quit
* optimistic spinning and not contend further:
*/
if (lock->owner)
return 0; return 0;
break;
arch_mutex_cpu_relax();
} }
/* /*
* Is that owner really running on that cpu? * If the owner changed to another task there is likely
* heavy contention, stop spinning.
*/ */
if (task_thread_info(rq->curr) != owner || need_resched()) if (lock->owner)
return 0; return 0;
arch_mutex_cpu_relax();
}
return 1; return 1;
} }
#endif #endif
...@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout); ...@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/ */
void rt_mutex_setprio(struct task_struct *p, int prio) void rt_mutex_setprio(struct task_struct *p, int prio)
{ {
unsigned long flags;
int oldprio, on_rq, running; int oldprio, on_rq, running;
struct rq *rq; struct rq *rq;
const struct sched_class *prev_class; const struct sched_class *prev_class;
BUG_ON(prio < 0 || prio > MAX_PRIO); BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags); rq = __task_rq_lock(p);
trace_sched_pi_setprio(p, prio); trace_sched_pi_setprio(p, prio);
oldprio = p->prio; oldprio = p->prio;
prev_class = p->sched_class; prev_class = p->sched_class;
on_rq = p->se.on_rq; on_rq = p->on_rq;
running = task_current(rq, p); running = task_current(rq, p);
if (on_rq) if (on_rq)
dequeue_task(rq, p, 0); dequeue_task(rq, p, 0);
...@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) ...@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, &flags); __task_rq_unlock(rq);
} }
#endif #endif
...@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock; goto out_unlock;
} }
on_rq = p->se.on_rq; on_rq = p->on_rq;
if (on_rq) if (on_rq)
dequeue_task(rq, p, 0); dequeue_task(rq, p, 0);
...@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice) ...@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(rq->curr); resched_task(rq->curr);
} }
out_unlock: out_unlock:
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
} }
EXPORT_SYMBOL(set_user_nice); EXPORT_SYMBOL(set_user_nice);
...@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) ...@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
static void static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{ {
BUG_ON(p->se.on_rq);
p->policy = policy; p->policy = policy;
p->rt_priority = prio; p->rt_priority = prio;
p->normal_prio = normal_prio(p); p->normal_prio = normal_prio(p);
...@@ -4994,20 +5067,17 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -4994,20 +5067,17 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
/* /*
* make sure no PI-waiters arrive (or leave) while we are * make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task: * changing the priority of the task:
*/ *
raw_spin_lock_irqsave(&p->pi_lock, flags);
/*
* To be able to change p->policy safely, the appropriate * To be able to change p->policy safely, the appropriate
* runqueue lock must be held. * runqueue lock must be held.
*/ */
rq = __task_rq_lock(p); rq = task_rq_lock(p, &flags);
/* /*
* Changing the policy of the stop threads its a very bad idea * Changing the policy of the stop threads its a very bad idea
*/ */
if (p == rq->stop) { if (p == rq->stop) {
__task_rq_unlock(rq); task_rq_unlock(rq, p, &flags);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return -EINVAL; return -EINVAL;
} }
...@@ -5031,8 +5101,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -5031,8 +5101,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
if (rt_bandwidth_enabled() && rt_policy(policy) && if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 && task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) { !task_group_is_autogroup(task_group(p))) {
__task_rq_unlock(rq); task_rq_unlock(rq, p, &flags);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return -EPERM; return -EPERM;
} }
} }
...@@ -5041,11 +5110,10 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -5041,11 +5110,10 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
/* recheck policy now with rq lock held */ /* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1; policy = oldpolicy = -1;
__task_rq_unlock(rq); task_rq_unlock(rq, p, &flags);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck; goto recheck;
} }
on_rq = p->se.on_rq; on_rq = p->on_rq;
running = task_current(rq, p); running = task_current(rq, p);
if (on_rq) if (on_rq)
deactivate_task(rq, p, 0); deactivate_task(rq, p, 0);
...@@ -5064,8 +5132,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, ...@@ -5064,8 +5132,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
activate_task(rq, p, 0); activate_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio);
__task_rq_unlock(rq); task_rq_unlock(rq, p, &flags);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
rt_mutex_adjust_pi(p); rt_mutex_adjust_pi(p);
...@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) ...@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
{ {
struct task_struct *p; struct task_struct *p;
unsigned long flags; unsigned long flags;
struct rq *rq;
int retval; int retval;
get_online_cpus(); get_online_cpus();
...@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) ...@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
if (retval) if (retval)
goto out_unlock; goto out_unlock;
rq = task_rq_lock(p, &flags); raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
task_rq_unlock(rq, &flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock: out_unlock:
rcu_read_unlock(); rcu_read_unlock();
...@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ...@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
time_slice = p->sched_class->get_rr_interval(rq, p); time_slice = p->sched_class->get_rr_interval(rq, p);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
rcu_read_unlock(); rcu_read_unlock();
jiffies_to_timespec(time_slice, &t); jiffies_to_timespec(time_slice, &t);
...@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) ...@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock(); rcu_read_unlock();
rq->curr = rq->idle = idle; rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) #if defined(CONFIG_SMP)
idle->oncpu = 1; idle->on_cpu = 1;
#endif #endif
raw_spin_unlock_irqrestore(&rq->lock, flags); raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */ /* Set the preempt count _outside_ the spinlocks! */
#if defined(CONFIG_PREEMPT)
task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
#else
task_thread_info(idle)->preempt_count = 0; task_thread_info(idle)->preempt_count = 0;
#endif
/* /*
* The idle tasks have their own, simple scheduling class: * The idle tasks have their own, simple scheduling class:
*/ */
...@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
unsigned int dest_cpu; unsigned int dest_cpu;
int ret = 0; int ret = 0;
/*
* Serialize against TASK_WAKING so that ttwu() and wunt() can
* drop the rq->lock and still rely on ->cpus_allowed.
*/
again:
while (task_is_waking(p))
cpu_relax();
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
if (task_is_waking(p)) {
task_rq_unlock(rq, &flags); if (cpumask_equal(&p->cpus_allowed, new_mask))
goto again; goto out;
}
if (!cpumask_intersects(new_mask, cpu_active_mask)) { if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
!cpumask_equal(&p->cpus_allowed, new_mask))) {
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
...@@ -5917,16 +5971,16 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -5917,16 +5971,16 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out; goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
if (migrate_task(p, rq)) { if (p->on_rq) {
struct migration_arg arg = { p, dest_cpu }; struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */ /* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm); tlb_migrate_finish(p->mm);
return 0; return 0;
} }
out: out:
task_rq_unlock(rq, &flags); task_rq_unlock(rq, p, &flags);
return ret; return ret;
} }
...@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) ...@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
rq_src = cpu_rq(src_cpu); rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu); rq_dest = cpu_rq(dest_cpu);
raw_spin_lock(&p->pi_lock);
double_rq_lock(rq_src, rq_dest); double_rq_lock(rq_src, rq_dest);
/* Already moved. */ /* Already moved. */
if (task_cpu(p) != src_cpu) if (task_cpu(p) != src_cpu)
...@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) ...@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're * If we're not on a rq, the next wake-up will ensure we're
* placed properly. * placed properly.
*/ */
if (p->se.on_rq) { if (p->on_rq) {
deactivate_task(rq_src, p, 0); deactivate_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu); set_task_cpu(p, dest_cpu);
activate_task(rq_dest, p, 0); activate_task(rq_dest, p, 0);
...@@ -5976,6 +6031,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) ...@@ -5976,6 +6031,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
ret = 1; ret = 1;
fail: fail:
double_rq_unlock(rq_src, rq_dest); double_rq_unlock(rq_src, rq_dest);
raw_spin_unlock(&p->pi_lock);
return ret; return ret;
} }
...@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) ...@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
case CPU_DYING: case CPU_DYING:
sched_ttwu_pending();
/* Update our root-domain */ /* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags); raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) { if (rq->rd) {
...@@ -6394,6 +6451,8 @@ early_initcall(migration_init); ...@@ -6394,6 +6451,8 @@ early_initcall(migration_init);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
static __read_mostly int sched_domain_debug_enabled; static __read_mostly int sched_domain_debug_enabled;
...@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
static void sched_domain_debug(struct sched_domain *sd, int cpu) static void sched_domain_debug(struct sched_domain *sd, int cpu)
{ {
cpumask_var_t groupmask;
int level = 0; int level = 0;
if (!sched_domain_debug_enabled) if (!sched_domain_debug_enabled)
...@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) ...@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
return;
}
for (;;) { for (;;) {
if (sched_domain_debug_one(sd, cpu, level, groupmask)) if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
break; break;
level++; level++;
sd = sd->parent; sd = sd->parent;
if (!sd) if (!sd)
break; break;
} }
free_cpumask_var(groupmask);
} }
#else /* !CONFIG_SCHED_DEBUG */ #else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0) # define sched_domain_debug(sd, cpu) do { } while (0)
...@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ...@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1; return 1;
} }
static void free_rootdomain(struct root_domain *rd) static void free_rootdomain(struct rcu_head *rcu)
{ {
synchronize_sched(); struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri); cpupri_cleanup(&rd->cpupri);
free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online); free_cpumask_var(rd->online);
free_cpumask_var(rd->span); free_cpumask_var(rd->span);
...@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) ...@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags); raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd) if (old_rd)
free_rootdomain(old_rd); call_rcu_sched(&old_rd->rcu, free_rootdomain);
} }
static int init_rootdomain(struct root_domain *rd) static int init_rootdomain(struct root_domain *rd)
...@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void) ...@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void)
return rd; return rd;
} }
static void free_sched_domain(struct rcu_head *rcu)
{
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
if (atomic_dec_and_test(&sd->groups->ref))
kfree(sd->groups);
kfree(sd);
}
static void destroy_sched_domain(struct sched_domain *sd, int cpu)
{
call_rcu(&sd->rcu, free_sched_domain);
}
static void destroy_sched_domains(struct sched_domain *sd, int cpu)
{
for (; sd; sd = sd->parent)
destroy_sched_domain(sd, cpu);
}
/* /*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock. * hold the hotplug lock.
...@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp; struct sched_domain *tmp;
for (tmp = sd; tmp; tmp = tmp->parent)
tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
/* Remove the sched domains which do not contribute to scheduling. */ /* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) { for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent; struct sched_domain *parent = tmp->parent;
...@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent; tmp->parent = parent->parent;
if (parent->parent) if (parent->parent)
parent->parent->child = tmp; parent->parent->child = tmp;
destroy_sched_domain(parent, cpu);
} else } else
tmp = tmp->parent; tmp = tmp->parent;
} }
if (sd && sd_degenerate(sd)) { if (sd && sd_degenerate(sd)) {
tmp = sd;
sd = sd->parent; sd = sd->parent;
destroy_sched_domain(tmp, cpu);
if (sd) if (sd)
sd->child = NULL; sd->child = NULL;
} }
...@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sched_domain_debug(sd, cpu); sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd); rq_attach_root(rq, rd);
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd); rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
} }
/* cpus with isolated domains */ /* cpus with isolated domains */
...@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str) ...@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup); __setup("isolcpus=", isolated_cpu_setup);
/*
* init_sched_build_groups takes the cpumask we wish to span, and a pointer
* to a function which identifies what group(along with sched group) a CPU
* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
* (due to the fact that we keep track of groups covered with a struct cpumask).
*
* init_sched_build_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
static void
init_sched_build_groups(const struct cpumask *span,
const struct cpumask *cpu_map,
int (*group_fn)(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg,
struct cpumask *tmpmask),
struct cpumask *covered, struct cpumask *tmpmask)
{
struct sched_group *first = NULL, *last = NULL;
int i;
cpumask_clear(covered);
for_each_cpu(i, span) {
struct sched_group *sg;
int group = group_fn(i, cpu_map, &sg, tmpmask);
int j;
if (cpumask_test_cpu(i, covered))
continue;
cpumask_clear(sched_group_cpus(sg));
sg->cpu_power = 0;
for_each_cpu(j, span) {
if (group_fn(j, cpu_map, NULL, tmpmask) != group)
continue;
cpumask_set_cpu(j, covered);
cpumask_set_cpu(j, sched_group_cpus(sg));
}
if (!first)
first = sg;
if (last)
last->next = sg;
last = sg;
}
last->next = first;
}
#define SD_NODES_PER_DOMAIN 16 #define SD_NODES_PER_DOMAIN 16
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
...@@ -6787,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span, ...@@ -6787,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span,
*/ */
static int find_next_best_node(int node, nodemask_t *used_nodes) static int find_next_best_node(int node, nodemask_t *used_nodes)
{ {
int i, n, val, min_val, best_node = 0; int i, n, val, min_val, best_node = -1;
min_val = INT_MAX; min_val = INT_MAX;
...@@ -6811,6 +6833,7 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) ...@@ -6811,6 +6833,7 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
} }
} }
if (best_node != -1)
node_set(best_node, *used_nodes); node_set(best_node, *used_nodes);
return best_node; return best_node;
} }
...@@ -6837,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span) ...@@ -6837,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, &used_nodes); int next_node = find_next_best_node(node, &used_nodes);
if (next_node < 0)
break;
cpumask_or(span, span, cpumask_of_node(next_node)); cpumask_or(span, span, cpumask_of_node(next_node));
} }
} }
static const struct cpumask *cpu_node_mask(int cpu)
{
lockdep_assert_held(&sched_domains_mutex);
sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
return sched_domains_tmpmask;
}
static const struct cpumask *cpu_allnodes_mask(int cpu)
{
return cpu_possible_mask;
}
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
int sched_smt_power_savings = 0, sched_mc_power_savings = 0; static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
/* int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
* The cpus mask in sched_group and sched_domain hangs off the end.
*
* ( See the the comments in include/linux/sched.h:struct sched_group
* and struct sched_domain. )
*/
struct static_sched_group {
struct sched_group sg;
DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
};
struct static_sched_domain { struct sd_data {
struct sched_domain sd; struct sched_domain **__percpu sd;
DECLARE_BITMAP(span, CONFIG_NR_CPUS); struct sched_group **__percpu sg;
}; };
struct s_data { struct s_data {
#ifdef CONFIG_NUMA struct sched_domain ** __percpu sd;
int sd_allnodes;
cpumask_var_t domainspan;
cpumask_var_t covered;
cpumask_var_t notcovered;
#endif
cpumask_var_t nodemask;
cpumask_var_t this_sibling_map;
cpumask_var_t this_core_map;
cpumask_var_t this_book_map;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
struct sched_group **sched_group_nodes;
struct root_domain *rd; struct root_domain *rd;
}; };
enum s_alloc { enum s_alloc {
sa_sched_groups = 0,
sa_rootdomain, sa_rootdomain,
sa_tmpmask, sa_sd,
sa_send_covered, sa_sd_storage,
sa_this_book_map,
sa_this_core_map,
sa_this_sibling_map,
sa_nodemask,
sa_sched_group_nodes,
#ifdef CONFIG_NUMA
sa_notcovered,
sa_covered,
sa_domainspan,
#endif
sa_none, sa_none,
}; };
/* struct sched_domain_topology_level;
* SMT sched-domains:
*/
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
static int
cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *unused)
{
if (sg)
*sg = &per_cpu(sched_groups, cpu).sg;
return cpu;
}
#endif /* CONFIG_SCHED_SMT */
/* typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
* multi-core sched-domains: typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
*/
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
static int struct sched_domain_topology_level {
cpu_to_core_group(int cpu, const struct cpumask *cpu_map, sched_domain_init_f init;
struct sched_group **sg, struct cpumask *mask) sched_domain_mask_f mask;
{ struct sd_data data;
int group; };
#ifdef CONFIG_SCHED_SMT
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#else
group = cpu;
#endif
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
return group;
}
#endif /* CONFIG_SCHED_MC */
/* /*
* book sched-domains: * Assumes the sched_domain tree is fully constructed
*/ */
#ifdef CONFIG_SCHED_BOOK static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
static int
cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{ {
int group = cpu; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
#ifdef CONFIG_SCHED_MC struct sched_domain *child = sd->child;
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#endif
if (sg)
*sg = &per_cpu(sched_group_book, group).sg;
return group;
}
#endif /* CONFIG_SCHED_BOOK */
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); if (child)
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); cpu = cpumask_first(sched_domain_span(child));
static int
cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg, struct cpumask *mask)
{
int group;
#ifdef CONFIG_SCHED_BOOK
cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_MC)
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#else
group = cpu;
#endif
if (sg) if (sg)
*sg = &per_cpu(sched_group_phys, group).sg; *sg = *per_cpu_ptr(sdd->sg, cpu);
return group;
return cpu;
} }
#ifdef CONFIG_NUMA
/* /*
* The init_sched_build_groups can't handle what we want to do with node * build_sched_groups takes the cpumask we wish to span, and a pointer
* groups, so roll our own. Now each node has its own list of groups which * to a function which identifies what group(along with sched group) a CPU
* gets dynamically allocated. * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
* (due to the fact that we keep track of groups covered with a struct cpumask).
*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/ */
static DEFINE_PER_CPU(struct static_sched_domain, node_domains); static void
static struct sched_group ***sched_group_nodes_bycpu; build_sched_groups(struct sched_domain *sd)
static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg,
struct cpumask *nodemask)
{ {
int group; struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered;
int i;
cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); lockdep_assert_held(&sched_domains_mutex);
group = cpumask_first(nodemask); covered = sched_domains_tmpmask;
if (sg) cpumask_clear(covered);
*sg = &per_cpu(sched_group_allnodes, group).sg;
return group;
}
static void init_numa_sched_groups_power(struct sched_group *group_head) for_each_cpu(i, span) {
{ struct sched_group *sg;
struct sched_group *sg = group_head; int group = get_group(i, sdd, &sg);
int j; int j;
if (!sg) if (cpumask_test_cpu(i, covered))
return;
do {
for_each_cpu(j, sched_group_cpus(sg)) {
struct sched_domain *sd;
sd = &per_cpu(phys_domains, j).sd;
if (j != group_first_cpu(sd->groups)) {
/*
* Only add "power" once for each
* physical package.
*/
continue; continue;
}
sg->cpu_power += sd->groups->cpu_power;
}
sg = sg->next;
} while (sg != group_head);
}
static int build_numa_sched_groups(struct s_data *d,
const struct cpumask *cpu_map, int num)
{
struct sched_domain *sd;
struct sched_group *sg, *prev;
int n, j;
cpumask_clear(d->covered);
cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
if (cpumask_empty(d->nodemask)) {
d->sched_group_nodes[num] = NULL;
goto out;
}
sched_domain_node_span(num, d->domainspan);
cpumask_and(d->domainspan, d->domainspan, cpu_map);
sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, num);
if (!sg) {
printk(KERN_WARNING "Can not alloc domain group for node %d\n",
num);
return -ENOMEM;
}
d->sched_group_nodes[num] = sg;
for_each_cpu(j, d->nodemask) {
sd = &per_cpu(node_domains, j).sd;
sd->groups = sg;
}
cpumask_clear(sched_group_cpus(sg));
sg->cpu_power = 0; sg->cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), d->nodemask);
sg->next = sg;
cpumask_or(d->covered, d->covered, d->nodemask);
prev = sg;
for (j = 0; j < nr_node_ids; j++) {
n = (num + j) % nr_node_ids;
cpumask_complement(d->notcovered, d->covered);
cpumask_and(d->tmpmask, d->notcovered, cpu_map);
cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
if (cpumask_empty(d->tmpmask))
break;
cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
if (cpumask_empty(d->tmpmask))
continue;
sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, num);
if (!sg) {
printk(KERN_WARNING
"Can not alloc domain group for node %d\n", j);
return -ENOMEM;
}
sg->cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), d->tmpmask);
sg->next = prev->next;
cpumask_or(d->covered, d->covered, d->tmpmask);
prev->next = sg;
prev = sg;
}
out:
return 0;
}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA
/* Free memory allocated for various sched_group structures */
static void free_sched_groups(const struct cpumask *cpu_map,
struct cpumask *nodemask)
{
int cpu, i;
for_each_cpu(cpu, cpu_map) {
struct sched_group **sched_group_nodes
= sched_group_nodes_bycpu[cpu];
if (!sched_group_nodes)
continue;
for (i = 0; i < nr_node_ids; i++) { for_each_cpu(j, span) {
struct sched_group *oldsg, *sg = sched_group_nodes[i]; if (get_group(j, sdd, NULL) != group)
cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
if (cpumask_empty(nodemask))
continue; continue;
if (sg == NULL) cpumask_set_cpu(j, covered);
continue; cpumask_set_cpu(j, sched_group_cpus(sg));
sg = sg->next;
next_sg:
oldsg = sg;
sg = sg->next;
kfree(oldsg);
if (oldsg != sched_group_nodes[i])
goto next_sg;
} }
kfree(sched_group_nodes);
sched_group_nodes_bycpu[cpu] = NULL; if (!first)
first = sg;
if (last)
last->next = sg;
last = sg;
} }
last->next = first;
} }
#else /* !CONFIG_NUMA */
static void free_sched_groups(const struct cpumask *cpu_map,
struct cpumask *nodemask)
{
}
#endif /* CONFIG_NUMA */
/* /*
* Initialize sched groups cpu_power. * Initialize sched groups cpu_power.
...@@ -7159,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, ...@@ -7159,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
*/ */
static void init_sched_groups_power(int cpu, struct sched_domain *sd) static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{ {
struct sched_domain *child;
struct sched_group *group;
long power;
int weight;
WARN_ON(!sd || !sd->groups); WARN_ON(!sd || !sd->groups);
if (cpu != group_first_cpu(sd->groups)) if (cpu != group_first_cpu(sd->groups))
...@@ -7171,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) ...@@ -7171,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
child = sd->child; update_group_power(sd, cpu);
sd->groups->cpu_power = 0;
if (!child) {
power = SCHED_LOAD_SCALE;
weight = cpumask_weight(sched_domain_span(sd));
/*
* SMT siblings share the power of a single core.
* Usually multiple threads get a better yield out of
* that one core than a single thread would have,
* reflect that in sd->smt_gain.
*/
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
power *= sd->smt_gain;
power /= weight;
power >>= SCHED_LOAD_SHIFT;
}
sd->groups->cpu_power += power;
return;
}
/*
* Add cpu_power of each child group to this groups cpu_power.
*/
group = child->groups;
do {
sd->groups->cpu_power += group->cpu_power;
group = group->next;
} while (group != child->groups);
} }
/* /*
...@@ -7214,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) ...@@ -7214,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
# define SD_INIT_NAME(sd, type) do { } while (0) # define SD_INIT_NAME(sd, type) do { } while (0)
#endif #endif
#define SD_INIT(sd, type) sd_init_##type(sd)
#define SD_INIT_FUNC(type) \ #define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \ static noinline struct sched_domain * \
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
{ \ { \
memset(sd, 0, sizeof(*sd)); \ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
*sd = SD_##type##_INIT; \ *sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
SD_INIT_NAME(sd, type); \ SD_INIT_NAME(sd, type); \
sd->private = &tl->data; \
return sd; \
} }
SD_INIT_FUNC(CPU) SD_INIT_FUNC(CPU)
...@@ -7241,13 +7045,14 @@ SD_INIT_FUNC(CPU) ...@@ -7241,13 +7045,14 @@ SD_INIT_FUNC(CPU)
#endif #endif
static int default_relax_domain_level = -1; static int default_relax_domain_level = -1;
int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str) static int __init setup_relax_domain_level(char *str)
{ {
unsigned long val; unsigned long val;
val = simple_strtoul(str, NULL, 0); val = simple_strtoul(str, NULL, 0);
if (val < SD_LV_MAX) if (val < sched_domain_level_max)
default_relax_domain_level = val; default_relax_domain_level = val;
return 1; return 1;
...@@ -7275,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd, ...@@ -7275,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd,
} }
} }
static void __sdt_free(const struct cpumask *cpu_map);
static int __sdt_alloc(const struct cpumask *cpu_map);
static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map) const struct cpumask *cpu_map)
{ {
switch (what) { switch (what) {
case sa_sched_groups:
free_sched_groups(cpu_map, d->tmpmask); /* fall through */
d->sched_group_nodes = NULL;
case sa_rootdomain: case sa_rootdomain:
free_rootdomain(d->rd); /* fall through */ if (!atomic_read(&d->rd->refcount))
case sa_tmpmask: free_rootdomain(&d->rd->rcu); /* fall through */
free_cpumask_var(d->tmpmask); /* fall through */ case sa_sd:
case sa_send_covered: free_percpu(d->sd); /* fall through */
free_cpumask_var(d->send_covered); /* fall through */ case sa_sd_storage:
case sa_this_book_map: __sdt_free(cpu_map); /* fall through */
free_cpumask_var(d->this_book_map); /* fall through */
case sa_this_core_map:
free_cpumask_var(d->this_core_map); /* fall through */
case sa_this_sibling_map:
free_cpumask_var(d->this_sibling_map); /* fall through */
case sa_nodemask:
free_cpumask_var(d->nodemask); /* fall through */
case sa_sched_group_nodes:
#ifdef CONFIG_NUMA
kfree(d->sched_group_nodes); /* fall through */
case sa_notcovered:
free_cpumask_var(d->notcovered); /* fall through */
case sa_covered:
free_cpumask_var(d->covered); /* fall through */
case sa_domainspan:
free_cpumask_var(d->domainspan); /* fall through */
#endif
case sa_none: case sa_none:
break; break;
} }
...@@ -7314,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, ...@@ -7314,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map) const struct cpumask *cpu_map)
{ {
#ifdef CONFIG_NUMA memset(d, 0, sizeof(*d));
if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
return sa_none; if (__sdt_alloc(cpu_map))
if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) return sa_sd_storage;
return sa_domainspan; d->sd = alloc_percpu(struct sched_domain *);
if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) if (!d->sd)
return sa_covered; return sa_sd_storage;
/* Allocate the per-node list of sched groups */
d->sched_group_nodes = kcalloc(nr_node_ids,
sizeof(struct sched_group *), GFP_KERNEL);
if (!d->sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
return sa_notcovered;
}
sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
#endif
if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
return sa_sched_group_nodes;
if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
return sa_nodemask;
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
return sa_this_sibling_map;
if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
return sa_this_core_map;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
return sa_this_book_map;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
d->rd = alloc_rootdomain(); d->rd = alloc_rootdomain();
if (!d->rd) { if (!d->rd)
printk(KERN_WARNING "Cannot alloc root domain\n"); return sa_sd;
return sa_tmpmask;
}
return sa_rootdomain; return sa_rootdomain;
} }
static struct sched_domain *__build_numa_sched_domains(struct s_data *d, /*
const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) * NULL the sd_data elements we've used to build the sched_domain and
* sched_group structure so that the subsequent __free_domain_allocs()
* will not free the data we're using.
*/
static void claim_allocations(int cpu, struct sched_domain *sd)
{ {
struct sched_domain *sd = NULL; struct sd_data *sdd = sd->private;
#ifdef CONFIG_NUMA struct sched_group *sg = sd->groups;
struct sched_domain *parent;
d->sd_allnodes = 0; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
if (cpumask_weight(cpu_map) > *per_cpu_ptr(sdd->sd, cpu) = NULL;
SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
sd = &per_cpu(allnodes_domains, i).sd;
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), cpu_map);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
d->sd_allnodes = 1;
}
parent = sd;
sd = &per_cpu(node_domains, i).sd; if (cpu == cpumask_first(sched_group_cpus(sg))) {
SD_INIT(sd, NODE); WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
set_domain_attribute(sd, attr); *per_cpu_ptr(sdd->sg, cpu) = NULL;
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); }
sd->parent = parent;
if (parent)
parent->child = sd;
cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
#endif
return sd;
} }
static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, #ifdef CONFIG_SCHED_SMT
const struct cpumask *cpu_map, struct sched_domain_attr *attr, static const struct cpumask *cpu_smt_mask(int cpu)
struct sched_domain *parent, int i)
{ {
struct sched_domain *sd; return topology_thread_cpumask(cpu);
sd = &per_cpu(phys_domains, i).sd;
SD_INIT(sd, CPU);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), d->nodemask);
sd->parent = parent;
if (parent)
parent->child = sd;
cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
return sd;
} }
#endif
static struct sched_domain *__build_book_sched_domain(struct s_data *d, /*
const struct cpumask *cpu_map, struct sched_domain_attr *attr, * Topology list, bottom-up.
struct sched_domain *parent, int i) */
{ static struct sched_domain_topology_level default_topology[] = {
struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_SMT
{ sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
{ sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK #ifdef CONFIG_SCHED_BOOK
sd = &per_cpu(book_domains, i).sd; { sd_init_BOOK, cpu_book_mask, },
SD_INIT(sd, BOOK);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
sd->parent = parent;
parent->child = sd;
cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif #endif
return sd; { sd_init_CPU, cpu_cpu_mask, },
} #ifdef CONFIG_NUMA
{ sd_init_NODE, cpu_node_mask, },
{ sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
{ NULL, },
};
static struct sched_domain *__build_mc_sched_domain(struct s_data *d, static struct sched_domain_topology_level *sched_domain_topology = default_topology;
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i) static int __sdt_alloc(const struct cpumask *cpu_map)
{ {
struct sched_domain *sd = parent; struct sched_domain_topology_level *tl;
#ifdef CONFIG_SCHED_MC int j;
sd = &per_cpu(core_domains, i).sd;
SD_INIT(sd, MC); for (tl = sched_domain_topology; tl->init; tl++) {
set_domain_attribute(sd, attr); struct sd_data *sdd = &tl->data;
cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
sd->parent = parent; sdd->sd = alloc_percpu(struct sched_domain *);
parent->child = sd; if (!sdd->sd)
cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); return -ENOMEM;
#endif
return sd; sdd->sg = alloc_percpu(struct sched_group *);
if (!sdd->sg)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
*per_cpu_ptr(sdd->sd, j) = sd;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sg)
return -ENOMEM;
*per_cpu_ptr(sdd->sg, j) = sg;
}
}
return 0;
} }
static struct sched_domain *__build_smt_sched_domain(struct s_data *d, static void __sdt_free(const struct cpumask *cpu_map)
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *parent, int i)
{ {
struct sched_domain *sd = parent; struct sched_domain_topology_level *tl;
#ifdef CONFIG_SCHED_SMT int j;
sd = &per_cpu(cpu_domains, i).sd;
SD_INIT(sd, SIBLING); for (tl = sched_domain_topology; tl->init; tl++) {
set_domain_attribute(sd, attr); struct sd_data *sdd = &tl->data;
cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
sd->parent = parent; for_each_cpu(j, cpu_map) {
parent->child = sd; kfree(*per_cpu_ptr(sdd->sd, j));
cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); kfree(*per_cpu_ptr(sdd->sg, j));
#endif }
return sd; free_percpu(sdd->sd);
free_percpu(sdd->sg);
}
} }
static void build_sched_groups(struct s_data *d, enum sched_domain_level l, struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, int cpu) struct s_data *d, const struct cpumask *cpu_map,
struct sched_domain_attr *attr, struct sched_domain *child,
int cpu)
{ {
switch (l) { struct sched_domain *sd = tl->init(tl, cpu);
#ifdef CONFIG_SCHED_SMT if (!sd)
case SD_LV_SIBLING: /* set up CPU (sibling) groups */ return child;
cpumask_and(d->this_sibling_map, cpu_map,
topology_thread_cpumask(cpu)); set_domain_attribute(sd, attr);
if (cpu == cpumask_first(d->this_sibling_map)) cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
init_sched_build_groups(d->this_sibling_map, cpu_map, if (child) {
&cpu_to_cpu_group, sd->level = child->level + 1;
d->send_covered, d->tmpmask); sched_domain_level_max = max(sched_domain_level_max, sd->level);
break; child->parent = sd;
#endif
#ifdef CONFIG_SCHED_MC
case SD_LV_MC: /* set up multi-core groups */
cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
if (cpu == cpumask_first(d->this_core_map))
init_sched_build_groups(d->this_core_map, cpu_map,
&cpu_to_core_group,
d->send_covered, d->tmpmask);
break;
#endif
#ifdef CONFIG_SCHED_BOOK
case SD_LV_BOOK: /* set up book groups */
cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
if (cpu == cpumask_first(d->this_book_map))
init_sched_build_groups(d->this_book_map, cpu_map,
&cpu_to_book_group,
d->send_covered, d->tmpmask);
break;
#endif
case SD_LV_CPU: /* set up physical groups */
cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
if (!cpumask_empty(d->nodemask))
init_sched_build_groups(d->nodemask, cpu_map,
&cpu_to_phys_group,
d->send_covered, d->tmpmask);
break;
#ifdef CONFIG_NUMA
case SD_LV_ALLNODES:
init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
d->send_covered, d->tmpmask);
break;
#endif
default:
break;
} }
sd->child = child;
return sd;
} }
/* /*
* Build sched domains for a given set of cpus and attach the sched domains * Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus * to the individual cpus
*/ */
static int __build_sched_domains(const struct cpumask *cpu_map, static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr) struct sched_domain_attr *attr)
{ {
enum s_alloc alloc_state = sa_none; enum s_alloc alloc_state = sa_none;
struct s_data d;
struct sched_domain *sd; struct sched_domain *sd;
int i; struct s_data d;
#ifdef CONFIG_NUMA int i, ret = -ENOMEM;
d.sd_allnodes = 0;
#endif
alloc_state = __visit_domain_allocation_hell(&d, cpu_map); alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain) if (alloc_state != sa_rootdomain)
goto error; goto error;
alloc_state = sa_sched_groups;
/* /* Set up domains for cpus specified by the cpu_map. */
* Set up domains for cpus specified by the cpu_map.
*/
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), struct sched_domain_topology_level *tl;
cpu_map);
sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = NULL;
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); for (tl = sched_domain_topology; tl->init; tl++)
sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
}
for_each_cpu(i, cpu_map) { while (sd->child)
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); sd = sd->child;
build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
}
/* Set up physical groups */ *per_cpu_ptr(d.sd, i) = sd;
for (i = 0; i < nr_node_ids; i++) }
build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
#ifdef CONFIG_NUMA /* Build the groups for the domains */
/* Set up node groups */ for_each_cpu(i, cpu_map) {
if (d.sd_allnodes) for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); sd->span_weight = cpumask_weight(sched_domain_span(sd));
get_group(i, sd->private, &sd->groups);
atomic_inc(&sd->groups->ref);
for (i = 0; i < nr_node_ids; i++) if (i != cpumask_first(sched_domain_span(sd)))
if (build_numa_sched_groups(&d, cpu_map, i)) continue;
goto error;
#endif
/* Calculate CPU power for physical packages and nodes */ build_sched_groups(sd);
#ifdef CONFIG_SCHED_SMT
for_each_cpu(i, cpu_map) {
sd = &per_cpu(cpu_domains, i).sd;
init_sched_groups_power(i, sd);
} }
#endif
#ifdef CONFIG_SCHED_MC
for_each_cpu(i, cpu_map) {
sd = &per_cpu(core_domains, i).sd;
init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_BOOK
for_each_cpu(i, cpu_map) {
sd = &per_cpu(book_domains, i).sd;
init_sched_groups_power(i, sd);
} }
#endif
for_each_cpu(i, cpu_map) { /* Calculate CPU power for physical packages and nodes */
sd = &per_cpu(phys_domains, i).sd; for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
init_sched_groups_power(i, sd); init_sched_groups_power(i, sd);
} }
#ifdef CONFIG_NUMA
for (i = 0; i < nr_node_ids; i++)
init_numa_sched_groups_power(d.sched_group_nodes[i]);
if (d.sd_allnodes) {
struct sched_group *sg;
cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
d.tmpmask);
init_numa_sched_groups_power(sg);
} }
#endif
/* Attach the domains */ /* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
#ifdef CONFIG_SCHED_SMT sd = *per_cpu_ptr(d.sd, i);
sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC)
sd = &per_cpu(core_domains, i).sd;
#elif defined(CONFIG_SCHED_BOOK)
sd = &per_cpu(book_domains, i).sd;
#else
sd = &per_cpu(phys_domains, i).sd;
#endif
cpu_attach_domain(sd, d.rd, i); cpu_attach_domain(sd, d.rd, i);
} }
rcu_read_unlock();
d.sched_group_nodes = NULL; /* don't free this we still need it */ ret = 0;
__free_domain_allocs(&d, sa_tmpmask, cpu_map);
return 0;
error: error:
__free_domain_allocs(&d, alloc_state, cpu_map); __free_domain_allocs(&d, alloc_state, cpu_map);
return -ENOMEM; return ret;
}
static int build_sched_domains(const struct cpumask *cpu_map)
{
return __build_sched_domains(cpu_map, NULL);
} }
static cpumask_var_t *doms_cur; /* current sched domains */ static cpumask_var_t *doms_cur; /* current sched domains */
...@@ -7670,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) ...@@ -7670,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated cpus, but could be used to * For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future. * exclude other special cases in the future.
*/ */
static int arch_init_sched_domains(const struct cpumask *cpu_map) static int init_sched_domains(const struct cpumask *cpu_map)
{ {
int err; int err;
...@@ -7681,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) ...@@ -7681,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
doms_cur = &fallback_doms; doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
dattr_cur = NULL; dattr_cur = NULL;
err = build_sched_domains(doms_cur[0]); err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl(); register_sched_domain_sysctl();
return err; return err;
} }
static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
struct cpumask *tmpmask)
{
free_sched_groups(cpu_map, tmpmask);
}
/* /*
* Detach sched domains from a group of cpus specified in cpu_map * Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain * These cpus will now be attached to the NULL domain
*/ */
static void detach_destroy_domains(const struct cpumask *cpu_map) static void detach_destroy_domains(const struct cpumask *cpu_map)
{ {
/* Save because hotplug lock held. */
static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i; int i;
rcu_read_lock();
for_each_cpu(i, cpu_map) for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i); cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched(); rcu_read_unlock();
arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
} }
/* handle null as "default" */ /* handle null as "default" */
...@@ -7795,8 +7479,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -7795,8 +7479,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
goto match2; goto match2;
} }
/* no match - add a new doms_new */ /* no match - add a new doms_new */
__build_sched_domains(doms_new[i], build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
dattr_new ? dattr_new + i : NULL);
match2: match2:
; ;
} }
...@@ -7815,7 +7498,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -7815,7 +7498,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
} }
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
static void arch_reinit_sched_domains(void) static void reinit_sched_domains(void)
{ {
get_online_cpus(); get_online_cpus();
...@@ -7848,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) ...@@ -7848,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
else else
sched_mc_power_savings = level; sched_mc_power_savings = level;
arch_reinit_sched_domains(); reinit_sched_domains();
return count; return count;
} }
...@@ -7967,14 +7650,9 @@ void __init sched_init_smp(void) ...@@ -7967,14 +7650,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
#if defined(CONFIG_NUMA)
sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
GFP_KERNEL);
BUG_ON(sched_group_nodes_bycpu == NULL);
#endif
get_online_cpus(); get_online_cpus();
mutex_lock(&sched_domains_mutex); mutex_lock(&sched_domains_mutex);
arch_init_sched_domains(cpu_active_mask); init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus)) if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
...@@ -8281,6 +7959,7 @@ void __init sched_init(void) ...@@ -8281,6 +7959,7 @@ void __init sched_init(void)
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ #ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
...@@ -8340,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) ...@@ -8340,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
int old_prio = p->prio; int old_prio = p->prio;
int on_rq; int on_rq;
on_rq = p->se.on_rq; on_rq = p->on_rq;
if (on_rq) if (on_rq)
deactivate_task(rq, p, 0); deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0); __setscheduler(rq, p, SCHED_NORMAL, 0);
...@@ -8553,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) ...@@ -8553,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{ {
struct rt_rq *rt_rq; struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se; struct sched_rt_entity *rt_se;
struct rq *rq;
int i; int i;
tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
...@@ -8567,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) ...@@ -8567,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
ktime_to_ns(def_rt_bandwidth.rt_period), 0); ktime_to_ns(def_rt_bandwidth.rt_period), 0);
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
rq = cpu_rq(i);
rt_rq = kzalloc_node(sizeof(struct rt_rq), rt_rq = kzalloc_node(sizeof(struct rt_rq),
GFP_KERNEL, cpu_to_node(i)); GFP_KERNEL, cpu_to_node(i));
if (!rt_rq) if (!rt_rq)
...@@ -8683,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk) ...@@ -8683,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk)
rq = task_rq_lock(tsk, &flags); rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk); running = task_current(rq, tsk);
on_rq = tsk->se.on_rq; on_rq = tsk->on_rq;
if (on_rq) if (on_rq)
dequeue_task(rq, tsk, 0); dequeue_task(rq, tsk, 0);
...@@ -8702,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk) ...@@ -8702,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk)
if (on_rq) if (on_rq)
enqueue_task(rq, tsk, 0); enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, tsk, &flags);
} }
#endif /* CONFIG_CGROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */
......
...@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) ...@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags); read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) { do_each_thread(g, p) {
if (!p->se.on_rq || task_cpu(p) != rq_cpu) if (!p->on_rq || task_cpu(p) != rq_cpu)
continue; continue;
print_task(m, rq, p); print_task(m, rq, p);
...@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) ...@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
P(ttwu_count); P(ttwu_count);
P(ttwu_local); P(ttwu_local);
SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
rq->rq_sched_info.bkl_count);
#undef P #undef P
#undef P64 #undef P64
#endif #endif
...@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.wait_count); P(se.statistics.wait_count);
PN(se.statistics.iowait_sum); PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count); P(se.statistics.iowait_count);
P(sched_info.bkl_count);
P(se.nr_migrations); P(se.nr_migrations);
P(se.statistics.nr_migrations_cold); P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine); P(se.statistics.nr_failed_migrations_affine);
......
...@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) ...@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
} }
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
} }
/* /*
...@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
hrtick_update(rq); hrtick_update(rq);
} }
static void set_next_buddy(struct sched_entity *se);
/* /*
* The dequeue_task method is called before nr_running is * The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and * decreased. We remove the task from the rbtree and
...@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
for_each_sched_entity(se) { for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags); dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */ /* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) if (cfs_rq->load.weight) {
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && parent_entity(se))
set_next_buddy(parent_entity(se));
break; break;
}
flags |= DEQUEUE_SLEEP; flags |= DEQUEUE_SLEEP;
} }
...@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static void task_waking_fair(struct rq *rq, struct task_struct *p) static void task_waking_fair(struct task_struct *p)
{ {
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 min_vruntime;
se->vruntime -= cfs_rq->min_vruntime; #ifndef CONFIG_64BIT
u64 min_vruntime_copy;
do {
min_vruntime_copy = cfs_rq->min_vruntime_copy;
smp_rmb();
min_vruntime = cfs_rq->min_vruntime;
} while (min_vruntime != min_vruntime_copy);
#else
min_vruntime = cfs_rq->min_vruntime;
#endif
se->vruntime -= min_vruntime;
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
...@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
/* /*
* Otherwise, iterate the domains and find an elegible idle cpu. * Otherwise, iterate the domains and find an elegible idle cpu.
*/ */
rcu_read_lock();
for_each_domain(target, sd) { for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break; break;
...@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break; break;
} }
rcu_read_unlock();
return target; return target;
} }
...@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
* preempt must be disabled. * preempt must be disabled.
*/ */
static int static int
select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{ {
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
...@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ ...@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
new_cpu = prev_cpu; new_cpu = prev_cpu;
} }
rcu_read_lock();
for_each_domain(cpu, tmp) { for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE)) if (!(tmp->flags & SD_LOAD_BALANCE))
continue; continue;
...@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ ...@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
if (affine_sd) { if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu); prev_cpu = cpu;
else
return select_idle_sibling(p, prev_cpu); new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
} }
while (sd) { while (sd) {
...@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ ...@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
} }
/* while loop will break here if sd == NULL */ /* while loop will break here if sd == NULL */
} }
unlock:
rcu_read_unlock();
return new_cpu; return new_cpu;
} }
...@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) ...@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
* This is especially important for buddies when the leftmost * This is especially important for buddies when the leftmost
* task is higher priority than the buddy. * task is higher priority than the buddy.
*/ */
if (unlikely(se->load.weight != NICE_0_LOAD)) return calc_delta_fair(gran, se);
gran = calc_delta_fair(gran, se);
return gran;
} }
/* /*
...@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ...@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se) static void set_last_buddy(struct sched_entity *se)
{ {
if (likely(task_of(se)->policy != SCHED_IDLE)) { if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
return;
for_each_sched_entity(se) for_each_sched_entity(se)
cfs_rq_of(se)->last = se; cfs_rq_of(se)->last = se;
}
} }
static void set_next_buddy(struct sched_entity *se) static void set_next_buddy(struct sched_entity *se)
{ {
if (likely(task_of(se)->policy != SCHED_IDLE)) { if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
return;
for_each_sched_entity(se) for_each_sched_entity(se)
cfs_rq_of(se)->next = se; cfs_rq_of(se)->next = se;
}
} }
static void set_skip_buddy(struct sched_entity *se) static void set_skip_buddy(struct sched_entity *se)
{ {
if (likely(task_of(se)->policy != SCHED_IDLE)) {
for_each_sched_entity(se) for_each_sched_entity(se)
cfs_rq_of(se)->skip = se; cfs_rq_of(se)->skip = se;
}
} }
/* /*
...@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ ...@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct sched_entity *se = &curr->se, *pse = &p->se; struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency; int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
if (unlikely(se == pse)) if (unlikely(se == pse))
return; return;
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse); set_next_buddy(pse);
next_buddy_marked = 1;
}
/* /*
* We can come here with TIF_NEED_RESCHED already set from new task * We can come here with TIF_NEED_RESCHED already set from new task
...@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ ...@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
update_curr(cfs_rq); update_curr(cfs_rq);
find_matching_se(&se, &pse); find_matching_se(&se, &pse);
BUG_ON(!pse); BUG_ON(!pse);
if (wakeup_preempt_entity(se, pse) == 1) if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
* triggering this preemption.
*/
if (!next_buddy_marked)
set_next_buddy(pse);
goto preempt; goto preempt;
}
return; return;
...@@ -2102,7 +2142,7 @@ static unsigned long ...@@ -2102,7 +2142,7 @@ static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd, unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned, enum cpu_idle_type idle, int *all_pinned,
int *this_best_prio, struct cfs_rq *busiest_cfs_rq) struct cfs_rq *busiest_cfs_rq)
{ {
int loops = 0, pulled = 0; int loops = 0, pulled = 0;
long rem_load_move = max_load_move; long rem_load_move = max_load_move;
...@@ -2140,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -2140,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
*/ */
if (rem_load_move <= 0) if (rem_load_move <= 0)
break; break;
if (p->prio < *this_best_prio)
*this_best_prio = p->prio;
} }
out: out:
/* /*
...@@ -2202,7 +2239,7 @@ static unsigned long ...@@ -2202,7 +2239,7 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle, struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio) int *all_pinned)
{ {
long rem_load_move = max_load_move; long rem_load_move = max_load_move;
int busiest_cpu = cpu_of(busiest); int busiest_cpu = cpu_of(busiest);
...@@ -2227,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -2227,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
rem_load = div_u64(rem_load, busiest_h_load + 1); rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = balance_tasks(this_rq, this_cpu, busiest, moved_load = balance_tasks(this_rq, this_cpu, busiest,
rem_load, sd, idle, all_pinned, this_best_prio, rem_load, sd, idle, all_pinned,
busiest_cfs_rq); busiest_cfs_rq);
if (!moved_load) if (!moved_load)
...@@ -2253,11 +2290,11 @@ static unsigned long ...@@ -2253,11 +2290,11 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle, struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio) int *all_pinned)
{ {
return balance_tasks(this_rq, this_cpu, busiest, return balance_tasks(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned, max_load_move, sd, idle, all_pinned,
this_best_prio, &busiest->cfs); &busiest->cfs);
} }
#endif #endif
...@@ -2274,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, ...@@ -2274,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
int *all_pinned) int *all_pinned)
{ {
unsigned long total_load_moved = 0, load_moved; unsigned long total_load_moved = 0, load_moved;
int this_best_prio = this_rq->curr->prio;
do { do {
load_moved = load_balance_fair(this_rq, this_cpu, busiest, load_moved = load_balance_fair(this_rq, this_cpu, busiest,
max_load_move - total_load_moved, max_load_move - total_load_moved,
sd, idle, all_pinned, &this_best_prio); sd, idle, all_pinned);
total_load_moved += load_moved; total_load_moved += load_moved;
...@@ -2648,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) ...@@ -2648,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/* /*
* Only siblings can have significantly less than SCHED_LOAD_SCALE * Only siblings can have significantly less than SCHED_LOAD_SCALE
*/ */
if (sd->level != SD_LV_SIBLING) if (!(sd->flags & SD_SHARE_CPUPOWER))
return 0; return 0;
/* /*
...@@ -3465,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) ...@@ -3465,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
raw_spin_unlock(&this_rq->lock); raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu); update_shares(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) { for_each_domain(this_cpu, sd) {
unsigned long interval; unsigned long interval;
int balance = 1; int balance = 1;
...@@ -3486,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) ...@@ -3486,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
break; break;
} }
} }
rcu_read_unlock();
raw_spin_lock(&this_rq->lock); raw_spin_lock(&this_rq->lock);
...@@ -3534,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -3534,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data)
double_lock_balance(busiest_rq, target_rq); double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */ /* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) { for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) && if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
...@@ -3549,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -3549,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data)
else else
schedstat_inc(sd, alb_failed); schedstat_inc(sd, alb_failed);
} }
rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq); double_unlock_balance(busiest_rq, target_rq);
out_unlock: out_unlock:
busiest_rq->active_balance = 0; busiest_rq->active_balance = 0;
...@@ -3675,6 +3715,7 @@ static int find_new_ilb(int cpu) ...@@ -3675,6 +3715,7 @@ static int find_new_ilb(int cpu)
{ {
struct sched_domain *sd; struct sched_domain *sd;
struct sched_group *ilb_group; struct sched_group *ilb_group;
int ilb = nr_cpu_ids;
/* /*
* Have idle load balancer selection from semi-idle packages only * Have idle load balancer selection from semi-idle packages only
...@@ -3690,20 +3731,25 @@ static int find_new_ilb(int cpu) ...@@ -3690,20 +3731,25 @@ static int find_new_ilb(int cpu)
if (cpumask_weight(nohz.idle_cpus_mask) < 2) if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done; goto out_done;
rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups; ilb_group = sd->groups;
do { do {
if (is_semi_idle_group(ilb_group)) if (is_semi_idle_group(ilb_group)) {
return cpumask_first(nohz.grp_idle_mask); ilb = cpumask_first(nohz.grp_idle_mask);
goto unlock;
}
ilb_group = ilb_group->next; ilb_group = ilb_group->next;
} while (ilb_group != sd->groups); } while (ilb_group != sd->groups);
} }
unlock:
rcu_read_unlock();
out_done: out_done:
return nr_cpu_ids; return ilb;
} }
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu) static inline int find_new_ilb(int call_cpu)
...@@ -3848,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3848,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
update_shares(cpu); update_shares(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE)) if (!(sd->flags & SD_LOAD_BALANCE))
continue; continue;
...@@ -3893,6 +3940,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) ...@@ -3893,6 +3940,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (!balance) if (!balance)
break; break;
} }
rcu_read_unlock();
/* /*
* next_balance will be updated only when there is a need. * next_balance will be updated only when there is a need.
......
...@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) ...@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
* Decrement CPU power based on irq activity * Decrement CPU power based on irq activity
*/ */
SCHED_FEAT(NONIRQ_POWER, 1) SCHED_FEAT(NONIRQ_POWER, 1)
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, 1)
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static int static int
select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{ {
return task_cpu(p); /* IDLE tasks as never migrated */ return task_cpu(p); /* IDLE tasks as never migrated */
} }
......
...@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) ...@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
} }
typedef struct task_group *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
(&iter->list != &task_groups) && \
(rt_rq = iter->rt_rq[cpu_of(rq)]); \
iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{ {
list_add_rcu(&rt_rq->leaf_rt_rq_list, list_add_rcu(&rt_rq->leaf_rt_rq_list,
...@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) ...@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(def_rt_bandwidth.rt_period); return ktime_to_ns(def_rt_bandwidth.rt_period);
} }
typedef struct rt_rq *rt_rq_iter_t;
#define for_each_rt_rq(rt_rq, iter, rq) \
for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{ {
} }
...@@ -402,12 +415,13 @@ static int do_balance_runtime(struct rt_rq *rt_rq) ...@@ -402,12 +415,13 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
static void __disable_runtime(struct rq *rq) static void __disable_runtime(struct rq *rq)
{ {
struct root_domain *rd = rq->rd; struct root_domain *rd = rq->rd;
rt_rq_iter_t iter;
struct rt_rq *rt_rq; struct rt_rq *rt_rq;
if (unlikely(!scheduler_running)) if (unlikely(!scheduler_running))
return; return;
for_each_leaf_rt_rq(rt_rq, rq) { for_each_rt_rq(rt_rq, iter, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
s64 want; s64 want;
int i; int i;
...@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq) ...@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq)
static void __enable_runtime(struct rq *rq) static void __enable_runtime(struct rq *rq)
{ {
rt_rq_iter_t iter;
struct rt_rq *rt_rq; struct rt_rq *rt_rq;
if (unlikely(!scheduler_running)) if (unlikely(!scheduler_running))
...@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq) ...@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
/* /*
* Reset each runqueue's bandwidth settings * Reset each runqueue's bandwidth settings
*/ */
for_each_leaf_rt_rq(rt_rq, rq) { for_each_rt_rq(rt_rq, iter, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
raw_spin_lock(&rt_b->rt_runtime_lock); raw_spin_lock(&rt_b->rt_runtime_lock);
...@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) ...@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0; rt_rq->rt_throttled = 0;
enqueue = 1; enqueue = 1;
/*
* Force a clock update if the CPU was idle,
* lest wakeup -> unthrottle time accumulate.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
rq->skip_clock_update = -1;
} }
if (rt_rq->rt_time || rt_rq->rt_nr_running) if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0; idle = 0;
...@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq) ...@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task); static int find_lowest_rq(struct task_struct *task);
static int static int
select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
{ {
struct task_struct *curr;
struct rq *rq;
int cpu;
if (sd_flag != SD_BALANCE_WAKE) if (sd_flag != SD_BALANCE_WAKE)
return smp_processor_id(); return smp_processor_id();
cpu = task_cpu(p);
rq = cpu_rq(cpu);
rcu_read_lock();
curr = ACCESS_ONCE(rq->curr); /* unlocked access */
/* /*
* If the current task is an RT task, then * If the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another * try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task * runqueue. Otherwise simply start this RT task
* on its current runqueue. * on its current runqueue.
...@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) ...@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
* lock? * lock?
* *
* For equal prio tasks, we just let the scheduler sort it out. * For equal prio tasks, we just let the scheduler sort it out.
*
* Otherwise, just let it ride on the affined RQ and the
* post-schedule router will push the preempted task away
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*/ */
if (unlikely(rt_task(rq->curr)) && if (curr && unlikely(rt_task(curr)) &&
(rq->curr->rt.nr_cpus_allowed < 2 || (curr->rt.nr_cpus_allowed < 2 ||
rq->curr->prio < p->prio) && curr->prio < p->prio) &&
(p->rt.nr_cpus_allowed > 1)) { (p->rt.nr_cpus_allowed > 1)) {
int cpu = find_lowest_rq(p); int target = find_lowest_rq(p);
return (cpu == -1) ? task_cpu(p) : cpu; if (target != -1)
cpu = target;
} }
rcu_read_unlock();
/* return cpu;
* Otherwise, just let it ride on the affined RQ and the
* post-schedule router will push the preempted task away
*/
return task_cpu(p);
} }
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
...@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) ...@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing * The previous task needs to be made eligible for pushing
* if it is still active * if it is still active
*/ */
if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p); enqueue_pushable_task(rq, p);
} }
...@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) ...@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu, !cpumask_test_cpu(lowest_rq->cpu,
&task->cpus_allowed) || &task->cpus_allowed) ||
task_running(rq, task) || task_running(rq, task) ||
!task->se.on_rq)) { !task->on_rq)) {
raw_spin_unlock(&lowest_rq->lock); raw_spin_unlock(&lowest_rq->lock);
lowest_rq = NULL; lowest_rq = NULL;
...@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) ...@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p)); BUG_ON(task_current(rq, p));
BUG_ON(p->rt.nr_cpus_allowed <= 1); BUG_ON(p->rt.nr_cpus_allowed <= 1);
BUG_ON(!p->se.on_rq); BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p)); BUG_ON(!rt_task(p));
return p; return p;
...@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq) ...@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq)
*/ */
if (p && (p->prio < this_rq->rt.highest_prio.curr)) { if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr); WARN_ON(p == src_rq->curr);
WARN_ON(!p->se.on_rq); WARN_ON(!p->on_rq);
/* /*
* There's a chance that p is higher in priority * There's a chance that p is higher in priority
...@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, ...@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
* Update the migration status of the RQ if we have an RT task * Update the migration status of the RQ if we have an RT task
* which is running AND changing its weight value. * which is running AND changing its weight value.
*/ */
if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p); struct rq *rq = task_rq(p);
if (!task_current(rq, p)) { if (!task_current(rq, p)) {
...@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) ...@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks * we may need to handle the pulling of RT tasks
* now. * now.
*/ */
if (p->se.on_rq && !rq->rt.rt_nr_running) if (p->on_rq && !rq->rt.rt_nr_running)
pull_rt_task(rq); pull_rt_task(rq);
} }
...@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) ...@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* If that current running task is also an RT task * If that current running task is also an RT task
* then see if we can move to another run queue. * then see if we can move to another run queue.
*/ */
if (p->se.on_rq && rq->curr != p) { if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (rq->rt.overloaded && push_rt_task(rq) && if (rq->rt.overloaded && push_rt_task(rq) &&
/* Don't resched if we changed runqueues */ /* Don't resched if we changed runqueues */
...@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) ...@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{ {
if (!p->se.on_rq) if (!p->on_rq)
return; return;
if (rq->curr == p) { if (rq->curr == p) {
...@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); ...@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
static void print_rt_stats(struct seq_file *m, int cpu) static void print_rt_stats(struct seq_file *m, int cpu)
{ {
rt_rq_iter_t iter;
struct rt_rq *rt_rq; struct rt_rq *rt_rq;
rcu_read_lock(); rcu_read_lock();
for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
print_rt_rq(m, cpu, rt_rq); print_rt_rq(m, cpu, rt_rq);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -9,8 +9,7 @@ ...@@ -9,8 +9,7 @@
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static int static int
select_task_rq_stop(struct rq *rq, struct task_struct *p, select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
int sd_flag, int flags)
{ {
return task_cpu(p); /* stop tasks as never migrate */ return task_cpu(p); /* stop tasks as never migrate */
} }
...@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) ...@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{ {
struct task_struct *stop = rq->stop; struct task_struct *stop = rq->stop;
if (stop && stop->se.on_rq) if (stop && stop->on_rq)
return stop; return stop;
return NULL; return NULL;
......
...@@ -53,7 +53,6 @@ const char *reserved_field_names[] = { ...@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
"common_preempt_count", "common_preempt_count",
"common_pid", "common_pid",
"common_tgid", "common_tgid",
"common_lock_depth",
FIELD_STRING_IP, FIELD_STRING_IP,
FIELD_STRING_RETIP, FIELD_STRING_RETIP,
FIELD_STRING_FUNC, FIELD_STRING_FUNC,
......
...@@ -63,7 +63,6 @@ The format file for the sched_wakep event defines the following fields ...@@ -63,7 +63,6 @@ The format file for the sched_wakep event defines the following fields
field:unsigned char common_flags; field:unsigned char common_flags;
field:unsigned char common_preempt_count; field:unsigned char common_preempt_count;
field:int common_pid; field:int common_pid;
field:int common_lock_depth;
field:char comm[TASK_COMM_LEN]; field:char comm[TASK_COMM_LEN];
field:pid_t pid; field:pid_t pid;
......
...@@ -463,7 +463,6 @@ The format file for the sched_wakep event defines the following fields ...@@ -463,7 +463,6 @@ The format file for the sched_wakep event defines the following fields
field:unsigned char common_flags; field:unsigned char common_flags;
field:unsigned char common_preempt_count; field:unsigned char common_preempt_count;
field:int common_pid; field:int common_pid;
field:int common_lock_depth;
field:char comm[TASK_COMM_LEN]; field:char comm[TASK_COMM_LEN];
field:pid_t pid; field:pid_t pid;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment