Commit bc4016f4 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits)
  sched: Export account_system_vtime()
  sched: Call tick_check_idle before __irq_enter
  sched: Remove irq time from available CPU power
  sched: Do not account irq time to current task
  x86: Add IRQ_TIME_ACCOUNTING
  sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time
  sched: Add a PF flag for ksoftirqd identification
  sched: Consolidate account_system_vtime extern declaration
  sched: Fix softirq time accounting
  sched: Drop group_capacity to 1 only if local group has extra capacity
  sched: Force balancing on newidle balance if local group has capacity
  sched: Set group_imb only a task can be pulled from the busiest cpu
  sched: Do not consider SCHED_IDLE tasks to be cache hot
  sched: Drop all load weight manipulation for RT tasks
  sched: Create special class for stop/migrate work
  sched: Unindent labels
  sched: Comment updates: fix default latency and granularity numbers
  tracing/sched: Add sched_pi_setprio tracepoint
  sched: Give CPU bound RT tasks preference
  sched: Try not to migrate higher priority RT tasks
  ...
parents 5d70f79b b7dadc38
......@@ -14,25 +14,39 @@ to /proc/cpuinfo.
identifier (rather than the kernel's). The actual value is
architecture and platform dependent.
3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
3) /sys/devices/system/cpu/cpuX/topology/book_id:
the book ID of cpuX. Typically it is the hardware platform's
identifier (rather than the kernel's). The actual value is
architecture and platform dependent.
4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
internel kernel map of cpuX's hardware threads within the same
core as cpuX
4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
internal kernel map of cpuX's hardware threads within the same
physical_package_id.
6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
internal kernel map of cpuX's hardware threads within the same
book_id.
To implement it in an architecture-neutral way, a new source file,
drivers/base/topology.c, is to export the 4 attributes.
drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
For an architecture to support this feature, it must define some of
these macros in include/asm-XXX/topology.h:
#define topology_physical_package_id(cpu)
#define topology_core_id(cpu)
#define topology_book_id(cpu)
#define topology_thread_cpumask(cpu)
#define topology_core_cpumask(cpu)
#define topology_book_cpumask(cpu)
The type of **_id is int.
The type of siblings is (const) struct cpumask *.
......@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
3) thread_siblings: just the given CPU
4) core_siblings: just the given CPU
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
default definitions for topology_book_id() and topology_book_cpumask().
Additionally, CPU topology information is provided under
/sys/devices/system/cpu and includes these files. The internal
source for the output is in brackets ("[]").
......
......@@ -2435,6 +2435,10 @@ and is between 256 and 4096 characters. It is defined in the file
disables clocksource verification at runtime.
Used to enable high-resolution timer mode on older
hardware, and in virtualized environment.
[x86] noirqtime: Do not use TSC to do irq accounting.
Used to run time disable IRQ_TIME_ACCOUNTING on any
platforms where RDTSC is slow and this accounting
can add overhead.
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
......
......@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
void default_idle(void);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void account_system_vtime(struct task_struct *);
#endif
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
......
......@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x)))
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void account_system_vtime(struct task_struct *);
#endif
extern struct dentry *powerpc_debugfs_root;
#endif /* __KERNEL__ */
......
......@@ -199,6 +199,13 @@ config HOTPLUG_CPU
can be controlled through /sys/devices/system/cpu/cpu#.
Say N if you want to disable CPU hotplug.
config SCHED_BOOK
bool "Book scheduler support"
depends on SMP
help
Book scheduler support improves the CPU scheduler's decision making
when dealing with machines that have several books.
config MATHEMU
bool "IEEE FPU emulation"
depends on MARCH_G5
......
......@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
extern void account_vtime(struct task_struct *, struct task_struct *);
extern void account_tick_vtime(struct task_struct *);
extern void account_system_vtime(struct task_struct *);
#ifdef CONFIG_PFAULT
extern void pfault_irq_init(void);
......
......@@ -3,15 +3,32 @@
#include <linux/cpumask.h>
#define mc_capable() (1)
const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
extern unsigned char cpu_core_id[NR_CPUS];
extern cpumask_t cpu_core_map[NR_CPUS];
static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
{
return &cpu_core_map[cpu];
}
#define topology_core_id(cpu) (cpu_core_id[cpu])
#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
#define mc_capable() (1)
#ifdef CONFIG_SCHED_BOOK
extern unsigned char cpu_book_id[NR_CPUS];
extern cpumask_t cpu_book_map[NR_CPUS];
static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
{
return &cpu_book_map[cpu];
}
#define topology_book_id(cpu) (cpu_book_id[cpu])
#define topology_book_cpumask(cpu) (&cpu_book_map[cpu])
#endif /* CONFIG_SCHED_BOOK */
int topology_set_cpu_management(int fc);
void topology_schedule_update(void);
......@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
};
#endif
#define SD_BOOK_INIT SD_CPU_INIT
#include <asm-generic/topology.h>
#endif /* _ASM_S390_TOPOLOGY_H */
......@@ -57,8 +57,8 @@ struct tl_info {
union tl_entry tle[0];
};
struct core_info {
struct core_info *next;
struct mask_info {
struct mask_info *next;
unsigned char id;
cpumask_t mask;
};
......@@ -66,7 +66,6 @@ struct core_info {
static int topology_enabled;
static void topology_work_fn(struct work_struct *work);
static struct tl_info *tl_info;
static struct core_info core_info;
static int machine_has_topology;
static struct timer_list topology_timer;
static void set_topology_timer(void);
......@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
/* topology_lock protects the core linked list */
static DEFINE_SPINLOCK(topology_lock);
static struct mask_info core_info;
cpumask_t cpu_core_map[NR_CPUS];
unsigned char cpu_core_id[NR_CPUS];
static cpumask_t cpu_coregroup_map(unsigned int cpu)
#ifdef CONFIG_SCHED_BOOK
static struct mask_info book_info;
cpumask_t cpu_book_map[NR_CPUS];
unsigned char cpu_book_id[NR_CPUS];
#endif
static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
{
struct core_info *core = &core_info;
unsigned long flags;
cpumask_t mask;
cpus_clear(mask);
if (!topology_enabled || !machine_has_topology)
return cpu_possible_map;
spin_lock_irqsave(&topology_lock, flags);
while (core) {
if (cpu_isset(cpu, core->mask)) {
mask = core->mask;
while (info) {
if (cpu_isset(cpu, info->mask)) {
mask = info->mask;
break;
}
core = core->next;
info = info->next;
}
spin_unlock_irqrestore(&topology_lock, flags);
if (cpus_empty(mask))
mask = cpumask_of_cpu(cpu);
return mask;
}
const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
{
return &cpu_core_map[cpu];
}
static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
struct mask_info *core)
{
unsigned int cpu;
......@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
for_each_present_cpu(lcpu) {
if (cpu_logical_map(lcpu) == rcpu) {
cpu_set(lcpu, core->mask);
cpu_core_id[lcpu] = core->id;
smp_cpu_polarization[lcpu] = tl_cpu->pp;
}
if (cpu_logical_map(lcpu) != rcpu)
continue;
#ifdef CONFIG_SCHED_BOOK
cpu_set(lcpu, book->mask);
cpu_book_id[lcpu] = book->id;
#endif
cpu_set(lcpu, core->mask);
cpu_core_id[lcpu] = core->id;
smp_cpu_polarization[lcpu] = tl_cpu->pp;
}
}
}
static void clear_cores(void)
static void clear_masks(void)
{
struct core_info *core = &core_info;
struct mask_info *info;
while (core) {
cpus_clear(core->mask);
core = core->next;
info = &core_info;
while (info) {
cpus_clear(info->mask);
info = info->next;
}
#ifdef CONFIG_SCHED_BOOK
info = &book_info;
while (info) {
cpus_clear(info->mask);
info = info->next;
}
#endif
}
static union tl_entry *next_tle(union tl_entry *tle)
......@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
static void tl_to_cores(struct tl_info *info)
{
#ifdef CONFIG_SCHED_BOOK
struct mask_info *book = &book_info;
#else
struct mask_info *book = NULL;
#endif
struct mask_info *core = &core_info;
union tl_entry *tle, *end;
struct core_info *core = &core_info;
spin_lock_irq(&topology_lock);
clear_cores();
clear_masks();
tle = info->tle;
end = (union tl_entry *)((unsigned long)info + info->length);
while (tle < end) {
switch (tle->nl) {
case 5:
case 4:
case 3:
#ifdef CONFIG_SCHED_BOOK
case 2:
book = book->next;
book->id = tle->container.id;
break;
#endif
case 1:
core = core->next;
core->id = tle->container.id;
break;
case 0:
add_cpus_to_core(&tle->cpu, core);
add_cpus_to_mask(&tle->cpu, book, core);
break;
default:
clear_cores();
clear_masks();
machine_has_topology = 0;
goto out;
}
......@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
static void update_cpu_core_map(void)
{
unsigned long flags;
int cpu;
for_each_possible_cpu(cpu)
cpu_core_map[cpu] = cpu_coregroup_map(cpu);
spin_lock_irqsave(&topology_lock, flags);
for_each_possible_cpu(cpu) {
cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
#ifdef CONFIG_SCHED_BOOK
cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
#endif
}
spin_unlock_irqrestore(&topology_lock, flags);
}
static void store_topology(struct tl_info *info)
{
#ifdef CONFIG_SCHED_BOOK
int rc;
rc = stsi(info, 15, 1, 3);
if (rc != -ENOSYS)
return;
#endif
stsi(info, 15, 1, 2);
}
int arch_update_cpu_topology(void)
......@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
topology_update_polarization_simple();
return 0;
}
stsi(info, 15, 1, 2);
store_topology(info);
tl_to_cores(info);
update_cpu_core_map();
for_each_online_cpu(cpu) {
......@@ -299,12 +335,24 @@ static int __init init_topology_update(void)
}
__initcall(init_topology_update);
static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
{
int i, nr_masks;
nr_masks = info->mag[NR_MAG - offset];
for (i = 0; i < info->mnest - offset; i++)
nr_masks *= info->mag[NR_MAG - offset - 1 - i];
nr_masks = max(nr_masks, 1);
for (i = 0; i < nr_masks; i++) {
mask->next = alloc_bootmem(sizeof(struct mask_info));
mask = mask->next;
}
}
void __init s390_init_cpu_topology(void)
{
unsigned long long facility_bits;
struct tl_info *info;
struct core_info *core;
int nr_cores;
int i;
if (stfle(&facility_bits, 1) <= 0)
......@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
tl_info = alloc_bootmem_pages(PAGE_SIZE);
info = tl_info;
stsi(info, 15, 1, 2);
nr_cores = info->mag[NR_MAG - 2];
for (i = 0; i < info->mnest - 2; i++)
nr_cores *= info->mag[NR_MAG - 3 - i];
store_topology(info);
pr_info("The CPU configuration topology of the machine is:");
for (i = 0; i < NR_MAG; i++)
printk(" %d", info->mag[i]);
printk(" / %d\n", info->mnest);
core = &core_info;
for (i = 0; i < nr_cores; i++) {
core->next = alloc_bootmem(sizeof(struct core_info));
core = core->next;
if (!core)
goto error;
}
return;
error:
machine_has_topology = 0;
alloc_masks(info, &core_info, 2);
#ifdef CONFIG_SCHED_BOOK
alloc_masks(info, &book_info, 3);
#endif
}
......@@ -799,6 +799,17 @@ config SCHED_MC
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
default n
---help---
Select this option to enable fine granularity task irq time
accounting. This is done by reading a timestamp on each
transitions between softirq and hardirq state, so there can be a
small performance impact.
If in doubt, say N here.
source "kernel/Kconfig.preempt"
config X86_UP_APIC
......
......@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
static int no_sched_irq_time;
static int __init tsc_setup(char *str)
{
if (!strcmp(str, "reliable"))
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
return 1;
}
......@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
if (!tsc_unstable) {
tsc_unstable = 1;
sched_clock_stable = 0;
disable_sched_clock_irqtime();
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
......@@ -987,6 +992,9 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
if (!no_sched_irq_time)
enable_sched_clock_irqtime();
lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
lpj_fine = lpj;
......
......@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
return sprintf(buf, "%d\n", topology_##name(cpu)); \
}
#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
defined(topology_book_cpumask)
static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
{
ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
......@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
define_one_ro_named(core_siblings, show_core_cpumask);
define_one_ro_named(core_siblings_list, show_core_cpumask_list);
#ifdef CONFIG_SCHED_BOOK
define_id_show_func(book_id);
define_one_ro(book_id);
define_siblings_show_func(book_cpumask);
define_one_ro_named(book_siblings, show_book_cpumask);
define_one_ro_named(book_siblings_list, show_book_cpumask_list);
#endif
static struct attribute *default_attrs[] = {
&attr_physical_package_id.attr,
&attr_core_id.attr,
......@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
&attr_thread_siblings_list.attr,
&attr_core_siblings.attr,
&attr_core_siblings_list.attr,
#ifdef CONFIG_SCHED_BOOK
&attr_book_id.attr,
&attr_book_siblings.attr,
&attr_book_siblings_list.attr,
#endif
NULL
};
......
......@@ -64,6 +64,8 @@
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
......@@ -82,10 +84,13 @@
/*
* Are we doing bottom half or hardware interrupt processing?
* Are we in a softirq context? Interrupt context?
* in_softirq - Are we currently processing softirq or have bh disabled?
* in_serving_softirq - Are we currently processing softirq?
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
/*
* Are we in NMI context?
......@@ -132,10 +137,12 @@ extern void synchronize_irq(unsigned int irq);
struct task_struct;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
static inline void account_system_vtime(struct task_struct *tsk)
{
}
#else
extern void account_system_vtime(struct task_struct *tsk);
#endif
#if defined(CONFIG_NO_HZ)
......
......@@ -875,6 +875,7 @@ enum sched_domain_level {
SD_LV_NONE = 0,
SD_LV_SIBLING,
SD_LV_MC,
SD_LV_BOOK,
SD_LV_CPU,
SD_LV_NODE,
SD_LV_ALLNODES,
......@@ -1690,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
/*
* Per process flags
*/
#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
/* Not implemented yet, only for 486*/
#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
......@@ -1837,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* An i/f to runtime opt-in for irq time accounting based off of sched_clock.
* The reason for this explicit opt-in is not to have perf penalty with
* slow sched_clocks.
*/
extern void enable_sched_clock_irqtime(void);
extern void disable_sched_clock_irqtime(void);
#else
static inline void enable_sched_clock_irqtime(void) {}
static inline void disable_sched_clock_irqtime(void) {}
#endif
extern unsigned long long
task_sched_runtime(struct task_struct *task);
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
......@@ -2378,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
extern int __cond_resched_softirq(void);
#define cond_resched_softirq() ({ \
__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
__cond_resched_softirq(); \
#define cond_resched_softirq() ({ \
__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
__cond_resched_softirq(); \
})
/*
......
......@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
.balance_interval = 64, \
}
#ifdef CONFIG_SCHED_BOOK
#ifndef SD_BOOK_INIT
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
#endif
#endif /* CONFIG_SCHED_BOOK */
#ifdef CONFIG_NUMA
#ifndef SD_NODE_INIT
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
......
......@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
(unsigned long long)__entry->vruntime)
);
/*
* Tracepoint for showing priority inheritance modifying a tasks
* priority.
*/
TRACE_EVENT(sched_pi_setprio,
TP_PROTO(struct task_struct *tsk, int newprio),
TP_ARGS(tsk, newprio),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, oldprio )
__field( int, newprio )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->oldprio = tsk->prio;
__entry->newprio = newprio;
),
TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
__entry->comm, __entry->pid,
__entry->oldprio, __entry->newprio)
);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
......
This diff is collapsed.
......@@ -25,7 +25,7 @@
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
......@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
......@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_of(cfs_rq)->clock;
u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
......@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
se->exec_start = rq_of(cfs_rq)->clock;
se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
......@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
/* re-arm NEWIDLE balancing when moving tasks */
src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
this_rq->idle_stamp = 0;
}
/*
......@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
tsk_cache_hot = task_hot(p, rq->clock, sd);
tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
......@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
unsigned long this_has_capacity;
/* Statistics of the busiest group */
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
unsigned long busiest_has_capacity;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
......@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
......@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
u64 total, available;
total = sched_avg_period() + (rq->clock - rq->age_stamp);
available = total - rq->rt_avg;
if (unlikely(total < rq->rt_avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
available = total - rq->rt_avg;
}
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
......@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
unsigned long load, max_cpu_load, min_cpu_load;
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
......@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
......@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
if (load > max_cpu_load)
if (load > max_cpu_load) {
max_cpu_load = load;
max_nr_running = rq->nr_running;
}
if (min_cpu_load > load)
min_cpu_load = load;
}
......@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
sgs->group_capacity =
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
}
/**
......@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
* and move all the excess tasks away.
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling)
if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
......@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
......@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
return fix_small_imbalance(sds, this_cpu, imbalance);
}
/******* find_busiest_group() helpers end here *********************/
/**
......@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
*
* Note: when doing newidle balance, if the local group has excess
* capacity (i.e. nr_running < group_capacity) and the busiest group
* does not have any capacity, we force a load balance to pull tasks
* to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
......@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
if (sds.this_load >= sds.max_load)
goto out_balanced;
......@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
......@@ -3031,7 +3068,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
this_cpu)) {
......@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
if (pulled_task) {
this_rq->idle_stamp = 0;
if (pulled_task)
break;
}
}
raw_spin_lock(&this_rq->lock);
......
......@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
* release the lock. Decreases scheduling overhead.
*/
SCHED_FEAT(OWNER_SPIN, 1)
/*
* Decrement CPU power based on irq activity
*/
SCHED_FEAT(NONIRQ_POWER, 1)
......@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
if (!task_has_rt_policy(curr))
return;
delta_exec = rq->clock - curr->se.exec_start;
delta_exec = rq->clock_task - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
......@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq->clock;
curr->se.exec_start = rq->clock_task;
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
......@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
*
* We want to avoid overloading runqueues. Even if
* the RT task is of higher priority than the current RT task.
* RT tasks behave differently than other tasks. If
* one gets preempted, we try to push it off to another queue.
* So trying to keep a preempting RT task on the same
* cache hot CPU will force the running RT task to
* a cold CPU. So we waste all the cache for the lower
* RT task in hopes of saving some of a RT task
* that is just being woken and probably will have
* cold cache anyway.
* We want to avoid overloading runqueues. If the woken
* task is a higher priority, then it will stay on this CPU
* and the lower prio task should be moved to another CPU.
* Even though this will probably make the lower prio task
* lose its cache, we do not want to bounce a higher task
* around just because it gave up its CPU, perhaps for a
* lock?
*
* For equal prio tasks, we just let the scheduler sort it out.
*/
if (unlikely(rt_task(rq->curr)) &&
(rq->curr->rt.nr_cpus_allowed < 2 ||
rq->curr->prio < p->prio) &&
(p->rt.nr_cpus_allowed > 1)) {
int cpu = find_lowest_rq(p);
......@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
} while (rt_rq);
p = rt_task_of(rt_se);
p->se.exec_start = rq->clock;
p->se.exec_start = rq->clock_task;
return p;
}
......@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
for_each_leaf_rt_rq(rt_rq, rq) {
array = &rt_rq->active;
idx = sched_find_first_bit(array->bitmap);
next_idx:
next_idx:
if (idx >= MAX_RT_PRIO)
continue;
if (next && next->prio < idx)
......@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
retry:
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
return 0;
......@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
* but possible)
*/
}
skip:
skip:
double_unlock_balance(this_rq, src_rq);
}
......@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
p->rt.nr_cpus_allowed > 1)
p->rt.nr_cpus_allowed > 1 &&
rt_task(rq->curr) &&
(rq->curr->rt.nr_cpus_allowed < 2 ||
rq->curr->prio < p->prio))
push_rt_tasks(rq);
}
......@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
p->se.exec_start = rq->clock_task;
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
......
/*
* stop-task scheduling class.
*
* The stop task is the highest priority task in the system, it preempts
* everything and will be preempted by nothing.
*
* See kernel/stop_machine.c
*/
#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct rq *rq, struct task_struct *p,
int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
#endif /* CONFIG_SMP */
static void
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
{
resched_task(rq->curr); /* we preempt everything */
}
static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
if (stop && stop->state == TASK_RUNNING)
return stop;
return NULL;
}
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
}
static void yield_task_stop(struct rq *rq)
{
BUG(); /* the stop task should never yield, its pointless. */
}
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_stop(struct rq *rq)
{
}
static void switched_to_stop(struct rq *rq, struct task_struct *p,
int running)
{
BUG(); /* its impossible to change to this class */
}
static void prio_changed_stop(struct rq *rq, struct task_struct *p,
int oldprio, int running)
{
BUG(); /* how!?, what priority? */
}
static unsigned int
get_rr_interval_stop(struct rq *rq, struct task_struct *task)
{
return 0;
}
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
static const struct sched_class stop_sched_class = {
.next = &rt_sched_class,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
.check_preempt_curr = check_preempt_curr_stop,
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
#endif
.set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
/* no .task_new for stop tasks */
};
......@@ -76,12 +76,22 @@ void wakeup_softirqd(void)
wake_up_process(tsk);
}
/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
* softirq processing.
* - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
* on local_bh_disable or local_bh_enable.
* This lets us distinguish between whether we are currently processing
* softirq and whether we just have bh disabled.
*/
/*
* This one is for softirq.c-internal use,
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
static void __local_bh_disable(unsigned long ip)
static void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
......@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
preempt_count() += SOFTIRQ_OFFSET;
preempt_count() += cnt;
/*
* Were softirqs turned off above:
*/
if (softirq_count() == SOFTIRQ_OFFSET)
if (softirq_count() == cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == SOFTIRQ_OFFSET)
if (preempt_count() == cnt)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip)
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
add_preempt_count(SOFTIRQ_OFFSET);
add_preempt_count(cnt);
barrier();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
void local_bh_disable(void)
{
__local_bh_disable((unsigned long)__builtin_return_address(0));
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(local_bh_disable);
static void __local_bh_enable(unsigned int cnt)
{
WARN_ON_ONCE(in_irq());
WARN_ON_ONCE(!irqs_disabled());
if (softirq_count() == cnt)
trace_softirqs_on((unsigned long)__builtin_return_address(0));
sub_preempt_count(cnt);
}
/*
* Special-case - softirqs can safely be enabled in
* cond_resched_softirq(), or by __do_softirq(),
......@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
*/
void _local_bh_enable(void)
{
WARN_ON_ONCE(in_irq());
WARN_ON_ONCE(!irqs_disabled());
if (softirq_count() == SOFTIRQ_OFFSET)
trace_softirqs_on((unsigned long)__builtin_return_address(0));
sub_preempt_count(SOFTIRQ_OFFSET);
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
......@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
/*
* Are softirqs going to be turned on now:
*/
if (softirq_count() == SOFTIRQ_OFFSET)
if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
trace_softirqs_on(ip);
/*
* Keep preemption disabled until we are done with
* softirq processing:
*/
sub_preempt_count(SOFTIRQ_OFFSET - 1);
sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
if (unlikely(!in_interrupt() && local_softirq_pending()))
do_softirq();
......@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
pending = local_softirq_pending();
account_system_vtime(current);
__local_bh_disable((unsigned long)__builtin_return_address(0));
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
lockdep_softirq_enter();
cpu = smp_processor_id();
......@@ -245,7 +262,7 @@ asmlinkage void __do_softirq(void)
lockdep_softirq_exit();
account_system_vtime(current);
_local_bh_enable();
__local_bh_enable(SOFTIRQ_OFFSET);
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
......@@ -279,10 +296,16 @@ void irq_enter(void)
rcu_irq_enter();
if (idle_cpu(cpu) && !in_interrupt()) {
__irq_enter();
/*
* Prevent raise_softirq from needlessly waking up ksoftirqd
* here, as softirq will be serviced on return from interrupt.
*/
local_bh_disable();
tick_check_idle(cpu);
} else
__irq_enter();
_local_bh_enable();
}
__irq_enter();
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
......@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
{
set_current_state(TASK_INTERRUPTIBLE);
current->flags |= PF_KSOFTIRQD;
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
......
......@@ -287,11 +287,12 @@ static int cpu_stopper_thread(void *data)
goto repeat;
}
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
unsigned int cpu = (unsigned long)hcpu;
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct task_struct *p;
......@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
get_task_struct(p);
kthread_bind(p, cpu);
sched_set_stop_task(cpu, p);
stopper->thread = p;
break;
case CPU_ONLINE:
kthread_bind(stopper->thread, cpu);
/* strictly unnecessary, as first user will wake it */
wake_up_process(stopper->thread);
/* mark enabled */
......@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
{
struct cpu_stop_work *work;
sched_set_stop_task(cpu, NULL);
/* kill the stopper */
kthread_stop(stopper->thread);
/* drain remaining works */
......
......@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
* calls by looking at the number of nested bh disable calls because
* softirqs always disables bh.
*/
if (softirq_count() != SOFTIRQ_OFFSET) {
if (in_serving_softirq()) {
/* If there is an sk_classid we'll use that. */
if (!skb->sk)
return -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment