Commit f213a6c8 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - fix affine wakeups (Peter Zijlstra)

   - improve CPU onlining (and general bootup) scalability on systems
     with ridiculous number (thousands) of CPUs (Peter Zijlstra)

   - sched/numa updates (Rik van Riel)

   - sched/deadline updates (Byungchul Park)

   - sched/cpufreq enhancements and related cleanups (Viresh Kumar)

   - sched/debug enhancements (Xie XiuQi)

   - various fixes"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
  sched/debug: Optimize sched_domain sysctl generation
  sched/topology: Avoid pointless rebuild
  sched/topology, cpuset: Avoid spurious/wrong domain rebuilds
  sched/topology: Improve comments
  sched/topology: Fix memory leak in __sdt_alloc()
  sched/completion: Document that reinit_completion() must be called after complete_all()
  sched/autogroup: Fix error reporting printk text in autogroup_create()
  sched/fair: Fix wake_affine() for !NUMA_BALANCING
  sched/debug: Intruduce task_state_to_char() helper function
  sched/debug: Show task state in /proc/sched_debug
  sched/debug: Use task_pid_nr_ns in /proc/$pid/sched
  sched/core: Remove unnecessary initialization init_idle_bootup_task()
  sched/deadline: Change return value of cpudl_find()
  sched/deadline: Make find_later_rq() choose a closer CPU in topology
  sched/numa: Scale scan period with tasks in group and shared/private
  sched/numa: Slow down scan rate if shared faults dominate
  sched/pelt: Fix false running accounting
  sched: Mark pick_next_task_dl() and build_sched_domain() as static
  sched/cpupri: Don't re-initialize 'struct cpupri'
  sched/deadline: Don't re-initialize 'struct cpudl'
  ...
parents 621bee34 bbdacdfe
...@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) ...@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
extern void setup_node_to_cpumask_map(void); extern void setup_node_to_cpumask_map(void);
/*
* Returns the number of the node containing Node 'node'. This
* architecture is flat, so it is a pretty simple function!
*/
#define parent_node(node) (node)
#define pcibus_to_node(bus) __pcibus_to_node(bus) #define pcibus_to_node(bus) __pcibus_to_node(bus)
extern int __node_distance(int, int); extern int __node_distance(int, int);
......
...@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = { ...@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = {
static int sched_show(struct seq_file *m, void *v) static int sched_show(struct seq_file *m, void *v)
{ {
struct inode *inode = m->private; struct inode *inode = m->private;
struct pid_namespace *ns = inode->i_sb->s_fs_info;
struct task_struct *p; struct task_struct *p;
p = get_proc_task(inode); p = get_proc_task(inode);
if (!p) if (!p)
return -ESRCH; return -ESRCH;
proc_sched_show_task(p, m); proc_sched_show_task(p, ns, m);
put_task_struct(p); put_task_struct(p);
......
...@@ -1233,6 +1233,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) ...@@ -1233,6 +1233,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
return task_pgrp_nr_ns(tsk, &init_pid_ns); return task_pgrp_nr_ns(tsk, &init_pid_ns);
} }
static inline char task_state_to_char(struct task_struct *task)
{
const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
unsigned long state = task->state;
state = state ? __ffs(state) + 1 : 0;
/* Make sure the string lines up properly with the number of task states: */
BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
}
/** /**
* is_global_init - check if a task structure is init. Since init * is_global_init - check if a task structure is init. Since init
* is free to have sub-threads we need to check tgid. * is free to have sub-threads we need to check tgid.
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
*/ */
struct task_struct; struct task_struct;
struct pid_namespace;
extern void dump_cpu_task(int cpu); extern void dump_cpu_task(int cpu);
...@@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p); ...@@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p);
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
struct seq_file; struct seq_file;
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_show_task(struct task_struct *p,
struct pid_namespace *ns, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p); extern void proc_sched_set_task(struct task_struct *p);
#endif #endif
......
...@@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void); ...@@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void);
extern asmlinkage void schedule_tail(struct task_struct *prev); extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle(struct task_struct *idle, int cpu);
extern void init_idle_bootup_task(struct task_struct *idle);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_dead(struct task_struct *p); extern void sched_dead(struct task_struct *p);
......
...@@ -71,6 +71,14 @@ struct sched_domain_shared { ...@@ -71,6 +71,14 @@ struct sched_domain_shared {
atomic_t ref; atomic_t ref;
atomic_t nr_busy_cpus; atomic_t nr_busy_cpus;
int has_idle_cores; int has_idle_cores;
/*
* Some variables from the most recent sd_lb_stats for this domain,
* used by wake_affine().
*/
unsigned long nr_running;
unsigned long load;
unsigned long capacity;
}; };
struct sched_domain { struct sched_domain {
......
...@@ -430,7 +430,6 @@ static noinline void __ref rest_init(void) ...@@ -430,7 +430,6 @@ static noinline void __ref rest_init(void)
* The boot idle thread must execute schedule() * The boot idle thread must execute schedule()
* at least once to get things moving: * at least once to get things moving:
*/ */
init_idle_bootup_task(current);
schedule_preempt_disabled(); schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */ /* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE); cpu_startup_entry(CPUHP_ONLINE);
......
...@@ -2344,13 +2344,7 @@ void cpuset_update_active_cpus(void) ...@@ -2344,13 +2344,7 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests * We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing * inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order. * to a work item to avoid reverse locking order.
*
* We still need to do partition_sched_domains() synchronously;
* otherwise, the scheduler will get confused and put tasks to the
* dead CPU. Fall back to the default single domain.
* cpuset_hotplug_workfn() will rebuild it as necessary.
*/ */
partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work); schedule_work(&cpuset_hotplug_work);
} }
......
...@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void) ...@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail; goto out_fail;
tg = sched_create_group(&root_task_group); tg = sched_create_group(&root_task_group);
if (IS_ERR(tg)) if (IS_ERR(tg))
goto out_free; goto out_free;
...@@ -101,7 +100,7 @@ static inline struct autogroup *autogroup_create(void) ...@@ -101,7 +100,7 @@ static inline struct autogroup *autogroup_create(void)
out_fail: out_fail:
if (printk_ratelimit()) { if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n", printk(KERN_WARNING "autogroup_create: %s failure.\n",
ag ? "sched_create_group()" : "kmalloc()"); ag ? "sched_create_group()" : "kzalloc()");
} }
return autogroup_kref_get(&autogroup_default); return autogroup_kref_get(&autogroup_default);
......
...@@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete); ...@@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete);
* *
* It may be assumed that this function implies a write memory barrier before * It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up. * changing the task state if and only if any tasks are woken up.
*
* Since complete_all() sets the completion of @x permanently to done
* to allow multiple waiters to finish, a call to reinit_completion()
* must be used on @x if @x is to be used again. The code must make
* sure that all waiters have woken and finished before reinitializing
* @x. Also note that the function completion_done() can not be used
* to know if there are still waiters after complete_all() has been called.
*/ */
void complete_all(struct completion *x) void complete_all(struct completion *x)
{ {
...@@ -297,6 +304,7 @@ EXPORT_SYMBOL(try_wait_for_completion); ...@@ -297,6 +304,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress) * Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters. * 1 if there are no waiters.
* *
* Note, this will always return true if complete_all() was called on @X.
*/ */
bool completion_done(struct completion *x) bool completion_done(struct completion *x)
{ {
......
...@@ -5133,24 +5133,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ...@@ -5133,24 +5133,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
return retval; return retval;
} }
static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
void sched_show_task(struct task_struct *p) void sched_show_task(struct task_struct *p)
{ {
unsigned long free = 0; unsigned long free = 0;
int ppid; int ppid;
unsigned long state = p->state;
/* Make sure the string lines up properly with the number of task states: */
BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p)) if (!try_get_task_stack(p))
return; return;
if (state)
state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); if (p->state == TASK_RUNNING)
if (state == TASK_RUNNING)
printk(KERN_CONT " running task "); printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE #ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p); free = stack_not_used(p);
...@@ -5207,11 +5200,6 @@ void show_state_filter(unsigned long state_filter) ...@@ -5207,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks(); debug_show_all_locks();
} }
void init_idle_bootup_task(struct task_struct *idle)
{
idle->sched_class = &idle_sched_class;
}
/** /**
* init_idle - set up an idle thread for a given CPU * init_idle - set up an idle thread for a given CPU
* @idle: task in question * @idle: task in question
...@@ -5468,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) ...@@ -5468,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/ */
next = pick_next_task(rq, &fake_task, rf); next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next); BUG_ON(!next);
next->sched_class->put_prev_task(rq, next); put_prev_task(rq, next);
/* /*
* Rules for changing task_struct::cpus_allowed are holding * Rules for changing task_struct::cpus_allowed are holding
......
...@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp) ...@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task * @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL) * @later_mask: a mask to fill in with the selected CPUs (or NULL)
* *
* Returns: int - best CPU (heap maximum if suitable) * Returns: int - CPUs were found
*/ */
int cpudl_find(struct cpudl *cp, struct task_struct *p, int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask) struct cpumask *later_mask)
{ {
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl; const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask && if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask); return 1;
goto out; } else {
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) { dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
best_cpu = cpudl_maximum(cp);
if (later_mask) if (later_mask)
cpumask_set_cpu(best_cpu, later_mask); cpumask_set_cpu(best_cpu, later_mask);
}
out: return 1;
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); }
}
return best_cpu; return 0;
} }
/* /*
...@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp) ...@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
{ {
int i; int i;
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock); raw_spin_lock_init(&cp->lock);
cp->size = 0; cp->size = 0;
......
...@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp) ...@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
{ {
int i; int i;
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i]; struct cpupri_vec *vec = &cp->pri_to_cpu[i];
......
...@@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) ...@@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out. * let's hope p can move out.
*/ */
if (rq->curr->nr_cpus_allowed == 1 || if (rq->curr->nr_cpus_allowed == 1 ||
cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
return; return;
/* /*
...@@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) ...@@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else. * see if it is pushed or pulled somewhere else.
*/ */
if (p->nr_cpus_allowed != 1 && if (p->nr_cpus_allowed != 1 &&
cpudl_find(&rq->rd->cpudl, p, NULL) != -1) cpudl_find(&rq->rd->cpudl, p, NULL))
return; return;
resched_curr(rq); resched_curr(rq);
...@@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, ...@@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node); return rb_entry(left, struct sched_dl_entity, rb_node);
} }
struct task_struct * static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{ {
struct sched_dl_entity *dl_se; struct sched_dl_entity *dl_se;
...@@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task) ...@@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd; struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id(); int this_cpu = smp_processor_id();
int best_cpu, cpu = task_cpu(task); int cpu = task_cpu(task);
/* Make sure the mask is initialized first */ /* Make sure the mask is initialized first */
if (unlikely(!later_mask)) if (unlikely(!later_mask))
...@@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task) ...@@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity * We have to consider system topology and task affinity
* first, then we can look for a suitable cpu. * first, then we can look for a suitable cpu.
*/ */
best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
task, later_mask);
if (best_cpu == -1)
return -1; return -1;
/* /*
* If we are here, some target has been found, * If we are here, some targets have been found, including
* the most suitable of which is cached in best_cpu. * the most suitable which is, among the runqueues where the
* This is, among the runqueues where the current tasks * current tasks have later deadlines than the task's one, the
* have later deadlines than the task's one, the rq * rq with the latest possible one.
* with the latest possible one.
* *
* Now we check how well this matches with task's * Now we check how well this matches with task's
* affinity and system topology. * affinity and system topology.
...@@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task) ...@@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock(); rcu_read_lock();
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) { if (sd->flags & SD_WAKE_AFFINE) {
int best_cpu;
/* /*
* If possible, preempting this_cpu is * If possible, preempting this_cpu is
...@@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task) ...@@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu; return this_cpu;
} }
best_cpu = cpumask_first_and(later_mask,
sched_domain_span(sd));
/* /*
* Last chance: if best_cpu is valid and is * Last chance: if a cpu being in both later_mask
* in the mask, that becomes our choice. * and current sd span is valid, that becomes our
* choice. Of course, the latest possible cpu is
* already under consideration through later_mask.
*/ */
if (best_cpu < nr_cpu_ids && if (best_cpu < nr_cpu_ids) {
cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
rcu_read_unlock(); rcu_read_unlock();
return best_cpu; return best_cpu;
} }
......
...@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ...@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table; return table;
} }
static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header; static struct ctl_table_header *sd_sysctl_header;
void register_sched_domain_sysctl(void) void register_sched_domain_sysctl(void)
{ {
int i, cpu_num = num_possible_cpus(); static struct ctl_table *cpu_entries;
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); static struct ctl_table **cpu_idx;
char buf[32]; char buf[32];
int i;
if (!cpu_entries) {
cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
if (!cpu_entries)
return;
WARN_ON(sd_ctl_dir[0].child); WARN_ON(sd_ctl_dir[0].child);
sd_ctl_dir[0].child = entry; sd_ctl_dir[0].child = cpu_entries;
}
if (!cpu_idx) {
struct ctl_table *e = cpu_entries;
if (entry == NULL) cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
if (!cpu_idx)
return; return;
/* deal with sparse possible map */
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
cpu_idx[i] = e;
e++;
}
}
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
for_each_cpu(i, sd_sysctl_cpus) {
struct ctl_table *e = cpu_idx[i];
if (e->child)
sd_free_ctl_entry(&e->child);
if (!e->procname) {
snprintf(buf, 32, "cpu%d", i); snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL); e->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555; }
entry->child = sd_alloc_ctl_cpu_table(i); e->mode = 0555;
entry++; e->child = sd_alloc_ctl_cpu_table(i);
__cpumask_clear_cpu(i, sd_sysctl_cpus);
} }
WARN_ON(sd_sysctl_header); WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root); sd_sysctl_header = register_sysctl_table(sd_ctl_root);
} }
void dirty_sched_domain_sysctl(int cpu)
{
if (cpumask_available(sd_sysctl_cpus))
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
/* may be called multiple times per register */ /* may be called multiple times per register */
void unregister_sched_domain_sysctl(void) void unregister_sched_domain_sysctl(void)
{ {
unregister_sysctl_table(sd_sysctl_header); unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL; sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
} }
#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
...@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg) ...@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)
} }
#endif #endif
static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
static void static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{ {
if (rq->curr == p) if (rq->curr == p)
SEQ_printf(m, "R"); SEQ_printf(m, ">R");
else else
SEQ_printf(m, " "); SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p), p->comm, task_pid_nr(p),
...@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) ...@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, SEQ_printf(m,
"\nrunnable tasks:\n" "\nrunnable tasks:\n"
" task PID tree-key switches prio" " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n" " wait-time sum-exec sum-sleep\n"
"------------------------------------------------------" "-------------------------------------------------------"
"----------------------------------------------------\n"); "----------------------------------------------------\n");
rcu_read_lock(); rcu_read_lock();
...@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) ...@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
#endif #endif
} }
void proc_sched_show_task(struct task_struct *p, struct seq_file *m) void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
struct seq_file *m)
{ {
unsigned long nr_switches; unsigned long nr_switches;
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p)); get_nr_threads(p));
SEQ_printf(m, SEQ_printf(m,
"---------------------------------------------------------" "---------------------------------------------------------"
......
...@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se) ...@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/* /*
* For !fair tasks do: * For !fair tasks do:
* *
update_cfs_rq_load_avg(now, cfs_rq, false); update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se); attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p); switched_from_fair(rq, p);
* *
...@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256; ...@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000; unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
atomic_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
int active_nodes;
struct rcu_head rcu;
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
unsigned long faults[0];
};
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_nr_scan_windows(struct task_struct *p)
{ {
unsigned long rss = 0; unsigned long rss = 0;
...@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p) ...@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan); return max_t(unsigned int, floor, scan);
} }
static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
/* Scale the maximum scan period with the amount of shared memory. */
if (p->numa_group) {
struct numa_group *ng = p->numa_group;
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
period *= atomic_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
return max(smin, period);
}
static unsigned int task_scan_max(struct task_struct *p) static unsigned int task_scan_max(struct task_struct *p)
{ {
unsigned int smin = task_scan_min(p); unsigned long smin = task_scan_min(p);
unsigned int smax; unsigned long smax;
/* Watch for min being lower than max due to floor calculations */ /* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
if (p->numa_group) {
struct numa_group *ng = p->numa_group;
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
period *= atomic_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
smax = max(smax, period);
}
return max(smin, smax); return max(smin, smax);
} }
...@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) ...@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
} }
struct numa_group {
atomic_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
int active_nodes;
struct rcu_head rcu;
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
unsigned long faults[0];
};
/* Shared or private faults. */ /* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2 #define NR_NUMA_HINT_FAULT_TYPES 2
...@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) ...@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
} }
static inline unsigned long group_faults_priv(struct numa_group *ng)
{
unsigned long faults = 0;
int node;
for_each_online_node(node) {
faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
return faults;
}
static inline unsigned long group_faults_shared(struct numa_group *ng)
{
unsigned long faults = 0;
int node;
for_each_online_node(node) {
faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
}
return faults;
}
/* /*
* A node triggering more than 1/3 as many NUMA faults as the maximum is * A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations * considered part of a numa group's pseudo-interleaving set. Migrations
...@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, ...@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
} }
static unsigned long weighted_cpuload(const int cpu); static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type); static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type); static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu); static unsigned long capacity_of(int cpu);
...@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ...@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running; ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu); ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu); ns->compute_capacity += capacity_of(cpu);
cpus++; cpus++;
...@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p) ...@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an * Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed. * alternative node to recheck if the tasks is now properly placed.
*/ */
p->numa_scan_period = task_scan_min(p); p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) { if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu); ret = migrate_task_to(p, env.best_cpu);
...@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p, ...@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private) unsigned long shared, unsigned long private)
{ {
unsigned int period_slot; unsigned int period_slot;
int ratio; int lr_ratio, ps_ratio;
int diff; int diff;
unsigned long remote = p->numa_faults_locality[0]; unsigned long remote = p->numa_faults_locality[0];
...@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p, ...@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/ */
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
if (ratio >= NUMA_PERIOD_THRESHOLD) { ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
int slot = ratio - NUMA_PERIOD_THRESHOLD;
if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
/*
* Most memory accesses are local. There is no need to
* do fast NUMA scanning, since memory is already local.
*/
int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
/*
* Most memory accesses are shared with other tasks.
* There is no point in continuing fast NUMA scanning,
* since other tasks may just move the memory elsewhere.
*/
int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot) if (!slot)
slot = 1; slot = 1;
diff = slot * period_slot; diff = slot * period_slot;
} else { } else {
diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
/* /*
* Scale scan rate increases based on sharing. There is an * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
* inverse relationship between the degree of sharing and * yet they are not on the local NUMA node. Speed up
* the adjustment made to the scanning period. Broadly * NUMA scanning to get the memory moved over.
* speaking the intent is that there is little point
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/ */
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); int ratio = max(lr_ratio, ps_ratio);
diff = (diff * ratio) / NUMA_PERIOD_SLOTS; diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
} }
p->numa_scan_period = clamp(p->numa_scan_period + diff, p->numa_scan_period = clamp(p->numa_scan_period + diff,
...@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work) ...@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) { if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p); p->numa_scan_period_max = task_scan_max(p);
p->numa_scan_period = task_scan_min(p); p->numa_scan_period = task_scan_start(p);
} }
next_scan = now + msecs_to_jiffies(p->numa_scan_period); next_scan = now + msecs_to_jiffies(p->numa_scan_period);
...@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) ...@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) { if (now > curr->node_stamp + period) {
if (!curr->node_stamp) if (!curr->node_stamp)
curr->numa_scan_period = task_scan_min(curr); curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period; curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) { if (!time_before(jiffies, curr->mm->numa_next_scan)) {
...@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) ...@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
} }
} }
/*
* Can a task be moved from prev_cpu to this_cpu without causing a load
* imbalance that would trigger the load balancer?
*/
static inline bool numa_wake_affine(struct sched_domain *sd,
struct task_struct *p, int this_cpu,
int prev_cpu, int sync)
{
struct numa_stats prev_load, this_load;
s64 this_eff_load, prev_eff_load;
update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
update_numa_stats(&this_load, cpu_to_node(this_cpu));
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync) {
unsigned long current_load = task_h_load(current);
if (this_load.load > current_load)
this_load.load -= current_load;
else
this_load.load = 0;
}
/*
* In low-load situations, where this_cpu's node is idle due to the
* sync cause above having dropped this_load.load to 0, move the task.
* Moving to an idle socket will not create a bad imbalance.
*
* Otherwise check if the nodes are near enough in load to allow this
* task to be woken on this_cpu's node.
*/
if (this_load.load > 0) {
unsigned long task_load = task_h_load(p);
this_eff_load = 100;
this_eff_load *= prev_load.compute_capacity;
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= this_load.compute_capacity;
this_eff_load *= this_load.load + task_load;
prev_eff_load *= prev_load.load - task_load;
return this_eff_load <= prev_eff_load;
}
return true;
}
#else #else
static void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{ {
...@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) ...@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{ {
} }
#ifdef CONFIG_SMP
static inline bool numa_wake_affine(struct sched_domain *sd,
struct task_struct *p, int this_cpu,
int prev_cpu, int sync)
{
return true;
}
#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
static void static void
...@@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se) ...@@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se)
} }
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
if (&this_rq()->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem -- added to that it only calls on the local
* CPU, so if we enqueue remotely we'll miss an update, but
* the next tick/schedule should update.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq_of(cfs_rq), 0);
}
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* Approximate: * Approximate:
...@@ -2967,6 +3001,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, ...@@ -2967,6 +3001,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10; sa->last_update_time += delta << 10;
/*
* running is a subset of runnable (weight) so running can't be set if
* runnable is clear. But there are some corner cases where the current
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
* update_blocked_averages()
*/
if (!weight)
running = 0;
/* /*
* Now we know we crossed measurement unit boundaries. The *_avg * Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps: * accrues by two steps:
...@@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} ...@@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
if (&this_rq()->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem -- added to that it only calls on the local
* CPU, so if we enqueue remotely we'll miss an update, but
* the next tick/schedule should update.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq_of(cfs_rq), 0);
}
}
/* /*
* Unsigned subtract and clamp on underflow. * Unsigned subtract and clamp on underflow.
* *
...@@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) ...@@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task() * @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update * @cfs_rq: cfs_rq to update
* @update_freq: should we call cfs_rq_util_change() or will the call do so
* *
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see * avg. The immediate corollary is that all (fair) tasks must be attached, see
...@@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) ...@@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true. * call update_tg_load_avg() when this function returns true.
*/ */
static inline int static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{ {
struct sched_avg *sa = &cfs_rq->avg; struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0; int decayed, removed_load = 0, removed_util = 0;
...@@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) ...@@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
cfs_rq->load_last_update_time_copy = sa->last_update_time; cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif #endif
if (update_freq && (decayed || removed_util)) if (decayed || removed_util)
cfs_rq_util_change(cfs_rq); cfs_rq_util_change(cfs_rq);
return decayed || removed_load; return decayed || removed_load;
...@@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) ...@@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se); __update_load_avg_se(now, cpu, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se); decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG)) if (decayed && (flags & UPDATE_TG))
...@@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf); ...@@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
static inline int static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{ {
return 0; return 0;
} }
...@@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) ...@@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1) static inline void update_load_avg(struct sched_entity *se, int not_used1)
{ {
cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); cfs_rq_util_change(cfs_rq_of(se));
} }
static inline void static inline void
...@@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, ...@@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
} }
/* Used instead of source_load when we know the type == 0 */ /* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu) static unsigned long weighted_cpuload(struct rq *rq)
{ {
return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); return cfs_rq_runnable_load_avg(&rq->cfs);
} }
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
...@@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq) ...@@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/* /*
* bail if there's load or we're actually up-to-date. * bail if there's load or we're actually up-to-date.
*/ */
if (weighted_cpuload(cpu_of(this_rq))) if (weighted_cpuload(this_rq))
return; return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
...@@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void) ...@@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with * concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same. * cpu_load_update_idle() but both updater would be writing the same.
*/ */
this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); this_rq->cpu_load[0] = weighted_cpuload(this_rq);
} }
/* /*
...@@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void) ...@@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick) if (curr_jiffies == this_rq->last_load_update_tick)
return; return;
load = weighted_cpuload(cpu_of(this_rq)); load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf); rq_lock(this_rq, &rf);
update_rq_clock(this_rq); update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load); cpu_load_update_nohz(this_rq, curr_jiffies, load);
...@@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) ...@@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/ */
void cpu_load_update_active(struct rq *this_rq) void cpu_load_update_active(struct rq *this_rq)
{ {
unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped()) if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
...@@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq) ...@@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type) static unsigned long source_load(int cpu, int type)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu); unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS)) if (type == 0 || !sched_feat(LB_BIAS))
return total; return total;
...@@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type) ...@@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type) static unsigned long target_load(int cpu, int type)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu); unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS)) if (type == 0 || !sched_feat(LB_BIAS))
return total; return total;
...@@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) ...@@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = weighted_cpuload(cpu); unsigned long load_avg = weighted_cpuload(rq);
if (nr_running) if (nr_running)
return load_avg / nr_running; return load_avg / nr_running;
...@@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p) ...@@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
return 1; return 1;
} }
struct llc_stats {
unsigned long nr_running;
unsigned long load;
unsigned long capacity;
int has_capacity;
};
static bool get_llc_stats(struct llc_stats *stats, int cpu)
{
struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (!sds)
return false;
stats->nr_running = READ_ONCE(sds->nr_running);
stats->load = READ_ONCE(sds->load);
stats->capacity = READ_ONCE(sds->capacity);
stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
return true;
}
/*
* Can a task be moved from prev_cpu to this_cpu without causing a load
* imbalance that would trigger the load balancer?
*
* Since we're running on 'stale' values, we might in fact create an imbalance
* but recomputing these values is expensive, as that'd mean iteration 2 cache
* domains worth of CPUs.
*/
static bool
wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
struct llc_stats prev_stats, this_stats;
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
if (!get_llc_stats(&prev_stats, prev_cpu) ||
!get_llc_stats(&this_stats, this_cpu))
return false;
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current LLC.
*/
if (sync) {
unsigned long current_load = task_h_load(current);
/* in this case load hits 0 and this LLC is considered 'idle' */
if (current_load > this_stats.load)
return true;
this_stats.load -= current_load;
}
/*
* The has_capacity stuff is not SMT aware, but by trying to balance
* the nr_running on both ends we try and fill the domain at equal
* rates, thereby first consuming cores before siblings.
*/
/* if the old cache has capacity, stay there */
if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
return false;
/* if this cache has capacity, come here */
if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
return true;
/*
* Check to see if we can move the load without causing too much
* imbalance.
*/
task_load = task_h_load(p);
this_eff_load = 100;
this_eff_load *= prev_stats.capacity;
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= this_stats.capacity;
this_eff_load *= this_stats.load + task_load;
prev_eff_load *= prev_stats.load - task_load;
return this_eff_load <= prev_eff_load;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p, static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync) int prev_cpu, int sync)
{ {
int this_cpu = smp_processor_id(); int this_cpu = smp_processor_id();
bool affine = false; bool affine;
/* /*
* Common case: CPUs are in the same socket, and select_idle_sibling() * Default to no affine wakeups; wake_affine() should not effect a task
* will do its thing regardless of what we return: * placement the load-balancer feels inclined to undo. The conservative
* option is therefore to not move tasks when they wake up.
*/ */
if (cpus_share_cache(prev_cpu, this_cpu)) affine = false;
affine = true;
else /*
affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); * If the wakeup is across cache domains, try to evaluate if movement
* makes sense, otherwise rely on select_idle_siblings() to do
* placement inside the cache domain.
*/
if (!cpus_share_cache(prev_cpu, this_cpu))
affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) { if (affine) {
...@@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) ...@@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i; shallowest_idle_cpu = i;
} }
} else if (shallowest_idle_cpu == -1) { } else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i); load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) { if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load; min_load = load;
least_loaded_cpu = i; least_loaded_cpu = i;
...@@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf ...@@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks; int new_tasks;
again: again:
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running) if (!cfs_rq->nr_running)
goto idle; goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class) if (prev->sched_class != &fair_sched_class)
goto simple; goto simple;
...@@ -6220,12 +6337,18 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf ...@@ -6220,12 +6337,18 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
/* /*
* This call to check_cfs_rq_runtime() will do the * This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s). * throttle and dequeue its entity in the parent(s).
* Therefore the 'simple' nr_running test will indeed * Therefore the nr_running test will indeed
* be correct. * be correct.
*/ */
if (unlikely(check_cfs_rq_runtime(cfs_rq))) if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
goto idle;
goto simple; goto simple;
} }
}
se = pick_next_entity(cfs_rq, curr); se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se); cfs_rq = group_cfs_rq(se);
...@@ -6264,12 +6387,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf ...@@ -6264,12 +6387,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
return p; return p;
simple: simple:
cfs_rq = &rq->cfs;
#endif #endif
if (!cfs_rq->nr_running)
goto idle;
put_prev_task(rq, prev); put_prev_task(rq, prev);
do { do {
...@@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu) ...@@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq)) if (throttled_hierarchy(cfs_rq))
continue; continue;
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */ /* Propagate pending load changes to the parent, if any: */
...@@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu) ...@@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf);
update_rq_clock(rq); update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf);
} }
...@@ -7036,6 +7155,7 @@ struct sg_lb_stats { ...@@ -7036,6 +7155,7 @@ struct sg_lb_stats {
struct sd_lb_stats { struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */ struct sched_group *local; /* Local group in this sd */
unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
...@@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) ...@@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){ *sds = (struct sd_lb_stats){
.busiest = NULL, .busiest = NULL,
.local = NULL, .local = NULL,
.total_running = 0UL,
.total_load = 0UL, .total_load = 0UL,
.total_capacity = 0UL, .total_capacity = 0UL,
.busiest_stat = { .busiest_stat = {
...@@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, ...@@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running; sgs->nr_preferred_running += rq->nr_preferred_running;
#endif #endif
sgs->sum_weighted_load += weighted_cpuload(i); sgs->sum_weighted_load += weighted_cpuload(rq);
/* /*
* No need to call idle_cpu() if nr_running is not 0 * No need to call idle_cpu() if nr_running is not 0
*/ */
...@@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) ...@@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/ */
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{ {
struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child; struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups; struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats *local = &sds->local_stat;
...@@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group: next_group:
/* Now, start updating sd_lb_stats */ /* Now, start updating sd_lb_stats */
sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load; sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity; sds->total_capacity += sgs->group_capacity;
...@@ -7561,6 +7684,21 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd ...@@ -7561,6 +7684,21 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->dst_rq->rd->overload = overload; env->dst_rq->rd->overload = overload;
} }
if (!shared)
return;
/*
* Since these are sums over groups they can contain some CPUs
* multiple times for the NUMA domains.
*
* Currently only wake_affine_llc() and find_busiest_group()
* uses these numbers, only the last is affected by this problem.
*
* XXX fix that.
*/
WRITE_ONCE(shared->nr_running, sds->total_running);
WRITE_ONCE(shared->load, sds->total_load);
WRITE_ONCE(shared->capacity, sds->total_capacity);
} }
/** /**
...@@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) ...@@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0) if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced; goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity; / sds.total_capacity;
...@@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, ...@@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i); capacity = capacity_of(i);
wl = weighted_cpuload(i); wl = weighted_cpuload(rq);
/* /*
* When comparing with imbalance, use weighted_cpuload() * When comparing with imbalance, use weighted_cpuload()
......
...@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg); ...@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void); void register_sched_domain_sysctl(void);
void dirty_sched_domain_sysctl(int cpu);
void unregister_sched_domain_sysctl(void); void unregister_sched_domain_sysctl(void);
#else #else
static inline void register_sched_domain_sysctl(void) static inline void register_sched_domain_sysctl(void)
{ {
} }
static inline void dirty_sched_domain_sysctl(int cpu)
{
}
static inline void unregister_sched_domain_sysctl(void) static inline void unregister_sched_domain_sysctl(void)
{ {
} }
......
...@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) ...@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
static int init_rootdomain(struct root_domain *rd) static int init_rootdomain(struct root_domain *rd)
{ {
memset(rd, 0, sizeof(*rd));
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out; goto out;
if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
...@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void) ...@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)
{ {
struct root_domain *rd; struct root_domain *rd;
rd = kmalloc(sizeof(*rd), GFP_KERNEL); rd = kzalloc(sizeof(*rd), GFP_KERNEL);
if (!rd) if (!rd)
return NULL; return NULL;
...@@ -337,6 +335,7 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) ...@@ -337,6 +335,7 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc); kfree(sg->sgc);
if (atomic_dec_and_test(&sg->ref))
kfree(sg); kfree(sg);
sg = tmp; sg = tmp;
} while (sg != first); } while (sg != first);
...@@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) ...@@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
static void destroy_sched_domain(struct sched_domain *sd) static void destroy_sched_domain(struct sched_domain *sd)
{ {
/* /*
* If its an overlapping domain it has private groups, iterate and * A normal sched domain may have multiple group references, an
* nuke them all. * overlapping domain, having private groups, only one. Iterate,
* dropping group/capacity references, freeing where none remain.
*/ */
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1); free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
kfree(sd->groups->sgc);
kfree(sd->groups);
}
if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared); kfree(sd->shared);
kfree(sd); kfree(sd);
...@@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ...@@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd); rq_attach_root(rq, rd);
tmp = rq->sd; tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd); rcu_assign_pointer(rq->sd, sd);
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp); destroy_sched_domains(tmp);
update_top_cache_domain(cpu); update_top_cache_domain(cpu);
...@@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) ...@@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
else else
cpumask_copy(sg_span, sched_domain_span(sd)); cpumask_copy(sg_span, sched_domain_span(sd));
atomic_inc(&sg->ref);
return sg; return sg;
} }
...@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map) ...@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
} }
} }
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr, const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu) struct sched_domain *child, int cpu)
{ {
...@@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */ /* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology(); new_topology = arch_update_cpu_topology();
n = doms_new ? ndoms_new : 0; if (!doms_new) {
WARN_ON_ONCE(dattr_new);
n = 0;
doms_new = alloc_sched_domains(1);
if (doms_new) {
n = 1;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
}
} else {
n = ndoms_new;
}
/* Destroy deleted domains: */ /* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) { for (i = 0; i < ndoms_cur; i++) {
...@@ -1870,11 +1878,10 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ...@@ -1870,11 +1878,10 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
} }
n = ndoms_cur; n = ndoms_cur;
if (doms_new == NULL) { if (!doms_new) {
n = 0; n = 0;
doms_new = &fallback_doms; doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
WARN_ON_ONCE(dattr_new);
} }
/* Build new domains: */ /* Build new domains: */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment