Commit cca08cd6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - introduce and use task_rcu_dereference()/try_get_task_struct() to fix
   and generalize task_struct handling (Oleg Nesterov)

 - do various per entity load tracking (PELT) fixes and optimizations
   (Peter Zijlstra)

 - cputime virt-steal time accounting enhancements/fixes (Wanpeng Li)

 - introduce consolidated cputime output file cpuacct.usage_all and
   related refactorings (Zhao Lei)

 - ... plus misc fixes and enhancements

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set
  sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together
  sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show()
  sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums
  sched/fair: Rework throttle_count sync
  sched/core: Fix sched_getaffinity() return value kerneldoc comment
  sched/fair: Reorder cgroup creation code
  sched/fair: Apply more PELT fixes
  sched/fair: Fix PELT integrity for new tasks
  sched/cgroup: Fix cpu_cgroup_fork() handling
  sched/fair: Fix PELT integrity for new groups
  sched/fair: Fix and optimize the fork() path
  sched/cputime: Add steal time support to full dynticks CPU time accounting
  sched/cputime: Fix prev steal time accouting during CPU hotplug
  KVM: Fix steal clock warp during guest CPU hotplug
  sched/debug: Always show 'nr_migrations'
  sched/fair: Use task_rcu_dereference()
  sched/api: Introduce task_rcu_dereference() and try_get_task_struct()
  sched/idle: Optimize the generic idle loop
  sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task()
parents 7e4dc77b 748c7201
...@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) ...@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
if (!has_steal_clock) if (!has_steal_clock)
return; return;
memset(st, 0, sizeof(*st));
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
pr_info("kvm-stealtime: cpu %d, msr %llx\n", pr_info("kvm-stealtime: cpu %d, msr %llx\n",
cpu, (unsigned long long) slow_virt_to_phys(st)); cpu, (unsigned long long) slow_virt_to_phys(st));
......
...@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); ...@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
#define TASK_WAKING 256 #define TASK_WAKING 256
#define TASK_PARKED 512 #define TASK_PARKED 512
#define TASK_NOLOAD 1024 #define TASK_NOLOAD 1024
#define TASK_STATE_MAX 2048 #define TASK_NEW 2048
#define TASK_STATE_MAX 4096
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
extern char ___assert_task_state[1 - 2*!!( extern char ___assert_task_state[1 - 2*!!(
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
...@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t) ...@@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t); __put_task_struct(t);
} }
struct task_struct *task_rcu_dereference(struct task_struct **ptask);
struct task_struct *try_get_task_struct(struct task_struct **ptask);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void task_cputime(struct task_struct *t, extern void task_cputime(struct task_struct *t,
cputime_t *utime, cputime_t *stime); cputime_t *utime, cputime_t *stime);
......
...@@ -210,6 +210,82 @@ void release_task(struct task_struct *p) ...@@ -210,6 +210,82 @@ void release_task(struct task_struct *p)
goto repeat; goto repeat;
} }
/*
* Note that if this function returns a valid task_struct pointer (!NULL)
* task->usage must remain >0 for the duration of the RCU critical section.
*/
struct task_struct *task_rcu_dereference(struct task_struct **ptask)
{
struct sighand_struct *sighand;
struct task_struct *task;
/*
* We need to verify that release_task() was not called and thus
* delayed_put_task_struct() can't run and drop the last reference
* before rcu_read_unlock(). We check task->sighand != NULL,
* but we can read the already freed and reused memory.
*/
retry:
task = rcu_dereference(*ptask);
if (!task)
return NULL;
probe_kernel_address(&task->sighand, sighand);
/*
* Pairs with atomic_dec_and_test() in put_task_struct(). If this task
* was already freed we can not miss the preceding update of this
* pointer.
*/
smp_rmb();
if (unlikely(task != READ_ONCE(*ptask)))
goto retry;
/*
* We've re-checked that "task == *ptask", now we have two different
* cases:
*
* 1. This is actually the same task/task_struct. In this case
* sighand != NULL tells us it is still alive.
*
* 2. This is another task which got the same memory for task_struct.
* We can't know this of course, and we can not trust
* sighand != NULL.
*
* In this case we actually return a random value, but this is
* correct.
*
* If we return NULL - we can pretend that we actually noticed that
* *ptask was updated when the previous task has exited. Or pretend
* that probe_slab_address(&sighand) reads NULL.
*
* If we return the new task (because sighand is not NULL for any
* reason) - this is fine too. This (new) task can't go away before
* another gp pass.
*
* And note: We could even eliminate the false positive if re-read
* task->sighand once again to avoid the falsely NULL. But this case
* is very unlikely so we don't care.
*/
if (!sighand)
return NULL;
return task;
}
struct task_struct *try_get_task_struct(struct task_struct **ptask)
{
struct task_struct *task;
rcu_read_lock();
task = task_rcu_dereference(ptask);
if (task)
get_task_struct(task);
rcu_read_unlock();
return task;
}
/* /*
* Determine if a process group is "orphaned", according to the POSIX * Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected * definition in 2.2.2.52. Orphaned process groups are not to be affected
......
...@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p); __sched_fork(clone_flags, p);
/* /*
* We mark the process as running here. This guarantees that * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external * nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either. * event cannot wake it up and insert it on the runqueue either.
*/ */
p->state = TASK_RUNNING; p->state = TASK_NEW;
/* /*
* Make sure we do not leak PI boosting priority to the child. * Make sure we do not leak PI boosting priority to the child.
...@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class;
} }
if (p->sched_class->task_fork) init_entity_runnable_average(&p->se);
p->sched_class->task_fork(p);
/* /*
* The child is not yet in the pid-hash so no cgroup attach races, * The child is not yet in the pid-hash so no cgroup attach races,
...@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) ...@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU. * Silence PROVE_RCU.
*/ */
raw_spin_lock_irqsave(&p->pi_lock, flags); raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu); /*
* We're setting the cpu for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
...@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p) ...@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf; struct rq_flags rf;
struct rq *rq; struct rq *rq;
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags); raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* /*
* Fork balancing, do it here and not earlier because: * Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path * - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug * - any previously selected cpu might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/ */
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif #endif
rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se); post_init_entity_util_avg(&p->se);
...@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) ...@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n"); pr_cont("\n");
} }
#endif #endif
if (panic_on_warn)
panic("scheduling while atomic\n");
dump_stack(); dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
} }
...@@ -4752,7 +4762,8 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) ...@@ -4752,7 +4762,8 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
* @len: length in bytes of the bitmask pointed to by user_mask_ptr * @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask * @user_mask_ptr: user-space pointer to hold the current cpu mask
* *
* Return: 0 on success. An error code otherwise. * Return: size of CPU mask copied to user_mask_ptr on success. An
* error code otherwise.
*/ */
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr) unsigned long __user *, user_mask_ptr)
...@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) ...@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update; rq->calc_load_update = calc_load_update;
account_reset_rq(rq);
update_max_interval(); update_max_interval();
} }
...@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) ...@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children); INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children); list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags); spin_unlock_irqrestore(&task_group_lock, flags);
online_fair_sched_group(tg);
} }
/* rcu callback to free various structures associated with a task group */ /* rcu callback to free various structures associated with a task group */
...@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg) ...@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags); spin_unlock_irqrestore(&task_group_lock, flags);
} }
/* change task's runqueue when it moves between groups. static void sched_change_group(struct task_struct *tsk, int type)
* The caller of this function should have put the task in its new group
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
* reflect its new group.
*/
void sched_move_task(struct task_struct *tsk)
{ {
struct task_group *tg; struct task_group *tg;
int queued, running;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
/* /*
* All callers are synchronized by task_rq_lock(); we do not use RCU * All callers are synchronized by task_rq_lock(); we do not use RCU
...@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk) ...@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg; tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group) if (tsk->sched_class->task_change_group)
tsk->sched_class->task_move_group(tsk); tsk->sched_class->task_change_group(tsk, type);
else else
#endif #endif
set_task_rq(tsk, task_cpu(tsk)); set_task_rq(tsk, task_cpu(tsk));
}
/*
* Change task's runqueue when it moves between groups.
*
* The caller of this function should have put the task in its new group by
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
void sched_move_task(struct task_struct *tsk)
{
int queued, running;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running)) if (unlikely(running))
tsk->sched_class->set_curr_task(rq); tsk->sched_class->set_curr_task(rq);
...@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ...@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg); sched_free_group(tg);
} }
/*
* This is called before wake_up_new_task(), therefore we really only
* have to set its group bits, all the other stuff does not apply.
*/
static void cpu_cgroup_fork(struct task_struct *task) static void cpu_cgroup_fork(struct task_struct *task)
{ {
sched_move_task(task); struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(task, &rf);
sched_change_group(task, TASK_SET_GROUP);
task_rq_unlock(rq, task, &rf);
} }
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{ {
struct task_struct *task; struct task_struct *task;
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
int ret = 0;
cgroup_taskset_for_each(task, css, tset) { cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED
...@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ...@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class) if (task->sched_class != &fair_sched_class)
return -EINVAL; return -EINVAL;
#endif #endif
/*
* Serialize against wake_up_new_task() such that if its
* running, we're sure to observe its full state.
*/
raw_spin_lock_irq(&task->pi_lock);
/*
* Avoid calling sched_move_task() before wake_up_new_task()
* has happened. This would lead to problems with PELT, due to
* move wanting to detach+attach while we're not attached yet.
*/
if (task->state == TASK_NEW)
ret = -EINVAL;
raw_spin_unlock_irq(&task->pi_lock);
if (ret)
break;
} }
return 0; return ret;
} }
static void cpu_cgroup_attach(struct cgroup_taskset *tset) static void cpu_cgroup_attach(struct cgroup_taskset *tset)
......
...@@ -25,15 +25,13 @@ enum cpuacct_stat_index { ...@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS, CPUACCT_STAT_NSTATS,
}; };
enum cpuacct_usage_index { static const char * const cpuacct_stat_desc[] = {
CPUACCT_USAGE_USER, /* ... user mode */ [CPUACCT_STAT_USER] = "user",
CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ [CPUACCT_STAT_SYSTEM] = "system",
CPUACCT_USAGE_NRUSAGE,
}; };
struct cpuacct_usage { struct cpuacct_usage {
u64 usages[CPUACCT_USAGE_NRUSAGE]; u64 usages[CPUACCT_STAT_NSTATS];
}; };
/* track cpu usage of a group of tasks and its child groups */ /* track cpu usage of a group of tasks and its child groups */
...@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) ...@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
} }
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_usage_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data; u64 data;
/* /*
* We allow index == CPUACCT_USAGE_NRUSAGE here to read * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages. * the sum of suages.
*/ */
BUG_ON(index > CPUACCT_USAGE_NRUSAGE); BUG_ON(index > CPUACCT_STAT_NSTATS);
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
/* /*
...@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, ...@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_lock_irq(&cpu_rq(cpu)->lock); raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif #endif
if (index == CPUACCT_USAGE_NRUSAGE) { if (index == CPUACCT_STAT_NSTATS) {
int i = 0; int i = 0;
data = 0; data = 0;
for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i]; data += cpuusage->usages[i];
} else { } else {
data = cpuusage->usages[index]; data = cpuusage->usages[index];
...@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) ...@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
raw_spin_lock_irq(&cpu_rq(cpu)->lock); raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif #endif
for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val; cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT #ifndef CONFIG_64BIT
...@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) ...@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/* return total cpu usage (in nanoseconds) of a group */ /* return total cpu usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 __cpuusage_read(struct cgroup_subsys_state *css,
enum cpuacct_usage_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct *ca = css_ca(css); struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0; u64 totalcpuusage = 0;
...@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, ...@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
static u64 cpuusage_user_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
struct cftype *cft) struct cftype *cft)
{ {
return __cpuusage_read(css, CPUACCT_USAGE_USER); return __cpuusage_read(css, CPUACCT_STAT_USER);
} }
static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
struct cftype *cft) struct cftype *cft)
{ {
return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
} }
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{ {
return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
} }
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
...@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, ...@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
} }
static int __cpuacct_percpu_seq_show(struct seq_file *m, static int __cpuacct_percpu_seq_show(struct seq_file *m,
enum cpuacct_usage_index index) enum cpuacct_stat_index index)
{ {
struct cpuacct *ca = css_ca(seq_css(m)); struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu; u64 percpu;
...@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, ...@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{ {
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
} }
static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{ {
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
} }
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{ {
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
} }
static const char * const cpuacct_stat_desc[] = { static int cpuacct_all_seq_show(struct seq_file *m, void *V)
[CPUACCT_STAT_USER] = "user", {
[CPUACCT_STAT_SYSTEM] = "system", struct cpuacct *ca = css_ca(seq_css(m));
}; int index;
int cpu;
seq_puts(m, "cpu");
for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
seq_printf(m, " %s", cpuacct_stat_desc[index]);
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
seq_printf(m, "%d", cpu);
for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#endif
}
seq_puts(m, "\n");
}
return 0;
}
static int cpuacct_stats_show(struct seq_file *sf, void *v) static int cpuacct_stats_show(struct seq_file *sf, void *v)
{ {
struct cpuacct *ca = css_ca(seq_css(sf)); struct cpuacct *ca = css_ca(seq_css(sf));
s64 val[CPUACCT_STAT_NSTATS];
int cpu; int cpu;
s64 val = 0; int stat;
memset(val, 0, sizeof(val));
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
val += kcpustat->cpustat[CPUTIME_USER];
val += kcpustat->cpustat[CPUTIME_NICE];
}
val = cputime64_to_clock_t(val);
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0; val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
for_each_possible_cpu(cpu) { val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
val += kcpustat->cpustat[CPUTIME_SYSTEM]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
val += kcpustat->cpustat[CPUTIME_IRQ]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
} }
val = cputime64_to_clock_t(val); for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
cputime64_to_clock_t(val[stat]));
}
return 0; return 0;
} }
...@@ -301,6 +329,10 @@ static struct cftype files[] = { ...@@ -301,6 +329,10 @@ static struct cftype files[] = {
.name = "usage_percpu_sys", .name = "usage_percpu_sys",
.seq_show = cpuacct_percpu_sys_seq_show, .seq_show = cpuacct_percpu_sys_seq_show,
}, },
{
.name = "usage_all",
.seq_show = cpuacct_all_seq_show,
},
{ {
.name = "stat", .name = "stat",
.seq_show = cpuacct_stats_show, .seq_show = cpuacct_stats_show,
...@@ -316,11 +348,11 @@ static struct cftype files[] = { ...@@ -316,11 +348,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime) void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{ {
struct cpuacct *ca; struct cpuacct *ca;
int index = CPUACCT_USAGE_SYSTEM; int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk); struct pt_regs *regs = task_pt_regs(tsk);
if (regs && user_mode(regs)) if (regs && user_mode(regs))
index = CPUACCT_USAGE_USER; index = CPUACCT_STAT_USER;
rcu_read_lock(); rcu_read_lock();
......
...@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) ...@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime; cpustat[CPUTIME_IDLE] += (__force u64) cputime;
} }
static __always_inline bool steal_account_process_tick(void) static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
{ {
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) { if (static_key_false(&paravirt_steal_enabled)) {
...@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) ...@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void)
* time in jiffies. Lets cast the result to jiffies * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds. * granularity and account the rest on the next rounds.
*/ */
steal_jiffies = nsecs_to_jiffies(steal); steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
account_steal_time(jiffies_to_cputime(steal_jiffies)); account_steal_time(jiffies_to_cputime(steal_jiffies));
return steal_jiffies; return steal_jiffies;
} }
#endif #endif
return false; return 0;
} }
/* /*
...@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, ...@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
u64 cputime = (__force u64) cputime_one_jiffy; u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat; u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick()) if (steal_account_process_tick(ULONG_MAX))
return; return;
cputime *= ticks; cputime *= ticks;
...@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) ...@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return; return;
} }
if (steal_account_process_tick()) if (steal_account_process_tick(ULONG_MAX))
return; return;
if (user_tick) if (user_tick)
...@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) ...@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk)
{ {
unsigned long now = READ_ONCE(jiffies); unsigned long now = READ_ONCE(jiffies);
unsigned long delta = now - tsk->vtime_snap; unsigned long delta_jiffies, steal_jiffies;
delta_jiffies = now - tsk->vtime_snap;
steal_jiffies = steal_account_process_tick(delta_jiffies);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now; tsk->vtime_snap = now;
return jiffies_to_cputime(delta); return jiffies_to_cputime(delta_jiffies - steal_jiffies);
} }
static void __vtime_account_system(struct task_struct *tsk) static void __vtime_account_system(struct task_struct *tsk)
......
...@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw; nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations); P(se.nr_migrations);
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) { if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu; u64 avg_atom, avg_per_cpu;
......
...@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se) ...@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
} }
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
/* /*
* With new tasks being created, their initial util_avgs are extrapolated * With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg: * based on the cfs_rq's current util_avg:
...@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se) ...@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg; struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;
if (cap > 0) { if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) { if (cfs_rq->avg.util_avg != 0) {
...@@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se) ...@@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
} }
sa->util_sum = sa->util_avg * LOAD_AVG_MAX; sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
} }
if (entity_is_task(se)) {
struct task_struct *p = task_of(se);
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
* expected state.
*/
se->avg.last_update_time = now;
return;
}
}
tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
if (tg_update)
update_tg_load_avg(cfs_rq, false);
} }
#else #else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se) void init_entity_runnable_average(struct sched_entity *se)
{ {
} }
void post_init_entity_util_avg(struct sched_entity *se) void post_init_entity_util_avg(struct sched_entity *se)
{ {
} }
#endif static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
}
#endif /* CONFIG_SMP */
/* /*
* Update the current task's runtime statistics. * Update the current task's runtime statistics.
...@@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, ...@@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
{ {
if (env->best_task) if (env->best_task)
put_task_struct(env->best_task); put_task_struct(env->best_task);
if (p)
get_task_struct(p);
env->best_task = p; env->best_task = p;
env->best_imp = imp; env->best_imp = imp;
...@@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, ...@@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp; long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp; long moveimp = imp;
int dist = env->dist; int dist = env->dist;
bool assigned = false;
rcu_read_lock(); rcu_read_lock();
cur = task_rcu_dereference(&dst_rq->curr);
raw_spin_lock_irq(&dst_rq->lock); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = dst_rq->curr;
/*
* No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL; cur = NULL;
else {
/*
* The task_struct must be protected here to protect the
* p->numa_faults access in the task_weight since the
* numa_faults could already be freed in the following path:
* finish_task_switch()
* --> put_task_struct()
* --> __put_task_struct()
* --> task_numa_free()
*/
get_task_struct(cur);
}
raw_spin_unlock_irq(&dst_rq->lock);
/* /*
* Because we have preemption enabled we can get migrated around and * Because we have preemption enabled we can get migrated around and
...@@ -1477,7 +1492,6 @@ static void task_numa_compare(struct task_numa_env *env, ...@@ -1477,7 +1492,6 @@ static void task_numa_compare(struct task_numa_env *env,
*/ */
if (!load_too_imbalanced(src_load, dst_load, env)) { if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1; imp = moveimp - 1;
put_task_struct(cur);
cur = NULL; cur = NULL;
goto assign; goto assign;
} }
...@@ -1503,16 +1517,9 @@ static void task_numa_compare(struct task_numa_env *env, ...@@ -1503,16 +1517,9 @@ static void task_numa_compare(struct task_numa_env *env,
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign: assign:
assigned = true;
task_numa_assign(env, cur, imp); task_numa_assign(env, cur, imp);
unlock: unlock:
rcu_read_unlock(); rcu_read_unlock();
/*
* The dst_rq->curr isn't assigned. The protection for task_struct is
* finished.
*/
if (cur && !assigned)
put_task_struct(cur);
} }
static void task_numa_find_cpu(struct task_numa_env *env, static void task_numa_find_cpu(struct task_numa_env *env,
...@@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se, ...@@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{ {
struct rq *rq = rq_of(cfs_rq); struct rq *rq = rq_of(cfs_rq);
...@@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) ...@@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
WRITE_ONCE(*ptr, res); \ WRITE_ONCE(*ptr, res); \
} while (0) } while (0)
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ /**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
* @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
* post_init_entity_util_avg().
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
* Returns true if the load decayed or we removed utilization. It is expected
* that one calls update_tg_load_avg() on this condition, but after you've
* modified the cfs_rq avg (attach/detach), such that we propagate the new
* avg up.
*/
static inline int static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{ {
...@@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) ...@@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
update_tg_load_avg(cfs_rq, 0); update_tg_load_avg(cfs_rq, 0);
} }
/**
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
if (!sched_feat(ATTACH_AGE_LOAD)) if (!sched_feat(ATTACH_AGE_LOAD))
...@@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s ...@@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* /*
* If we got migrated (either between CPUs or between cgroups) we'll * If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time. * have aged the average right before clearing @last_update_time.
*
* Or we're fresh through post_init_entity_util_avg().
*/ */
if (se->avg.last_update_time) { if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
...@@ -2998,6 +3029,14 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s ...@@ -2998,6 +3029,14 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq); cfs_rq_util_change(cfs_rq);
} }
/**
* detach_entity_load_avg - detach this entity from its cfs_rq load avg
* @cfs_rq: cfs_rq to detach from
* @se: sched_entity to detach
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
...@@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) ...@@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)
u64 last_update_time; u64 last_update_time;
/* /*
* Newly created task or never used group entity should not be removed * tasks cannot exit without having gone through wake_up_new_task() ->
* from its (source) cfs_rq * post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
*
* Similarly for groups, they will have passed through
* post_init_entity_util_avg() before unregister_sched_fair_group()
* calls this.
*/ */
if (se->avg.last_update_time == 0)
return;
last_update_time = cfs_rq_last_update_time(cfs_rq); last_update_time = cfs_rq_last_update_time(cfs_rq);
...@@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); ...@@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
return 0;
}
static inline void update_load_avg(struct sched_entity *se, int not_used) static inline void update_load_avg(struct sched_entity *se, int not_used)
{ {
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
...@@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) ...@@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{ {
if (unlikely(cfs_rq->throttle_count)) if (unlikely(cfs_rq->throttle_count))
return cfs_rq->throttled_clock_task; return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
} }
...@@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) ...@@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--; cfs_rq->throttle_count--;
#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) { if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */ /* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task; cfs_rq->throttled_clock_task;
} }
#endif
return 0; return 0;
} }
...@@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) ...@@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used()) if (!cfs_bandwidth_used())
return; return;
/* Synchronize hierarchical throttle counter: */
if (unlikely(!cfs_rq->throttle_uptodate)) {
struct rq *rq = rq_of(cfs_rq);
struct cfs_rq *pcfs_rq;
struct task_group *tg;
cfs_rq->throttle_uptodate = 1;
/* Get closest up-to-date node, because leaves go first: */
for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
pcfs_rq = tg->cfs_rq[cpu_of(rq)];
if (pcfs_rq->throttle_uptodate)
break;
}
if (tg) {
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_task = rq_clock_task(rq);
}
}
/* an active group must be handled by the update_curr()->put() path */ /* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr) if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return; return;
...@@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) ...@@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq); throttle_cfs_rq(cfs_rq);
} }
static void sync_throttle(struct task_group *tg, int cpu)
{
struct cfs_rq *pcfs_rq, *cfs_rq;
if (!cfs_bandwidth_used())
return;
if (!tg->parent)
return;
cfs_rq = tg->cfs_rq[cpu];
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */ /* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ {
...@@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) ...@@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
...@@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) ...@@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)
{ {
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr; struct sched_entity *se = &p->se, *curr;
int this_cpu = smp_processor_id();
struct rq *rq = this_rq(); struct rq *rq = this_rq();
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
raw_spin_lock(&rq->lock);
update_rq_clock(rq); update_rq_clock(rq);
cfs_rq = task_cfs_rq(current); cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr; curr = cfs_rq->curr;
if (curr) {
/*
* Not only the cpu but also the task_group of the parent might have
* been changed after parent->se.parent,cfs_rq were copied to
* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
* of child point to valid ones.
*/
rcu_read_lock();
__set_task_cpu(p, this_cpu);
rcu_read_unlock();
update_curr(cfs_rq); update_curr(cfs_rq);
if (curr)
se->vruntime = curr->vruntime; se->vruntime = curr->vruntime;
}
place_entity(cfs_rq, se, 1); place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
...@@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) ...@@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)
} }
se->vruntime -= cfs_rq->min_vruntime; se->vruntime -= cfs_rq->min_vruntime;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
} }
/* /*
...@@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) ...@@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
{ {
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;
if (!vruntime_normalized(p)) { if (!vruntime_normalized(p)) {
/* /*
...@@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) ...@@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
} }
/* Catch up with the cfs_rq and remove our load when we leave */ /* Catch up with the cfs_rq and remove our load when we leave */
tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se); detach_entity_load_avg(cfs_rq, se);
if (tg_update)
update_tg_load_avg(cfs_rq, false);
} }
static void attach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p)
{ {
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se); struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
/* /*
...@@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) ...@@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif #endif
/* Synchronize task with its cfs_rq */ /* Synchronize task with its cfs_rq */
tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se); attach_entity_load_avg(cfs_rq, se);
if (tg_update)
update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p)) if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime; se->vruntime += cfs_rq->min_vruntime;
...@@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
static void task_set_group_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
}
static void task_move_group_fair(struct task_struct *p) static void task_move_group_fair(struct task_struct *p)
{ {
detach_task_cfs_rq(p); detach_task_cfs_rq(p);
...@@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) ...@@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p); attach_task_cfs_rq(p);
} }
static void task_change_group_fair(struct task_struct *p, int type)
{
switch (type) {
case TASK_SET_GROUP:
task_set_group_fair(p);
break;
case TASK_MOVE_GROUP:
task_move_group_fair(p);
break;
}
}
void free_fair_sched_group(struct task_group *tg) void free_fair_sched_group(struct task_group *tg)
{ {
int i; int i;
...@@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) ...@@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq); init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se); init_entity_runnable_average(se);
raw_spin_lock_irq(&rq->lock);
post_init_entity_util_avg(se);
raw_spin_unlock_irq(&rq->lock);
} }
return 1; return 1;
...@@ -8576,6 +8632,23 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) ...@@ -8576,6 +8632,23 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 0; return 0;
} }
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
post_init_entity_util_avg(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
}
}
void unregister_fair_sched_group(struct task_group *tg) void unregister_fair_sched_group(struct task_group *tg)
{ {
unsigned long flags; unsigned long flags;
...@@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) ...@@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1; return 1;
} }
void online_fair_sched_group(struct task_group *tg) { }
void unregister_fair_sched_group(struct task_group *tg) { } void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
...@@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = { ...@@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair, .update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair, .task_change_group = task_change_group_fair,
#endif #endif
}; };
......
...@@ -201,6 +201,8 @@ static void cpuidle_idle_call(void) ...@@ -201,6 +201,8 @@ static void cpuidle_idle_call(void)
*/ */
static void cpu_idle_loop(void) static void cpu_idle_loop(void)
{ {
int cpu = smp_processor_id();
while (1) { while (1) {
/* /*
* If the arch has a polling bit, we maintain an invariant: * If the arch has a polling bit, we maintain an invariant:
...@@ -219,7 +221,7 @@ static void cpu_idle_loop(void) ...@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
check_pgt_cache(); check_pgt_cache();
rmb(); rmb();
if (cpu_is_offline(smp_processor_id())) { if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead(); cpuhp_report_idle_dead();
arch_cpu_idle_dead(); arch_cpu_idle_dead();
} }
......
...@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); ...@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg); extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu, struct sched_entity *se, int cpu,
...@@ -437,7 +438,7 @@ struct cfs_rq { ...@@ -437,7 +438,7 @@ struct cfs_rq {
u64 throttled_clock, throttled_clock_task; u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time; u64 throttled_clock_task_time;
int throttled, throttle_count, throttle_uptodate; int throttled, throttle_count;
struct list_head throttled_list; struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */
...@@ -1246,8 +1247,11 @@ struct sched_class { ...@@ -1246,8 +1247,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq); void (*update_curr) (struct rq *rq);
#define TASK_SET_GROUP 0
#define TASK_MOVE_GROUP 1
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p); void (*task_change_group) (struct task_struct *p, int type);
#endif #endif
}; };
...@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} ...@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
#else /* arch_scale_freq_capacity */ #else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false) #define arch_scale_freq_invariant() (false)
#endif #endif
static inline void account_reset_rq(struct rq *rq)
{
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
rq->prev_irq_time = 0;
#endif
#ifdef CONFIG_PARAVIRT
rq->prev_steal_time = 0;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
rq->prev_steal_time_rq = 0;
#endif
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment