Commit 4fde846a authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:
 "This scheduler update provides:

   - The (hopefully) final fix for the vtime accounting issues which
     were around for quite some time

   - Use types known to user space in UAPI headers to unbreak user space
     builds

   - Make load balancing respect the current scheduling domain again
     instead of evaluating unrelated CPUs"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/headers/uapi: Fix linux/sched/types.h userspace compilation errors
  sched/fair: Fix load_balance() affinity redo path
  sched/cputime: Accumulate vtime on top of nsec clocksource
  sched/cputime: Move the vtime task fields to their own struct
  sched/cputime: Rename vtime fields
  sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime
  vtime, sched/cputime: Remove vtime_account_user()
  Revert "sched/cputime: Refactor the cputime_adjust() code"
parents c3931a87 242fc352
...@@ -170,9 +170,9 @@ extern struct cred init_cred; ...@@ -170,9 +170,9 @@ extern struct cred init_cred;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
# define INIT_VTIME(tsk) \ # define INIT_VTIME(tsk) \
.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
.vtime_snap = 0, \ .vtime.starttime = 0, \
.vtime_snap_whence = VTIME_SYS, .vtime.state = VTIME_SYS,
#else #else
# define INIT_VTIME(tsk) # define INIT_VTIME(tsk)
#endif #endif
......
...@@ -223,6 +223,24 @@ struct task_cputime { ...@@ -223,6 +223,24 @@ struct task_cputime {
#define prof_exp stime #define prof_exp stime
#define sched_exp sum_exec_runtime #define sched_exp sum_exec_runtime
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task runs in kernelspace in a CPU with VTIME active: */
VTIME_SYS,
};
struct vtime {
seqcount_t seqcount;
unsigned long long starttime;
enum vtime_state state;
u64 utime;
u64 stime;
u64 gtime;
};
struct sched_info { struct sched_info {
#ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */ /* Cumulative counters: */
...@@ -688,16 +706,7 @@ struct task_struct { ...@@ -688,16 +706,7 @@ struct task_struct {
u64 gtime; u64 gtime;
struct prev_cputime prev_cputime; struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_t vtime_seqcount; struct vtime vtime;
unsigned long long vtime_snap;
enum {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task runs in kernelspace in a CPU with VTIME active: */
VTIME_SYS,
} vtime_snap_whence;
#endif #endif
#ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL
......
...@@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { } ...@@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { }
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void arch_vtime_task_switch(struct task_struct *tsk); extern void arch_vtime_task_switch(struct task_struct *tsk);
extern void vtime_account_user(struct task_struct *tsk);
extern void vtime_user_enter(struct task_struct *tsk); extern void vtime_user_enter(struct task_struct *tsk);
extern void vtime_user_exit(struct task_struct *tsk);
static inline void vtime_user_exit(struct task_struct *tsk)
{
vtime_account_user(tsk);
}
extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu); extern void vtime_init_idle(struct task_struct *tsk, int cpu);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
static inline void vtime_account_user(struct task_struct *tsk) { }
static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_enter(struct task_struct *tsk) { }
static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { }
static inline void vtime_guest_enter(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { }
......
...@@ -54,21 +54,21 @@ struct sched_param { ...@@ -54,21 +54,21 @@ struct sched_param {
* available in the scheduling class file or in Documentation/. * available in the scheduling class file or in Documentation/.
*/ */
struct sched_attr { struct sched_attr {
u32 size; __u32 size;
u32 sched_policy; __u32 sched_policy;
u64 sched_flags; __u64 sched_flags;
/* SCHED_NORMAL, SCHED_BATCH */ /* SCHED_NORMAL, SCHED_BATCH */
s32 sched_nice; __s32 sched_nice;
/* SCHED_FIFO, SCHED_RR */ /* SCHED_FIFO, SCHED_RR */
u32 sched_priority; __u32 sched_priority;
/* SCHED_DEADLINE */ /* SCHED_DEADLINE */
u64 sched_runtime; __u64 sched_runtime;
u64 sched_deadline; __u64 sched_deadline;
u64 sched_period; __u64 sched_period;
}; };
#endif /* _UAPI_LINUX_SCHED_TYPES_H */ #endif /* _UAPI_LINUX_SCHED_TYPES_H */
...@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
prev_cputime_init(&p->prev_cputime); prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_init(&p->vtime_seqcount); seqcount_init(&p->vtime.seqcount);
p->vtime_snap = 0; p->vtime.starttime = 0;
p->vtime_snap_whence = VTIME_INACTIVE; p->vtime.state = VTIME_INACTIVE;
#endif #endif
#if defined(SPLIT_RSS_COUNTING) #if defined(SPLIT_RSS_COUNTING)
......
...@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr, ...@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
utime = curr->utime; utime = curr->utime;
/* /*
* If either stime or both stime and utime are 0, assume all runtime is * If either stime or utime are 0, assume all runtime is userspace.
* userspace. Once a task gets some ticks, the monotonicy code at * Once a task gets some ticks, the monotonicy code at 'update:'
* 'update' will ensure things converge to the observed ratio. * will ensure things converge to the observed ratio.
*/ */
if (stime != 0) { if (stime == 0) {
if (utime == 0) utime = rtime;
goto update;
}
if (utime == 0) {
stime = rtime; stime = rtime;
else goto update;
stime = scale_stime(stime, rtime, stime + utime);
} }
stime = scale_stime(stime, rtime, stime + utime);
update:
/* /*
* Make sure stime doesn't go backwards; this preserves monotonicity * Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic. * for utime because rtime is monotonic.
...@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) ...@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static u64 vtime_delta(struct task_struct *tsk) static u64 vtime_delta(struct vtime *vtime)
{ {
unsigned long now = READ_ONCE(jiffies); unsigned long long clock;
if (time_before(now, (unsigned long)tsk->vtime_snap)) clock = sched_clock_cpu(smp_processor_id());
if (clock < vtime->starttime)
return 0; return 0;
return jiffies_to_nsecs(now - tsk->vtime_snap); return clock - vtime->starttime;
} }
static u64 get_vtime_delta(struct task_struct *tsk) static u64 get_vtime_delta(struct vtime *vtime)
{ {
unsigned long now = READ_ONCE(jiffies); u64 delta = vtime_delta(vtime);
u64 delta, other; u64 other;
/* /*
* Unlike tick based timing, vtime based timing never has lost * Unlike tick based timing, vtime based timing never has lost
...@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk) ...@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding * elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative. * errors from causing elapsed vtime to go negative.
*/ */
delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta); other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
tsk->vtime_snap = now; vtime->starttime += delta;
return delta - other; return delta - other;
} }
static void __vtime_account_system(struct task_struct *tsk) static void __vtime_account_system(struct task_struct *tsk,
struct vtime *vtime)
{ {
account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
account_system_time(tsk, irq_count(), vtime->stime);
vtime->stime = 0;
}
}
static void vtime_account_guest(struct task_struct *tsk,
struct vtime *vtime)
{
vtime->gtime += get_vtime_delta(vtime);
if (vtime->gtime >= TICK_NSEC) {
account_guest_time(tsk, vtime->gtime);
vtime->gtime = 0;
}
} }
void vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk)
{ {
if (!vtime_delta(tsk)) struct vtime *vtime = &tsk->vtime;
if (!vtime_delta(vtime))
return; return;
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk); /* We might have scheduled out from guest path */
write_seqcount_end(&tsk->vtime_seqcount); if (current->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
} }
void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk)
{ {
write_seqcount_begin(&tsk->vtime_seqcount); struct vtime *vtime = &tsk->vtime;
tsk->vtime_snap_whence = VTIME_SYS;
if (vtime_delta(tsk)) write_seqcount_begin(&vtime->seqcount);
account_user_time(tsk, get_vtime_delta(tsk)); __vtime_account_system(tsk, vtime);
write_seqcount_end(&tsk->vtime_seqcount); vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
} }
void vtime_user_enter(struct task_struct *tsk) void vtime_user_exit(struct task_struct *tsk)
{ {
write_seqcount_begin(&tsk->vtime_seqcount); struct vtime *vtime = &tsk->vtime;
if (vtime_delta(tsk))
__vtime_account_system(tsk); write_seqcount_begin(&vtime->seqcount);
tsk->vtime_snap_whence = VTIME_USER; vtime->utime += get_vtime_delta(vtime);
write_seqcount_end(&tsk->vtime_seqcount); if (vtime->utime >= TICK_NSEC) {
account_user_time(tsk, vtime->utime);
vtime->utime = 0;
}
vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
} }
void vtime_guest_enter(struct task_struct *tsk) void vtime_guest_enter(struct task_struct *tsk)
{ {
struct vtime *vtime = &tsk->vtime;
/* /*
* The flags must be updated under the lock with * The flags must be updated under the lock with
* the vtime_snap flush and update. * the vtime_starttime flush and update.
* That enforces a right ordering and update sequence * That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime()) * synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta. * that can thus safely catch up with a tickless delta.
*/ */
write_seqcount_begin(&tsk->vtime_seqcount); write_seqcount_begin(&vtime->seqcount);
if (vtime_delta(tsk)) __vtime_account_system(tsk, vtime);
__vtime_account_system(tsk);
current->flags |= PF_VCPU; current->flags |= PF_VCPU;
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&vtime->seqcount);
} }
EXPORT_SYMBOL_GPL(vtime_guest_enter); EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk) void vtime_guest_exit(struct task_struct *tsk)
{ {
write_seqcount_begin(&tsk->vtime_seqcount); struct vtime *vtime = &tsk->vtime;
__vtime_account_system(tsk);
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
current->flags &= ~PF_VCPU; current->flags &= ~PF_VCPU;
write_seqcount_end(&tsk->vtime_seqcount); write_seqcount_end(&vtime->seqcount);
} }
EXPORT_SYMBOL_GPL(vtime_guest_exit); EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk) void vtime_account_idle(struct task_struct *tsk)
{ {
account_idle_time(get_vtime_delta(tsk)); account_idle_time(get_vtime_delta(&tsk->vtime));
} }
void arch_vtime_task_switch(struct task_struct *prev) void arch_vtime_task_switch(struct task_struct *prev)
{ {
write_seqcount_begin(&prev->vtime_seqcount); struct vtime *vtime = &prev->vtime;
prev->vtime_snap_whence = VTIME_INACTIVE;
write_seqcount_end(&prev->vtime_seqcount); write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_INACTIVE;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&current->vtime_seqcount); write_seqcount_begin(&vtime->seqcount);
current->vtime_snap_whence = VTIME_SYS; vtime->state = VTIME_SYS;
current->vtime_snap = jiffies; vtime->starttime = sched_clock_cpu(smp_processor_id());
write_seqcount_end(&current->vtime_seqcount); write_seqcount_end(&vtime->seqcount);
} }
void vtime_init_idle(struct task_struct *t, int cpu) void vtime_init_idle(struct task_struct *t, int cpu)
{ {
struct vtime *vtime = &t->vtime;
unsigned long flags; unsigned long flags;
local_irq_save(flags); local_irq_save(flags);
write_seqcount_begin(&t->vtime_seqcount); write_seqcount_begin(&vtime->seqcount);
t->vtime_snap_whence = VTIME_SYS; vtime->state = VTIME_SYS;
t->vtime_snap = jiffies; vtime->starttime = sched_clock_cpu(cpu);
write_seqcount_end(&t->vtime_seqcount); write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags); local_irq_restore(flags);
} }
u64 task_gtime(struct task_struct *t) u64 task_gtime(struct task_struct *t)
{ {
struct vtime *vtime = &t->vtime;
unsigned int seq; unsigned int seq;
u64 gtime; u64 gtime;
...@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t) ...@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
return t->gtime; return t->gtime;
do { do {
seq = read_seqcount_begin(&t->vtime_seqcount); seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime; gtime = t->gtime;
if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
gtime += vtime_delta(t); gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&t->vtime_seqcount, seq)); } while (read_seqcount_retry(&vtime->seqcount, seq));
return gtime; return gtime;
} }
...@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t) ...@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
*/ */
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{ {
u64 delta; struct vtime *vtime = &t->vtime;
unsigned int seq; unsigned int seq;
u64 delta;
if (!vtime_accounting_enabled()) { if (!vtime_accounting_enabled()) {
*utime = t->utime; *utime = t->utime;
...@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) ...@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
} }
do { do {
seq = read_seqcount_begin(&t->vtime_seqcount); seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime; *utime = t->utime;
*stime = t->stime; *stime = t->stime;
/* Task is sleeping, nothing to add */ /* Task is sleeping, nothing to add */
if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
continue; continue;
delta = vtime_delta(t); delta = vtime_delta(vtime);
/* /*
* Task runs either in user or kernel space, add pending nohz time to * Task runs either in user or kernel space, add pending nohz time to
* the right place. * the right place.
*/ */
if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
*utime += delta; *utime += vtime->utime + delta;
else if (t->vtime_snap_whence == VTIME_SYS) else if (vtime->state == VTIME_SYS)
*stime += delta; *stime += vtime->stime + delta;
} while (read_seqcount_retry(&t->vtime_seqcount, seq)); } while (read_seqcount_retry(&vtime->seqcount, seq));
} }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
...@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) ...@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't * our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu. * meet load balance goals by pulling other tasks on src_cpu.
* *
* Also avoid computing new_dst_cpu if we have already computed * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
* one in current iteration. * already computed one in current iteration.
*/ */
if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0; return 0;
/* Prevent to re-select dst_cpu via env's cpus */ /* Prevent to re-select dst_cpu via env's cpus */
...@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.tasks = LIST_HEAD_INIT(env.tasks), .tasks = LIST_HEAD_INIT(env.tasks),
}; };
/* cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
* For NEWLY_IDLE load_balancing, we don't need to consider
* other cpus in our group
*/
if (idle == CPU_NEWLY_IDLE)
env.dst_grpmask = NULL;
cpumask_copy(cpus, cpu_active_mask);
schedstat_inc(sd->lb_count[idle]); schedstat_inc(sd->lb_count[idle]);
...@@ -8151,7 +8144,15 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -8151,7 +8144,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
/* All tasks on this runqueue were pinned by CPU affinity */ /* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) { if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus); cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus)) { /*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
* active CPUs remaining as possible busiest CPUs to
* pull load from which are not contained within the
* destination group that is receiving any migrated
* load.
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0; env.loop = 0;
env.loop_break = sched_nr_migrate_break; env.loop_break = sched_nr_migrate_break;
goto redo; goto redo;
...@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) ...@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu, .src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq, .src_rq = busiest_rq,
.idle = CPU_IDLE, .idle = CPU_IDLE,
/*
* can_migrate_task() doesn't need to compute new_dst_cpu
* for active balancing. Since we have CPU_IDLE, but no
* @dst_grpmask we need to make that test go away with lying
* about DST_PINNED.
*/
.flags = LBF_DST_PINNED,
}; };
schedstat_inc(sd->alb_count); schedstat_inc(sd->alb_count);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment