Commit f7f4e7fc authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - power-aware scheduling improvements (Patrick Bellasi)

 - NUMA balancing improvements (Mel Gorman)

 - vCPU scheduling fixes (Rohit Jain)

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Update util_est before updating schedutil
  sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization
  sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation
  sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu()
  sched/wait: Include <linux/wait.h> in <linux/swait.h>
  sched/numa: Stagger NUMA balancing scan periods for new threads
  sched/core: Don't schedule threads on pre-empted vCPUs
  sched/fair: Avoid calling sync_entity_load_avg() unnecessarily
  sched/fair: Rearrange select_task_rq_fair() to optimize it
parents d9b446e2 2539fc82
......@@ -49,7 +49,7 @@ CONTENTS
2.1 Main algorithm
------------------
SCHED_DEADLINE uses three parameters, named "runtime", "period", and
SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and
these "runtime" microseconds are available within "deadline" microseconds
......@@ -117,6 +117,10 @@ CONTENTS
scheduling deadline = scheduling deadline + period
remaining runtime = remaining runtime + runtime
The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
to get informed about runtime overruns through the delivery of SIGXCPU
signals.
2.2 Bandwidth reclaiming
------------------------
......@@ -279,6 +283,19 @@ CONTENTS
running_bw is incremented.
2.3 Energy-aware scheduling
------------------------
When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
value that still allows to meet the deadlines. This behavior is currently
implemented only for ARM architectures.
A particular care must be taken in case the time needed for changing frequency
is of the same order of magnitude of the reservation period. In such cases,
setting a fixed CPU frequency results in a lower amount of deadline misses.
3. Scheduling Real-Time Tasks
=============================
......@@ -505,6 +522,12 @@ CONTENTS
17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
Computing, 2016.
18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
2016.
19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
2018), Pau, France, April 2018.
4. Bandwidth management
......
......@@ -1512,6 +1512,7 @@ static inline int task_nice(const struct task_struct *p)
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
......
......@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <asm/current.h>
/*
......
......@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
p->mm->numa_scan_seq = 0;
}
if (clone_flags & CLONE_VM)
p->numa_preferred_nid = current->numa_preferred_nid;
else
p->numa_preferred_nid = -1;
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
init_numa_balancing(clone_flags, p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
......@@ -4049,6 +4029,23 @@ int idle_cpu(int cpu)
return 1;
}
/**
* available_idle_cpu - is a given CPU idle for enqueuing work.
* @cpu: the CPU in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int available_idle_cpu(int cpu)
{
if (!idle_cpu(cpu))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
}
/**
* idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
......
......@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util;
if (rq->rt.rt_nr_running) {
util = sg_cpu->max;
} else {
util = sg_cpu->util_dl;
if (rq->cfs.h_nr_running)
util += sg_cpu->util_cfs;
}
if (rq->rt.rt_nr_running)
return sg_cpu->max;
/*
* Utilization required by DEADLINE must always be granted while, for
* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
* gracefully reduce the frequency when no tasks show up for longer
* periods of time.
*
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
*/
return min(util, sg_cpu->max);
return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
}
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
......
......@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
if (mm) {
mm_users = atomic_read(&mm->mm_users);
if (mm_users == 1) {
mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
mm->numa_scan_seq = 0;
}
}
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
p->numa_group = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
p->numa_preferred_nid = -1;
return;
}
/*
* New thread, keep existing numa_preferred_nid which should be copied
* already by arch_dup_task_struct but stagger when scans start.
*/
if (mm) {
unsigned int delay;
delay = min_t(unsigned int, task_scan_max(current),
current->numa_scan_period * mm_users * NSEC_PER_MSEC);
delay += 2 * TICK_NSEC;
p->node_stamp = delay;
}
}
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != -1);
......@@ -5344,6 +5385,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
util_est_enqueue(&rq->cfs, p);
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
......@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
add_nr_running(rq, 1);
util_est_enqueue(&rq->cfs, p);
hrtick_update(rq);
}
......@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
......@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
......@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
/*
* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
* last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
......@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
if (cpu == core)
continue;
if (!idle_cpu(cpu))
if (!available_idle_cpu(cpu))
goto unlock;
}
......@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
if (!idle_cpu(cpu))
if (!available_idle_cpu(cpu))
idle = false;
}
......@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
if (available_idle_cpu(cpu))
return cpu;
}
......@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
if (available_idle_cpu(cpu))
break;
}
......@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
if (idle_cpu(target))
if (available_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
return prev;
/* Check a recently used CPU as a potential idle candidate: */
......@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
idle_cpu(recent_used_cpu) &&
available_idle_cpu(recent_used_cpu) &&
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/*
* Replace recent_used_cpu with prev as it is a potential
......@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
......@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
if (cpu != prev_cpu)
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
......@@ -6591,34 +6649,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
}
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
if (cpu == prev_cpu)
goto pick_cpu;
new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
/*
* We're going to need the task's util for capacity_spare_wake
* in find_idlest_group. Sync it up to prev_cpu's
* last_update_time.
*/
sync_entity_load_avg(&p->se);
}
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
/* Fast path */
if (!sd) {
pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
current->recent_used_cpu = cpu;
}
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
return new_cpu;
......
......@@ -1069,6 +1069,12 @@ enum numa_faults_stats {
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
#else
static inline void
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment