Commit b2e09f63 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull more scheduler updates from Ingo Molnar:
 "Second round of scheduler changes:
   - try-to-wakeup and IPI reduction speedups, from Andy Lutomirski
   - continued power scheduling cleanups and refactorings, from Nicolas
     Pitre
   - misc fixes and enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/deadline: Delete extraneous extern for to_ratio()
  sched/idle: Optimize try-to-wake-up IPI
  sched/idle: Simplify wake_up_idle_cpu()
  sched/idle: Clear polling before descheduling the idle thread
  sched, trace: Add a tracepoint for IPI-less remote wakeups
  cpuidle: Set polling in poll_idle
  sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...)
  sched: Rename capacity related flags
  sched: Final power vs. capacity cleanups
  sched: Remove remaining dubious usage of "power"
  sched: Let 'struct sched_group_power' care about CPU capacity
  sched/fair: Disambiguate existing/remaining "capacity" usage
  sched/fair: Change "has_capacity" to "has_free_capacity"
  sched/fair: Remove "power" from 'struct numa_stats'
  sched: Fix signedness bug in yield_to()
  sched/fair: Use time_after() in record_wakee()
  sched/balancing: Reduce the rate of needless idle load balancing
  sched/fair: Fix unlocked reads of some cfs_b->quota/period
parents 3737a127 535560d8
......@@ -26,30 +26,30 @@
#include <asm/topology.h>
/*
* cpu power scale management
* cpu capacity scale management
*/
/*
* cpu power table
* cpu capacity table
* This per cpu data structure describes the relative capacity of each core.
* On a heteregenous system, cores don't have the same computation capacity
* and we reflect that difference in the cpu_power field so the scheduler can
* take this difference into account during load balance. A per cpu structure
* is preferred because each CPU updates its own cpu_power field during the
* load balance except for idle cores. One idle core is selected to run the
* rebalance_domains for all idle cores and the cpu_power can be updated
* during this sequence.
* and we reflect that difference in the cpu_capacity field so the scheduler
* can take this difference into account during load balance. A per cpu
* structure is preferred because each CPU updates its own cpu_capacity field
* during the load balance except for idle cores. One idle core is selected
* to run the rebalance_domains for all idle cores and the cpu_capacity can be
* updated during this sequence.
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale);
unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
return per_cpu(cpu_scale, cpu);
}
static void set_power_scale(unsigned int cpu, unsigned long power)
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
{
per_cpu(cpu_scale, cpu) = power;
per_cpu(cpu_scale, cpu) = capacity;
}
#ifdef CONFIG_OF
......@@ -62,11 +62,11 @@ struct cpu_efficiency {
* Table of relative efficiency of each processors
* The efficiency value must fit in 20bit and the final
* cpu_scale value must be in the range
* 0 < cpu_scale < 3*SCHED_POWER_SCALE/2
* 0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2
* in order to return at most 1 when DIV_ROUND_CLOSEST
* is used to compute the capacity of a CPU.
* Processors that are not defined in the table,
* use the default SCHED_POWER_SCALE value for cpu_scale.
* use the default SCHED_CAPACITY_SCALE value for cpu_scale.
*/
static const struct cpu_efficiency table_efficiency[] = {
{"arm,cortex-a15", 3891},
......@@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1;
* Iterate all CPUs' descriptor in DT and compute the efficiency
* (as per table_efficiency). Also calculate a middle efficiency
* as close as possible to (max{eff_i} - min{eff_i}) / 2
* This is later used to scale the cpu_power field such that an
* 'average' CPU is of middle power. Also see the comments near
* table_efficiency[] and update_cpu_power().
* This is later used to scale the cpu_capacity field such that an
* 'average' CPU is of middle capacity. Also see the comments near
* table_efficiency[] and update_cpu_capacity().
*/
static void __init parse_dt_topology(void)
{
......@@ -141,15 +141,15 @@ static void __init parse_dt_topology(void)
* cpu_scale because all CPUs have the same capacity. Otherwise, we
* compute a middle_capacity factor that will ensure that the capacity
* of an 'average' CPU of the system will be as close as possible to
* SCHED_POWER_SCALE, which is the default value, but with the
* SCHED_CAPACITY_SCALE, which is the default value, but with the
* constraint explained near table_efficiency[].
*/
if (4*max_capacity < (3*(max_capacity + min_capacity)))
middle_capacity = (min_capacity + max_capacity)
>> (SCHED_POWER_SHIFT+1);
>> (SCHED_CAPACITY_SHIFT+1);
else
middle_capacity = ((max_capacity / 3)
>> (SCHED_POWER_SHIFT-1)) + 1;
>> (SCHED_CAPACITY_SHIFT-1)) + 1;
}
......@@ -158,20 +158,20 @@ static void __init parse_dt_topology(void)
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
* function returns directly for SMP system.
*/
static void update_cpu_power(unsigned int cpu)
static void update_cpu_capacity(unsigned int cpu)
{
if (!cpu_capacity(cpu))
return;
set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity);
set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
printk(KERN_INFO "CPU%u: update cpu_power %lu\n",
cpu, arch_scale_freq_power(NULL, cpu));
printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
cpu, arch_scale_freq_capacity(NULL, cpu));
}
#else
static inline void parse_dt_topology(void) {}
static inline void update_cpu_power(unsigned int cpuid) {}
static inline void update_cpu_capacity(unsigned int cpuid) {}
#endif
/*
......@@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid)
update_siblings_masks(cpuid);
update_cpu_power(cpuid);
update_cpu_capacity(cpuid);
printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
cpuid, cpu_topology[cpuid].thread_id,
......@@ -297,7 +297,7 @@ void __init init_cpu_topology(void)
{
unsigned int cpu;
/* init core mask and power*/
/* init core mask and capacity */
for_each_possible_cpu(cpu) {
struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]);
......@@ -307,7 +307,7 @@ void __init init_cpu_topology(void)
cpumask_clear(&cpu_topo->core_sibling);
cpumask_clear(&cpu_topo->thread_sibling);
set_power_scale(cpu, SCHED_POWER_SCALE);
set_capacity_scale(cpu, SCHED_CAPACITY_SCALE);
}
smp_wmb();
......
......@@ -749,7 +749,7 @@ int setup_profiling_timer(unsigned int multiplier)
/* cpumask of CPUs with asymetric SMT dependancy */
static const int powerpc_smt_flags(void)
{
int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
......
......@@ -187,8 +187,11 @@ static int poll_idle(struct cpuidle_device *dev,
t1 = ktime_get();
local_irq_enable();
if (!current_set_polling_and_test()) {
while (!need_resched())
cpu_relax();
}
current_clr_polling();
t2 = ktime_get();
diff = ktime_to_us(ktime_sub(t2, t1));
......
......@@ -586,7 +586,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
bool kvm_vcpu_yield_to(struct kvm_vcpu *target);
int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
......
......@@ -847,10 +847,10 @@ enum cpu_idle_type {
};
/*
* Increase resolution of cpu_power calculations
* Increase resolution of cpu_capacity calculations
*/
#define SCHED_POWER_SHIFT 10
#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
#define SCHED_CAPACITY_SHIFT 10
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
/*
* sched-domains (multiprocessor balancing) declarations:
......@@ -862,7 +862,7 @@ enum cpu_idle_type {
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
......@@ -874,7 +874,7 @@ enum cpu_idle_type {
#ifdef CONFIG_SCHED_SMT
static inline const int cpu_smt_flags(void)
{
return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
}
#endif
......@@ -1006,7 +1006,7 @@ typedef const int (*sched_domain_flags_f)(void);
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
struct sched_group_power **__percpu sgp;
struct sched_group_capacity **__percpu sgc;
};
struct sched_domain_topology_level {
......@@ -2173,7 +2173,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
extern bool yield_to(struct task_struct *p, bool preempt);
extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
/**
......
......@@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa,
__entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
__entry->dst_cpu, __entry->dst_nid)
);
/*
* Tracepoint for waking a polling cpu without an IPI.
*/
TRACE_EVENT(sched_wake_idle_without_ipi,
TP_PROTO(int cpu),
TP_ARGS(cpu),
TP_STRUCT__entry(
__field( int, cpu )
),
TP_fast_assign(
__entry->cpu = cpu;
),
TP_printk("cpu=%d", __entry->cpu)
);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
......
This diff is collapsed.
......@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
extern unsigned long to_ratio(u64 period, u64 runtime);
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
......
This diff is collapsed.
......@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
* Use arch dependent cpu power functions
* Use arch dependent cpu capacity functions
*/
SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
* Decrement CPU power based on time not spent running tasks
* Decrement CPU capacity based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
......
......@@ -12,6 +12,8 @@
#include <trace/events/power.h>
#include "sched.h"
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
......@@ -67,6 +69,10 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
*
* On archs that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
static void cpuidle_idle_call(void)
{
......@@ -175,10 +181,22 @@ static void cpuidle_idle_call(void)
/*
* Generic idle loop implementation
*
* Called with polling cleared.
*/
static void cpu_idle_loop(void)
{
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
*
* Our polling bit is clear if we're not scheduled (i.e. if
* rq->curr != rq->idle). This means that, if rq->idle has
* the polling bit set, then setting need_resched is
* guaranteed to cause the cpu to reschedule.
*/
__current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
......@@ -218,6 +236,17 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
__current_clr_polling();
/*
* We promise to call sched_ttwu_pending and reschedule
* if need_resched is set while polling is set. That
* means that clearing polling needs to be visible
* before doing these things.
*/
smp_mb__after_atomic();
sched_ttwu_pending();
schedule_preempt_disabled();
}
}
......@@ -239,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
__current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
......@@ -918,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
......@@ -943,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
......
......@@ -567,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
......@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
......@@ -728,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
struct sched_group_capacity {
atomic_t ref;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU.
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
unsigned int power, power_orig;
unsigned int capacity, capacity_orig;
unsigned long next_update;
int imbalance; /* XXX unrelated to power but shared group state */
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
......@@ -750,7 +752,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
/*
* The CPUs this group covers.
......@@ -773,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgp->cpumask);
return to_cpumask(sg->sgc->cpumask);
}
/**
......@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
#else
static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
#include "stats.h"
......@@ -1167,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
extern void update_group_power(struct sched_domain *sd, int cpu);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
......
......@@ -1714,11 +1714,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
#endif /* !CONFIG_S390 */
bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
int kvm_vcpu_yield_to(struct kvm_vcpu *target)
{
struct pid *pid;
struct task_struct *task = NULL;
bool ret = false;
int ret = 0;
rcu_read_lock();
pid = rcu_dereference(target->pid);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment