Commit 42f52e1c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes are:

   - Migrate CPU-intense 'misfit' tasks on asymmetric capacity systems,
     to better utilize (much) faster 'big core' CPUs. (Morten Rasmussen,
     Valentin Schneider)

   - Topology handling improvements, in particular when CPU capacity
     changes and related load-balancing fixes/improvements (Morten
     Rasmussen)

   - ... plus misc other improvements, fixes and updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits)
  sched/completions/Documentation: Add recommendation for dynamic and ONSTACK completions
  sched/completions/Documentation: Clean up the document some more
  sched/completions/Documentation: Fix a couple of punctuation nits
  cpu/SMT: State SMT is disabled even with nosmt and without "=force"
  sched/core: Fix comment regarding nr_iowait_cpu() and get_iowait_load()
  sched/fair: Remove setting task's se->runnable_weight during PELT update
  sched/fair: Disable LB_BIAS by default
  sched/pelt: Fix warning and clean up IRQ PELT config
  sched/topology: Make local variables static
  sched/debug: Use symbolic names for task state constants
  sched/numa: Remove unused numa_stats::nr_running field
  sched/numa: Remove unused code from update_numa_stats()
  sched/debug: Explicitly cast sched_feat() to bool
  sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains
  sched/fair: Don't move tasks to lower capacity CPUs unless necessary
  sched/fair: Set rq->rd->overload when misfit
  sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE()
  sched/core: Change root_domain->overload type to int
  sched/fair: Change 'prefer_sibling' type to bool
  sched/fair: Kick nohz balance if rq->misfit_task_load
  ...
parents 0d1b82cd 11e13696
This diff is collapsed.
...@@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu); ...@@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
/* Replace task scheduler's default cpu-invariant accounting */ /* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale #define arch_scale_cpu_capacity topology_get_cpu_scale
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
#else #else
static inline void init_cpu_topology(void) { } static inline void init_cpu_topology(void) { }
......
...@@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus); ...@@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus);
/* Replace task scheduler's default cpu-invariant accounting */ /* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale #define arch_scale_cpu_capacity topology_get_cpu_scale
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
#include <asm-generic/topology.h> #include <asm-generic/topology.h>
#endif /* _ASM_ARM_TOPOLOGY_H */ #endif /* _ASM_ARM_TOPOLOGY_H */
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/sched/topology.h> #include <linux/sched/topology.h>
#include <linux/cpuset.h>
DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
...@@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev, ...@@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev,
return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
} }
static void update_topology_flags_workfn(struct work_struct *work);
static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
static ssize_t cpu_capacity_store(struct device *dev, static ssize_t cpu_capacity_store(struct device *dev,
struct device_attribute *attr, struct device_attribute *attr,
const char *buf, const char *buf,
...@@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev, ...@@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev,
topology_set_cpu_scale(i, new_capacity); topology_set_cpu_scale(i, new_capacity);
mutex_unlock(&cpu_scale_mutex); mutex_unlock(&cpu_scale_mutex);
schedule_work(&update_topology_flags_work);
return count; return count;
} }
...@@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void) ...@@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void)
} }
subsys_initcall(register_cpu_capacity_sysctl); subsys_initcall(register_cpu_capacity_sysctl);
static int update_topology;
int topology_update_cpu_topology(void)
{
return update_topology;
}
/*
* Updating the sched_domains can't be done directly from cpufreq callbacks
* due to locking, so queue the work for later.
*/
static void update_topology_flags_workfn(struct work_struct *work)
{
update_topology = 1;
rebuild_sched_domains();
pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
update_topology = 0;
}
static u32 capacity_scale; static u32 capacity_scale;
static u32 *raw_capacity; static u32 *raw_capacity;
...@@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, ...@@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
if (cpumask_empty(cpus_to_visit)) { if (cpumask_empty(cpus_to_visit)) {
topology_normalize_cpu_scale(); topology_normalize_cpu_scale();
schedule_work(&update_topology_flags_work);
free_raw_capacity(); free_raw_capacity();
pr_debug("cpu_capacity: parsing done\n"); pr_debug("cpu_capacity: parsing done\n");
schedule_work(&parsing_done_work); schedule_work(&parsing_done_work);
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/percpu.h> #include <linux/percpu.h>
void topology_normalize_cpu_scale(void); void topology_normalize_cpu_scale(void);
int topology_update_cpu_topology(void);
struct device_node; struct device_node;
bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
......
...@@ -23,10 +23,10 @@ ...@@ -23,10 +23,10 @@
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ #define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
......
...@@ -159,9 +159,14 @@ TRACE_EVENT(sched_switch, ...@@ -159,9 +159,14 @@ TRACE_EVENT(sched_switch,
(__entry->prev_state & (TASK_REPORT_MAX - 1)) ? (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
__print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
{ 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, { TASK_INTERRUPTIBLE, "S" },
{ 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, { TASK_UNINTERRUPTIBLE, "D" },
{ 0x40, "P" }, { 0x80, "I" }) : { __TASK_STOPPED, "T" },
{ __TASK_TRACED, "t" },
{ EXIT_DEAD, "X" },
{ EXIT_ZOMBIE, "Z" },
{ TASK_PARKED, "P" },
{ TASK_DEAD, "I" }) :
"R", "R",
__entry->prev_state & TASK_REPORT_MAX ? "+" : "", __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
......
...@@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING ...@@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING
If in doubt, say N here. If in doubt, say N here.
config HAVE_SCHED_AVG_IRQ
def_bool y
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
depends on SMP
config BSD_PROCESS_ACCT config BSD_PROCESS_ACCT
bool "BSD Process Accounting" bool "BSD Process Accounting"
depends on MULTIUSER depends on MULTIUSER
......
...@@ -383,6 +383,7 @@ void __init cpu_smt_disable(bool force) ...@@ -383,6 +383,7 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: Force disabled\n"); pr_info("SMT: Force disabled\n");
cpu_smt_control = CPU_SMT_FORCE_DISABLED; cpu_smt_control = CPU_SMT_FORCE_DISABLED;
} else { } else {
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED; cpu_smt_control = CPU_SMT_DISABLED;
} }
} }
......
...@@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) ...@@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
* In theory, the compile should just see 0 here, and optimize out the call * In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it... * to sched_rt_avg_update. But I don't trust it...
*/ */
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) s64 __maybe_unused steal = 0, irq_delta = 0;
s64 steal = 0, irq_delta = 0;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
...@@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) ...@@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta; rq->clock_task += delta;
#ifdef HAVE_SCHED_AVG_IRQ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal); update_irq_load_avg(rq, irq_delta + steal);
#endif #endif
...@@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) ...@@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
if (idle_policy(p->policy)) { if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO); load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO; load->inv_weight = WMULT_IDLEPRIO;
p->se.runnable_weight = load->weight;
return; return;
} }
...@@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) ...@@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
} else { } else {
load->weight = scale_load(sched_prio_to_weight[prio]); load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio]; load->inv_weight = sched_prio_to_wmult[prio];
p->se.runnable_weight = load->weight;
} }
} }
...@@ -2915,10 +2916,10 @@ unsigned long nr_iowait(void) ...@@ -2915,10 +2916,10 @@ unsigned long nr_iowait(void)
} }
/* /*
* Consumers of these two interfaces, like for example the cpufreq menu * Consumers of these two interfaces, like for example the cpuidle menu
* governor are using nonsensical data. Boosting frequency for a CPU that has * governor, are using nonsensical data. Preferring shallow idle state selection
* IO-wait which might not even end up running the task when it does become * for a CPU that has IO-wait which might not even end up running the task when
* runnable. * it does become runnable.
*/ */
unsigned long nr_iowait_cpu(int cpu) unsigned long nr_iowait_cpu(int cpu)
......
This diff is collapsed.
...@@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) ...@@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true) SCHED_FEAT(LB_BIAS, false)
/* /*
* Decrement CPU capacity based on time not spent running tasks * Decrement CPU capacity based on time not spent running tasks
......
...@@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna ...@@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{ {
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1; return 1;
...@@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) ...@@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{ {
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) { cfs_rq->curr == se)) {
...@@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) ...@@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0; return 0;
} }
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/* /*
* irq: * irq:
* *
......
...@@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); ...@@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running); int update_irq_load_avg(struct rq *rq, u64 running);
#else #else
static inline int static inline int
......
...@@ -717,8 +717,12 @@ struct root_domain { ...@@ -717,8 +717,12 @@ struct root_domain {
cpumask_var_t span; cpumask_var_t span;
cpumask_var_t online; cpumask_var_t online;
/* Indicate more than one runnable task for any CPU */ /*
bool overload; * Indicate pullable load on at least one CPU, e.g:
* - More than one runnable task
* - Running task is misfit
*/
int overload;
/* /*
* The bit corresponding to a CPU gets set here if such CPU has more * The bit corresponding to a CPU gets set here if such CPU has more
...@@ -845,6 +849,8 @@ struct rq { ...@@ -845,6 +849,8 @@ struct rq {
unsigned char idle_balance; unsigned char idle_balance;
unsigned long misfit_task_load;
/* For active balancing */ /* For active balancing */
int active_balance; int active_balance;
int push_cpu; int push_cpu;
...@@ -858,8 +864,7 @@ struct rq { ...@@ -858,8 +864,7 @@ struct rq {
struct sched_avg avg_rt; struct sched_avg avg_rt;
struct sched_avg avg_dl; struct sched_avg avg_dl;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
#define HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq; struct sched_avg avg_irq;
#endif #endif
u64 idle_stamp; u64 idle_stamp;
...@@ -1188,6 +1193,7 @@ DECLARE_PER_CPU(int, sd_llc_id); ...@@ -1188,6 +1193,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_asym);
extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity { struct sched_group_capacity {
atomic_t ref; atomic_t ref;
...@@ -1197,6 +1203,7 @@ struct sched_group_capacity { ...@@ -1197,6 +1203,7 @@ struct sched_group_capacity {
*/ */
unsigned long capacity; unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long max_capacity; /* Max per-CPU capacity in group */
unsigned long next_update; unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */ int imbalance; /* XXX unrelated to capacity but shared group state */
...@@ -1396,7 +1403,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = ...@@ -1396,7 +1403,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
0; 0;
#undef SCHED_FEAT #undef SCHED_FEAT
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
...@@ -1696,8 +1703,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) ...@@ -1696,8 +1703,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
if (prev_nr < 2 && rq->nr_running >= 2) { if (prev_nr < 2 && rq->nr_running >= 2) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (!rq->rd->overload) if (!READ_ONCE(rq->rd->overload))
rq->rd->overload = true; WRITE_ONCE(rq->rd->overload, 1);
#endif #endif
} }
...@@ -2217,7 +2224,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) ...@@ -2217,7 +2224,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
} }
#endif #endif
#ifdef HAVE_SCHED_AVG_IRQ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq) static inline unsigned long cpu_util_irq(struct rq *rq)
{ {
return rq->avg_irq.util_avg; return rq->avg_irq.util_avg;
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
DEFINE_MUTEX(sched_domains_mutex); DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */ /* Protected by sched_domains_mutex: */
cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask;
cpumask_var_t sched_domains_tmpmask2; static cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
...@@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id); ...@@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_asym);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) static void update_top_cache_domain(int cpu)
{ {
...@@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, ...@@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg_span = sched_group_span(sg); sg_span = sched_group_span(sg);
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
} }
static int static int
...@@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) ...@@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
return sg; return sg;
} }
...@@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks; ...@@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks;
* SD_SHARE_PKG_RESOURCES - describes shared caches * SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies * SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain * SD_SHARE_POWERDOMAIN - describes shared power domain
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
* *
* Odd one out, which beside describing the topology has a quirk also * Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it: * prescribes the desired behaviour that goes along with it:
...@@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks; ...@@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks;
SD_SHARE_PKG_RESOURCES | \ SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \ SD_NUMA | \
SD_ASYM_PACKING | \ SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN) SD_SHARE_POWERDOMAIN)
static struct sched_domain * static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, const struct cpumask *cpu_map,
struct sched_domain *child, int cpu) struct sched_domain *child, int dflags, int cpu)
{ {
struct sd_data *sdd = &tl->data; struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
...@@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl,
"wrong sd_flags in topology description\n")) "wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS; sd_flags &= ~TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
*sd = (struct sched_domain){ *sd = (struct sched_domain){
.min_interval = sd_weight, .min_interval = sd_weight,
.max_interval = 2*sd_weight, .max_interval = 2*sd_weight,
...@@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl,
| 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES | 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE | 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING | 1*SD_PREFER_SIBLING
| 0*SD_NUMA | 0*SD_NUMA
| sd_flags | sd_flags
, ,
...@@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_ASYM_CPUCAPACITY) { if (sd->flags & SD_ASYM_CPUCAPACITY) {
struct sched_domain *t = sd; struct sched_domain *t = sd;
/*
* Don't attempt to spread across CPUs of different capacities.
*/
if (sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
for_each_lower_domain(t) for_each_lower_domain(t)
t->flags |= SD_BALANCE_WAKE; t->flags |= SD_BALANCE_WAKE;
} }
if (sd->flags & SD_SHARE_CPUCAPACITY) { if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110; sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */ sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) { } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117; sd->imbalance_pct = 117;
sd->cache_nice_tries = 1; sd->cache_nice_tries = 1;
sd->busy_idx = 2; sd->busy_idx = 2;
...@@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->busy_idx = 3; sd->busy_idx = 3;
sd->idle_idx = 2; sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE; sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC | sd->flags &= ~(SD_BALANCE_EXEC |
...@@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif #endif
} else { } else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1; sd->cache_nice_tries = 1;
sd->busy_idx = 2; sd->busy_idx = 2;
sd->idle_idx = 1; sd->idle_idx = 1;
...@@ -1604,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map) ...@@ -1604,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr, const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu) struct sched_domain *child, int dflags, int cpu)
{ {
struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
if (child) { if (child) {
sd->level = child->level + 1; sd->level = child->level + 1;
...@@ -1632,6 +1640,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve ...@@ -1632,6 +1640,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
return sd; return sd;
} }
/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
static struct sched_domain_topology_level
*asym_cpu_capacity_level(const struct cpumask *cpu_map)
{
int i, j, asym_level = 0;
bool asym = false;
struct sched_domain_topology_level *tl, *asym_tl = NULL;
unsigned long cap;
/* Is there any asymmetry? */
cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
if (arch_scale_cpu_capacity(NULL, i) != cap) {
asym = true;
break;
}
}
if (!asym)
return NULL;
/*
* Examine topology from all CPU's point of views to detect the lowest
* sched_domain_topology_level where a highest capacity CPU is visible
* to everyone.
*/
for_each_cpu(i, cpu_map) {
unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
int tl_id = 0;
for_each_sd_topology(tl) {
if (tl_id < asym_level)
goto next_level;
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
capacity = arch_scale_cpu_capacity(NULL, j);
if (capacity <= max_capacity)
continue;
max_capacity = capacity;
asym_level = tl_id;
asym_tl = tl;
}
next_level:
tl_id++;
}
}
return asym_tl;
}
/* /*
* Build sched domains for a given set of CPUs and attach the sched domains * Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs * to the individual CPUs
...@@ -1644,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ...@@ -1644,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d; struct s_data d;
struct rq *rq = NULL; struct rq *rq = NULL;
int i, ret = -ENOMEM; int i, ret = -ENOMEM;
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map); alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain) if (alloc_state != sa_rootdomain)
goto error; goto error;
tl_asym = asym_cpu_capacity_level(cpu_map);
/* Set up domains for CPUs specified by the cpu_map: */ /* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl; struct sched_domain_topology_level *tl;
sd = NULL; sd = NULL;
for_each_sd_topology(tl) { for_each_sd_topology(tl) {
sd = build_sched_domain(tl, cpu_map, attr, sd, i); int dflags = 0;
if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true;
}
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology) if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd; *per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP) if (tl->flags & SDTL_OVERLAP)
...@@ -1704,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ...@@ -1704,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
} }
rcu_read_unlock(); rcu_read_unlock();
if (has_asym)
static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) { if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment