Commit 3e201463 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main updates in this cycle were:

   - Group balancing enhancements and cleanups (Brendan Jackman)

   - Move CPU isolation related functionality into its separate
     kernel/sched/isolation.c file, with related 'housekeeping_*()'
     namespace and nomenclature et al. (Frederic Weisbecker)

   - Improve the interactive/cpu-intense fairness calculation (Josef
     Bacik)

   - Improve the PELT code and related cleanups (Peter Zijlstra)

   - Improve the logic of pick_next_task_fair() (Uladzislau Rezki)

   - Improve the RT IPI based balancing logic (Steven Rostedt)

   - Various micro-optimizations:

   - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi)

   - better idle loop (Cheng Jian)

   - ... plus misc fixes, cleanups and updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
  sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds
  sched/sysctl: Fix attributes of some extern declarations
  sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated
  sched/isolation: Add basic isolcpus flags
  sched/isolation: Move isolcpus= handling to the housekeeping code
  sched/isolation: Handle the nohz_full= parameter
  sched/isolation: Introduce housekeeping flags
  sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL
  sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu()
  sched/isolation: Use its own static key
  sched/isolation: Make the housekeeping cpumask private
  sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu()
  sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version
  sched/isolation: Move housekeeping related code to its own file
  sched/idle: Micro-optimize the idle loop
  sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK
  x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter
  sched/rt: Simplify the IPI based RT balancing logic
  block/ioprio: Use a helper to check for RT prio
  sched/rt: Add a helper to test for a RT task
  ...
parents f2be8bd5 765cc3a4
......@@ -1730,20 +1730,33 @@
isapnp= [ISAPNP]
Format: <RDP>,<reset>,<pci_scan>,<verbosity>
isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler.
The argument is a cpu list, as described above.
isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance.
[Deprecated - use cpusets instead]
Format: [flag-list,]<cpu-list>
Specify one or more CPUs to isolate from disturbances
specified in the flag list (default: domain):
nohz
Disable the tick when a single task runs.
domain
Isolate from the general SMP balancing and scheduling
algorithms. Note that performing domain isolation this way
is irreversible: it's not possible to bring back a CPU to
the domains once isolated through isolcpus. It's strongly
advised to use cpusets instead to disable scheduler load
balancing through the "cpuset.sched_load_balance" file.
It offers a much more flexible interface where CPUs can
move in and out of an isolated set anytime.
You can move a process onto or off an "isolated" CPU via
the CPU affinity syscalls or cpuset.
<cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1".
The format of <cpu-list> is described above.
This option can be used to specify one or more CPUs
to isolate from the general SMP balancing and scheduling
algorithms. You can move a process onto or off an
"isolated" CPU via the CPU affinity syscalls or cpuset.
<cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1".
This option is the preferred way to isolate CPUs. The
alternative -- manually setting the CPU mask of all
tasks in the system -- can cause problems and
suboptimal load balancer performance.
iucv= [HW,NET]
......@@ -4209,6 +4222,9 @@
Used to run time disable IRQ_TIME_ACCOUNTING on any
platforms where RDTSC is slow and this accounting
can add overhead.
[x86] unstable: mark the TSC clocksource as unstable, this
marks the TSC unconditionally unstable at bootup and
avoids any further wobbles once the TSC watchdog notices.
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
......
......@@ -18,6 +18,7 @@
#include <linux/cpufeature.h>
#include <linux/tick.h>
#include <linux/pm_qos.h>
#include <linux/sched/isolation.h>
#include "base.h"
......@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
struct device_attribute *attr, char *buf)
{
int n = 0, len = PAGE_SIZE-2;
cpumask_var_t isolated;
n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
return -ENOMEM;
cpumask_andnot(isolated, cpu_possible_mask,
housekeeping_cpumask(HK_FLAG_DOMAIN));
n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
free_cpumask_var(isolated);
return n;
}
......
......@@ -40,7 +40,7 @@
#include <linux/tcp.h>
#include <linux/net_tstamp.h>
#include <linux/ptp_clock_kernel.h>
#include <linux/tick.h>
#include <linux/sched/isolation.h>
#include <asm/checksum.h>
#include <asm/homecache.h>
......@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
tile_net_dev_init(name, mac);
if (!network_cpus_init())
cpumask_and(&network_cpus_map, housekeeping_cpumask(),
cpu_online_mask);
cpumask_and(&network_cpus_map,
housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
return 0;
}
......
......@@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
static inline const char *get_task_state(struct task_struct *tsk)
{
BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
return task_state_array[__get_task_state(tsk)];
return task_state_array[task_state_index(tsk)];
}
static inline int get_task_umask(struct task_struct *tsk)
......
......@@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return 0;
}
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return 0;
}
/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
......@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_last - get the last CPU in a cpumask
* @srcp: - the cpumask pointer
*
* Returns >= nr_cpumask_bits if no CPUs set.
*/
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
unsigned int cpumask_next(int n, const struct cpumask *srcp);
/**
......
......@@ -3,6 +3,7 @@
#define IOPRIO_H
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>
/*
......@@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
else if (task_is_realtime(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;
......
......@@ -166,8 +166,6 @@ struct task_group;
/* Task command name length: */
#define TASK_COMM_LEN 16
extern cpumask_var_t cpu_isolated_map;
extern void scheduler_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
......@@ -332,9 +330,11 @@ struct load_weight {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
u64 runnable_load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_load_avg;
unsigned long util_avg;
};
......@@ -377,6 +377,7 @@ struct sched_statistics {
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
......@@ -472,10 +473,10 @@ struct sched_dl_entity {
* conditions between the inactive timer handler and the wakeup
* code.
*/
int dl_throttled;
int dl_boosted;
int dl_yielded;
int dl_non_contending;
int dl_throttled : 1;
int dl_boosted : 1;
int dl_yielded : 1;
int dl_non_contending : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
......@@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
static inline unsigned int __get_task_state(struct task_struct *tsk)
static inline unsigned int task_state_index(struct task_struct *tsk)
{
unsigned int tsk_state = READ_ONCE(tsk->state);
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
......@@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
return fls(state);
}
static inline char __task_state_to_char(unsigned int state)
static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
......@@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)
static inline char task_state_to_char(struct task_struct *tsk)
{
return __task_state_to_char(__get_task_state(tsk));
return task_index_to_char(task_state_index(tsk));
}
/**
......
#ifndef _LINUX_SCHED_ISOLATION_H
#define _LINUX_SCHED_ISOLATION_H
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/tick.h>
enum hk_flags {
HK_FLAG_TIMER = 1,
HK_FLAG_RCU = (1 << 1),
HK_FLAG_MISC = (1 << 2),
HK_FLAG_SCHED = (1 << 3),
HK_FLAG_TICK = (1 << 4),
HK_FLAG_DOMAIN = (1 << 5),
};
#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
extern int housekeeping_any_cpu(enum hk_flags flags);
extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
extern void __init housekeeping_init(void);
#else
static inline int housekeeping_any_cpu(enum hk_flags flags)
{
return smp_processor_id();
}
static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{
return cpu_possible_mask;
}
static inline void housekeeping_affine(struct task_struct *t,
enum hk_flags flags) { }
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */
static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
{
#ifdef CONFIG_CPU_ISOLATION
if (static_branch_unlikely(&housekeeping_overriden))
return housekeeping_test_cpu(cpu, flags);
#endif
return true;
}
#endif /* _LINUX_SCHED_ISOLATION_H */
......@@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
return rt_prio(p->prio);
}
static inline bool task_is_realtime(struct task_struct *tsk)
{
int policy = tsk->policy;
if (policy == SCHED_FIFO || policy == SCHED_RR)
return true;
if (policy == SCHED_DEADLINE)
return true;
return false;
}
#ifdef CONFIG_RT_MUTEXES
/*
* Must hold either p->pi_lock or task_rq(p)->lock.
......
......@@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
extern __read_mostly unsigned int sysctl_sched_time_avg;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
......
......@@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
extern cpumask_var_t housekeeping_mask;
static inline bool tick_nohz_full_enabled(void)
{
......@@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
cpumask_or(mask, mask, tick_nohz_full_mask);
}
static inline int housekeeping_any_cpu(void)
{
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
}
extern void tick_nohz_dep_set(enum tick_dep_bits bit);
extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
......@@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
extern void tick_nohz_full_kick_cpu(int cpu);
extern void __tick_nohz_task_switch(void);
extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
#else
static inline int housekeeping_any_cpu(void)
{
return smp_processor_id();
}
static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
......@@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
static inline void tick_nohz_full_kick_cpu(int cpu) { }
static inline void __tick_nohz_task_switch(void) { }
static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
#endif
static inline const struct cpumask *housekeeping_cpumask(void)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled())
return housekeeping_mask;
#endif
return cpu_possible_mask;
}
static inline bool is_housekeeping_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled())
return cpumask_test_cpu(cpu, housekeeping_mask);
#endif
return true;
}
static inline void housekeeping_affine(struct task_struct *t)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled())
set_cpus_allowed_ptr(t, housekeeping_mask);
#endif
}
static inline void tick_nohz_task_switch(void)
{
if (tick_nohz_full_enabled())
......
......@@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
if (preempt)
return TASK_STATE_MAX;
return __get_task_state(p);
return task_state_index(p);
}
#endif /* CREATE_TRACE_POINTS */
......
......@@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING
endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
help
Make sure that CPUs running critical tasks are not disturbed by
any source of "noise" such as unbound workqueues, timers, kthreads...
Unbound jobs get offloaded to housekeeping CPUs.
source "kernel/rcu/Kconfig"
config BUILD_BIN2C
......
......@@ -46,6 +46,7 @@
#include <linux/cgroup.h>
#include <linux/efi.h>
#include <linux/tick.h>
#include <linux/sched/isolation.h>
#include <linux/interrupt.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
......@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
early_irq_init();
init_IRQ();
tick_init();
housekeeping_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
......
......@@ -57,7 +57,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
......@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
......@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
goto done;
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
......@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
non_isolated_cpus);
housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
......@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
cpumask_intersects(cp->cpus_allowed,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
......@@ -789,7 +785,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
cpumask_and(dp, dp, non_isolated_cpus);
cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
......@@ -802,7 +798,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
BUG_ON(nslot != ndoms);
done:
free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
......
......@@ -29,6 +29,7 @@
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
......@@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
housekeeping_affine(current);
housekeeping_affine(current, HK_FLAG_RCU);
}
/* Record the current task on dyntick-idle entry. */
......
......@@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
#define CREATE_TRACE_POINTS
......@@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
housekeeping_affine(current);
housekeeping_affine(current, HK_FLAG_RCU);
/*
* Each pass through the following loop makes one check for
......
......@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
......@@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
......@@ -42,18 +43,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
*
* If SCHED_DEBUG is disabled, each compilation unit has its own copy of
* sysctl_sched_features, defined in sched.h, to allow constants propagation
* at compile time and compiler optimization based on features default.
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
#endif
/*
* Number of tasks to iterate in a single balance run.
......@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
/* CPUs with isolated domains */
cpumask_var_t cpu_isolated_map;
/*
* __task_rq_lock - lock the rq @p resides on.
*/
......@@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
......@@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
if (!is_housekeeping_cpu(cpu))
cpu = housekeeping_any_cpu();
if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
......@@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
static void set_load_weight(struct task_struct *p)
static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
......@@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
if (update_load && p->sched_class == &fair_sched_class) {
reweight_task(p, prio);
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
}
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
......@@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
set_load_weight(p);
set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
......@@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
......@@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
set_load_weight(p);
set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
......@@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
sched_init_numa();
/*
......@@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
......@@ -5934,7 +5935,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
set_load_weight(&init_task);
set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
......@@ -5953,9 +5954,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
......
......@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
}
......@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
}
raw_spin_lock(&dl_b->lock);
__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
__dl_clear_params(p);
......@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
} else if ((flags & ENQUEUE_RESTORE) &&
dl_time_before(dl_se->deadline,
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
......@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
......@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
* p's deadline being now already after rq_clock(rq).
*/
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
setup_new_dl_entity(&p->dl);
if (rq->curr != p) {
#ifdef CONFIG_SMP
......@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_clear(dl_b, p->dl.dl_bw, cpus);
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
......@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* But this would require to set the task's "inactive
* timer" when the task is not inactive.
*/
__dl_clear(dl_b, p->dl.dl_bw, cpus);
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;
......
......@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
P(se->avg.runnable_load_avg);
#endif
#undef PN_SCHEDSTAT
......@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
atomic_long_read(&cfs_rq->removed_load_avg));
SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
atomic_long_read(&cfs_rq->removed_util_avg));
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
......@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
#endif
......
......@@ -33,6 +33,7 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
......@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
sa->last_update_time = 0;
/*
* sched_avg's period_contrib should be strictly less then 1024, so
* we give it 1023 to make sure it is almost a period (1024us), and
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
memset(sa, 0, sizeof(*sa));
/*
* Tasks are intialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
......@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
sa->util_avg = 0;
sa->util_sum = 0;
sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
se->runnable_weight = se->load.weight;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
......@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
if (entity_is_task(se)) {
......@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.load_sum / p->se.load.weight;
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
......@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
/*
* Signed add and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(_val) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
\
if (val < 0 && res > var) \
res = 0; \
\
WRITE_ONCE(*ptr, res); \
} while (0)
/*
* Unsigned subtract and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define sub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
res = var - val; \
if (res > var) \
res = 0; \
WRITE_ONCE(*ptr, res); \
} while (0)
#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
static inline long se_weight(struct sched_entity *se)
{
return scale_load_down(se->load.weight);
}
static inline long se_runnable(struct sched_entity *se)
{
return scale_load_down(se->runnable_weight);
}
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->runnable_weight += se->runnable_weight;
cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
}
static inline void
dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->runnable_weight -= se->runnable_weight;
sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
sub_positive(&cfs_rq->avg.runnable_load_sum,
se_runnable(se) * se->avg.runnable_load_sum);
}
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
}
#else
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight, unsigned long runnable)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
dequeue_runnable_load_avg(cfs_rq, se);
}
dequeue_load_avg(cfs_rq, se);
se->runnable_weight = runnable;
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
se->avg.runnable_load_avg =
div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
account_entity_enqueue(cfs_rq, se);
enqueue_runnable_load_avg(cfs_rq, se);
}
}
void reweight_task(struct task_struct *p, int prio)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
unsigned long weight = scale_load(sched_prio_to_weight[prio]);
reweight_entity(cfs_rq, se, weight, weight);
load->inv_weight = sched_prio_to_wmult[prio];
}
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
*
* That is, the weight of a group entity, is the proportional share of the
* group weight based on the group runqueue weights. That is:
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
* \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
* moves slower and therefore the approximation is cheaper and more stable.
*
* So instead of the above, we substitute:
*
* grq->load.weight -> grq->avg.load_avg (2)
*
* which yields the following:
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
* tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
* That is shares_avg, and it is right (given the approximation (2)).
*
* The problem with it is that because the average is slow -- it was designed
* to be exactly that of course -- this leads to transients in boundary
* conditions. In specific, the case where the group was idle and we start the
* one task. It takes time for our CPU's grq->avg.load_avg to build up,
* yielding bad latency etc..
*
* Now, in that special case (1) reduces to:
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
* grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
* So what we do is modify our approximation (3) to approach (4) in the (near)
* UP case, like:
*
* ge->load.weight =
*
* tg->weight * grq->load.weight
* --------------------------------------------------- (5)
* tg->load_avg - grq->avg.load_avg + grq->load.weight
*
* But because grq->load.weight can drop to 0, resulting in a divide by zero,
* we need to use grq->avg.load_avg as its lower bound, which then gives:
*
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
* tg_load_avg'
*
* Where:
*
* tg_load_avg' = tg->load_avg - grq->avg.load_avg +
* max(grq->load.weight, grq->avg.load_avg)
*
* And that is shares_weight and is icky. In the (near) UP case it approaches
* (4) while in the normal case it approaches (3). It consistently
* overestimates the ge->load.weight and therefore:
*
* \Sum ge->load.weight >= tg->weight
*
* hence icky!
*/
static long calc_group_shares(struct cfs_rq *cfs_rq)
{
long tg_weight, load, shares;
long tg_weight, tg_shares, load, shares;
struct task_group *tg = cfs_rq->tg;
/*
* This really should be: cfs_rq->avg.load_avg, but instead we use
* cfs_rq->load.weight, which is its upper bound. This helps ramp up
* the shares for small weight interactive tasks.
*/
load = scale_load_down(cfs_rq->load.weight);
tg_shares = READ_ONCE(tg->shares);
load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
tg_weight = atomic_long_read(&tg->load_avg);
......@@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
shares = (tg->shares * load);
shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
......@@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
shares = tg->shares;
return shares;
}
# else /* CONFIG_SMP */
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
/*
* This calculates the effective runnable weight for a group entity based on
* the group entity weight calculated above.
*
* Because of the above approximation (2), our group entity weight is
* an load_avg based ratio (3). This means that it includes blocked load and
* does not represent the runnable weight.
*
* Approximate the group entity's runnable weight per ratio from the group
* runqueue:
*
* grq->avg.runnable_load_avg
* ge->runnable_weight = ge->load.weight * -------------------------- (7)
* grq->avg.load_avg
*
* However, analogous to above, since the avg numbers are slow, this leads to
* transients in the from-idle case. Instead we use:
*
* ge->runnable_weight = ge->load.weight *
*
* max(grq->avg.runnable_load_avg, grq->runnable_weight)
* ----------------------------------------------------- (8)
* max(grq->avg.load_avg, grq->load.weight)
*
* Where these max() serve both to use the 'instant' values to fix the slow
* from-idle and avoid the /0 on to-idle, similar to (6).
*/
static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
}
long runnable, load_avg;
update_load_set(&se->load, weight);
load_avg = max(cfs_rq->avg.load_avg,
scale_load_down(cfs_rq->load.weight));
if (se->on_rq)
account_entity_enqueue(cfs_rq, se);
runnable = max(cfs_rq->avg.runnable_load_avg,
scale_load_down(cfs_rq->runnable_weight));
runnable *= shares;
if (load_avg)
runnable /= load_avg;
return clamp_t(long, runnable, MIN_SHARES, shares);
}
# endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
static void update_cfs_shares(struct sched_entity *se)
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
*/
static void update_cfs_group(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
long shares;
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares, runnable;
if (!cfs_rq)
if (!gcfs_rq)
return;
if (throttled_hierarchy(cfs_rq))
if (throttled_hierarchy(gcfs_rq))
return;
tg = cfs_rq->tg;
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
runnable = calc_group_runnable(gcfs_rq, shares);
#endif
shares = calc_cfs_shares(cfs_rq, tg);
reweight_entity(cfs_rq_of(se), se, shares);
reweight_entity(cfs_rq_of(se), se, shares, runnable);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_cfs_shares(struct sched_entity *se)
static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
......@@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
......@@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
if (cfs_rq) {
cfs_rq->runnable_load_sum =
decay_load(cfs_rq->runnable_load_sum, periods);
}
sa->runnable_load_sum =
decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
......@@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;
......@@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
......@@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
* this happens during idle_balance() which calls
* update_blocked_averages()
*/
if (!weight)
running = 0;
if (!load)
runnable = running = 0;
/*
* Now we know we crossed measurement unit boundaries. The *_avg
......@@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;
return 1;
}
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
/*
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
}
sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
return 1;
sa->load_avg = div_u64(load * sa->load_sum, divider);
sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
sa->util_avg = sa->util_sum / divider;
}
/*
* sched_entity:
*
* task:
* se_runnable() == se_weight()
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
* se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
*
* load_sum := runnable_sum
* load_avg = se_weight(se) * runnable_avg
*
* runnable_load_sum := runnable_sum
* runnable_load_avg = se_runnable(se) * runnable_avg
*
* XXX collapse load_sum and runnable_load_sum
*
* cfq_rs:
*
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
*
* runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
return 0;
}
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return ___update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
if (entity_is_task(se))
se->runnable_weight = se->load.weight;
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
return 0;
}
static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
return ___update_load_avg(now, cpu, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
cfs_rq->curr != NULL, cfs_rq);
}
if (___update_load_sum(now, cpu, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
/*
* Signed add and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(_val) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
\
if (val < 0 && res > var) \
res = 0; \
\
WRITE_ONCE(*ptr, res); \
} while (0)
___update_load_avg(&cfs_rq->avg, 1, 1);
return 1;
}
return 0;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
......@@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
/* Take into account change of utilization of a child task group */
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
* that for each group:
*
* ge->avg == grq->avg (1)
*
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
*
* Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
* simply copies the running sum over.
*
* However, update_tg_cfs_runnable() is more complex. So we have:
*
* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
*
* And since, like util, the runnable part should be directly transferable,
* the following would _appear_ to be the straight forward approach:
*
* grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
*
* And per (1) we have:
*
* ge->avg.running_avg == grq->avg.running_avg
*
* Which gives:
*
* ge->load.weight * grq->avg.load_avg
* ge->avg.load_avg = ----------------------------------- (4)
* grq->load.weight
*
* Except that is wrong!
*
* Because while for entities historical weight is not important and we
* really only care about our future and therefore can consider a pure
* runnable sum, runqueues can NOT do this.
*
* We specifically want runqueues to have a load_avg that includes
* historical weights. Those represent the blocked load, the load we expect
* to (shortly) return to us. This only works by keeping the weights as
* integral part of the sum. We therefore cannot decompose as per (3).
*
* OK, so what then?
*
*
* Another way to look at things is:
*
* grq->avg.load_avg = \Sum se->avg.load_avg
*
* Therefore, per (2):
*
* grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
*
* And the very thing we're propagating is a change in that sum (someone
* joined/left). So we can easily know the runnable change, which would be, per
* (2) the already tracked se->load_avg divided by the corresponding
* se->weight.
*
* Basically (4) but in differential form:
*
* d(runnable_avg) += se->avg.load_avg / se->load.weight
* (5)
* ge->avg.load_avg += ge->load.weight * d(runnable_avg)
*/
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
/* Nothing to update */
......@@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}
/* Take into account change of load of a child task group */
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta, load = gcfs_rq->avg.load_avg;
long runnable_sum = gcfs_rq->prop_runnable_sum;
long runnable_load_avg, load_avg;
s64 runnable_load_sum, load_sum;
/*
* If the load of group cfs_rq is null, the load of the
* sched_entity will also be null so we can skip the formula
*/
if (load) {
long tg_load;
if (!runnable_sum)
return;
/* Get tg's load and ensure tg_load > 0 */
tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
gcfs_rq->prop_runnable_sum = 0;
/* Ensure tg_load >= load and updated with current load*/
tg_load -= gcfs_rq->tg_load_avg_contrib;
tg_load += load;
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, LOAD_AVG_MAX);
/*
* We need to compute a correction term in the case that the
* task group is consuming more CPU than a task of equal
* weight. A task with a weight equals to tg->shares will have
* a load less or equal to scale_load_down(tg->shares).
* Similarly, the sched_entities that represent the task group
* at parent level, can't have a load higher than
* scale_load_down(tg->shares). And the Sum of sched_entities'
* load must be <= scale_load_down(tg->shares).
*/
if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
/* scale gcfs_rq's load into tg's shares*/
load *= scale_load_down(gcfs_rq->tg->shares);
load /= tg_load;
}
}
add_positive(&se->avg.load_sum, runnable_sum);
add_positive(&se->avg.load_avg, load_avg);
delta = load - se->avg.load_avg;
add_positive(&cfs_rq->avg.load_avg, load_avg);
add_positive(&cfs_rq->avg.load_sum, load_sum);
/* Nothing to update */
if (!delta)
return;
/* Set new sched_entity's load */
se->avg.load_avg = load;
se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
/* Update parent cfs_rq load */
add_positive(&cfs_rq->avg.load_avg, delta);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
add_positive(&se->avg.runnable_load_sum, runnable_sum);
add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
/*
* If the sched_entity is already enqueued, we also have to update the
* runnable load avg.
*/
if (se->on_rq) {
/* Update parent cfs_rq runnable_load_avg */
add_positive(&cfs_rq->runnable_load_avg, delta);
cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
}
}
static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
{
cfs_rq->propagate_avg = 1;
}
static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
struct cfs_rq *cfs_rq = group_cfs_rq(se);
if (!cfs_rq->propagate_avg)
return 0;
cfs_rq->propagate_avg = 0;
return 1;
cfs_rq->propagate = 1;
cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq;
struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
if (!test_and_clear_tg_cfs_propagate(se))
gcfs_rq = group_cfs_rq(se);
if (!gcfs_rq->propagate)
return 0;
gcfs_rq->propagate = 0;
cfs_rq = cfs_rq_of(se);
set_tg_cfs_propagate(cfs_rq);
add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
update_tg_cfs_util(cfs_rq, se);
update_tg_cfs_load(cfs_rq, se);
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
return 1;
}
......@@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
if (gcfs_rq->propagate_avg)
if (gcfs_rq->propagate)
return false;
/*
......@@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 0;
}
static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* Unsigned subtract and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define sub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
res = var - val; \
if (res > var) \
res = 0; \
WRITE_ONCE(*ptr, res); \
} while (0)
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
......@@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
int decayed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
if (cfs_rq->removed.nr) {
unsigned long r;
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
set_tg_cfs_propagate(cfs_rq);
}
sub_positive(&sa->load_sum, r * divider);
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
r = removed_util;
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
set_tg_cfs_propagate(cfs_rq);
sub_positive(&sa->util_sum, r * divider);
add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
decayed = 1;
}
decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
if (decayed || removed_util)
if (decayed)
cfs_rq_util_change(cfs_rq);
return decayed || removed_load;
}
/*
* Optional action to be done while updating the load average
*/
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
return decayed;
}
/**
......@@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/*
* When we attach the @se to the @cfs_rq, we must align the decay
* window because without that, really weird and wonderful things can
* happen.
*
* XXX illustrate
*/
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
se->avg.period_contrib = cfs_rq->avg.period_contrib;
/*
* Hell(o) Nasty stuff.. we need to recompute _sum based on the new
* period_contrib. This isn't strictly correct, but since we're
* entirely outside of the PELT hierarchy, nobody cares if we truncate
* _sum a little.
*/
se->avg.util_sum = se->avg.util_avg * divider;
se->avg.load_sum = divider;
if (se_weight(se)) {
se->avg.load_sum =
div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
}
se->avg.runnable_load_sum = se->avg.load_sum;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
set_tg_cfs_propagate(cfs_rq);
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
......@@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
set_tg_cfs_propagate(cfs_rq);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* Optional action to be done while updating the load average
*/
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
struct sched_avg *sa = &se->avg;
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
}
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
} else if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
}
#ifndef CONFIG_64BIT
......@@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
......@@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
*/
sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->runnable_load_avg;
return cfs_rq->avg.runnable_load_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
......@@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
static inline void update_load_avg(struct sched_entity *se, int not_used1)
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
cfs_rq_util_change(cfs_rq_of(se));
cfs_rq_util_change(cfs_rq);
}
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
......@@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
update_cfs_shares(se);
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
update_cfs_group(se);
enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
......@@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
dequeue_runnable_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
......@@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_shares(se);
update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
......@@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, UPDATE_TG);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
......@@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(prev, 0);
update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
......@@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
update_load_avg(curr, UPDATE_TG);
update_cfs_shares(curr);
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
......@@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
update_load_avg(se, UPDATE_TG);
update_cfs_shares(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
update_cfs_group(se);
}
if (!se)
......@@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
update_load_avg(se, UPDATE_TG);
update_cfs_shares(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
update_cfs_group(se);
}
if (!se)
......@@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
......@@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
unsigned long min_runnable_load = ULONG_MAX;
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
......@@ -5578,10 +5864,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/*
* find_idlest_cpu - find the idlest cpu among the cpus in group.
* find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
......@@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
return new_cpu;
}
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
......@@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = cpu;
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
/*
* We're going to need the task's util for capacity_spare_wake
* in find_idlest_group. Sync it up to prev_cpu's
* last_update_time.
*/
sync_entity_load_avg(&p->se);
}
if (!sd) {
pick_cpu:
pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else while (sd) {
struct sched_group *group;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_cpu(group, p, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
return new_cpu;
}
static void detach_entity_cfs_rq(struct sched_entity *se);
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
......@@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
se->vruntime -= min_vruntime;
}
/*
* We are supposed to update the task to "current" time, then its up to date
* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
* what current time is, so simply throw away the out-of-date time. This
* will result in the wakee task is less decayed, but giving the wakee more
* load sounds not bad.
*/
remove_entity_load_avg(&p->se);
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
/*
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
lockdep_assert_held(&task_rq(p)->lock);
detach_entity_cfs_rq(&p->se);
} else {
/*
* We are supposed to update the task to "current" time, then
* its up to date and ready to go to new CPU/cfs_rq. But we
* have difficulty in getting what current time is, so simply
* throw away the out-of-date time. This will result in the
* wakee task is less decayed, but giving the wakee more load
* sounds not bad.
*/
remove_entity_load_avg(&p->se);
}
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
......@@ -6334,10 +6658,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
set_next_entity(cfs_rq, se);
}
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
goto done;
simple:
#endif
......@@ -6351,6 +6672,16 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
p = task_of(se);
done: __maybe_unused
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
* the list, so our cfs_tasks list becomes MRU
* one.
*/
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
......@@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
struct task_struct *p;
lockdep_assert_held(&env->src_rq->lock);
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
......@@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
p = list_first_entry(tasks, struct task_struct, se.group_node);
p = list_last_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
......@@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
list_move_tail(&p->se.group_node, tasks);
list_move(&p->se.group_node, tasks);
}
/*
......@@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;
if (cfs_rq->runnable_load_sum)
if (cfs_rq->avg.runnable_load_sum)
return false;
return true;
......@@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
update_load_avg(cfs_rq_of(se), se, 0);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
......@@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
/*
* When dst_cpu is idle, prevent SMP nice and/or asymmetric group
* capacities from resulting in underutilization due to avg_load.
*/
if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
......@@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
if (!is_housekeeping_cpu(cpu))
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
......@@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
if (cfs_rq_throttled(cfs_rq))
break;
update_load_avg(se, UPDATE_TG);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
}
#else
......@@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(se, 0);
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
......@@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
#endif
/* Synchronize entity with its cfs_rq */
update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
......@@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->propagate_avg = 0;
#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
......@@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(se, UPDATE_TG);
update_cfs_shares(se);
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}
......
......@@ -209,6 +209,7 @@ static void cpuidle_idle_call(void)
*/
static void do_idle(void)
{
int cpu = smp_processor_id();
/*
* If the arch has a polling bit, we maintain an invariant:
*
......@@ -219,14 +220,13 @@ static void do_idle(void)
*/
__current_set_polling();
quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
rmb();
if (cpu_is_offline(smp_processor_id())) {
if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
......
/*
* Housekeeping management. Manage the targets for routine code that can run on
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
*
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
*
*/
#include <linux/sched/isolation.h>
#include <linux/tick.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/static_key.h>
#include <linux/ctype.h>
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
EXPORT_SYMBOL_GPL(housekeeping_overriden);
static cpumask_var_t housekeeping_mask;
static unsigned int housekeeping_flags;
int housekeeping_any_cpu(enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
return housekeeping_mask;
return cpu_possible_mask;
}
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
set_cpus_allowed_ptr(t, housekeeping_mask);
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
return cpumask_test_cpu(cpu, housekeeping_mask);
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
void __init housekeeping_init(void)
{
if (!housekeeping_flags)
return;
static_branch_enable(&housekeeping_overriden);
/* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;
int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
err = cpulist_parse(str, non_housekeeping_mask);
if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
if (!housekeeping_flags) {
alloc_bootmem_cpumask_var(&housekeeping_mask);
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask);
if (cpumask_empty(housekeeping_mask))
cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
} else {
cpumask_var_t tmp;
alloc_bootmem_cpumask_var(&tmp);
cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
if (!cpumask_equal(tmp, housekeeping_mask)) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
free_bootmem_cpumask_var(tmp);
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
free_bootmem_cpumask_var(tmp);
}
if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
tick_nohz_full_setup(non_housekeeping_mask);
} else {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
}
housekeeping_flags |= flags;
free_bootmem_cpumask_var(non_housekeeping_mask);
return 1;
}
static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
return housekeeping_setup(str, flags);
}
__setup("nohz_full=", housekeeping_nohz_full_setup);
static int __init housekeeping_isolcpus_setup(char *str)
{
unsigned int flags = 0;
while (isalpha(*str)) {
if (!strncmp(str, "nohz,", 5)) {
str += 5;
flags |= HK_FLAG_TICK;
continue;
}
if (!strncmp(str, "domain,", 7)) {
str += 7;
flags |= HK_FLAG_DOMAIN;
continue;
}
pr_warn("isolcpus: Error, unknown flag\n");
return 0;
}
/* Default behaviour for isolcpus without flags */
if (!flags)
flags |= HK_FLAG_DOMAIN;
return housekeeping_setup(str, flags);
}
__setup("isolcpus=", housekeeping_isolcpus_setup);
......@@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
static void push_irq_work_func(struct irq_work *work);
#endif
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
......@@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#ifdef HAVE_RT_PUSH_IPI
rt_rq->push_flags = 0;
rt_rq->push_cpu = nr_cpu_ids;
raw_spin_lock_init(&rt_rq->push_lock);
init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
......@@ -1876,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
/*
* The search for the next cpu always starts at rq->cpu and ends
* when we reach rq->cpu again. It will never return rq->cpu.
* This returns the next cpu to check, or nr_cpu_ids if the loop
* is complete.
* When a high priority task schedules out from a CPU and a lower priority
* task is scheduled in, a check is made to see if there's any RT tasks
* on other CPUs that are waiting to run because a higher priority RT task
* is currently running on its CPU. In this case, the CPU with multiple RT
* tasks queued on it (overloaded) needs to be notified that a CPU has opened
* up that may be able to run one of its non-running queued RT tasks.
*
* All CPUs with overloaded RT tasks need to be notified as there is currently
* no way to know which of these CPUs have the highest priority task waiting
* to run. Instead of trying to take a spinlock on each of these CPUs,
* which has shown to cause large latency when done on machines with many
* CPUs, sending an IPI to the CPUs to have them push off the overloaded
* RT tasks waiting to run.
*
* Just sending an IPI to each of the CPUs is also an issue, as on large
* count CPU machines, this can cause an IPI storm on a CPU, especially
* if its the only CPU with multiple RT tasks queued, and a large number
* of CPUs scheduling a lower priority task at the same time.
*
* Each root domain has its own irq work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
* tassk must be checked if there's one or many CPUs that are lowering
* their priority, there's a single irq work iterator that will try to
* push off RT tasks that are waiting to run.
*
* When a CPU schedules a lower priority task, it will kick off the
* irq work iterator that will jump to each CPU with overloaded RT tasks.
* As it only takes the first CPU that schedules a lower priority task
* to start the process, the rto_start variable is incremented and if
* the atomic result is one, then that CPU will try to take the rto_lock.
* This prevents high contention on the lock as the process handles all
* CPUs scheduling lower priority tasks.
*
* All CPUs that are scheduling a lower priority task will increment the
* rt_loop_next variable. This will make sure that the irq work iterator
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
* priority task, even if the iterator is in the middle of a scan. Incrementing
* the rt_loop_next will cause the iterator to perform another scan.
*
* rq->rt.push_cpu holds the last cpu returned by this function,
* or if this is the first instance, it must hold rq->cpu.
*/
static int rto_next_cpu(struct rq *rq)
{
int prev_cpu = rq->rt.push_cpu;
struct root_domain *rd = rq->rd;
int next;
int cpu;
cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
/*
* If the previous cpu is less than the rq's CPU, then it already
* passed the end of the mask, and has started from the beginning.
* We end if the next CPU is greater or equal to rq's CPU.
* When starting the IPI RT pushing, the rto_cpu is set to -1,
* rt_next_cpu() will simply return the first CPU found in
* the rto_mask.
*
* If rto_next_cpu() is called with rto_cpu is a valid cpu, it
* will return the next CPU found in the rto_mask.
*
* If there are no more CPUs left in the rto_mask, then a check is made
* against rto_loop and rto_loop_next. rto_loop is only updated with
* the rto_lock held, but any CPU may increment the rto_loop_next
* without any locking.
*/
if (prev_cpu < rq->cpu) {
if (cpu >= rq->cpu)
return nr_cpu_ids;
for (;;) {
} else if (cpu >= nr_cpu_ids) {
/*
* We passed the end of the mask, start at the beginning.
* If the result is greater or equal to the rq's CPU, then
* the loop is finished.
*/
cpu = cpumask_first(rq->rd->rto_mask);
if (cpu >= rq->cpu)
return nr_cpu_ids;
}
rq->rt.push_cpu = cpu;
/* When rto_cpu is -1 this acts like cpumask_first() */
cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
/* Return cpu to let the caller know if the loop is finished or not */
return cpu;
}
rd->rto_cpu = cpu;
static int find_next_push_cpu(struct rq *rq)
{
struct rq *next_rq;
int cpu;
if (cpu < nr_cpu_ids)
return cpu;
while (1) {
cpu = rto_next_cpu(rq);
if (cpu >= nr_cpu_ids)
break;
next_rq = cpu_rq(cpu);
rd->rto_cpu = -1;
/*
* ACQUIRE ensures we see the @rto_mask changes
* made prior to the @next value observed.
*
* Matches WMB in rt_set_overload().
*/
next = atomic_read_acquire(&rd->rto_loop_next);
/* Make sure the next rq can push to this rq */
if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
if (rd->rto_loop == next)
break;
rd->rto_loop = next;
}
return cpu;
return -1;
}
#define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2
static inline bool rto_start_trylock(atomic_t *v)
{
return !atomic_cmpxchg_acquire(v, 0, 1);
}
/*
* When a high priority task schedules out from a CPU and a lower priority
* task is scheduled in, a check is made to see if there's any RT tasks
* on other CPUs that are waiting to run because a higher priority RT task
* is currently running on its CPU. In this case, the CPU with multiple RT
* tasks queued on it (overloaded) needs to be notified that a CPU has opened
* up that may be able to run one of its non-running queued RT tasks.
*
* On large CPU boxes, there's the case that several CPUs could schedule
* a lower priority task at the same time, in which case it will look for
* any overloaded CPUs that it could pull a task from. To do this, the runqueue
* lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
* for a single overloaded CPU's runqueue lock can produce a large latency.
* (This has actually been observed on large boxes running cyclictest).
* Instead of taking the runqueue lock of the overloaded CPU, each of the
* CPUs that scheduled a lower priority task simply sends an IPI to the
* overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
* lots of contention. The overloaded CPU will look to push its non-running
* RT task off, and if it does, it can then ignore the other IPIs coming
* in, and just pass those IPIs off to any other overloaded CPU.
*
* When a CPU schedules a lower priority task, it only sends an IPI to
* the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
* as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
* RT overloaded tasks, would cause 100 IPIs to go out at once.
*
* The overloaded RT CPU, when receiving an IPI, will try to push off its
* overloaded RT tasks and then send an IPI to the next CPU that has
* overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
* have completed. Just because a CPU may have pushed off its own overloaded
* RT task does not mean it should stop sending the IPI around to other
* overloaded CPUs. There may be another RT task waiting to run on one of
* those CPUs that are of higher priority than the one that was just
* pushed.
*
* An optimization that could possibly be made is to make a CPU array similar
* to the cpupri array mask of all running RT tasks, but for the overloaded
* case, then the IPI could be sent to only the CPU with the highest priority
* RT task waiting, and that CPU could send off further IPIs to the CPU with
* the next highest waiting task. Since the overloaded case is much less likely
* to happen, the complexity of this implementation may not be worth it.
* Instead, just send an IPI around to all overloaded CPUs.
*
* The rq->rt.push_flags holds the status of the IPI that is going around.
* A run queue can only send out a single IPI at a time. The possible flags
* for rq->rt.push_flags are:
*
* (None or zero): No IPI is going around for the current rq
* RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
* RT_PUSH_IPI_RESTART: The priority of the running task for the rq
* has changed, and the IPI should restart
* circulating the overloaded CPUs again.
*
* rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
* before sending to the next CPU.
*
* Instead of having all CPUs that schedule a lower priority task send
* an IPI to the same "first" CPU in the RT overload mask, they send it
* to the next overloaded CPU after their own CPU. This helps distribute
* the work when there's more than one overloaded CPU and multiple CPUs
* scheduling in lower priority tasks.
*
* When a rq schedules a lower priority task than what was currently
* running, the next CPU with overloaded RT tasks is examined first.
* That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
* priority task, it will send an IPI first to CPU 5, then CPU 5 will
* send to CPU 1 if it is still overloaded. CPU 1 will clear the
* rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
*
* The first CPU to notice IPI_RESTART is set, will clear that flag and then
* send an IPI to the next overloaded CPU after the rq->cpu and not the next
* CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
* schedules a lower priority task, and the IPI_RESTART gets set while the
* handling is being done on CPU 5, it will clear the flag and send it back to
* CPU 4 instead of CPU 1.
*
* Note, the above logic can be disabled by turning off the sched_feature
* RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
* taken by the CPU requesting a pull and the waiting RT task will be pulled
* by that CPU. This may be fine for machines with few CPUs.
*/
static void tell_cpu_to_push(struct rq *rq)
static inline void rto_start_unlock(atomic_t *v)
{
int cpu;
atomic_set_release(v, 0);
}
if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
raw_spin_lock(&rq->rt.push_lock);
/* Make sure it's still executing */
if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
/*
* Tell the IPI to restart the loop as things have
* changed since it started.
*/
rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
raw_spin_unlock(&rq->rt.push_lock);
return;
}
raw_spin_unlock(&rq->rt.push_lock);
}
static void tell_cpu_to_push(struct rq *rq)
{
int cpu = -1;
/* When here, there's no IPI going around */
/* Keep the loop going if the IPI is currently active */
atomic_inc(&rq->rd->rto_loop_next);
rq->rt.push_cpu = rq->cpu;
cpu = find_next_push_cpu(rq);
if (cpu >= nr_cpu_ids)
/* Only one CPU can initiate a loop at a time */
if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
raw_spin_lock(&rq->rd->rto_lock);
/*
* The rto_cpu is updated under the lock, if it has a valid cpu
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
* Otherwise it is finishing up and an ipi needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
cpu = rto_next_cpu(rq);
irq_work_queue_on(&rq->rt.push_work, cpu);
raw_spin_unlock(&rq->rd->rto_lock);
rto_start_unlock(&rq->rd->rto_loop_start);
if (cpu >= 0)
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
/* Called from hardirq context */
static void try_to_push_tasks(void *arg)
void rto_push_irq_work_func(struct irq_work *work)
{
struct rt_rq *rt_rq = arg;
struct rq *rq, *src_rq;
int this_cpu;
struct rq *rq;
int cpu;
this_cpu = rt_rq->push_cpu;
rq = this_rq();
/* Paranoid check */
BUG_ON(this_cpu != smp_processor_id());
rq = cpu_rq(this_cpu);
src_rq = rq_of_rt_rq(rt_rq);
again:
/*
* We do not need to grab the lock to check for has_pushable_tasks.
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
push_rt_task(rq);
push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
/* Pass the IPI to the next rt overloaded queue */
raw_spin_lock(&rt_rq->push_lock);
/*
* If the source queue changed since the IPI went out,
* we need to restart the search from that CPU again.
*/
if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
rt_rq->push_cpu = src_rq->cpu;
}
raw_spin_lock(&rq->rd->rto_lock);
cpu = find_next_push_cpu(src_rq);
/* Pass the IPI to the next rt overloaded queue */
cpu = rto_next_cpu(rq);
if (cpu >= nr_cpu_ids)
rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
raw_spin_unlock(&rt_rq->push_lock);
raw_spin_unlock(&rq->rd->rto_lock);
if (cpu >= nr_cpu_ids)
if (cpu < 0)
return;
/*
* It is possible that a restart caused this CPU to be
* chosen again. Don't bother with an IPI, just see if we
* have more to push.
*/
if (unlikely(cpu == rq->cpu))
goto again;
/* Try the next RT overloaded CPU */
irq_work_queue_on(&rt_rq->push_work, cpu);
}
static void push_irq_work_func(struct irq_work *work)
{
struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
try_to_push_tasks(rt_rq);
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
......
......@@ -227,7 +227,7 @@ struct dl_bw {
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
static inline
void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw -= tsk_bw;
__dl_update(dl_b, (s32)tsk_bw / cpus);
......@@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern void __dl_clear_params(struct task_struct *p);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_task_can_attach(struct task_struct *p,
const struct cpumask *cs_cpus_allowed);
......@@ -419,6 +418,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running, h_nr_running;
u64 exec_clock;
......@@ -444,18 +444,22 @@ struct cfs_rq {
* CFS load tracking
*/
struct sched_avg avg;
u64 runnable_load_sum;
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
int nr;
unsigned long load_avg;
unsigned long util_avg;
unsigned long runnable_sum;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
/*
* h_load = weight * f(tg)
*
......@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
#ifdef CONFIG_IRQ_WORK
#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
......@@ -524,12 +528,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#ifdef HAVE_RT_PUSH_IPI
int push_flags;
int push_cpu;
struct irq_work push_work;
raw_spinlock_t push_lock;
#endif
#endif /* CONFIG_SMP */
int rt_queued;
......@@ -638,6 +636,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
*/
struct irq_work rto_push_work;
raw_spinlock_t rto_lock;
/* These are only updated and read within rto_lock */
int rto_loop;
int rto_cpu;
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
......@@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
/*
......@@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
# define const_debug const
#endif
extern const_debug unsigned int sysctl_sched_features;
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
......@@ -1232,6 +1244,13 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* To support run-time toggling of sched features, all the translation units
* (but core.c) reference the sysctl_sched_features defined in core.c.
*/
extern const_debug unsigned int sysctl_sched_features;
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
......@@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
}
#include "features.h"
#undef SCHED_FEAT
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
* constants propagation at compile time and compiler optimization based on
* features default.
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
static const_debug __maybe_unused unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
......@@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
......
......@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/sched/isolation.h>
#include "sched.h"
......@@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
#endif
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
......@@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
/* Setup the mask of CPUs configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
int ret;
alloc_bootmem_cpumask_var(&cpu_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
return 0;
}
return 1;
}
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
......@@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
......@@ -1332,6 +1325,10 @@ void sched_init_numa(void)
if (!sched_domains_numa_distance)
return;
/* Includes NUMA identity node at level 0. */
sched_domains_numa_distance[level++] = curr_distance;
sched_domains_numa_levels = level;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
......@@ -1379,8 +1376,7 @@ void sched_init_numa(void)
return;
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
* 'level' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
......@@ -1441,10 +1437,19 @@ void sched_init_numa(void)
for (i = 0; sched_domain_topology[i].mask; i++)
tl[i] = sched_domain_topology[i];
/*
* Add the NUMA identity distance, aka single NODE.
*/
tl[i++] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.numa_level = 0,
SD_INIT_NAME(NODE)
};
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
for (j = 1; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
......@@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
......@@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
doms_new = alloc_sched_domains(1);
if (doms_new) {
n = 1;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
cpumask_and(doms_new[0], cpu_active_mask,
housekeeping_cpumask(HK_FLAG_DOMAIN));
}
} else {
n = ndoms_new;
......@@ -1880,7 +1886,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
cpumask_and(doms_new[0], cpu_active_mask,
housekeeping_cpumask(HK_FLAG_DOMAIN));
}
/* Build new domains: */
......
......@@ -27,6 +27,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>
#include <asm/irq_regs.h>
......@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static atomic_t tick_dep_mask;
......@@ -385,20 +385,13 @@ void __tick_nohz_task_switch(void)
local_irq_restore(flags);
}
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
return 1;
}
__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
......@@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
return;
}
if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
}
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
......@@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
cpumask_copy(housekeeping_mask, cpu_possible_mask);
tick_nohz_full_running = false;
return;
}
......@@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, tick_nohz_full_mask);
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
......@@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
/*
* We need at least one CPU to handle housekeeping work such
* as timekeeping, unbound timers, workqueues, ...
*/
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
......@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (!ts->tick_stopped) {
calc_load_nohz_start();
cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
......
......@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
T = __task_state_to_char(field->next_state);
S = __task_state_to_char(field->prev_state);
T = task_index_to_char(field->next_state);
S = task_index_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
......@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
S = __task_state_to_char(field->prev_state);
T = __task_state_to_char(field->next_state);
S = task_index_to_char(field->prev_state);
T = task_index_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
......@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
S = __task_state_to_char(field->prev_state);
T = __task_state_to_char(field->next_state);
S = task_index_to_char(field->prev_state);
T = task_index_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
......
......@@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
entry->prev_state = __get_task_state(prev);
entry->prev_state = task_state_index(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
entry->next_state = __get_task_state(next);
entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
......@@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
entry->prev_state = __get_task_state(curr);
entry->prev_state = task_state_index(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
entry->next_state = __get_task_state(wakee);
entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
......
......@@ -25,6 +25,7 @@
#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
......@@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
void __init lockup_detector_init(void)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
if (tick_nohz_full_enabled())
pr_info("Disabling watchdog on nohz_full cores by default\n");
cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#endif
cpumask_copy(&watchdog_cpumask,
housekeeping_cpumask(HK_FLAG_TIMER));
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment