Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The main updates in this cycle were: - Group balancing enhancements and cleanups (Brendan Jackman) - Move CPU isolation related functionality into its separate kernel/sched/isolation.c file, with related 'housekeeping_*()' namespace and nomenclature et al. (Frederic Weisbecker) - Improve the interactive/cpu-intense fairness calculation (Josef Bacik) - Improve the PELT code and related cleanups (Peter Zijlstra) - Improve the logic of pick_next_task_fair() (Uladzislau Rezki) - Improve the RT IPI based balancing logic (Steven Rostedt) - Various micro-optimizations: - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi) - better idle loop (Cheng Jian) - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds sched/sysctl: Fix attributes of some extern declarations sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated sched/isolation: Add basic isolcpus flags sched/isolation: Move isolcpus= handling to the housekeeping code sched/isolation: Handle the nohz_full= parameter sched/isolation: Introduce housekeeping flags sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() sched/isolation: Use its own static key sched/isolation: Make the housekeeping cpumask private sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu() sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version sched/isolation: Move housekeeping related code to its own file sched/idle: Micro-optimize the idle loop sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter sched/rt: Simplify the IPI based RT balancing logic block/ioprio: Use a helper to check for RT prio sched/rt: Add a helper to test for a RT task ...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main updates in this cycle were: - Group balancing enhancements and cleanups (Brendan Jackman) - Move CPU isolation related functionality into its separate kernel/sched/isolation.c file, with related 'housekeeping_*()' namespace and nomenclature et al. (Frederic Weisbecker) - Improve the interactive/cpu-intense fairness calculation (Josef Bacik) - Improve the PELT code and related cleanups (Peter Zijlstra) - Improve the logic of pick_next_task_fair() (Uladzislau Rezki) - Improve the RT IPI based balancing logic (Steven Rostedt) - Various micro-optimizations: - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi) - better idle loop (Cheng Jian) - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds sched/sysctl: Fix attributes of some extern declarations sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated sched/isolation: Add basic isolcpus flags sched/isolation: Move isolcpus= handling to the housekeeping code sched/isolation: Handle the nohz_full= parameter sched/isolation: Introduce housekeeping flags sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() sched/isolation: Use its own static key sched/isolation: Make the housekeeping cpumask private sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu() sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version sched/isolation: Move housekeeping related code to its own file sched/idle: Micro-optimize the idle loop sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter sched/rt: Simplify the IPI based RT balancing logic block/ioprio: Use a helper to check for RT prio sched/rt: Add a helper to test for a RT task ...
3e201463 · Linus Torvalds · f2be8bd5 · 765cc3a4 · 3e201463 · 3e201463
Commit 3e201463 authored Nov 13, 2017 by Linus Torvalds
31 changed files
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1730,20 +1730,33 @@
 	isapnp=		[ISAPNP]
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>

-	isolcpus=	[KNL,SMP] Isolate CPUs from the general scheduler.
-			The argument is a cpu list, as described above.
+	isolcpus=	[KNL,SMP] Isolate a given set of CPUs from disturbance.
+			[Deprecated - use cpusets instead]
+			Format: [flag-list,]<cpu-list>
+
+			Specify one or more CPUs to isolate from disturbances
+			specified in the flag list (default: domain):
+
+			nohz
+			  Disable the tick when a single task runs.
+			domain
+			  Isolate from the general SMP balancing and scheduling
+			  algorithms. Note that performing domain isolation this way
+			  is irreversible: it's not possible to bring back a CPU to
+			  the domains once isolated through isolcpus. It's strongly
+			  advised to use cpusets instead to disable scheduler load
+			  balancing through the "cpuset.sched_load_balance" file.
+			  It offers a much more flexible interface where CPUs can
+			  move in and out of an isolated set anytime.
+
+			  You can move a process onto or off an "isolated" CPU via
+			  the CPU affinity syscalls or cpuset.
+			  <cpu number> begins at 0 and the maximum value is
+			  "number of CPUs in system - 1".
+
+			The format of <cpu-list> is described above.

-			This option can be used to specify one or more CPUs
-			to isolate from the general SMP balancing and scheduling
-			algorithms. You can move a process onto or off an
-			"isolated" CPU via the CPU affinity syscalls or cpuset.
-			<cpu number> begins at 0 and the maximum value is
-			"number of CPUs in system - 1".

-			This option is the preferred way to isolate CPUs. The
-			alternative -- manually setting the CPU mask of all
-			tasks in the system -- can cause problems and
-			suboptimal load balancer performance.

 	iucv=		[HW,NET]

@@ -4209,6 +4222,9 @@
 			Used to run time disable IRQ_TIME_ACCOUNTING on any
 			platforms where RDTSC is slow and this accounting
 			can add overhead.
+			[x86] unstable: mark the TSC clocksource as unstable, this
+			marks the TSC unconditionally unstable at bootup and
+			avoids any further wobbles once the TSC watchdog notices.

 	turbografx.map[2|3]=	[HW,JOY]
 			TurboGraFX parallel port interface

--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
 #include <linux/cpufeature.h>
 #include <linux/tick.h>
 #include <linux/pm_qos.h>
+#include <linux/sched/isolation.h>

 #include "base.h"

@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	int n = 0, len = PAGE_SIZE-2;
+	cpumask_var_t isolated;

-	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
+	if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_andnot(isolated, cpu_possible_mask,
+		       housekeeping_cpumask(HK_FLAG_DOMAIN));
+	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
+
+	free_cpumask_var(isolated);

 	return n;
 }

--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
 #include <linux/tcp.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
-#include <linux/tick.h>
+#include <linux/sched/isolation.h>

 #include <asm/checksum.h>
 #include <asm/homecache.h>
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
 		tile_net_dev_init(name, mac);

 	if (!network_cpus_init())
-		cpumask_and(&network_cpus_map, housekeeping_cpumask(),
-			    cpu_online_mask);
+		cpumask_and(&network_cpus_map,
+			    housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);

 	return 0;
 }

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
 	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
-	return task_state_array[__get_task_state(tsk)];
+	return task_state_array[task_state_index(tsk)];
 }

 static inline int get_task_umask(struct task_struct *tsk)

--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return 0;
 }

+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }

+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:	- the cpumask pointer
+ *
+ * Returns	>= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);

 /**

--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -3,6 +3,7 @@
 #define IOPRIO_H

 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/iocontext.h>

 /*
@@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
 {
 	if (task->policy == SCHED_IDLE)
 		return IOPRIO_CLASS_IDLE;
-	else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+	else if (task_is_realtime(task))
 		return IOPRIO_CLASS_RT;
 	else
 		return IOPRIO_CLASS_BE;

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,8 +166,6 @@ struct task_group;
 /* Task command name length: */
 #define TASK_COMM_LEN			16

-extern cpumask_var_t			cpu_isolated_map;
-
 extern void scheduler_tick(void);

 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
@@ -332,9 +330,11 @@ struct load_weight {
 struct sched_avg {
 	u64				last_update_time;
 	u64				load_sum;
+	u64				runnable_load_sum;
 	u32				util_sum;
 	u32				period_contrib;
 	unsigned long			load_avg;
+	unsigned long			runnable_load_avg;
 	unsigned long			util_avg;
 };

@@ -377,6 +377,7 @@ struct sched_statistics {
 struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
+	unsigned long			runnable_weight;
 	struct rb_node			run_node;
 	struct list_head		group_node;
 	unsigned int			on_rq;
@@ -472,10 +473,10 @@ struct sched_dl_entity {
 	 * conditions between the inactive timer handler and the wakeup
 	 * code.
 	 */
-	int				dl_throttled;
-	int				dl_boosted;
-	int				dl_yielded;
-	int				dl_non_contending;
+	int				dl_throttled      : 1;
+	int				dl_boosted        : 1;
+	int				dl_yielded        : 1;
+	int				dl_non_contending : 1;

 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)

-static inline unsigned int __get_task_state(struct task_struct *tsk)
+static inline unsigned int task_state_index(struct task_struct *tsk)
 {
 	unsigned int tsk_state = READ_ONCE(tsk->state);
 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
 	return fls(state);
 }

-static inline char __task_state_to_char(unsigned int state)
+static inline char task_index_to_char(unsigned int state)
 {
 	static const char state_char[] = "RSDTtXZPI";

@@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)

 static inline char task_state_to_char(struct task_struct *tsk)
 {
-	return __task_state_to_char(__get_task_state(tsk));
+	return task_index_to_char(task_state_index(tsk));
 }

 /**

--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
+#ifndef _LINUX_SCHED_ISOLATION_H
+#define _LINUX_SCHED_ISOLATION_H
+
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/tick.h>
+
+enum hk_flags {
+	HK_FLAG_TIMER		= 1,
+	HK_FLAG_RCU		= (1 << 1),
+	HK_FLAG_MISC		= (1 << 2),
+	HK_FLAG_SCHED		= (1 << 3),
+	HK_FLAG_TICK		= (1 << 4),
+	HK_FLAG_DOMAIN		= (1 << 5),
+};
+
+#ifdef CONFIG_CPU_ISOLATION
+DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
+extern int housekeeping_any_cpu(enum hk_flags flags);
+extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
+extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
+extern void __init housekeeping_init(void);
+
+#else
+
+static inline int housekeeping_any_cpu(enum hk_flags flags)
+{
+	return smp_processor_id();
+}
+
+static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+{
+	return cpu_possible_mask;
+}
+
+static inline void housekeeping_affine(struct task_struct *t,
+				       enum hk_flags flags) { }
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_CPU_ISOLATION */
+
+static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
+{
+#ifdef CONFIG_CPU_ISOLATION
+	if (static_branch_unlikely(&housekeeping_overriden))
+		return housekeeping_test_cpu(cpu, flags);
+#endif
+	return true;
+}
+
+#endif /* _LINUX_SCHED_ISOLATION_H */
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }

+static inline bool task_is_realtime(struct task_struct *tsk)
+{
+	int policy = tsk->policy;
+
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
+		return true;
+	if (policy == SCHED_DEADLINE)
+		return true;
+	return false;
+}
+
 #ifdef CONFIG_RT_MUTEXES
 /*
 * Must hold either p->pi_lock or task_rq(p)->lock.

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;

 #ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_migration_cost;
-extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_time_avg;

 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,

--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 #ifdef CONFIG_NO_HZ_FULL
 extern bool tick_nohz_full_running;
 extern cpumask_var_t tick_nohz_full_mask;
-extern cpumask_var_t housekeeping_mask;

 static inline bool tick_nohz_full_enabled(void)
 {
@@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 		cpumask_or(mask, mask, tick_nohz_full_mask);
 }

-static inline int housekeeping_any_cpu(void)
-{
-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
 extern void tick_nohz_dep_set(enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
 extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,

 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
+extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
 #else
-static inline int housekeeping_any_cpu(void)
-{
-	return smp_processor_id();
-}
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,

 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
+static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
 #endif

-static inline const struct cpumask *housekeeping_cpumask(void)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
-	return cpu_possible_mask;
-}
-
-static inline bool is_housekeeping_cpu(int cpu)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-#endif
-	return true;
-}
-
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 static inline void tick_nohz_task_switch(void)
 {
 	if (tick_nohz_full_enabled())

--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 	if (preempt)
 		return TASK_STATE_MAX;

-	return __get_task_state(p);
+	return task_state_index(p);
 }
 #endif /* CREATE_TRACE_POINTS */


--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING

 endmenu # "CPU/Task time and stats accounting"

+config CPU_ISOLATION
+	bool "CPU isolation"
+	help
+	  Make sure that CPUs running critical tasks are not disturbed by
+	  any source of "noise" such as unbound workqueues, timers, kthreads...
+	  Unbound jobs get offloaded to housekeeping CPUs.
+
 source "kernel/rcu/Kconfig"

 config BUILD_BIN2C

--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
 #include <linux/cgroup.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
+#include <linux/sched/isolation.h>
 #include <linux/interrupt.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
 	early_irq_init();
 	init_IRQ();
 	tick_init();
+	housekeeping_init();
 	rcu_init_nohz();
 	init_timers();
 	hrtimers_init();

--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 #include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
-	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	dattr = NULL;
 	csa = NULL;

-	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
-		goto done;
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
 		ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		cpumask_and(doms[0], top_cpuset.effective_cpus,
-				     non_isolated_cpus);
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));

 		goto done;
 	}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
-		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+		      cpumask_intersects(cp->cpus_allowed,
+					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
 			continue;

 		if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ static int generate_sched_domains(cpumask_var_t **domains,

 			if (apn == b->pn) {
 				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, non_isolated_cpus);
+				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);

@@ -802,7 +798,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	BUG_ON(nslot != ndoms);

 done:
-	free_cpumask_var(non_isolated_cpus);
 	kfree(csa);

 	/*

--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
 #include <linux/oom.h>
 #include <linux/sched/debug.h>
 #include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
 #include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"

@@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)

 	if (!tick_nohz_full_enabled())
 		return;
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 }

 /* Record the current task on dyntick-idle entry. */

--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
 #include <linux/kthread.h>
 #include <linux/tick.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>

 #define CREATE_TRACE_POINTS

@@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	LIST_HEAD(rcu_tasks_holdouts);

 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);

 	/*
 	 * Each pass through the following loop makes one check for

--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/sched/isolation.h>

 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -42,18 +43,21 @@

 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
 /*
 * Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
 */
-
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
-
 const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
-
 #undef SCHED_FEAT
+#endif

 /*
 * Number of tasks to iterate in a single balance run.
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;

-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
 /*
 * __task_rq_lock - lock the rq @p resides on.
 */
@@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;

-	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
 		return cpu;

 	rcu_read_lock();
@@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;

-			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}

-	if (!is_housekeeping_cpu(cpu))
-		cpu = housekeeping_any_cpu();
+	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
 unlock:
 	rcu_read_unlock();
 	return cpu;
@@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif

-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
 {
 	int prio = p->static_prio - MAX_RT_PRIO;
 	struct load_weight *load = &p->se.load;
@@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
 		return;
 	}

-	load->weight = scale_load(sched_prio_to_weight[prio]);
-	load->inv_weight = sched_prio_to_wmult[prio];
+	/*
+	 * SCHED_OTHER tasks have to update their load when changing their
+	 * weight
+	 */
+	if (update_load && p->sched_class == &fair_sched_class) {
+		reweight_task(p, prio);
+	} else {
+		load->weight = scale_load(sched_prio_to_weight[prio]);
+		load->inv_weight = sched_prio_to_wmult[prio];
+	}
 }

 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 			p->static_prio = NICE_TO_PRIO(0);

 		p->prio = p->normal_prio = __normal_prio(p);
-		set_load_weight(p);
+		set_load_weight(p, false);

 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
 		put_prev_task(rq, p);

 	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p);
+	set_load_weight(p, true);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
@@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
 	 */
 	p->rt_priority = attr->sched_priority;
 	p->normal_prio = normal_prio(p);
-	set_load_weight(p);
+	set_load_weight(p, true);
 }

 /* Actually do priority change: must hold pi & rq lock. */
@@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }

 void __init sched_init_smp(void)
 {
-	cpumask_var_t non_isolated_cpus;
-
-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
 	sched_init_numa();

 	/*
@@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
 	 */
 	mutex_lock(&sched_domains_mutex);
 	sched_init_domains(cpu_active_mask);
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-	if (cpumask_empty(non_isolated_cpus))
-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);

 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
 		BUG();
 	sched_init_granularity();
-	free_cpumask_var(non_isolated_cpus);

 	init_sched_rt_class();
 	init_sched_dl_class();
@@ -5934,7 +5935,7 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 	}

-	set_load_weight(&init_task);
+	set_load_weight(&init_task, false);

 	/*
 	 * The boot idle thread does lazy MMU switching as well:
@@ -5953,9 +5954,6 @@ void __init sched_init(void)
 	calc_load_update = jiffies + LOAD_FREQ;

 #ifdef CONFIG_SMP
-	/* May be allocated at isolcpus cmdline parse time */
-	if (cpu_isolated_map == NULL)
-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
 	set_cpu_rq_start_time(smp_processor_id());
 #endif

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
 			if (p->state == TASK_DEAD)
 				sub_rq_bw(p->dl.dl_bw, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
-			__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 			__dl_clear_params(p);
 			raw_spin_unlock(&dl_b->lock);
 		}
@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 		}

 		raw_spin_lock(&dl_b->lock);
-		__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+		__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&dl_b->lock);
 		__dl_clear_params(p);

@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 		update_dl_entity(dl_se, pi_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se, pi_se);
+	} else if ((flags & ENQUEUE_RESTORE) &&
+		  dl_time_before(dl_se->deadline,
+				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+		setup_new_dl_entity(dl_se);
 	}

 	__enqueue_dl_entity(dl_se);
@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		 * until we complete the update.
 		 */
 		raw_spin_lock(&src_dl_b->lock);
-		__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+		__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&src_dl_b->lock);
 	}

@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)

 		return;
 	}
-	/*
-	 * If p is boosted we already updated its params in
-	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-	 * p's deadline being now already after rq_clock(rq).
-	 */
-	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-		setup_new_dl_entity(&p->dl);

 	if (rq->curr != p) {
 #ifdef CONFIG_SMP
@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
 		if (hrtimer_active(&p->dl.inactive_timer))
-			__dl_clear(dl_b, p->dl.dl_bw, cpus);
+			__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 		 * But this would require to set the task's "inactive
 		 * timer" when the task is not inactive.
 		 */
-		__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		dl_change_utilization(p, new_bw);
 		err = 0;

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 		P_SCHEDSTAT(se->statistics.wait_count);
 	}
 	P(se->load.weight);
+	P(se->runnable_weight);
 #ifdef CONFIG_SMP
 	P(se->avg.load_avg);
 	P(se->avg.util_avg);
+	P(se->avg.runnable_load_avg);
 #endif

 #undef PN_SCHEDSTAT
@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
-			cfs_rq->runnable_load_avg);
+			cfs_rq->avg.runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
-			atomic_long_read(&cfs_rq->removed_load_avg));
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
-			atomic_long_read(&cfs_rq->removed_util_avg));
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
+			cfs_rq->removed.load_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
+			cfs_rq->removed.util_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
+			cfs_rq->removed.runnable_sum);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		   "nr_involuntary_switches", (long long)p->nivcsw);

 	P(se.load.weight);
+	P(se.runnable_weight);
 #ifdef CONFIG_SMP
 	P(se.avg.load_sum);
+	P(se.avg.runnable_load_sum);
 	P(se.avg.util_sum);
 	P(se.avg.load_avg);
+	P(se.avg.runnable_load_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 #endif

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,7 @@ static void cpuidle_idle_call(void)
 */
 static void do_idle(void)
 {
+	int cpu = smp_processor_id();
 	/*
 	 * If the arch has a polling bit, we maintain an invariant:
 	 *
@@ -219,14 +220,13 @@ static void do_idle(void)
 	 */

 	__current_set_polling();
-	quiet_vmstat();
 	tick_nohz_idle_enter();

 	while (!need_resched()) {
 		check_pgt_cache();
 		rmb();

-		if (cpu_is_offline(smp_processor_id())) {
+		if (cpu_is_offline(cpu)) {
 			cpuhp_report_idle_dead();
 			arch_cpu_idle_dead();
 		}

--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
+/*
+ *  Housekeeping management. Manage the targets for routine code that can run on
+ *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ *
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/tick.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/static_key.h>
+#include <linux/ctype.h>
+
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
+static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
+
+int housekeeping_any_cpu(enum hk_flags flags)
+{
+	if (static_branch_unlikely(&housekeeping_overriden))
+		if (housekeeping_flags & flags)
+			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+	return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+{
+	if (static_branch_unlikely(&housekeeping_overriden))
+		if (housekeeping_flags & flags)
+			return housekeeping_mask;
+	return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+{
+	if (static_branch_unlikely(&housekeeping_overriden))
+		if (housekeeping_flags & flags)
+			set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+{
+	if (static_branch_unlikely(&housekeeping_overriden))
+		if (housekeeping_flags & flags)
+			return cpumask_test_cpu(cpu, housekeeping_mask);
+	return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+
+void __init housekeeping_init(void)
+{
+	if (!housekeeping_flags)
+		return;
+
+	static_branch_enable(&housekeeping_overriden);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
+{
+	cpumask_var_t non_housekeeping_mask;
+	int err;
+
+	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+	err = cpulist_parse(str, non_housekeeping_mask);
+	if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
+		free_bootmem_cpumask_var(non_housekeeping_mask);
+		return 0;
+	}
+
+	if (!housekeeping_flags) {
+		alloc_bootmem_cpumask_var(&housekeeping_mask);
+		cpumask_andnot(housekeeping_mask,
+			       cpu_possible_mask, non_housekeeping_mask);
+		if (cpumask_empty(housekeeping_mask))
+			cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	} else {
+		cpumask_var_t tmp;
+
+		alloc_bootmem_cpumask_var(&tmp);
+		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+		if (!cpumask_equal(tmp, housekeeping_mask)) {
+			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+			free_bootmem_cpumask_var(tmp);
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+		free_bootmem_cpumask_var(tmp);
+	}
+
+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+			tick_nohz_full_setup(non_housekeeping_mask);
+		} else {
+			pr_warn("Housekeeping: nohz unsupported."
+				" Build with CONFIG_NO_HZ_FULL\n");
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+	}
+
+	housekeeping_flags |= flags;
+
+	free_bootmem_cpumask_var(non_housekeeping_mask);
+
+	return 1;
+}
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	unsigned int flags;
+
+	flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+	return housekeeping_setup(str, flags);
+}
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+	unsigned int flags = 0;
+
+	while (isalpha(*str)) {
+		if (!strncmp(str, "nohz,", 5)) {
+			str += 5;
+			flags |= HK_FLAG_TICK;
+			continue;
+		}
+
+		if (!strncmp(str, "domain,", 7)) {
+			str += 7;
+			flags |= HK_FLAG_DOMAIN;
+			continue;
+		}
+
+		pr_warn("isolcpus: Error, unknown flag\n");
+		return 0;
+	}
+
+	/* Default behaviour for isolcpus without flags */
+	if (!flags)
+		flags |= HK_FLAG_DOMAIN;
+
+	return housekeeping_setup(str, flags);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,7 +227,7 @@ struct dl_bw {
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);

 static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
 	dl_b->total_bw -= tsk_bw;
 	__dl_update(dl_b, (s32)tsk_bw / cpus);
@@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 extern int dl_task_can_attach(struct task_struct *p,
 			      const struct cpumask *cs_cpus_allowed);
@@ -419,6 +418,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
+	unsigned long runnable_weight;
 	unsigned int nr_running, h_nr_running;

 	u64 exec_clock;
@@ -444,18 +444,22 @@ struct cfs_rq {
 	 * CFS load tracking
 	 */
 	struct sched_avg avg;
-	u64 runnable_load_sum;
-	unsigned long runnable_load_avg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	unsigned long tg_load_avg_contrib;
-	unsigned long propagate_avg;
-#endif
-	atomic_long_t removed_load_avg, removed_util_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
 #endif
+	struct {
+		raw_spinlock_t	lock ____cacheline_aligned;
+		int		nr;
+		unsigned long	load_avg;
+		unsigned long	util_avg;
+		unsigned long	runnable_sum;
+	} removed;

 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long tg_load_avg_contrib;
+	long propagate;
+	long prop_runnable_sum;
+
 	/*
 	 *   h_load = weight * f(tg)
 	 *
@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
 }

 /* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
 # define HAVE_RT_PUSH_IPI
 #endif

@@ -524,12 +528,6 @@ struct rt_rq {
 	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
-	int push_flags;
-	int push_cpu;
-	struct irq_work push_work;
-	raw_spinlock_t push_lock;
-#endif
 #endif /* CONFIG_SMP */
 	int rt_queued;

@@ -638,6 +636,19 @@ struct root_domain {
 	struct dl_bw dl_bw;
 	struct cpudl cpudl;

+#ifdef HAVE_RT_PUSH_IPI
+	/*
+	 * For IPI pull requests, loop across the rto_mask.
+	 */
+	struct irq_work rto_push_work;
+	raw_spinlock_t rto_lock;
+	/* These are only updated and read within rto_lock */
+	int rto_loop;
+	int rto_cpu;
+	/* These atomics are updated outside of a lock */
+	atomic_t rto_loop_next;
+	atomic_t rto_loop_start;
+#endif
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
@@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
 extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);

+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
 #endif /* CONFIG_SMP */

 /*
@@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 # define const_debug const
 #endif

-extern const_debug unsigned int sysctl_sched_features;
-
 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,

@@ -1232,6 +1244,13 @@ enum {
 #undef SCHED_FEAT

 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern const_debug unsigned int sysctl_sched_features;
+
 #define SCHED_FEAT(name, enabled)					\
 static __always_inline bool static_branch_##name(struct static_key *key) \
 {									\
@@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
 }

 #include "features.h"
-
 #undef SCHED_FEAT

 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+
 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+
+/*
+ * Each translation unit has its own copy of sysctl_sched_features to allow
+ * constants propagation at compile time and compiler optimization based on
+ * features default.
+ */
+#define SCHED_FEAT(name, enabled)	\
+	(1UL << __SCHED_FEAT_##name) * enabled |
+static const_debug __maybe_unused unsigned int sysctl_sched_features =
+#include "features.h"
+	0;
+#undef SCHED_FEAT
+
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */

 extern struct static_key_false sched_numa_balancing;
@@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);

+extern void reweight_task(struct task_struct *p, int prio);
+
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);


--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
 */
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/sched/isolation.h>

 #include "sched.h"

@@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_dlo_mask;

+#ifdef HAVE_RT_PUSH_IPI
+	rd->rto_cpu = -1;
+	raw_spin_lock_init(&rd->rto_lock);
+	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
 	init_dl_bw(&rd->dl_bw);
 	if (cpudl_init(&rd->cpudl) != 0)
 		goto free_rto_mask;
@@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	update_top_cache_domain(cpu);
 }

-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	int ret;
-
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
-		return 0;
-	}
-	return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
@@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->smt_gain = 1178; /* ~15% */

 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
@@ -1332,6 +1325,10 @@ void sched_init_numa(void)
 	if (!sched_domains_numa_distance)
 		return;

+	/* Includes NUMA identity node at level 0. */
+	sched_domains_numa_distance[level++] = curr_distance;
+	sched_domains_numa_levels = level;
+
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
@@ -1379,8 +1376,7 @@ void sched_init_numa(void)
 		return;

 	/*
-	 * 'level' contains the number of unique distances, excluding the
-	 * identity distance node_distance(i,i).
+	 * 'level' contains the number of unique distances
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
@@ -1441,10 +1437,19 @@ void sched_init_numa(void)
 	for (i = 0; sched_domain_topology[i].mask; i++)
 		tl[i] = sched_domain_topology[i];

+	/*
+	 * Add the NUMA identity distance, aka single NODE.
+	 */
+	tl[i++] = (struct sched_domain_topology_level){
+		.mask = sd_numa_mask,
+		.numa_level = 0,
+		SD_INIT_NAME(NODE)
+	};
+
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
-	for (j = 0; j < level; i++, j++) {
+	for (j = 1; j < level; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.mask = sd_numa_mask,
 			.sd_flags = cpu_numa_flags,
@@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();

@@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		doms_new = alloc_sched_domains(1);
 		if (doms_new) {
 			n = 1;
-			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+			cpumask_and(doms_new[0], cpu_active_mask,
+				    housekeeping_cpumask(HK_FLAG_DOMAIN));
 		}
 	} else {
 		n = ndoms_new;
@@ -1880,7 +1886,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	if (!doms_new) {
 		n = 0;
 		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		cpumask_and(doms_new[0], cpu_active_mask,
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 	}

 	/* Build new domains: */

--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
 #include <linux/context_tracking.h>
+#include <linux/mm.h>

 #include <asm/irq_regs.h>

@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)

 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
-cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
 static atomic_t tick_dep_mask;

@@ -385,20 +385,13 @@ void __tick_nohz_task_switch(void)
 	local_irq_restore(flags);
 }

-/* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_full_setup(char *str)
+/* Get the boot-time nohz CPU list from the kernel parameters. */
+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 {
 	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-	if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
-		pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
-		free_bootmem_cpumask_var(tick_nohz_full_mask);
-		return 1;
-	}
+	cpumask_copy(tick_nohz_full_mask, cpumask);
 	tick_nohz_full_running = true;
-
-	return 1;
 }
-__setup("nohz_full=", tick_nohz_full_setup);

 static int tick_nohz_cpu_down(unsigned int cpu)
 {
@@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
 			return;
 	}

-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
-	}
-
 	/*
 	 * Full dynticks uses irq work to drive the tick rescheduling on safe
 	 * locking contexts. But then we need irq work to raise its own
@@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
 	if (!arch_irq_work_has_interrupt()) {
 		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
 		cpumask_clear(tick_nohz_full_mask);
-		cpumask_copy(housekeeping_mask, cpu_possible_mask);
 		tick_nohz_full_running = false;
 		return;
 	}
@@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
 		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
 	}

-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
-
 	for_each_cpu(cpu, tick_nohz_full_mask)
 		context_tracking_cpu_set(cpu);

@@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
 	WARN_ON(ret < 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
-
-	/*
-	 * We need at least one CPU to handle housekeeping work such
-	 * as timekeeping, unbound timers, workqueues, ...
-	 */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif

@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
 		cpu_load_update_nohz_start();
+		quiet_vmstat();

 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;

--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,

 	trace_assign_type(field, iter->ent);

-	T = __task_state_to_char(field->next_state);
-	S = __task_state_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
+	S = task_index_to_char(field->prev_state);
 	trace_find_cmdline(field->next_pid, comm);
 	trace_seq_printf(&iter->seq,
 			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);

 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			 field->prev_pid,
 			 field->prev_prio,
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);

 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);

 	SEQ_PUT_HEX_FIELD(s, field->prev_pid);
 	SEQ_PUT_HEX_FIELD(s, field->prev_prio);

--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
-	entry->prev_state		= __get_task_state(prev);
+	entry->prev_state		= task_state_index(prev);
 	entry->next_pid			= next->pid;
 	entry->next_prio		= next->prio;
-	entry->next_state		= __get_task_state(next);
+	entry->next_state		= task_state_index(next);
 	entry->next_cpu	= task_cpu(next);

 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
-	entry->prev_state		= __get_task_state(curr);
+	entry->prev_state		= task_state_index(curr);
 	entry->next_pid			= wakee->pid;
 	entry->next_prio		= wakee->prio;
-	entry->next_state		= __get_task_state(wakee);
+	entry->next_state		= task_state_index(wakee);
 	entry->next_cpu			= task_cpu(wakee);

 	if (!call_filter_check_discard(call, entry, buffer, event))

--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>

 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,

 void __init lockup_detector_init(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled()) {
+	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
-		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
-	} else
-		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#else
-	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#endif
+
+	cpumask_copy(&watchdog_cpumask,
+		     housekeeping_cpumask(HK_FLAG_TIMER));

 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;