Commit d479c5a1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The changes in this cycle are:

   - Optimize the task wakeup CPU selection logic, to improve
     scalability and reduce wakeup latency spikes

   - PELT enhancements

   - CFS bandwidth handling fixes

   - Optimize the wakeup path by remove rq->wake_list and replacing it
     with ->ttwu_pending

   - Optimize IPI cross-calls by making flush_smp_call_function_queue()
     process sync callbacks first.

   - Misc fixes and enhancements"

* tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits)
  irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too
  sched/headers: Split out open-coded prototypes into kernel/sched/smp.h
  sched: Replace rq::wake_list
  sched: Add rq::ttwu_pending
  irq_work, smp: Allow irq_work on call_single_queue
  smp: Optimize send_call_function_single_ipi()
  smp: Move irq_work_run() out of flush_smp_call_function_queue()
  smp: Optimize flush_smp_call_function_queue()
  sched: Fix smp_call_function_single_async() usage for ILB
  sched/core: Offload wakee task activation if it the wakee is descheduling
  sched/core: Optimize ttwu() spinning on p->on_cpu
  sched: Defend cfs and rt bandwidth quota against overflow
  sched/cpuacct: Fix charge cpuacct.usage_sys
  sched/fair: Replace zero-length array with flexible-array
  sched/pelt: Sync util/runnable_sum with PELT window when propagating
  sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr()
  sched/fair: Optimize enqueue_task_fair()
  sched: Make scheduler_ipi inline
  sched: Clean up scheduler_ipi()
  sched/core: Simplify sched_init()
  ...
parents f6aee505 25de110d
...@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void) ...@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */ /* Standard hot unplug procedure */
idle_task_exit(); idle_task_exit();
current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id(); cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu); DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu); generic_set_cpu_dead(cpu);
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
*/ */
/* flags share CSD_FLAG_ space */
#define IRQ_WORK_PENDING BIT(0) #define IRQ_WORK_PENDING BIT(0)
#define IRQ_WORK_BUSY BIT(1) #define IRQ_WORK_BUSY BIT(1)
...@@ -23,9 +25,12 @@ ...@@ -23,9 +25,12 @@
#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) #define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
/*
* structure shares layout with single_call_data_t.
*/
struct irq_work { struct irq_work {
atomic_t flags;
struct llist_node llnode; struct llist_node llnode;
atomic_t flags;
void (*func)(struct irq_work *); void (*func)(struct irq_work *);
}; };
...@@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work); ...@@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work);
void irq_work_run(void); void irq_work_run(void);
bool irq_work_needs_cpu(void); bool irq_work_needs_cpu(void);
void irq_work_single(void *arg);
#else #else
static inline bool irq_work_needs_cpu(void) { return false; } static inline bool irq_work_needs_cpu(void) { return false; }
static inline void irq_work_run(void) { } static inline void irq_work_run(void) { }
static inline void irq_work_single(void *arg) { }
#endif #endif
#endif /* _LINUX_IRQ_WORK_H */ #endif /* _LINUX_IRQ_WORK_H */
...@@ -654,6 +654,7 @@ struct task_struct { ...@@ -654,6 +654,7 @@ struct task_struct {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
struct llist_node wake_entry; struct llist_node wake_entry;
unsigned int wake_entry_type;
int on_cpu; int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK #ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */ /* Current CPU: */
...@@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); ...@@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
}) })
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
void scheduler_ipi(void); static __always_inline void scheduler_ipi(void)
{
/*
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
* TIF_NEED_RESCHED remotely (for the first time) will also send
* this IPI.
*/
preempt_fold_need_resched();
}
extern unsigned long wait_task_inactive(struct task_struct *, long match_state); extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else #else
static inline void scheduler_ipi(void) { } static inline void scheduler_ipi(void) { }
......
...@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm) ...@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm); __mmdrop(mm);
} }
void mmdrop(struct mm_struct *mm);
/* /*
* This has to be called after a get_task_mm()/mmget_not_zero() * This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_sem for writing before modifying the * followed by taking the mmap_sem for writing before modifying the
......
...@@ -11,21 +11,20 @@ ...@@ -11,21 +11,20 @@
*/ */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ #define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ #define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ #define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ #define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ #define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x2000 /* cross-node balancing */
#define SD_NUMA 0x4000 /* cross-node balancing */
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void) static inline int cpu_smt_flags(void)
......
...@@ -16,17 +16,39 @@ ...@@ -16,17 +16,39 @@
typedef void (*smp_call_func_t)(void *info); typedef void (*smp_call_func_t)(void *info);
typedef bool (*smp_cond_func_t)(int cpu, void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info);
enum {
CSD_FLAG_LOCK = 0x01,
/* IRQ_WORK_flags */
CSD_TYPE_ASYNC = 0x00,
CSD_TYPE_SYNC = 0x10,
CSD_TYPE_IRQ_WORK = 0x20,
CSD_TYPE_TTWU = 0x30,
CSD_FLAG_TYPE_MASK = 0xF0,
};
/*
* structure shares (partial) layout with struct irq_work
*/
struct __call_single_data { struct __call_single_data {
struct llist_node llist; struct llist_node llist;
unsigned int flags;
smp_call_func_t func; smp_call_func_t func;
void *info; void *info;
unsigned int flags;
}; };
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */ /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
typedef struct __call_single_data call_single_data_t typedef struct __call_single_data call_single_data_t
__aligned(sizeof(struct __call_single_data)); __aligned(sizeof(struct __call_single_data));
/*
* Enqueue a llist_node on the call_single_queue; be very careful, read
* flush_smp_call_function_queue() in detail.
*/
extern void __smp_call_single_queue(int cpu, struct llist_node *node);
/* total number of cpus in this system (may exceed NR_CPUS) */ /* total number of cpus in this system (may exceed NR_CPUS) */
extern unsigned int total_cpus; extern unsigned int total_cpus;
......
...@@ -9,23 +9,10 @@ ...@@ -9,23 +9,10 @@
#include <asm/current.h> #include <asm/current.h>
/* /*
* BROKEN wait-queues. * Simple waitqueues are semantically very different to regular wait queues
* * (wait.h). The most important difference is that the simple waitqueue allows
* These "simple" wait-queues are broken garbage, and should never be * for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
* used. The comments below claim that they are "similar" to regular * times.
* wait-queues, but the semantics are actually completely different, and
* every single user we have ever had has been buggy (or pointless).
*
* A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
* "wake_up()" does, and has led to problems. In other cases, it has
* been fine, because there's only ever one waiter (kvm), but in that
* case gthe whole "simple" wait-queue is just pointless to begin with,
* since there is no "queue". Use "wake_up_process()" with a direct
* pointer instead.
*
* While these are very similar to regular wait queues (wait.h) the most
* important difference is that the simple waitqueue allows for deterministic
* behaviour -- IOW it has strictly bounded IRQ and lock hold times.
* *
* Mainly, this is accomplished by two things. Firstly not allowing swake_up_all * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all
* from IRQ disabled, and dropping the lock upon every wakeup, giving a higher * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
...@@ -39,7 +26,7 @@ ...@@ -39,7 +26,7 @@
* sleeper state. * sleeper state.
* *
* - the !exclusive mode; because that leads to O(n) wakeups, everything is * - the !exclusive mode; because that leads to O(n) wakeups, everything is
* exclusive. * exclusive. As such swake_up_one will only ever awake _one_ waiter.
* *
* - custom wake callback functions; because you cannot give any guarantees * - custom wake callback functions; because you cannot give any guarantees
* about random code. This also allows swait to be used in RT, such that * about random code. This also allows swait to be used in RT, such that
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
* *
* This code is licenced under the GPL. * This code is licenced under the GPL.
*/ */
#include <linux/sched/mm.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/init.h> #include <linux/init.h>
...@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) ...@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu); return bringup_wait_for_ap(cpu);
} }
static int finish_cpu(unsigned int cpu)
{
struct task_struct *idle = idle_thread_get(cpu);
struct mm_struct *mm = idle->active_mm;
/*
* idle_task_exit() will have switched to &init_mm, now
* clean up any remaining active_mm state.
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
mmdrop(mm);
return 0;
}
/* /*
* Hotplug state machine related functions * Hotplug state machine related functions
*/ */
...@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { ...@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = { [CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup", .name = "cpu:bringup",
.startup.single = bringup_cpu, .startup.single = bringup_cpu,
.teardown.single = NULL, .teardown.single = finish_cpu,
.cant_stop = true, .cant_stop = true,
}, },
/* Final state before CPU kills itself */ /* Final state before CPU kills itself */
......
...@@ -708,8 +708,12 @@ void __noreturn do_exit(long code) ...@@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current; struct task_struct *tsk = current;
int group_dead; int group_dead;
profile_task_exit(tsk); /*
kcov_task_exit(tsk); * We can get here from a kernel oops, sometimes with preemption off.
* Start by checking for critical errors.
* Then fix up important state like USER_DS and preemption.
* Then do everything else.
*/
WARN_ON(blk_needs_flush_plug(tsk)); WARN_ON(blk_needs_flush_plug(tsk));
...@@ -727,6 +731,16 @@ void __noreturn do_exit(long code) ...@@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
*/ */
set_fs(USER_DS); set_fs(USER_DS);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
profile_task_exit(tsk);
kcov_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code); ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk); validate_creds_for_do_exit(tsk);
...@@ -744,13 +758,6 @@ void __noreturn do_exit(long code) ...@@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */ exit_signals(tsk); /* sets PF_EXITING */
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
/* sync mm's RSS info before statistics gathering */ /* sync mm's RSS info before statistics gathering */
if (tsk->mm) if (tsk->mm)
sync_mm_rss(tsk->mm); sync_mm_rss(tsk->mm);
......
...@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) ...@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
{ {
int oflags; int oflags;
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
/* /*
* If the work is already pending, no need to raise the IPI. * If the work is already pending, no need to raise the IPI.
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
...@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) ...@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) { if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */ /* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi()); WARN_ON_ONCE(in_nmi());
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) __smp_call_single_queue(cpu, &work->llnode);
arch_send_call_function_single_ipi(cpu);
} else { } else {
__irq_work_queue_local(work); __irq_work_queue_local(work);
} }
...@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void) ...@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
return true; return true;
} }
void irq_work_single(void *arg)
{
struct irq_work *work = arg;
int flags;
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
* Make it immediately visible so that other CPUs trying
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
lockdep_irq_work_enter(work);
work->func(work);
lockdep_irq_work_exit(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
flags &= ~IRQ_WORK_PENDING;
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
static void irq_work_run_list(struct llist_head *list) static void irq_work_run_list(struct llist_head *list)
{ {
struct irq_work *work, *tmp; struct irq_work *work, *tmp;
...@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list) ...@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
return; return;
llnode = llist_del_all(list); llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, llnode) { llist_for_each_entry_safe(work, tmp, llnode, llnode)
int flags; irq_work_single(work);
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
* Make it immediately visible so that other CPUs trying
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
lockdep_irq_work_enter(work);
work->func(work);
lockdep_irq_work_exit(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
flags &= ~IRQ_WORK_PENDING;
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
} }
/* /*
......
This diff is collapsed.
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com). * (balbir@in.ibm.com).
*/ */
#include <asm/irq_regs.h>
#include "sched.h" #include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */ /* Time spent by the tasks of the CPU accounting group executing in ... */
...@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) ...@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{ {
struct cpuacct *ca; struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM; int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk); struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs)) if (regs && user_mode(regs))
index = CPUACCT_STAT_USER; index = CPUACCT_STAT_USER;
...@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) ...@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
rcu_read_lock(); rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; __this_cpu_add(ca->cpuusage->usages[index], cputime);
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) ...@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
rcu_read_lock(); rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
this_cpu_ptr(ca->cpustat)->cpustat[index] += val; __this_cpu_add(ca->cpustat->cpustat[index], val);
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) ...@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */ /* &table[8] is terminator */
...@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) ...@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else else
SEQ_printf(m, " %c", task_state_to_char(p)); SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p), p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw), (long long)(p->nvcsw + p->nivcsw),
...@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) ...@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n"); SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n"); SEQ_printf(m, "runnable tasks:\n");
SEQ_printf(m, " S task PID tree-key switches prio" SEQ_printf(m, " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n"); " wait-time sum-exec sum-sleep\n");
SEQ_printf(m, "-------------------------------------------------------" SEQ_printf(m, "-------------------------------------------------------"
"----------------------------------------------------\n"); "------------------------------------------------------\n");
rcu_read_lock(); rcu_read_lock();
for_each_process_thread(g, p) { for_each_process_thread(g, p) {
...@@ -638,7 +638,6 @@ do { \ ...@@ -638,7 +638,6 @@ do { \
P(nr_running); P(nr_running);
P(nr_switches); P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible); P(nr_uninterruptible);
PN(next_balance); PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
......
This diff is collapsed.
...@@ -289,7 +289,11 @@ static void do_idle(void) ...@@ -289,7 +289,11 @@ static void do_idle(void)
*/ */
smp_mb__after_atomic(); smp_mb__after_atomic();
sched_ttwu_pending(); /*
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
flush_smp_call_function_from_idle();
schedule_idle(); schedule_idle();
if (unlikely(klp_patch_pending(current))) if (unlikely(klp_patch_pending(current)))
......
...@@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa, ...@@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
return 1; return 1;
} }
/*
* When syncing *_avg with *_sum, we must take into account the current
* position in the PELT segment otherwise the remaining part of the segment
* will be considered as idle time whereas it's not yet elapsed and this will
* generate unwanted oscillation in the range [1002..1024[.
*
* The max value of *_sum varies with the position in the time segment and is
* equals to :
*
* LOAD_AVG_MAX*y + sa->period_contrib
*
* which can be simplified into:
*
* LOAD_AVG_MAX - 1024 + sa->period_contrib
*
* because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
*
* The same care must be taken when a sched entity is added, updated or
* removed from a cfs_rq and we need to update sched_avg. Scheduler entities
* and the cfs rq, to which they are attached, have the same position in the
* time segment because they use the same clock. This means that we can use
* the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
* if it's more convenient.
*/
static __always_inline void static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load) ___update_load_avg(struct sched_avg *sa, unsigned long load)
{ {
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
int sched_rr_timeslice = RR_TIMESLICE; int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
...@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg, ...@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0) if (rt_period == 0)
return -EINVAL; return -EINVAL;
/*
* Bound quota to defend quota against overflow during bandwidth shift.
*/
if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
return -EINVAL;
mutex_lock(&rt_constraints_mutex); mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime); err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err) if (err)
...@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void) ...@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
return -EINVAL; return -EINVAL;
if ((sysctl_sched_rt_runtime != RUNTIME_INF) && if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
(sysctl_sched_rt_runtime > sysctl_sched_rt_period)) ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL; return -EINVAL;
return 0; return 0;
......
...@@ -349,7 +349,6 @@ struct cfs_bandwidth { ...@@ -349,7 +349,6 @@ struct cfs_bandwidth {
u8 idle; u8 idle;
u8 period_active; u8 period_active;
u8 distribute_running;
u8 slack_started; u8 slack_started;
struct hrtimer period_timer; struct hrtimer period_timer;
struct hrtimer slack_timer; struct hrtimer slack_timer;
...@@ -890,12 +889,15 @@ struct rq { ...@@ -890,12 +889,15 @@ struct rq {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
unsigned long last_blocked_load_update_tick; unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load; unsigned int has_blocked_load;
call_single_data_t nohz_csd;
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped; unsigned int nohz_tick_stopped;
atomic_t nohz_flags; atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */ #endif /* CONFIG_NO_HZ_COMMON */
unsigned long nr_load_updates; #ifdef CONFIG_SMP
unsigned int ttwu_pending;
#endif
u64 nr_switches; u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK
...@@ -951,6 +953,7 @@ struct rq { ...@@ -951,6 +953,7 @@ struct rq {
struct callback_head *balance_callback; struct callback_head *balance_callback;
unsigned char nohz_idle_balance;
unsigned char idle_balance; unsigned char idle_balance;
unsigned long misfit_task_load; unsigned long misfit_task_load;
...@@ -979,7 +982,7 @@ struct rq { ...@@ -979,7 +982,7 @@ struct rq {
/* This is used to determine avg_idle's max value */ /* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost; u64 max_idle_balance_cost;
#endif #endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time; u64 prev_irq_time;
...@@ -1020,10 +1023,6 @@ struct rq { ...@@ -1020,10 +1023,6 @@ struct rq {
unsigned int ttwu_local; unsigned int ttwu_local;
#endif #endif
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
#ifdef CONFIG_CPU_IDLE #ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */ /* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state; struct cpuidle_state *idle_state;
...@@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq, ...@@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head; rq->balance_callback = head;
} }
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \ #define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \ rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex)) lockdep_is_held(&sched_domains_mutex))
...@@ -1461,7 +1458,7 @@ struct sched_group { ...@@ -1461,7 +1458,7 @@ struct sched_group {
* by attaching extra space to the end of the structure, * by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with) * depending on how many CPUs the kernel has booted up with)
*/ */
unsigned long cpumask[0]; unsigned long cpumask[];
}; };
static inline struct cpumask *sched_group_span(struct sched_group *sg) static inline struct cpumask *sched_group_span(struct sched_group *sg)
...@@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void) ...@@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void)
} }
#endif #endif
extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); extern void flush_smp_call_function_from_idle(void);
#else
static inline void sched_ttwu_pending(void) { }
static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } #else /* !CONFIG_SMP: */
static inline void flush_smp_call_function_from_idle(void) { }
#endif /* CONFIG_SMP */ #endif
#include "stats.h" #include "stats.h"
#include "autogroup.h" #include "autogroup.h"
...@@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) ...@@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
*/ */
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */
#define WF_ON_RQ 0x08 /* Wakee is on_rq */
/* /*
* To aid in avoiding the subversion of "niceness" due to uneven distribution * To aid in avoiding the subversion of "niceness" due to uneven distribution
...@@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); ...@@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20 #define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT) #define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8 #define RATIO_SHIFT 8
#define MAX_BW_BITS (64 - BW_SHIFT)
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime); unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se); extern void init_entity_runnable_average(struct sched_entity *se);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Scheduler internal SMP callback types and methods between the scheduler
* and other internal parts of the core kernel:
*/
extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu);
...@@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ...@@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask); cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
if (sd->parent)
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
return -1;
}
printk(KERN_CONT "span=%*pbl level=%s\n", printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name); cpumask_pr_args(sched_domain_span(sd)), sd->name);
...@@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) ...@@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd)
return 1; return 1;
/* Following flags need at least 2 groups */ /* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE | if (sd->flags & (SD_BALANCE_NEWIDLE |
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK | SD_BALANCE_FORK |
SD_BALANCE_EXEC | SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY | SD_SHARE_CPUCAPACITY |
...@@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ...@@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
/* Flags needing groups don't count if only 1 group in parent */ /* Flags needing groups don't count if only 1 group in parent */
if (parent->groups == parent->groups->next) { if (parent->groups == parent->groups->next) {
pflags &= ~(SD_LOAD_BALANCE | pflags &= ~(SD_BALANCE_NEWIDLE |
SD_BALANCE_NEWIDLE | SD_BALANCE_FORK |
SD_BALANCE_FORK | SD_BALANCE_EXEC |
SD_BALANCE_EXEC | SD_ASYM_CPUCAPACITY |
SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY |
SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES |
SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING |
SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN);
SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1) if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE; pflags &= ~SD_SERIALIZE;
} }
...@@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, ...@@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl,
.cache_nice_tries = 0, .cache_nice_tries = 0,
.flags = 1*SD_LOAD_BALANCE .flags = 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC | 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK | 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE | 0*SD_BALANCE_WAKE
......
...@@ -22,11 +22,9 @@ ...@@ -22,11 +22,9 @@
#include <linux/hypervisor.h> #include <linux/hypervisor.h>
#include "smpboot.h" #include "smpboot.h"
#include "sched/smp.h"
enum { #define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
CSD_FLAG_LOCK = 0x01,
CSD_FLAG_SYNCHRONOUS = 0x02,
};
struct call_function_data { struct call_function_data {
call_single_data_t __percpu *csd; call_single_data_t __percpu *csd;
...@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu) ...@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* still pending. * still pending.
*/ */
flush_smp_call_function_queue(false); flush_smp_call_function_queue(false);
irq_work_run();
return 0; return 0;
} }
...@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd) ...@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
if (llist_add(node, &per_cpu(call_single_queue, cpu)))
send_call_function_single_ipi(cpu);
}
/* /*
* Insert a previously allocated call_single_data_t element * Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have * for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set. * ->func, ->info, and ->flags set.
*/ */
static int generic_exec_single(int cpu, call_single_data_t *csd, static int generic_exec_single(int cpu, call_single_data_t *csd)
smp_call_func_t func, void *info)
{ {
if (cpu == smp_processor_id()) { if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
void *info = csd->info;
unsigned long flags; unsigned long flags;
/* /*
...@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, ...@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
return 0; return 0;
} }
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
csd_unlock(csd); csd_unlock(csd);
return -ENXIO; return -ENXIO;
} }
csd->func = func; __smp_call_single_queue(cpu, &csd->llist);
csd->info = info;
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
arch_send_call_function_single_ipi(cpu);
return 0; return 0;
} }
...@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void) ...@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
*/ */
static void flush_smp_call_function_queue(bool warn_cpu_offline) static void flush_smp_call_function_queue(bool warn_cpu_offline)
{ {
struct llist_head *head;
struct llist_node *entry;
call_single_data_t *csd, *csd_next; call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
struct llist_head *head;
static bool warned; static bool warned;
lockdep_assert_irqs_disabled(); lockdep_assert_irqs_disabled();
...@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) ...@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here * We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet. * because we are not invoking the IPI handlers yet.
*/ */
llist_for_each_entry(csd, entry, llist) llist_for_each_entry(csd, entry, llist) {
pr_warn("IPI callback %pS sent to offline CPU\n", switch (CSD_TYPE(csd)) {
csd->func); case CSD_TYPE_ASYNC:
case CSD_TYPE_SYNC:
case CSD_TYPE_IRQ_WORK:
pr_warn("IPI callback %pS sent to offline CPU\n",
csd->func);
break;
case CSD_TYPE_TTWU:
pr_warn("IPI task-wakeup sent to offline CPU\n");
break;
default:
pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
CSD_TYPE(csd));
break;
}
}
} }
/*
* First; run all SYNC callbacks, people are waiting for us.
*/
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) { llist_for_each_entry_safe(csd, csd_next, entry, llist) {
smp_call_func_t func = csd->func;
void *info = csd->info;
/* Do we wait until *after* callback? */ /* Do we wait until *after* callback? */
if (csd->flags & CSD_FLAG_SYNCHRONOUS) { if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
if (prev) {
prev->next = &csd_next->llist;
} else {
entry = &csd_next->llist;
}
func(info); func(info);
csd_unlock(csd); csd_unlock(csd);
} else { } else {
csd_unlock(csd); prev = &csd->llist;
func(info);
} }
} }
if (!entry)
return;
/* /*
* Handle irq works queued remotely by irq_work_queue_on(). * Second; run all !SYNC callbacks.
* Smp functions above are typically synchronous so they
* better run first since some other CPUs may be busy waiting
* for them.
*/ */
irq_work_run(); prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
int type = CSD_TYPE(csd);
if (type != CSD_TYPE_TTWU) {
if (prev) {
prev->next = &csd_next->llist;
} else {
entry = &csd_next->llist;
}
if (type == CSD_TYPE_ASYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
csd_unlock(csd);
func(info);
} else if (type == CSD_TYPE_IRQ_WORK) {
irq_work_single(csd);
}
} else {
prev = &csd->llist;
}
}
/*
* Third; only CSD_TYPE_TTWU is left, issue those.
*/
if (entry)
sched_ttwu_pending(entry);
}
void flush_smp_call_function_from_idle(void)
{
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
return;
local_irq_save(flags);
flush_smp_call_function_queue(true);
local_irq_restore(flags);
} }
/* /*
...@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, ...@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{ {
call_single_data_t *csd; call_single_data_t *csd;
call_single_data_t csd_stack = { call_single_data_t csd_stack = {
.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
}; };
int this_cpu; int this_cpu;
int err; int err;
...@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, ...@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd_lock(csd); csd_lock(csd);
} }
err = generic_exec_single(cpu, csd, func, info); csd->func = func;
csd->info = info;
err = generic_exec_single(cpu, csd);
if (wait) if (wait)
csd_lock_wait(csd); csd_lock_wait(csd);
...@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) ...@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
csd->flags = CSD_FLAG_LOCK; csd->flags = CSD_FLAG_LOCK;
smp_wmb(); smp_wmb();
err = generic_exec_single(cpu, csd, csd->func, csd->info); err = generic_exec_single(cpu, csd);
out: out:
preempt_enable(); preempt_enable();
...@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, ...@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock(csd); csd_lock(csd);
if (wait) if (wait)
csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->flags |= CSD_TYPE_SYNC;
csd->func = func; csd->func = func;
csd->info = info; csd->info = info;
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
...@@ -598,6 +669,24 @@ void __init smp_init(void) ...@@ -598,6 +669,24 @@ void __init smp_init(void)
{ {
int num_nodes, num_cpus; int num_nodes, num_cpus;
/*
* Ensure struct irq_work layout matches so that
* flush_smp_call_function_queue() can do horrible things.
*/
BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
offsetof(struct __call_single_data, llist));
BUILD_BUG_ON(offsetof(struct irq_work, func) !=
offsetof(struct __call_single_data, func));
BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
offsetof(struct __call_single_data, flags));
/*
* Assert the CSD_TYPE_TTWU layout is similar enough
* for task_struct to be on the @call_single_queue.
*/
BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
idle_threads_init(); idle_threads_init();
cpuhp_threads_init(); cpuhp_threads_init();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment