Commit 394f4528 authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'rcu/next' of...

Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu into core/rcu
parents 90a8a73c 3c2dcf2a
CONFIG_RCU_TRACE debugfs Files and Formats CONFIG_RCU_TRACE debugfs Files and Formats
The rcutree implementation of RCU provides debugfs trace output that The rcutree and rcutiny implementations of RCU provide debugfs trace
summarizes counters and state. This information is useful for debugging output that summarizes counters and state. This information is useful for
RCU itself, and can sometimes also help to debug abuses of RCU. debugging RCU itself, and can sometimes also help to debug abuses of RCU.
The following sections describe the debugfs files and formats. The following sections describe the debugfs files and formats, first
for rcutree and next for rcutiny.
Hierarchical RCU debugfs Files and Formats CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
This implementation of RCU provides three debugfs files under the These implementations of RCU provides five debugfs files under the
top-level directory RCU: rcu/rcudata (which displays fields in struct top-level directory RCU: rcu/rcudata (which displays fields in struct
rcu_data), rcu/rcugp (which displays grace-period counters), and rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
rcu/rcuhier (which displays the struct rcu_node hierarchy). rcu/rcudata), rcu/rcugp (which displays grace-period counters),
rcu/rcuhier (which displays the struct rcu_node hierarchy), and
rcu/rcu_pending (which displays counts of the reasons that the
rcu_pending() function decided that there was core RCU work to do).
The output of "cat rcu/rcudata" looks as follows: The output of "cat rcu/rcudata" looks as follows:
...@@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for ...@@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for
been registered in absence of CPU-hotplug activity. been registered in absence of CPU-hotplug activity.
o "co" is the number of RCU callbacks that have been orphaned due to o "co" is the number of RCU callbacks that have been orphaned due to
this CPU going offline. this CPU going offline. These orphaned callbacks have been moved
to an arbitrarily chosen online CPU.
o "ca" is the number of RCU callbacks that have been adopted due to o "ca" is the number of RCU callbacks that have been adopted due to
other CPUs going offline. Note that ci+co-ca+ql is the number of other CPUs going offline. Note that ci+co-ca+ql is the number of
...@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is ...@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is
The output of "cat rcu/rcuhier" looks as follows, with very long lines: The output of "cat rcu/rcuhier" looks as follows, with very long lines:
c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1/1 .>. 0:127 ^0 1/1 .>. 0:127 ^0
3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
rcu_bh: rcu_bh:
c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
0/1 .>. 0:127 ^0 0/1 .>. 0:127 ^0
0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
...@@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that ...@@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that
exited immediately (without even being counted in nfqs above) exited immediately (without even being counted in nfqs above)
due to contention on ->fqslock. due to contention on ->fqslock.
o "oqlen" is the number of callbacks on the "orphan" callback
list. RCU callbacks are placed on this list by CPUs going
offline, and are "adopted" either by the CPU helping the outgoing
CPU or by the next rcu_barrier*() call, whichever comes first.
o Each element of the form "1/1 0:127 ^0" represents one struct o Each element of the form "1/1 0:127 ^0" represents one struct
rcu_node. Each line represents one level of the hierarchy, from rcu_node. Each line represents one level of the hierarchy, from
root to leaves. It is best to think of the rcu_data structures root to leaves. It is best to think of the rcu_data structures
...@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert ...@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert
readers will note that the rcu "nn" number for a given CPU very readers will note that the rcu "nn" number for a given CPU very
closely matches the rcu_bh "np" number for that same CPU. This closely matches the rcu_bh "np" number for that same CPU. This
is due to short-circuit evaluation in rcu_pending(). is due to short-circuit evaluation in rcu_pending().
CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
These implementations of RCU provides a single debugfs file under the
top-level directory RCU, namely rcu/rcudata, which displays fields in
rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
rcu_preempt_ctrlblk.
The output of "cat rcu/rcudata" is as follows:
rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
exp balk: bt=0 nos=0
rcu_sched: qlen: 0
rcu_bh: qlen: 0
This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
The last three lines of the rcu_preempt section appear only in
CONFIG_RCU_BOOST kernel builds. The fields are as follows:
o "qlen" is the number of RCU callbacks currently waiting either
for an RCU grace period or waiting to be invoked. This is the
only field present for rcu_sched and rcu_bh, due to the
short-circuiting of grace period in those two cases.
o "gp" is the number of grace periods that have completed.
o "g197/p197/c197" displays the grace-period state, with the
"g" number being the number of grace periods that have started
(mod 256), the "p" number being the number of grace periods
that the CPU has responded to (also mod 256), and the "c"
number being the number of grace periods that have completed
(once again mode 256).
Why have both "gp" and "g"? Because the data flowing into
"gp" is only present in a CONFIG_RCU_TRACE kernel.
o "tasks" is a set of bits. The first bit is "T" if there are
currently tasks that have recently blocked within an RCU
read-side critical section, the second bit is "N" if any of the
aforementioned tasks are blocking the current RCU grace period,
and the third bit is "E" if any of the aforementioned tasks are
blocking the current expedited grace period. Each bit is "."
if the corresponding condition does not hold.
o "ttb" is a single bit. It is "B" if any of the blocked tasks
need to be priority boosted and "." otherwise.
o "btg" indicates whether boosting has been carried out during
the current grace period, with "exp" indicating that boosting
is in progress for an expedited grace period, "no" indicating
that boosting has not yet started for a normal grace period,
"begun" indicating that boosting has bebug for a normal grace
period, and "done" indicating that boosting has completed for
a normal grace period.
o "ntb" is the total number of tasks subjected to RCU priority boosting
periods since boot.
o "neb" is the number of expedited grace periods that have had
to resort to RCU priority boosting since boot.
o "nnb" is the number of normal grace periods that have had
to resort to RCU priority boosting since boot.
o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
o "bt" is the low-order 12 bits of the value that the jiffies counter
will have at the next time that boosting is scheduled to begin.
o In the line beginning with "normal balk", the fields are as follows:
o "nt" is the number of times that the system balked from
boosting because there were no blocked tasks to boost.
Note that the system will balk from boosting even if the
grace period is overdue when the currently running task
is looping within an RCU read-side critical section.
There is no point in boosting in this case, because
boosting a running task won't make it run any faster.
o "gt" is the number of times that the system balked
from boosting because, although there were blocked tasks,
none of them were preventing the current grace period
from completing.
o "bt" is the number of times that the system balked
from boosting because boosting was already in progress.
o "b" is the number of times that the system balked from
boosting because boosting had already completed for
the grace period in question.
o "ny" is the number of times that the system balked from
boosting because it was not yet time to start boosting
the grace period in question.
o "nos" is the number of times that the system balked from
boosting for inexplicable ("not otherwise specified")
reasons. This can actually happen due to races involving
increments of the jiffies counter.
o In the line beginning with "exp balk", the fields are as follows:
o "bt" is the number of times that the system balked from
boosting because there were no blocked tasks to boost.
o "nos" is the number of times that the system balked from
boosting for inexplicable ("not otherwise specified")
reasons.
...@@ -83,6 +83,12 @@ extern struct group_info init_groups; ...@@ -83,6 +83,12 @@ extern struct group_info init_groups;
*/ */
# define CAP_INIT_BSET CAP_FULL_SET # define CAP_INIT_BSET CAP_FULL_SET
#ifdef CONFIG_RCU_BOOST
#define INIT_TASK_RCU_BOOST() \
.rcu_boost_mutex = NULL,
#else
#define INIT_TASK_RCU_BOOST()
#endif
#ifdef CONFIG_TREE_PREEMPT_RCU #ifdef CONFIG_TREE_PREEMPT_RCU
#define INIT_TASK_RCU_TREE_PREEMPT() \ #define INIT_TASK_RCU_TREE_PREEMPT() \
.rcu_blocked_node = NULL, .rcu_blocked_node = NULL,
...@@ -94,7 +100,8 @@ extern struct group_info init_groups; ...@@ -94,7 +100,8 @@ extern struct group_info init_groups;
.rcu_read_lock_nesting = 0, \ .rcu_read_lock_nesting = 0, \
.rcu_read_unlock_special = 0, \ .rcu_read_unlock_special = 0, \
.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
INIT_TASK_RCU_TREE_PREEMPT() INIT_TASK_RCU_TREE_PREEMPT() \
INIT_TASK_RCU_BOOST()
#else #else
#define INIT_TASK_RCU_PREEMPT(tsk) #define INIT_TASK_RCU_PREEMPT(tsk)
#endif #endif
......
...@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list, ...@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
#define list_first_entry_rcu(ptr, type, member) \ #define list_first_entry_rcu(ptr, type, member) \
list_entry_rcu((ptr)->next, type, member) list_entry_rcu((ptr)->next, type, member)
#define __list_for_each_rcu(pos, head) \
for (pos = rcu_dereference_raw(list_next_rcu(head)); \
pos != (head); \
pos = rcu_dereference_raw(list_next_rcu((pos)))
/** /**
* list_for_each_entry_rcu - iterate over rcu list of given type * list_for_each_entry_rcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor. * @pos: the type * to use as a loop cursor.
......
...@@ -47,6 +47,8 @@ ...@@ -47,6 +47,8 @@
extern int rcutorture_runnable; /* for sysctl */ extern int rcutorture_runnable; /* for sysctl */
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
...@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head, ...@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
extern void synchronize_sched(void); extern void synchronize_sched(void);
extern void rcu_barrier_bh(void); extern void rcu_barrier_bh(void);
extern void rcu_barrier_sched(void); extern void rcu_barrier_sched(void);
extern void synchronize_sched_expedited(void);
extern int sched_expedited_torture_stats(char *page); extern int sched_expedited_torture_stats(char *page);
static inline void __rcu_read_lock_bh(void) static inline void __rcu_read_lock_bh(void)
...@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void) ...@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/* Internal to kernel */ /* Internal to kernel */
extern void rcu_init(void);
extern void rcu_sched_qs(int cpu); extern void rcu_sched_qs(int cpu);
extern void rcu_bh_qs(int cpu); extern void rcu_bh_qs(int cpu);
extern void rcu_check_callbacks(int cpu, int user); extern void rcu_check_callbacks(int cpu, int user);
......
...@@ -27,7 +27,9 @@ ...@@ -27,7 +27,9 @@
#include <linux/cache.h> #include <linux/cache.h>
#define rcu_init_sched() do { } while (0) static inline void rcu_init(void)
{
}
#ifdef CONFIG_TINY_RCU #ifdef CONFIG_TINY_RCU
...@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void) ...@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
synchronize_sched(); synchronize_sched();
} }
static inline void synchronize_sched_expedited(void)
{
synchronize_sched();
}
#ifdef CONFIG_TINY_RCU #ifdef CONFIG_TINY_RCU
static inline void rcu_preempt_note_context_switch(void) static inline void rcu_preempt_note_context_switch(void)
...@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void) ...@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
} }
#ifdef CONFIG_DEBUG_LOCK_ALLOC #ifdef CONFIG_DEBUG_LOCK_ALLOC
extern int rcu_scheduler_active __read_mostly; extern int rcu_scheduler_active __read_mostly;
extern void rcu_scheduler_starting(void); extern void rcu_scheduler_starting(void);
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
static inline void rcu_scheduler_starting(void) static inline void rcu_scheduler_starting(void)
{ {
} }
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#endif /* __LINUX_RCUTINY_H */ #endif /* __LINUX_RCUTINY_H */
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#ifndef __LINUX_RCUTREE_H #ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H
extern void rcu_init(void);
extern void rcu_note_context_switch(int cpu); extern void rcu_note_context_switch(int cpu);
extern int rcu_needs_cpu(int cpu); extern int rcu_needs_cpu(int cpu);
extern void rcu_cpu_stall_reset(void); extern void rcu_cpu_stall_reset(void);
...@@ -47,6 +48,7 @@ static inline void exit_rcu(void) ...@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
extern void synchronize_rcu_bh(void); extern void synchronize_rcu_bh(void);
extern void synchronize_sched_expedited(void);
extern void synchronize_rcu_expedited(void); extern void synchronize_rcu_expedited(void);
static inline void synchronize_rcu_bh_expedited(void) static inline void synchronize_rcu_bh_expedited(void)
......
...@@ -1229,6 +1229,9 @@ struct task_struct { ...@@ -1229,6 +1229,9 @@ struct task_struct {
#ifdef CONFIG_TREE_PREEMPT_RCU #ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node; struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
struct rt_mutex *rcu_boost_mutex;
#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info; struct sched_info sched_info;
...@@ -1759,7 +1762,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * ...@@ -1759,7 +1762,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
#ifdef CONFIG_PREEMPT_RCU #ifdef CONFIG_PREEMPT_RCU
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ #define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
static inline void rcu_copy_process(struct task_struct *p) static inline void rcu_copy_process(struct task_struct *p)
{ {
...@@ -1767,7 +1771,10 @@ static inline void rcu_copy_process(struct task_struct *p) ...@@ -1767,7 +1771,10 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_read_unlock_special = 0; p->rcu_read_unlock_special = 0;
#ifdef CONFIG_TREE_PREEMPT_RCU #ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL; p->rcu_blocked_node = NULL;
#endif #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
p->rcu_boost_mutex = NULL;
#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry); INIT_LIST_HEAD(&p->rcu_node_entry);
} }
......
...@@ -393,7 +393,6 @@ config PREEMPT_RCU ...@@ -393,7 +393,6 @@ config PREEMPT_RCU
config RCU_TRACE config RCU_TRACE
bool "Enable tracing for RCU" bool "Enable tracing for RCU"
depends on TREE_RCU || TREE_PREEMPT_RCU
help help
This option provides tracing in RCU which presents stats This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation. in debugfs for debugging RCU implementation.
...@@ -459,6 +458,60 @@ config TREE_RCU_TRACE ...@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
TREE_PREEMPT_RCU implementations, permitting Makefile to TREE_PREEMPT_RCU implementations, permitting Makefile to
trivially select kernel/rcutree_trace.c. trivially select kernel/rcutree_trace.c.
config RCU_BOOST
bool "Enable RCU priority boosting"
depends on RT_MUTEXES && TINY_PREEMPT_RCU
default n
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.
This option also prevents heavy loads from blocking RCU
callback invocation for all flavors of RCU.
Say Y here if you are working with real-time apps or heavy loads
Say N here if you are unsure.
config RCU_BOOST_PRIO
int "Real-time priority to boost RCU readers to"
range 1 99
depends on RCU_BOOST
default 1
help
This option specifies the real-time priority to which preempted
RCU readers are to be boosted. If you are working with CPU-bound
real-time applications, you should specify a priority higher then
the highest-priority CPU-bound application.
Specify the real-time priority, or take the default if unsure.
config RCU_BOOST_DELAY
int "Milliseconds to delay boosting after RCU grace-period start"
range 0 3000
depends on RCU_BOOST
default 500
help
This option specifies the time to wait after the beginning of
a given grace period before priority-boosting preempted RCU
readers blocking that grace period. Note that any RCU reader
blocking an expedited RCU grace period is boosted immediately.
Accept the default if unsure.
config SRCU_SYNCHRONIZE_DELAY
int "Microseconds to delay before waiting for readers"
range 0 20
default 10
help
This option controls how long SRCU delays before entering its
loop waiting on SRCU readers. The purpose of this loop is
to avoid the unconditional context-switch penalty that would
otherwise be incurred if there was an active SRCU reader,
in a manner similar to adaptive locking schemes. This should
be set to be a bit longer than the common-case SRCU read-side
critical-section overhead.
Accept the default if unsure.
endmenu # "RCU Subsystem" endmenu # "RCU Subsystem"
config IKCONFIG config IKCONFIG
......
...@@ -36,31 +36,16 @@ ...@@ -36,31 +36,16 @@
#include <linux/time.h> #include <linux/time.h>
#include <linux/cpu.h> #include <linux/cpu.h>
/* Global control variables for rcupdate callback mechanism. */ /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
struct rcu_ctrlblk { static struct task_struct *rcu_kthread_task;
struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ static unsigned long have_rcu_kthread_work;
struct rcu_head **curtail; /* ->next pointer of last CB. */ static void invoke_rcu_kthread(void);
};
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_sched_ctrlblk = {
.donetail = &rcu_sched_ctrlblk.rcucblist,
.curtail = &rcu_sched_ctrlblk.rcucblist,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.donetail = &rcu_bh_ctrlblk.rcucblist,
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
#ifdef CONFIG_DEBUG_LOCK_ALLOC
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/* Forward declarations for rcutiny_plugin.h. */ /* Forward declarations for rcutiny_plugin.h. */
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); struct rcu_ctrlblk;
static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head, static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu), void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp); struct rcu_ctrlblk *rcp);
...@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu) ...@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
{ {
if (rcu_qsctr_help(&rcu_sched_ctrlblk) + if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk)) rcu_qsctr_help(&rcu_bh_ctrlblk))
raise_softirq(RCU_SOFTIRQ); invoke_rcu_kthread();
} }
/* /*
...@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu) ...@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
void rcu_bh_qs(int cpu) void rcu_bh_qs(int cpu)
{ {
if (rcu_qsctr_help(&rcu_bh_ctrlblk)) if (rcu_qsctr_help(&rcu_bh_ctrlblk))
raise_softirq(RCU_SOFTIRQ); invoke_rcu_kthread();
} }
/* /*
...@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user) ...@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
} }
/* /*
* Helper function for rcu_process_callbacks() that operates on the * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
* specified rcu_ctrlkblk structure. * whose grace period has elapsed.
*/ */
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{ {
struct rcu_head *next, *list; struct rcu_head *next, *list;
unsigned long flags; unsigned long flags;
RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */ /* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail) if (&rcp->rcucblist == rcp->donetail)
...@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) ...@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
next = list->next; next = list->next;
prefetch(next); prefetch(next);
debug_rcu_head_unqueue(list); debug_rcu_head_unqueue(list);
local_bh_disable();
list->func(list); list->func(list);
local_bh_enable();
list = next; list = next;
RCU_TRACE(cb_count++);
} }
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
} }
/* /*
* Invoke any callbacks whose grace period has completed. * This kthread invokes RCU callbacks whose grace periods have
* elapsed. It is awakened as needed, and takes the place of the
* RCU_SOFTIRQ that was used previously for this purpose.
* This is a kthread, but it is never stopped, at least not until
* the system goes down.
*/ */
static void rcu_process_callbacks(struct softirq_action *unused) static int rcu_kthread(void *arg)
{ {
__rcu_process_callbacks(&rcu_sched_ctrlblk); unsigned long work;
__rcu_process_callbacks(&rcu_bh_ctrlblk); unsigned long morework;
rcu_preempt_process_callbacks(); unsigned long flags;
for (;;) {
wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
morework = rcu_boost();
local_irq_save(flags);
work = have_rcu_kthread_work;
have_rcu_kthread_work = morework;
local_irq_restore(flags);
if (work) {
rcu_process_callbacks(&rcu_sched_ctrlblk);
rcu_process_callbacks(&rcu_bh_ctrlblk);
rcu_preempt_process_callbacks();
}
schedule_timeout_interruptible(1); /* Leave CPU for others. */
}
return 0; /* Not reached, but needed to shut gcc up. */
}
/*
* Wake up rcu_kthread() to process callbacks now eligible for invocation
* or to boost readers.
*/
static void invoke_rcu_kthread(void)
{
unsigned long flags;
local_irq_save(flags);
have_rcu_kthread_work = 1;
wake_up(&rcu_kthread_wq);
local_irq_restore(flags);
} }
/* /*
...@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head, ...@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags); local_irq_save(flags);
*rcp->curtail = head; *rcp->curtail = head;
rcp->curtail = &head->next; rcp->curtail = &head->next;
RCU_TRACE(rcp->qlen++);
local_irq_restore(flags); local_irq_restore(flags);
} }
...@@ -282,7 +308,16 @@ void rcu_barrier_sched(void) ...@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
} }
EXPORT_SYMBOL_GPL(rcu_barrier_sched); EXPORT_SYMBOL_GPL(rcu_barrier_sched);
void __init rcu_init(void) /*
* Spawn the kthread that invokes RCU callbacks.
*/
static int __init rcu_spawn_kthreads(void)
{ {
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); struct sched_param sp;
rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
sp.sched_priority = RCU_BOOST_PRIO;
sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
return 0;
} }
early_initcall(rcu_spawn_kthreads);
This diff is collapsed.
This diff is collapsed.
...@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; ...@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \ .gpnum = -300, \
.completed = -300, \ .completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
.orphan_cbs_list = NULL, \
.orphan_cbs_tail = &structname.orphan_cbs_list, \
.orphan_qlen = 0, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
.n_force_qs = 0, \ .n_force_qs = 0, \
.n_force_qs_ngp = 0, \ .n_force_qs_ngp = 0, \
...@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void) ...@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{ {
if (rdp->gpnum != rnp->gpnum) { if (rdp->gpnum != rnp->gpnum) {
rdp->qs_pending = 1; /*
rdp->passed_quiesc = 0; * If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
* go looking for one.
*/
rdp->gpnum = rnp->gpnum; rdp->gpnum = rnp->gpnum;
if (rnp->qsmask & rdp->grpmask) {
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
} else
rdp->qs_pending = 0;
} }
} }
...@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat ...@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
/* Remember that we saw this grace-period completion. */ /* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed; rdp->completed = rnp->completed;
/*
* If we were in an extended quiescent state, we may have
* missed some grace periods that others CPUs handled on
* our behalf. Catch up with this state to avoid noting
* spurious new grace periods. If another grace period
* has started, then rnp->gpnum will have advanced, so
* we will detect this later on.
*/
if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
rdp->gpnum = rdp->completed;
/*
* If RCU does not need a quiescent state from this CPU,
* then make sure that this CPU doesn't go looking for one.
*/
if ((rnp->qsmask & rdp->grpmask) == 0)
rdp->qs_pending = 0;
} }
} }
...@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) ...@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
/* /*
* Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the * Move a dying CPU's RCU callbacks to online CPU's callback list.
* specified flavor of RCU. The callbacks will be adopted by the next * Synchronization is not required because this function executes
* _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever * in stop_machine() context.
* comes first. Because this is invoked from the CPU_DYING notifier,
* irqs are already disabled.
*/ */
static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{ {
int i; int i;
/* current DYING CPU is cleared in the cpu_online_mask */
int receive_cpu = cpumask_any(cpu_online_mask);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
if (rdp->nxtlist == NULL) if (rdp->nxtlist == NULL)
return; /* irqs disabled, so comparison is stable. */ return; /* irqs disabled, so comparison is stable. */
raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
*rsp->orphan_cbs_tail = rdp->nxtlist; *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
receive_rdp->qlen += rdp->qlen;
receive_rdp->n_cbs_adopted += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->nxtlist = NULL; rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++) for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist; rdp->nxttail[i] = &rdp->nxtlist;
rsp->orphan_qlen += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen = 0; rdp->qlen = 0;
raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
}
/*
* Adopt previously orphaned RCU callbacks.
*/
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
{
unsigned long flags;
struct rcu_data *rdp;
raw_spin_lock_irqsave(&rsp->onofflock, flags);
rdp = this_cpu_ptr(rsp->rda);
if (rsp->orphan_cbs_list == NULL) {
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
return;
}
*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
rdp->qlen += rsp->orphan_qlen;
rdp->n_cbs_adopted += rsp->orphan_qlen;
rsp->orphan_cbs_list = NULL;
rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
rsp->orphan_qlen = 0;
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
} }
/* /*
...@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) ...@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags); raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP) if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp); rcu_report_exp_rnp(rsp, rnp);
rcu_adopt_orphan_cbs(rsp);
} }
/* /*
...@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu) ...@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
#else /* #ifdef CONFIG_HOTPLUG_CPU */ #else /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
}
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
{ {
} }
...@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), ...@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
*/ */
local_irq_save(flags); local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda); rdp = this_cpu_ptr(rsp->rda);
rcu_process_gp_end(rsp, rdp);
check_for_new_grace_period(rsp, rdp);
/* Add the callback to our list. */ /* Add the callback to our list. */
*rdp->nxttail[RCU_NEXT_TAIL] = head; *rdp->nxttail[RCU_NEXT_TAIL] = head;
rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
unsigned long nestflag;
struct rcu_node *rnp_root = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
}
/* /*
* Force the grace period if too many callbacks or too long waiting. * Force the grace period if too many callbacks or too long waiting.
* Enforce hysteresis, and don't invoke force_quiescent_state() * Enforce hysteresis, and don't invoke force_quiescent_state()
...@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), ...@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* is the only one waiting for a grace period to complete. * is the only one waiting for a grace period to complete.
*/ */
if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
rdp->blimit = LONG_MAX;
if (rsp->n_force_qs == rdp->n_force_qs_snap && /* Are we ignoring a completed grace period? */
*rdp->nxttail[RCU_DONE_TAIL] != head) rcu_process_gp_end(rsp, rdp);
force_quiescent_state(rsp, 0); check_for_new_grace_period(rsp, rdp);
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->qlen_last_fqs_check = rdp->qlen; /* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
unsigned long nestflag;
struct rcu_node *rnp_root = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
if (rsp->n_force_qs == rdp->n_force_qs_snap &&
*rdp->nxttail[RCU_DONE_TAIL] != head)
force_quiescent_state(rsp, 0);
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->qlen_last_fqs_check = rdp->qlen;
}
} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
force_quiescent_state(rsp, 1); force_quiescent_state(rsp, 1);
local_irq_restore(flags); local_irq_restore(flags);
...@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp, ...@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
* decrement rcu_barrier_cpu_count -- otherwise the first CPU * decrement rcu_barrier_cpu_count -- otherwise the first CPU
* might complete its grace period before all of the other CPUs * might complete its grace period before all of the other CPUs
* did their increment, causing this function to return too * did their increment, causing this function to return too
* early. * early. Note that on_each_cpu() disables irqs, which prevents
* any CPUs from coming online or going offline until each online
* CPU has queued its RCU-barrier callback.
*/ */
atomic_set(&rcu_barrier_cpu_count, 1); atomic_set(&rcu_barrier_cpu_count, 1);
preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
rcu_adopt_orphan_cbs(rsp);
on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
if (atomic_dec_and_test(&rcu_barrier_cpu_count)) if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion); complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion);
...@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, ...@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
case CPU_DYING: case CPU_DYING:
case CPU_DYING_FROZEN: case CPU_DYING_FROZEN:
/* /*
* preempt_disable() in _rcu_barrier() prevents stop_machine(), * The whole machine is "stopped" except this CPU, so we can
* so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" * touch any data without introducing corruption. We send the
* returns, all online cpus have queued rcu_barrier_func(). * dying CPU's callbacks to an arbitrarily chosen online CPU.
* The dying CPU clears its cpu_online_mask bit and
* moves all of its RCU callbacks to ->orphan_cbs_list
* in the context of stop_machine(), so subsequent calls
* to _rcu_barrier() will adopt these callbacks and only
* then queue rcu_barrier_func() on all remaining CPUs.
*/ */
rcu_send_cbs_to_orphanage(&rcu_bh_state); rcu_send_cbs_to_online(&rcu_bh_state);
rcu_send_cbs_to_orphanage(&rcu_sched_state); rcu_send_cbs_to_online(&rcu_sched_state);
rcu_preempt_send_cbs_to_orphanage(); rcu_preempt_send_cbs_to_online();
break; break;
case CPU_DEAD: case CPU_DEAD:
case CPU_DEAD_FROZEN: case CPU_DEAD_FROZEN:
...@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) ...@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{ {
int i; int i;
for (i = NUM_RCU_LVLS - 1; i >= 0; i--) for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT; rsp->levelspread[i] = CONFIG_RCU_FANOUT;
rsp->levelspread[0] = RCU_FANOUT_LEAF;
} }
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_levelspread(struct rcu_state *rsp)
......
...@@ -31,46 +31,51 @@ ...@@ -31,46 +31,51 @@
/* /*
* Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
* In theory, it should be possible to add more levels straightforwardly. * In theory, it should be possible to add more levels straightforwardly.
* In practice, this has not been tested, so there is probably some * In practice, this did work well going from three levels to four.
* bug somewhere. * Of course, your mileage may vary.
*/ */
#define MAX_RCU_LVLS 4 #define MAX_RCU_LVLS 4
#define RCU_FANOUT (CONFIG_RCU_FANOUT) #if CONFIG_RCU_FANOUT > 16
#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) #define RCU_FANOUT_LEAF 16
#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) #else /* #if CONFIG_RCU_FANOUT > 16 */
#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
#if NR_CPUS <= RCU_FANOUT #define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
# define NUM_RCU_LVLS 1 # define NUM_RCU_LVLS 1
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0 # define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_SQ #elif NR_CPUS <= RCU_FANOUT_2
# define NUM_RCU_LVLS 2 # define NUM_RCU_LVLS 2
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0 # define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_CUBE #elif NR_CPUS <= RCU_FANOUT_3
# define NUM_RCU_LVLS 3 # define NUM_RCU_LVLS 3
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_3 NR_CPUS # define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0 # define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_FOURTH #elif NR_CPUS <= RCU_FANOUT_4
# define NUM_RCU_LVLS 4 # define NUM_RCU_LVLS 4
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_4 NR_CPUS # define NUM_RCU_LVL_4 (NR_CPUS)
#else #else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT */ #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
...@@ -203,8 +208,8 @@ struct rcu_data { ...@@ -203,8 +208,8 @@ struct rcu_data {
long qlen_last_fqs_check; long qlen_last_fqs_check;
/* qlen at last check for QS forcing */ /* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap; unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */ /* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */ long blimit; /* Upper limit on a processed batch */
...@@ -309,15 +314,7 @@ struct rcu_state { ...@@ -309,15 +314,7 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */ /* End of fields guarded by root rcu_node's lock. */
raw_spinlock_t onofflock; /* exclude on/offline and */ raw_spinlock_t onofflock; /* exclude on/offline and */
/* starting new GP. Also */ /* starting new GP. */
/* protects the following */
/* orphan_cbs fields. */
struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
/* orphaned by all CPUs in */
/* a given leaf rcu_node */
/* going offline. */
struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
long orphan_qlen; /* Number of orphaned cbs. */
raw_spinlock_t fqslock; /* Only one task forcing */ raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */ /* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */ unsigned long jiffies_force_qs; /* Time at which to invoke */
...@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); ...@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
static int rcu_preempt_pending(int cpu); static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu); static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_orphanage(void); static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void); static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void); static void rcu_needs_cpu_flush(void);
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
*/ */
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/stop_machine.h>
/* /*
* Check the RCU kernel configuration parameters and print informative * Check the RCU kernel configuration parameters and print informative
...@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) ...@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
} }
/* /*
* Move preemptable RCU's callbacks to ->orphan_cbs_list. * Move preemptable RCU's callbacks from dying CPU to other online CPU.
*/ */
static void rcu_preempt_send_cbs_to_orphanage(void) static void rcu_preempt_send_cbs_to_online(void)
{ {
rcu_send_cbs_to_orphanage(&rcu_preempt_state); rcu_send_cbs_to_online(&rcu_preempt_state);
} }
/* /*
...@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) ...@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
/* /*
* Because there is no preemptable RCU, there are no callbacks to move. * Because there is no preemptable RCU, there are no callbacks to move.
*/ */
static void rcu_preempt_send_cbs_to_orphanage(void) static void rcu_preempt_send_cbs_to_online(void)
{ {
} }
...@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void) ...@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifndef CONFIG_SMP
void synchronize_sched_expedited(void)
{
cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#else /* #ifndef CONFIG_SMP */
static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
*
* In the current initial implementation of cpu_stop, the
* above condition is already met when the control reaches
* this point and the following smp_mb() is not strictly
* necessary. Do smp_mb() anyway for documentation and
* robustness against future implementation changes.
*/
smp_mb(); /* See above comment block. */
return 0;
}
/*
* Wait for an rcu-sched grace period to elapse, but use "big hammer"
* approach to force grace period to end quickly. This consumes
* significant time on all CPUs, and is thus not recommended for
* any sort of common-case code.
*
* Note that it is illegal to call this function while holding any
* lock that is acquired by a CPU-hotplug notifier. Failing to
* observe this restriction will result in deadlock.
*
* This implementation can be thought of as an application of ticket
* locking to RCU, with sync_sched_expedited_started and
* sync_sched_expedited_done taking on the roles of the halves
* of the ticket-lock word. Each task atomically increments
* sync_sched_expedited_started upon entry, snapshotting the old value,
* then attempts to stop all the CPUs. If this succeeds, then each
* CPU will have executed a context switch, resulting in an RCU-sched
* grace period. We are then done, so we use atomic_cmpxchg() to
* update sync_sched_expedited_done to match our snapshot -- but
* only if someone else has not already advanced past our snapshot.
*
* On the other hand, if try_stop_cpus() fails, we check the value
* of sync_sched_expedited_done. If it has advanced past our
* initial snapshot, then someone else must have forced a grace period
* some time after we took our snapshot. In this case, our work is
* done for us, and we can simply return. Otherwise, we try again,
* but keep our initial snapshot for purposes of checking for someone
* doing our work for us.
*
* If we fail too many times in a row, we fall back to synchronize_sched().
*/
void synchronize_sched_expedited(void)
{
int firstsnap, s, snap, trycount = 0;
/* Note that atomic_inc_return() implies full memory barrier. */
firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
get_online_cpus();
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
while (try_stop_cpus(cpu_online_mask,
synchronize_sched_expedited_cpu_stop,
NULL) == -EAGAIN) {
put_online_cpus();
/* No joy, try again later. Or just synchronize_sched(). */
if (trycount++ < 10)
udelay(trycount * num_online_cpus());
else {
synchronize_sched();
return;
}
/* Check to see if someone else did our work for us. */
s = atomic_read(&sync_sched_expedited_done);
if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
smp_mb(); /* ensure test happens before caller kfree */
return;
}
/*
* Refetching sync_sched_expedited_started allows later
* callers to piggyback on our grace period. We subtract
* 1 to get the same token that the last incrementer got.
* We retry after they started, so our grace period works
* for them, and they started after our first try, so their
* grace period works for us.
*/
get_online_cpus();
snap = atomic_read(&sync_sched_expedited_started) - 1;
smp_mb(); /* ensure read is before try_stop_cpus(). */
}
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
* relevant -- which it won't be if someone who started later
* than we did beat us to the punch.
*/
do {
s = atomic_read(&sync_sched_expedited_done);
if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
smp_mb(); /* ensure test happens before caller kfree */
break;
}
} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#endif /* #else #ifndef CONFIG_SMP */
#if !defined(CONFIG_RCU_FAST_NO_HZ) #if !defined(CONFIG_RCU_FAST_NO_HZ)
/* /*
......
...@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) ...@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum; gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
"nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
rsp->completed, gpnum, rsp->signaled, rsp->completed, gpnum, rsp->signaled,
(long)(rsp->jiffies_force_qs - jiffies), (long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff), (int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp,
rsp->n_force_qs_lh, rsp->orphan_qlen); rsp->n_force_qs_lh);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) { if (rnp->level != level) {
seq_puts(m, "\n"); seq_puts(m, "\n");
...@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = { ...@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
static struct dentry *rcudir; static struct dentry *rcudir;
static int __init rcuclassic_trace_init(void) static int __init rcutree_trace_init(void)
{ {
struct dentry *retval; struct dentry *retval;
...@@ -337,14 +337,14 @@ static int __init rcuclassic_trace_init(void) ...@@ -337,14 +337,14 @@ static int __init rcuclassic_trace_init(void)
return 1; return 1;
} }
static void __exit rcuclassic_trace_cleanup(void) static void __exit rcutree_trace_cleanup(void)
{ {
debugfs_remove_recursive(rcudir); debugfs_remove_recursive(rcudir);
} }
module_init(rcuclassic_trace_init); module_init(rcutree_trace_init);
module_exit(rcuclassic_trace_cleanup); module_exit(rcutree_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney"); MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
......
...@@ -9534,72 +9534,3 @@ struct cgroup_subsys cpuacct_subsys = { ...@@ -9534,72 +9534,3 @@ struct cgroup_subsys cpuacct_subsys = {
}; };
#endif /* CONFIG_CGROUP_CPUACCT */ #endif /* CONFIG_CGROUP_CPUACCT */
#ifndef CONFIG_SMP
void synchronize_sched_expedited(void)
{
barrier();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#else /* #ifndef CONFIG_SMP */
static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
*
* In the current initial implementation of cpu_stop, the
* above condition is already met when the control reaches
* this point and the following smp_mb() is not strictly
* necessary. Do smp_mb() anyway for documentation and
* robustness against future implementation changes.
*/
smp_mb(); /* See above comment block. */
return 0;
}
/*
* Wait for an rcu-sched grace period to elapse, but use "big hammer"
* approach to force grace period to end quickly. This consumes
* significant time on all CPUs, and is thus not recommended for
* any sort of common-case code.
*
* Note that it is illegal to call this function while holding any
* lock that is acquired by a CPU-hotplug notifier. Failing to
* observe this restriction will result in deadlock.
*/
void synchronize_sched_expedited(void)
{
int snap, trycount = 0;
smp_mb(); /* ensure prior mod happens before capturing snap. */
snap = atomic_read(&synchronize_sched_expedited_count) + 1;
get_online_cpus();
while (try_stop_cpus(cpu_online_mask,
synchronize_sched_expedited_cpu_stop,
NULL) == -EAGAIN) {
put_online_cpus();
if (trycount++ < 10)
udelay(trycount * num_online_cpus());
else {
synchronize_sched();
return;
}
if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
smp_mb(); /* ensure test happens before caller kfree */
return;
}
get_online_cpus();
}
atomic_inc(&synchronize_sched_expedited_count);
smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#endif /* #else #ifndef CONFIG_SMP */
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/delay.h>
#include <linux/srcu.h> #include <linux/srcu.h>
static int init_srcu_struct_fields(struct srcu_struct *sp) static int init_srcu_struct_fields(struct srcu_struct *sp)
...@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) ...@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
* all srcu_read_lock() calls using the old counters have completed. * all srcu_read_lock() calls using the old counters have completed.
* Their corresponding critical sections might well be still * Their corresponding critical sections might well be still
* executing, but the srcu_read_lock() primitives themselves * executing, but the srcu_read_lock() primitives themselves
* will have finished executing. * will have finished executing. We initially give readers
* an arbitrarily chosen 10 microseconds to get out of their
* SRCU read-side critical sections, then loop waiting 1/HZ
* seconds per iteration.
*/ */
if (srcu_readers_active_idx(sp, idx))
udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
while (srcu_readers_active_idx(sp, idx)) while (srcu_readers_active_idx(sp, idx))
schedule_timeout_interruptible(1); schedule_timeout_interruptible(1);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment