Commit d5f177d3 authored by Paul E. McKenney's avatar Paul E. McKenney

rcu-tasks: Add an RCU Tasks Trace to simplify protection of tracing hooks

Because RCU does not watch exception early-entry/late-exit, idle-loop,
or CPU-hotplug execution, protection of tracing and BPF operations is
needlessly complicated.  This commit therefore adds a variant of
Tasks RCU that:

o	Has explicit read-side markers to allow finite grace periods in
	the face of in-kernel loops for PREEMPT=n builds.  These markers
	are rcu_read_lock_trace() and rcu_read_unlock_trace().

o	Protects code in the idle loop, exception entry/exit, and
	CPU-hotplug code paths.  In this respect, RCU-tasks trace is
	similar to SRCU, but with lighter-weight readers.

o	Avoids expensive read-side instruction, having overhead similar
	to that of Preemptible RCU.

There are of course downsides:

o	The grace-period code can send IPIs to CPUs, even when those
	CPUs are in the idle loop or in nohz_full userspace.  This is
	mitigated by later commits.

o	It is necessary to scan the full tasklist, much as for Tasks RCU.

o	There is a single callback queue guarded by a single lock,
	again, much as for Tasks RCU.  However, those early use cases
	that request multiple grace periods in quick succession are
	expected to do so from a single task, which makes the single
	lock almost irrelevant.  If needed, multiple callback queues
	can be provided using any number of schemes.

Perhaps most important, this variant of RCU does not affect the vanilla
flavors, rcu_preempt and rcu_sched.  The fact that RCU Tasks Trace
readers can operate from idle, offline, and exception entry/exit in no
way enables rcu_preempt and rcu_sched readers to do so.

The memory ordering was outlined here:
https://lore.kernel.org/lkml/20200319034030.GX3199@paulmck-ThinkPad-P72/

This effort benefited greatly from off-list discussions of BPF
requirements with Alexei Starovoitov and Andrii Nakryiko.  At least
some of the on-list discussions are captured in the Link: tags below.
In addition, KCSAN was quite helpful in finding some early bugs.

Link: https://lore.kernel.org/lkml/20200219150744.428764577@infradead.org/
Link: https://lore.kernel.org/lkml/87mu8p797b.fsf@nanos.tec.linutronix.de/
Link: https://lore.kernel.org/lkml/20200225221305.605144982@linutronix.de/
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrii Nakryiko <andriin@fb.com>
[ paulmck: Apply feedback from Steve Rostedt and Joel Fernandes. ]
[ paulmck: Decrement trc_n_readers_need_end upon IPI failure. ]
[ paulmck: Fix locking issue reported by rcutorture. ]
Signed-off-by: default avatarPaul E. McKenney <paulmck@kernel.org>
parent d01aa263
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion, adapted for tracing.
*
* Copyright (C) 2020 Paul E. McKenney.
*/
#ifndef __LINUX_RCUPDATE_TRACE_H
#define __LINUX_RCUPDATE_TRACE_H
#include <linux/sched.h>
#include <linux/rcupdate.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern struct lockdep_map rcu_trace_lock_map;
static inline int rcu_read_lock_trace_held(void)
{
return lock_is_held(&rcu_trace_lock_map);
}
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
static inline int rcu_read_lock_trace_held(void)
{
return 1;
}
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#ifdef CONFIG_TASKS_TRACE_RCU
void rcu_read_unlock_trace_special(struct task_struct *t);
/**
* rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
*
* When synchronize_rcu_trace() is invoked by one task, then that task
* is guaranteed to block until all other tasks exit their read-side
* critical sections. Similarly, if call_rcu_trace() is invoked on one
* task while other tasks are within RCU read-side critical sections,
* invocation of the corresponding RCU callback is deferred until after
* the all the other tasks exit their critical sections.
*
* For more details, please see the documentation for rcu_read_lock().
*/
static inline void rcu_read_lock_trace(void)
{
struct task_struct *t = current;
WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1);
rcu_lock_acquire(&rcu_trace_lock_map);
}
/**
* rcu_read_unlock_trace - mark end of RCU-trace read-side critical section
*
* Pairs with a preceding call to rcu_read_lock_trace(), and nesting is
* allowed. Invoking a rcu_read_unlock_trace() when there is no matching
* rcu_read_lock_trace() is verboten, and will result in lockdep complaints.
*
* For more details, please see the documentation for rcu_read_unlock().
*/
static inline void rcu_read_unlock_trace(void)
{
int nesting;
struct task_struct *t = current;
rcu_lock_release(&rcu_trace_lock_map);
nesting = READ_ONCE(t->trc_reader_nesting) - 1;
WRITE_ONCE(t->trc_reader_nesting, nesting);
if (likely(!READ_ONCE(t->trc_reader_need_end)) || nesting)
return; // We assume shallow reader nesting.
rcu_read_unlock_trace_special(t);
}
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
void synchronize_rcu_tasks_trace(void);
void rcu_barrier_tasks_trace(void);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
#endif /* __LINUX_RCUPDATE_TRACE_H */
......@@ -724,6 +724,14 @@ struct task_struct {
struct list_head rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
int trc_reader_nesting;
int trc_ipi_to_cpu;
bool trc_reader_need_end;
bool trc_reader_checked;
struct list_head trc_holdout_list;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
struct sched_info sched_info;
struct list_head tasks;
......
......@@ -141,6 +141,10 @@ struct task_struct init_task
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
.rcu_tasks_idle_cpu = -1,
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
.trc_reader_nesting = 0,
.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
#endif
#ifdef CONFIG_CPUSETS
.mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq),
#endif
......
......@@ -1683,6 +1683,10 @@ static inline void rcu_copy_process(struct task_struct *p)
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
struct pid *pidfd_pid(const struct file *file)
......
......@@ -71,7 +71,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU_GENERIC
def_bool TASKS_RCU || TASKS_RUDE_RCU
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
select SRCU
help
This option enables generic infrastructure code supporting
......@@ -93,6 +93,15 @@ config TASKS_RUDE_RCU
switches on all online CPUs, including idle ones, so use
with caution.
config TASKS_TRACE_RCU
def_bool 0
help
This option enables a task-based RCU implementation that uses
explicit rcu_read_lock_trace() read-side markers, and allows
these readers to appear in the idle loop as well as on the CPU
hotplug code paths. It can force IPIs on online CPUs, including
idle ones, so use with caution.
config RCU_STALL_COMMON
def_bool TREE_RCU
help
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment