Commit 9621fbee authored by Kalesh Singh's avatar Kalesh Singh Committed by Paul E. McKenney

rcu: Move expedited grace period (GP) work to RT kthread_worker

Enabling CONFIG_RCU_BOOST did not reduce RCU expedited grace-period
latency because its workqueues run at SCHED_OTHER, and thus can be
delayed by normal processes.  This commit avoids these delays by moving
the expedited GP work items to a real-time-priority kthread_worker.

This option is controlled by CONFIG_RCU_EXP_KTHREAD and disabled by
default on PREEMPT_RT=y kernels which disable expedited grace periods
after boot by unconditionally setting rcupdate.rcu_normal_after_boot=1.

The results were evaluated on arm64 Android devices (6GB ram) running
5.10 kernel, and capturing trace data in critical user-level code.

The table below shows the resulting order-of-magnitude improvements
in synchronize_rcu_expedited() latency:

------------------------------------------------------------------------
|                          |   workqueues  |  kthread_worker |  Diff   |
------------------------------------------------------------------------
| Count                    |          725  |            688  |         |
------------------------------------------------------------------------
| Min Duration       (ns)  |          326  |            447  |  37.12% |
------------------------------------------------------------------------
| Q1                 (ns)  |       39,428  |         38,971  |  -1.16% |
------------------------------------------------------------------------
| Q2 - Median        (ns)  |       98,225  |         69,743  | -29.00% |
------------------------------------------------------------------------
| Q3                 (ns)  |      342,122  |        126,638  | -62.98% |
------------------------------------------------------------------------
| Max Duration       (ns)  |  372,766,967  |      2,329,671  | -99.38% |
------------------------------------------------------------------------
| Avg Duration       (ns)  |    2,746,353  |        151,242  | -94.49% |
------------------------------------------------------------------------
| Standard Deviation (ns)  |   19,327,765  |        294,408  |         |
------------------------------------------------------------------------

The below table show the range of maximums/minimums for
synchronize_rcu_expedited() latency from all experiments:

------------------------------------------------------------------------
|                          |   workqueues  |  kthread_worker |  Diff   |
------------------------------------------------------------------------
| Total No. of Experiments |           25  |             23  |         |
------------------------------------------------------------------------
| Largest  Maximum   (ns)  |  372,766,967  |      2,329,671  | -99.38% |
------------------------------------------------------------------------
| Smallest Maximum   (ns)  |       38,819  |         86,954  | 124.00% |
------------------------------------------------------------------------
| Range of Maximums  (ns)  |  372,728,148  |      2,242,717  |         |
------------------------------------------------------------------------
| Largest  Minimum   (ns)  |       88,623  |         27,588  | -68.87% |
------------------------------------------------------------------------
| Smallest Minimum   (ns)  |          326  |            447  |  37.12% |
------------------------------------------------------------------------
| Range of Minimums  (ns)  |       88,297  |         27,141  |         |
------------------------------------------------------------------------

Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Reported-by: default avatarTim Murray <timmurray@google.com>
Reported-by: default avatarWei Wang <wvw@google.com>
Tested-by: default avatarKyle Lin <kylelin@google.com>
Tested-by: default avatarChunwei Lu <chunweilu@google.com>
Tested-by: default avatarLulu Wang <luluw@google.com>
Signed-off-by: default avatarKalesh Singh <kaleshsingh@google.com>
Signed-off-by: default avatarPaul E. McKenney <paulmck@kernel.org>
parent 28b3ae42
...@@ -195,6 +195,20 @@ config RCU_BOOST_DELAY ...@@ -195,6 +195,20 @@ config RCU_BOOST_DELAY
Accept the default if unsure. Accept the default if unsure.
config RCU_EXP_KTHREAD
bool "Perform RCU expedited work in a real-time kthread"
depends on RCU_BOOST && RCU_EXPERT
default !PREEMPT_RT && NR_CPUS <= 32
help
Use this option to further reduce the latencies of expedited
grace periods at the expense of being more disruptive.
This option is disabled by default on PREEMPT_RT=y kernels which
disable expedited grace periods after boot by unconditionally
setting rcupdate.rcu_normal_after_boot=1.
Accept the default if unsure.
config RCU_NOCB_CPU config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs" bool "Offload RCU callback processing from boot-selected CPUs"
depends on TREE_RCU depends on TREE_RCU
......
...@@ -536,7 +536,12 @@ int rcu_get_gp_kthreads_prio(void); ...@@ -536,7 +536,12 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j); void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void); void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_gp_wq;
#ifdef CONFIG_RCU_EXP_KTHREAD
extern struct kthread_worker *rcu_exp_gp_kworker;
extern struct kthread_worker *rcu_exp_par_gp_kworker;
#else /* !CONFIG_RCU_EXP_KTHREAD */
extern struct workqueue_struct *rcu_par_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq;
#endif /* CONFIG_RCU_EXP_KTHREAD */
#endif /* #else #ifdef CONFIG_TINY_RCU */ #endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU #ifdef CONFIG_RCU_NOCB_CPU
......
...@@ -4471,6 +4471,51 @@ static int rcu_pm_notify(struct notifier_block *self, ...@@ -4471,6 +4471,51 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK; return NOTIFY_OK;
} }
#ifdef CONFIG_RCU_EXP_KTHREAD
struct kthread_worker *rcu_exp_gp_kworker;
struct kthread_worker *rcu_exp_par_gp_kworker;
static void __init rcu_start_exp_gp_kworkers(void)
{
const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
struct sched_param param = { .sched_priority = kthread_prio };
rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
pr_err("Failed to create %s!\n", gp_kworker_name);
return;
}
rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
pr_err("Failed to create %s!\n", par_gp_kworker_name);
kthread_destroy_worker(rcu_exp_gp_kworker);
return;
}
sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
&param);
}
static inline void rcu_alloc_par_gp_wq(void)
{
}
#else /* !CONFIG_RCU_EXP_KTHREAD */
struct workqueue_struct *rcu_par_gp_wq;
static void __init rcu_start_exp_gp_kworkers(void)
{
}
static inline void rcu_alloc_par_gp_wq(void)
{
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_par_gp_wq);
}
#endif /* CONFIG_RCU_EXP_KTHREAD */
/* /*
* Spawn the kthreads that handle RCU's grace periods. * Spawn the kthreads that handle RCU's grace periods.
*/ */
...@@ -4500,6 +4545,8 @@ static int __init rcu_spawn_gp_kthread(void) ...@@ -4500,6 +4545,8 @@ static int __init rcu_spawn_gp_kthread(void)
rcu_spawn_nocb_kthreads(); rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads(); rcu_spawn_boost_kthreads();
rcu_spawn_core_kthreads(); rcu_spawn_core_kthreads();
/* Create kthread worker for expedited GPs */
rcu_start_exp_gp_kworkers();
return 0; return 0;
} }
early_initcall(rcu_spawn_gp_kthread); early_initcall(rcu_spawn_gp_kthread);
...@@ -4745,7 +4792,6 @@ static void __init rcu_dump_rcu_node_tree(void) ...@@ -4745,7 +4792,6 @@ static void __init rcu_dump_rcu_node_tree(void)
} }
struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_gp_wq;
struct workqueue_struct *rcu_par_gp_wq;
static void __init kfree_rcu_batch_init(void) static void __init kfree_rcu_batch_init(void)
{ {
...@@ -4811,8 +4857,7 @@ void __init rcu_init(void) ...@@ -4811,8 +4857,7 @@ void __init rcu_init(void)
/* Create workqueue for Tree SRCU and for expedited GPs. */ /* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq); WARN_ON(!rcu_gp_wq);
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); rcu_alloc_par_gp_wq();
WARN_ON(!rcu_par_gp_wq);
/* Fill in default value for rcutree.qovld boot parameter. */ /* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */ /* -After- the rcu_node ->lock fields are initialized! */
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
*/ */
#include <linux/cache.h> #include <linux/cache.h>
#include <linux/kthread.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rtmutex.h> #include <linux/rtmutex.h>
#include <linux/threads.h> #include <linux/threads.h>
...@@ -23,7 +24,11 @@ ...@@ -23,7 +24,11 @@
/* Communicate arguments to a workqueue handler. */ /* Communicate arguments to a workqueue handler. */
struct rcu_exp_work { struct rcu_exp_work {
unsigned long rew_s; unsigned long rew_s;
#ifdef CONFIG_RCU_EXP_KTHREAD
struct kthread_work rew_work;
#else
struct work_struct rew_work; struct work_struct rew_work;
#endif /* CONFIG_RCU_EXP_KTHREAD */
}; };
/* RCU's kthread states for tracing. */ /* RCU's kthread states for tracing. */
......
...@@ -334,15 +334,13 @@ static bool exp_funnel_lock(unsigned long s) ...@@ -334,15 +334,13 @@ static bool exp_funnel_lock(unsigned long s)
* Select the CPUs within the specified rcu_node that the upcoming * Select the CPUs within the specified rcu_node that the upcoming
* expedited grace period needs to wait for. * expedited grace period needs to wait for.
*/ */
static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
{ {
int cpu; int cpu;
unsigned long flags; unsigned long flags;
unsigned long mask_ofl_test; unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi; unsigned long mask_ofl_ipi;
int ret; int ret;
struct rcu_exp_work *rewp =
container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
raw_spin_lock_irqsave_rcu_node(rnp, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags);
...@@ -417,13 +415,119 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) ...@@ -417,13 +415,119 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
} }
static void rcu_exp_sel_wait_wake(unsigned long s);
#ifdef CONFIG_RCU_EXP_KTHREAD
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
{
struct rcu_exp_work *rewp =
container_of(wp, struct rcu_exp_work, rew_work);
__sync_rcu_exp_select_node_cpus(rewp);
}
static inline bool rcu_gp_par_worker_started(void)
{
return !!READ_ONCE(rcu_exp_par_gp_kworker);
}
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
{
kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
/*
* Use rcu_exp_par_gp_kworker, because flushing a work item from
* another work item on the same kthread worker can result in
* deadlock.
*/
kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
}
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
{
kthread_flush_work(&rnp->rew.rew_work);
}
/*
* Work-queue handler to drive an expedited grace period forward.
*/
static void wait_rcu_exp_gp(struct kthread_work *wp)
{
struct rcu_exp_work *rewp;
rewp = container_of(wp, struct rcu_exp_work, rew_work);
rcu_exp_sel_wait_wake(rewp->rew_s);
}
static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
{
kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
}
static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
{
}
#else /* !CONFIG_RCU_EXP_KTHREAD */
static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
struct rcu_exp_work *rewp =
container_of(wp, struct rcu_exp_work, rew_work);
__sync_rcu_exp_select_node_cpus(rewp);
}
static inline bool rcu_gp_par_worker_started(void)
{
return !!READ_ONCE(rcu_par_gp_wq);
}
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
{
int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
/* If all offline, queue the work on an unbound CPU. */
if (unlikely(cpu > rnp->grphi - rnp->grplo))
cpu = WORK_CPU_UNBOUND;
else
cpu += rnp->grplo;
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
}
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
{
flush_work(&rnp->rew.rew_work);
}
/*
* Work-queue handler to drive an expedited grace period forward.
*/
static void wait_rcu_exp_gp(struct work_struct *wp)
{
struct rcu_exp_work *rewp;
rewp = container_of(wp, struct rcu_exp_work, rew_work);
rcu_exp_sel_wait_wake(rewp->rew_s);
}
static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
{
INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
queue_work(rcu_gp_wq, &rew->rew_work);
}
static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
{
destroy_work_on_stack(&rew->rew_work);
}
#endif /* CONFIG_RCU_EXP_KTHREAD */
/* /*
* Select the nodes that the upcoming expedited grace period needs * Select the nodes that the upcoming expedited grace period needs
* to wait for. * to wait for.
*/ */
static void sync_rcu_exp_select_cpus(void) static void sync_rcu_exp_select_cpus(void)
{ {
int cpu;
struct rcu_node *rnp; struct rcu_node *rnp;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
...@@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void) ...@@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false; rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask)) if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */ continue; /* Avoid early boot non-existent wq. */
if (!READ_ONCE(rcu_par_gp_wq) || if (!rcu_gp_par_worker_started() ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) { rcu_is_last_leaf_node(rnp)) {
/* No workqueues yet or last leaf, do direct call. */ /* No worker started yet or last leaf, do direct call. */
sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
continue; continue;
} }
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); sync_rcu_exp_select_cpus_queue_work(rnp);
cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
/* If all offline, queue the work on an unbound CPU. */
if (unlikely(cpu > rnp->grphi - rnp->grplo))
cpu = WORK_CPU_UNBOUND;
else
cpu += rnp->grplo;
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
rnp->exp_need_flush = true; rnp->exp_need_flush = true;
} }
/* Wait for workqueue jobs (if any) to complete. */ /* Wait for jobs (if any) to complete. */
rcu_for_each_leaf_node(rnp) rcu_for_each_leaf_node(rnp)
if (rnp->exp_need_flush) if (rnp->exp_need_flush)
flush_work(&rnp->rew.rew_work); sync_rcu_exp_select_cpus_flush_work(rnp);
} }
/* /*
...@@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) ...@@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s); rcu_exp_wait_wake(s);
} }
/*
* Work-queue handler to drive an expedited grace period forward.
*/
static void wait_rcu_exp_gp(struct work_struct *wp)
{
struct rcu_exp_work *rewp;
rewp = container_of(wp, struct rcu_exp_work, rew_work);
rcu_exp_sel_wait_wake(rewp->rew_s);
}
#ifdef CONFIG_PREEMPT_RCU #ifdef CONFIG_PREEMPT_RCU
/* /*
...@@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void) ...@@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void)
} else { } else {
/* Marshall arguments & schedule the expedited grace period. */ /* Marshall arguments & schedule the expedited grace period. */
rew.rew_s = s; rew.rew_s = s;
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); synchronize_rcu_expedited_queue_work(&rew);
queue_work(rcu_gp_wq, &rew.rew_work);
} }
/* Wait for expedited grace period to complete. */ /* Wait for expedited grace period to complete. */
rnp = rcu_get_root(); rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s)); sync_exp_work_done(s));
smp_mb(); /* Workqueue actions happen before return. */ smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */ /* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex); mutex_unlock(&rcu_state.exp_mutex);
if (likely(!boottime)) if (likely(!boottime))
destroy_work_on_stack(&rew.rew_work); synchronize_rcu_expedited_destroy_work(&rew);
} }
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment