Commit 1018016c authored by Jason Low's avatar Jason Low Committed by Ingo Molnar

sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability

While running a database workload, we found a scalability issue with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger systems
(such as a 16 socket machine), this caused more than 30% of total time
spent trying to obtain this kernel lock to update these group timer stats.

This patch converts the timers to 64-bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Note: On 32-bit systems using the generic 64-bit atomics, this causes
sample_group_cputimer() to take locks 3 times instead of just 1 time.
However, we tested this patch on a 32-bit system ARM system using the
generic atomics and did not find the overhead to be much of an issue.
An explanation for why this isn't an issue is that 32-bit systems usually
have small numbers of CPUs, and cacheline contention from extra spinlocks
called periodically is not really apparent on smaller systems.
Signed-off-by: default avatarJason Low <jason.low2@hp.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarThomas Gleixner <tglx@linutronix.de>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 7e5a2c17
...@@ -50,9 +50,10 @@ extern struct fs_struct init_fs; ...@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
.cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \
.rlim = INIT_RLIMITS, \ .rlim = INIT_RLIMITS, \
.cputimer = { \ .cputimer = { \
.cputime = INIT_CPUTIME, \ .utime = ATOMIC64_INIT(0), \
.running = 0, \ .stime = ATOMIC64_INIT(0), \
.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ .sum_exec_runtime = ATOMIC64_INIT(0), \
.running = 0 \
}, \ }, \
.cred_guard_mutex = \ .cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
......
...@@ -598,9 +598,10 @@ struct task_cputime { ...@@ -598,9 +598,10 @@ struct task_cputime {
* used for thread group CPU timer calculations. * used for thread group CPU timer calculations.
*/ */
struct thread_group_cputimer { struct thread_group_cputimer {
struct task_cputime cputime; atomic64_t utime;
atomic64_t stime;
atomic64_t sum_exec_runtime;
int running; int running;
raw_spinlock_t lock;
}; };
#include <linux/rwsem.h> #include <linux/rwsem.h>
...@@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void) ...@@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
static inline void thread_group_cputime_init(struct signal_struct *sig)
{
raw_spin_lock_init(&sig->cputimer.lock);
}
/* /*
* Reevaluate whether the task has signals pending delivery. * Reevaluate whether the task has signals pending delivery.
* Wake the task if so. * Wake the task if so.
......
...@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) ...@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{ {
unsigned long cpu_limit; unsigned long cpu_limit;
/* Thread group counters. */
thread_group_cputime_init(sig);
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) { if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
......
...@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk) ...@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
{ {
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running) /* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(cputimer->running))
return false; return false;
/* /*
...@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk, ...@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk)) if (!cputimer_running(tsk))
return; return;
raw_spin_lock(&cputimer->lock); atomic64_add(cputime, &cputimer->utime);
cputimer->cputime.utime += cputime;
raw_spin_unlock(&cputimer->lock);
} }
/** /**
...@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk, ...@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk)) if (!cputimer_running(tsk))
return; return;
raw_spin_lock(&cputimer->lock); atomic64_add(cputime, &cputimer->stime);
cputimer->cputime.stime += cputime;
raw_spin_unlock(&cputimer->lock);
} }
/** /**
...@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, ...@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk)) if (!cputimer_running(tsk))
return; return;
raw_spin_lock(&cputimer->lock); atomic64_add(ns, &cputimer->sum_exec_runtime);
cputimer->cputime.sum_exec_runtime += ns;
raw_spin_unlock(&cputimer->lock);
} }
...@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, ...@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0; return 0;
} }
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) /*
* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
* to avoid race conditions with concurrent updates to cputime.
*/
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{ {
if (b->utime > a->utime) u64 curr_cputime;
a->utime = b->utime; retry:
curr_cputime = atomic64_read(cputime);
if (sum_cputime > curr_cputime) {
if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
goto retry;
}
}
if (b->stime > a->stime) static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
a->stime = b->stime; {
__update_gt_cputime(&cputimer->utime, sum->utime);
__update_gt_cputime(&cputimer->stime, sum->stime);
__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
}
if (b->sum_exec_runtime > a->sum_exec_runtime) /* Sample thread_group_cputimer values in "cputimer", store results in "times". */
a->sum_exec_runtime = b->sum_exec_runtime; static inline void sample_group_cputimer(struct task_cputime *times,
struct thread_group_cputimer *cputimer)
{
times->utime = atomic64_read(&cputimer->utime);
times->stime = atomic64_read(&cputimer->stime);
times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
} }
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{ {
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum; struct task_cputime sum;
unsigned long flags;
if (!cputimer->running) { /* Check if cputimer isn't running. This is accessed without locking. */
if (!READ_ONCE(cputimer->running)) {
/* /*
* The POSIX timer interface allows for absolute time expiry * The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have * values through the TIMER_ABSTIME flag, therefore we have
* to synchronize the timer to the clock every time we start * to synchronize the timer to the clock every time we start it.
* it.
*/ */
thread_group_cputime(tsk, &sum); thread_group_cputime(tsk, &sum);
raw_spin_lock_irqsave(&cputimer->lock, flags); update_gt_cputime(cputimer, &sum);
cputimer->running = 1;
update_gt_cputime(&cputimer->cputime, &sum); /*
} else * We're setting cputimer->running without a lock. Ensure
raw_spin_lock_irqsave(&cputimer->lock, flags); * this only gets written to in one operation. We set
*times = cputimer->cputime; * running after update_gt_cputime() as a small optimization,
raw_spin_unlock_irqrestore(&cputimer->lock, flags); * but barriers are not required because update_gt_cputime()
* can handle concurrent updates.
*/
WRITE_ONCE(cputimer->running, 1);
}
sample_group_cputimer(times, cputimer);
} }
/* /*
...@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) ...@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires)) if (!task_cputime_zero(&tsk->cputime_expires))
return false; return false;
if (tsk->signal->cputimer.running) /* Check if cputimer is running. This is accessed without locking. */
if (READ_ONCE(tsk->signal->cputimer.running))
return false; return false;
return true; return true;
...@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk, ...@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
} }
} }
static void stop_process_timers(struct signal_struct *sig) static inline void stop_process_timers(struct signal_struct *sig)
{ {
struct thread_group_cputimer *cputimer = &sig->cputimer; struct thread_group_cputimer *cputimer = &sig->cputimer;
unsigned long flags;
raw_spin_lock_irqsave(&cputimer->lock, flags); /* Turn off cputimer->running. This is done without locking. */
cputimer->running = 0; WRITE_ONCE(cputimer->running, 0);
raw_spin_unlock_irqrestore(&cputimer->lock, flags);
} }
static u32 onecputick; static u32 onecputick;
...@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) ...@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
} }
sig = tsk->signal; sig = tsk->signal;
if (sig->cputimer.running) { /* Check if cputimer is running. This is accessed without locking. */
if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample; struct task_cputime group_sample;
raw_spin_lock(&sig->cputimer.lock); sample_group_cputimer(&group_sample, &sig->cputimer);
group_sample = sig->cputimer.cputime;
raw_spin_unlock(&sig->cputimer.lock);
if (task_cputime_expired(&group_sample, &sig->cputime_expires)) if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1; return 1;
...@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) ...@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* If there are any active process wide timers (POSIX 1.b, itimers, * If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running. * RLIMIT_CPU) cputimer must be running.
*/ */
if (tsk->signal->cputimer.running) if (READ_ONCE(tsk->signal->cputimer.running))
check_process_timers(tsk, &firing); check_process_timers(tsk, &firing);
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment