Commit ba134539 authored by Russ Cox's avatar Russ Cox

runtime: faster entersyscall/exitsyscall

Replace cas with xadd in scheduler.
Suggested by Dmitriy in last code review.
Verified with Promela model.

When there's actual contention for the atomic word,
this avoids the looping that compare-and-swap requires.

benchmark                            old ns/op    new ns/op    delta
runtime_test.BenchmarkSyscall               32           26  -17.08%
runtime_test.BenchmarkSyscall-2            155           59  -61.81%
runtime_test.BenchmarkSyscall-3            112           52  -52.95%
runtime_test.BenchmarkSyscall-4             94           48  -48.57%
runtime_test.BenchmarkSyscallWork          871          872   +0.11%
runtime_test.BenchmarkSyscallWork-2        481          477   -0.83%
runtime_test.BenchmarkSyscallWork-3        338          335   -0.89%
runtime_test.BenchmarkSyscallWork-4        263          256   -2.66%

R=golang-dev, iant
CC=golang-dev
https://golang.org/cl/4800047
parent 7ce1a4bd
...@@ -773,7 +773,7 @@ runtime·gosched(void) ...@@ -773,7 +773,7 @@ runtime·gosched(void)
void void
runtime·entersyscall(void) runtime·entersyscall(void)
{ {
uint32 v, w; uint32 v;
if(runtime·sched.predawn) if(runtime·sched.predawn)
return; return;
...@@ -796,24 +796,14 @@ runtime·entersyscall(void) ...@@ -796,24 +796,14 @@ runtime·entersyscall(void)
// mcpu-- // mcpu--
// gwait not true // gwait not true
// waitstop && mcpu <= mcpumax not true // waitstop && mcpu <= mcpumax not true
// If we can do the same with a single atomic read/write, // If we can do the same with a single atomic add,
// then we can skip the locks. // then we can skip the locks.
for(;;) { v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
v = runtime·sched.atomic; if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
if(atomic_gwaiting(v)) return;
break;
if(atomic_waitstop(v) && atomic_mcpu(v)-1 <= atomic_mcpumax(v))
break;
w = v;
w += (-1<<mcpuShift);
if(runtime·cas(&runtime·sched.atomic, v, w))
return;
}
schedlock(); schedlock();
v = runtime·atomicload(&runtime·sched.atomic);
// atomic { mcpu--; }
v = runtime·xadd(&runtime·sched.atomic, (-1<<mcpuShift));
if(atomic_gwaiting(v)) { if(atomic_gwaiting(v)) {
matchmg(); matchmg();
v = runtime·atomicload(&runtime·sched.atomic); v = runtime·atomicload(&runtime·sched.atomic);
...@@ -837,43 +827,28 @@ runtime·entersyscall(void) ...@@ -837,43 +827,28 @@ runtime·entersyscall(void)
void void
runtime·exitsyscall(void) runtime·exitsyscall(void)
{ {
uint32 v, w; uint32 v;
if(runtime·sched.predawn) if(runtime·sched.predawn)
return; return;
// Fast path. // Fast path.
// If we can do the mcpu-- bookkeeping and // If we can do the mcpu++ bookkeeping and
// find that we still have mcpu <= mcpumax, then we can // find that we still have mcpu <= mcpumax, then we can
// start executing Go code immediately, without having to // start executing Go code immediately, without having to
// schedlock/schedunlock. // schedlock/schedunlock.
for(;;) { v = runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
// If the profiler frequency needs updating, if(m->profilehz == runtime·sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) {
// take the slow path. // There's a cpu for us, so we can run.
if(m->profilehz != runtime·sched.profilehz) g->status = Grunning;
break; // Garbage collector isn't running (since we are),
// so okay to clear gcstack.
v = runtime·sched.atomic; g->gcstack = nil;
if(atomic_mcpu(v) >= atomic_mcpumax(v)) return;
break;
w = v;
w += (1<<mcpuShift);
if(runtime·cas(&runtime·sched.atomic, v, w)) {
// There's a cpu for us, so we can run.
g->status = Grunning;
// Garbage collector isn't running (since we are),
// so okay to clear gcstack.
g->gcstack = nil;
return;
}
} }
schedlock(); schedlock();
// atomic { mcpu++; }
runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
// Tell scheduler to put g back on the run queue: // Tell scheduler to put g back on the run queue:
// mostly equivalent to g->status = Grunning, // mostly equivalent to g->status = Grunning,
// but keeps the garbage collector from thinking // but keeps the garbage collector from thinking
......
...@@ -3,8 +3,9 @@ ...@@ -3,8 +3,9 @@
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
/* /*
model for proc.c as of 2011/07/15. model for proc.c as of 2011/07/22.
takes 4300 seconds to explore 1128130 states
takes 4900 seconds to explore 1189070 states
with G=3, var_gomaxprocs=1 with G=3, var_gomaxprocs=1
on a Core i7 L640 2.13 GHz Lenovo X201s. on a Core i7 L640 2.13 GHz Lenovo X201s.
...@@ -329,33 +330,53 @@ inline schedule() { ...@@ -329,33 +330,53 @@ inline schedule() {
nextgandunlock() nextgandunlock()
} }
/*
* schedpend is > 0 if a goroutine is about to committed to
* entering the scheduler but has not yet done so.
* Just as we don't test for the undesirable conditions when a
* goroutine is in the scheduler, we don't test for them when
* a goroutine will be in the scheduler shortly.
* Modeling this state lets us replace mcpu cas loops with
* simpler mcpu atomic adds.
*/
byte schedpend;
/* /*
* entersyscall is like the C function. * entersyscall is like the C function.
*/ */
inline entersyscall() { inline entersyscall() {
bit willsched;
/* /*
* Fast path. Check all the conditions tested during schedlock/schedunlock * Fast path. Check all the conditions tested during schedlock/schedunlock
* below, and if we can get through the whole thing without stopping, run it * below, and if we can get through the whole thing without stopping, run it
* in one atomic cas-based step. * in one atomic cas-based step.
*/ */
atomic { atomic {
atomic_mcpu--;
if if
:: atomic_gwaiting -> :: atomic_gwaiting ->
skip skip
:: atomic_waitstop && atomic_mcpu-1 <= atomic_mcpumax -> :: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
skip skip
:: else -> :: else ->
atomic_mcpu--;
goto Lreturn_entersyscall; goto Lreturn_entersyscall;
fi fi;
willsched = 1;
schedpend++;
} }
/* /*
* Normal path. * Normal path.
*/ */
schedlock() schedlock()
d_step { opt_dstep {
atomic_mcpu--; if
:: willsched ->
schedpend--;
willsched = 0
:: else
fi
} }
if if
:: atomic_gwaiting -> :: atomic_gwaiting ->
...@@ -382,11 +403,11 @@ inline exitsyscall() { ...@@ -382,11 +403,11 @@ inline exitsyscall() {
*/ */
atomic { atomic {
// omitted profilehz check // omitted profilehz check
atomic_mcpu++;
if if
:: atomic_mcpu >= atomic_mcpumax -> :: atomic_mcpu >= atomic_mcpumax ->
skip skip
:: else -> :: else ->
atomic_mcpu++;
goto Lreturn_exitsyscall goto Lreturn_exitsyscall
fi fi
} }
...@@ -396,7 +417,6 @@ inline exitsyscall() { ...@@ -396,7 +417,6 @@ inline exitsyscall() {
*/ */
schedlock(); schedlock();
d_step { d_step {
atomic_mcpu++;
if if
:: atomic_mcpu <= atomic_mcpumax -> :: atomic_mcpu <= atomic_mcpumax ->
skip skip
...@@ -497,10 +517,10 @@ active proctype monitor() { ...@@ -497,10 +517,10 @@ active proctype monitor() {
do do
// Should never have goroutines waiting with procs available. // Should never have goroutines waiting with procs available.
:: !sched_lock && gwait > 0 && atomic_mcpu < atomic_mcpumax -> :: !sched_lock && schedpend==0 && gwait > 0 && atomic_mcpu < atomic_mcpumax ->
assert 0 assert 0
// Should never have gc waiting for stop if things have already stopped. // Should never have gc waiting for stop if things have already stopped.
:: !sched_lock && atomic_waitstop && atomic_mcpu <= atomic_mcpumax -> :: !sched_lock && schedpend==0 && atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
assert 0 assert 0
od od
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment