* comment, clean up scheduler

* rewrite lock implementation to be correct (tip: never assume that an algorithm you found in a linux man page is correct.) * delete unneeded void* arg from clone fn * replace Rendez with Note * comment mal better * use 6c -w, fix warnings * mark all assembly functions 7 R=r DELTA=828 (338 added, 221 deleted, 269 changed) OCL=13884 CL=13886

* comment, clean up scheduler
* rewrite lock implementation to be correct (tip: never assume that an algorithm you found in a linux man page is correct.) * delete unneeded void* arg from clone fn * replace Rendez with Note * comment mal better * use 6c -w, fix warnings * mark all assembly functions 7 R=r DELTA=828 (338 added, 221 deleted, 269 changed) OCL=13884 CL=13886
96824000 · Russ Cox · 5adbacb8 · 96824000 · 96824000 · 96824000
Commit 96824000 authored Aug 05, 2008 by Russ Cox
14 changed files
--- a/src/runtime/Makefile
+++ b/src/runtime/Makefile
@@ -49,10 +49,10 @@ clean:
 	rm -f *.$(O) *.a runtime.acid
 %.$O:	%.c
-	$(CC) $<
+	$(CC) -w $<
 sys_file.$O:	sys_file.c sys_types.h $(OS_H)
-	$(CC) -D$(GOARCH)_$(GOOS) $<
+	$(CC) -w -D$(GOARCH)_$(GOOS) $<
 %.$O:	%.s
 	$(AS) $<

--- a/src/runtime/amd64_linux.h
+++ b/src/runtime/amd64_linux.h
@@ -48,6 +48,6 @@ struct stat {
 // Linux-specific system calls
 int64	futex(uint32*, int32, uint32, struct timespec*, uint32*, uint32);
-int64	clone(int32, void*, M*, G*, void(*)(void*), void*);
+int64	clone(int32, void*, M*, G*, void(*)(void));
 int64	select(int32, void*, void*, void*, void*);
--- a/src/runtime/chan.c
+++ b/src/runtime/chan.c
@@ -487,7 +487,7 @@ sys·selectgo(Select *sel)
 	SudoG *sg;
 	G *gp;
-	byte *ae, *as;
+	byte *as;
 	if(xxx) {
 		prints("selectgo: sel=");
@@ -630,6 +630,8 @@ sys·selectgo(Select *sel)
 asynr:
 asyns:
 	throw("asyn");
+	return;	// compiler doesn't know throw doesn't return
 gotr:
 	// recv path to wakeup the sender (sg)
 	if(xxx) {

--- a/src/runtime/map.c
+++ b/src/runtime/map.c
@@ -199,7 +199,6 @@ out:
 void
 sys·mapassign1(Hmap *m, ...)
 {
-	Link **ll;
 	byte *ak, *av;
 	ak = (byte*)&m + m->ko;

--- a/src/runtime/print.c
+++ b/src/runtime/print.c
@@ -8,7 +8,6 @@
 void
 dump(byte *p, int32 n)
 {
-	uint32 v;
 	int32 i;
 	for(i=0; i<n; i++) {

--- a/src/runtime/proc.c
+++ b/src/runtime/proc.c
@@ -9,28 +9,101 @@ typedef struct Sched Sched;
 M	m0;
 G	g0;	// idle goroutine for m0
-// Maximum number of os procs (M's) to kick off.
-// Can override with $gomaxprocs environment variable.
-// For now set to 1 (single-threaded), because not 
-// everything is properly locked (e.g., chans) and because
-// Darwin's multithreading code isn't implemented.
-int32	gomaxprocs = 1;
 static	int32	debug	= 0;
+// Go scheduler
+//
+// The go scheduler's job is to match ready-to-run goroutines (`g's)
+// with waiting-for-work schedulers (`m's).  If there are ready gs
+// and no waiting ms, ready() will start a new m running in a new
+// OS thread, so that all ready gs can run simultaneously, up to a limit.
+// For now, ms never go away.
+//
+// The default maximum number of ms is one: go runs single-threaded.
+// This is because some locking details have to be worked ou
+// (select in particular is not locked properly) and because the low-level
+// code hasn't been written yet for OS X.  Setting the environmen
+// variable $gomaxprocs changes sched.mmax for now.
+//
+// Even a program that can run without deadlock in a single process
+// might use more ms if given the chance.  For example, the prime
+// sieve will use as many ms as there are primes (up to sched.mmax),
+// allowing different stages of the pipeline to execute in parallel.
+// We could revisit this choice, only kicking off new ms for blocking
+// system calls, but that would limit the amount of parallel computation
+// that go would try to do.
+//
+// In general, one could imagine all sorts of refinements to the
+// scheduler, but the goal now is just to get something working on
+// Linux and OS X.
 struct Sched {
-	G *runhead;
-	G *runtail;
-	int32 nwait;
-	int32 nready;
-	int32 ng;
-	int32 nm;
-	M *wait;
 	Lock;
+	G *gfree;	// available gs (status == Gdead)
+	G *ghead;	// gs waiting to run
+	G *gtail;
+	int32 gwait;	// number of gs waiting to run
+	int32 gcount;	// number of gs that are alive
+	M *mhead;	// ms waiting for work
+	int32 mwait;	// number of ms waiting for work
+	int32 mcount;	// number of ms that are alive
+	int32 mmax;	// max number of ms allowed
+	int32 predawn;	// running initialization, don't run new gs.
 };
 Sched sched;
+// Scheduling helpers.  Sched must be locked.
+static void gput(G*);	// put/get on ghead/gtail
+static G* gget(void);
+static void mput(M*);	// put/get on mhead
+static M* mget(void);
+static void gfput(G*);	// put/get on gfree
+static G* gfget(void);
+static void mnew(void);	// kick off new m
+static void readylocked(G*);	// ready, but sched is locked
+// Scheduler loop.
+static void scheduler(void);
+// Called before main·init_function.
+void
+schedinit(void)
+{
+	int32 n;
+	byte *p;
+	sched.mmax = 1;
+	p = getenv("gomaxprocs");
+	if(p != nil && (n = atoi(p)) != 0)
+		sched.mmax = n;
+	sched.mcount = 1;
+	sched.predawn = 1;
+}
+// Called after main·init_function; main·main is on ready queue.
+void
+m0init(void)
+{
+	int32 i;
+	// Let's go.
+	sched.predawn = 0;
+	// There's already one m (us).
+	// If main·init_function started other goroutines,
+	// kick off new ms to handle them, like ready
+	// would have, had it not been pre-dawn.
+	for(i=1; i<sched.gcount && i<sched.mmax; i++)
+		mnew();
+	scheduler();
+}
 void
 sys·goexit(void)
 {
@@ -39,23 +112,10 @@ sys·goexit(void)
 		sys·printint(g->goid);
 		prints("\n");
 	}
-	g->status = Gdead;
+	g->status = Gmoribund;
 	sys·gosched();
 }
-void
-schedinit(void)
-{
-	byte *p;
-	extern int32 getenvc(void);
-	p = getenv("gomaxprocs");
-	if(p && '0' <= *p && *p <= '9')
-		gomaxprocs = atoi(p);
-	sched.nm = 1;
-	sched.nwait = 1;
-}
 void
 sys·newproc(int32 siz, byte* fn, byte* arg0)
 {
@@ -71,22 +131,18 @@ sys·newproc(int32 siz, byte* fn, byte* arg0)
 	if(siz > 1024)
 		throw("sys·newproc: too many args");
-	// try to rip off an old goroutine
+	lock(&sched);
-	for(newg=allg; newg!=nil; newg=newg->alllink)
-		if(newg->status == Gdead)
-			break;
-	if(newg == nil) {
+	if((newg = gfget()) != nil){
+		newg->status = Gwaiting;
+		stk = newg->stack0;
+	}else{
 		newg = mal(sizeof(G));
 		stk = mal(4096);
 		newg->stack0 = stk;
 		newg->status = Gwaiting;
 		newg->alllink = allg;
 		allg = newg;
-	} else {
-		stk = newg->stack0;
-		newg->status = Gwaiting;
 	}
 	newg->stackguard = stk+160;
@@ -104,14 +160,13 @@ sys·newproc(int32 siz, byte* fn, byte* arg0)
 	newg->sched.SP = sp;
 	newg->sched.PC = fn;
-	lock(&sched);
+	sched.gcount++;
-	sched.ng++;
 	goidgen++;
 	newg->goid = goidgen;
+	readylocked(newg);
 	unlock(&sched);
-	ready(newg);
 //prints(" goid=");
 //sys·printint(newg->goid);
 //prints("\n");
@@ -132,193 +187,248 @@ tracebackothers(G *me)
 	}
 }
-void newmach(void);
+// Put on `g' queue.  Sched must be locked.
 static void
-readylocked(G *g)
+gput(G *g)
 {
-	g->status = Grunnable;
+	g->schedlink = nil;
-	if(sched.runhead == nil)
+	if(sched.ghead == nil)
-		sched.runhead = g;
+		sched.ghead = g;
 	else
-		sched.runtail->runlink = g;
+		sched.gtail->schedlink = g;
-	sched.runtail = g;
+	sched.gtail = g;
-	g->runlink = nil;
+	sched.gwait++;
-	sched.nready++;
+}
-	// Don't wake up another scheduler.
-	// This only gets called when we're
+// Get from `g' queue.  Sched must be locked.
-	// about to reschedule anyway.
+static G*
+gget(void)
+{
+	G *g;
+	g = sched.ghead;
+	if(g){
+		sched.ghead = g->schedlink;
+		if(sched.ghead == nil)
+			sched.gtail = nil;
+		sched.gwait--;
+	}
+	return g;
+}
+// Put on `m' list.  Sched must be locked.
+static void
+mput(M *m)
+{
+	m->schedlink = sched.mhead;
+	sched.mhead = m;
+	sched.mwait++;
+}
+// Get from `m' list.  Sched must be locked.
+static M*
+mget(void)
+{
+	M *m;
+	m = sched.mhead;
+	if(m){
+		sched.mhead = m->schedlink;
+		sched.mwait--;
+	}
+	return m;
+}
+// Put on gfree list.  Sched must be locked.
+static void
+gfput(G *g)
+{
+	g->schedlink = sched.gfree;
+	sched.gfree = g;
 }
-static Lock print;
+// Get from gfree list.  Sched must be locked.
+static G*
+gfget(void)
+{
+	G *g;
+	g = sched.gfree;
+	if(g)
+		sched.gfree = g->schedlink;
+	return g;
+}
+// Mark g ready to run.
 void
 ready(G *g)
 {
-	M *mm;
+	// Wait for g to stop running (for example, it migh
+	// have queued itself on a channel but not yet gotten
-	// gp might be running on another scheduler.
+	// a chance to call sys·gosched and actually go to sleep).
-	// (E.g., it queued and then we decided to wake it up
+	notesleep(&g->stopped);
-	// before it had a chance to sys·gosched().)
-	// Grabbing the runlock ensures that it is not running elsewhere.
-	// You can delete the if check, but don't delete the 
-	// lock/unlock sequence (being able to grab the lock
-	// means the proc has gone to sleep).
-	lock(&g->runlock);
-	if(g->status == Grunnable || g->status == Grunning)
-		*(int32*)0x1023 = 0x1023;
 	lock(&sched);
+	readylocked(g);
+	unlock(&sched);
+}
+// Mark g ready to run.  Sched is already locked,
+// and g is known not to be running right now
+// (i.e., ready has slept on g->stopped or the g was
+// just allocated in sys·newproc).
+static void
+readylocked(G *g)
+{
+	M *m;
+	// Mark runnable.
+	if(g->status == Grunnable || g->status == Grunning)
+		throw("bad g->status in ready");
 	g->status = Grunnable;
-	if(sched.runhead == nil)
-		sched.runhead = g;
+	// Before we've gotten to main·main,
-	else
+	// only queue new gs, don't run them
-		sched.runtail->runlink = g;
+	// or try to allocate new ms for them.
-	sched.runtail = g;
+	// That includes main·main itself.
-	g->runlink = nil;
+	if(sched.predawn){
-	unlock(&g->runlock);
+		gput(g);
-	sched.nready++;
-	if(sched.nready > sched.nwait)
-	if(gomaxprocs == 0 || sched.nm < gomaxprocs){
-		if(debug){
-			prints("new scheduler: ");
-			sys·printint(sched.nready);
-			prints(" > ");
-			sys·printint(sched.nwait);
-			prints("\n");
-		}
-		sched.nwait++;
-		newmach();
 	}
-	if(sched.wait){
-		mm = sched.wait;
-		sched.wait = mm->waitlink;
-		rwakeupandunlock(&mm->waitr);
-	}else
-		unlock(&sched);
-}
-extern void p0(void), p1(void);
+	// Else if there's an m waiting, give it g.
+	else if((m = mget()) != nil){
+		m->nextg = g;
+		notewakeup(&m->havenextg);
+	}
+	// Else put g on queue, kicking off new m if needed.
+	else{
+		gput(g);
+		if(sched.mcount < sched.mmax)
+			mnew();
+	}
+}
-G*
+// Get the next goroutine that m should run.
-nextgoroutine(void)
+// Sched must be locked on entry, is unlocked on exit.
+static G*
+nextgandunlock(void)
 {
 	G *gp;
-	while((gp = sched.runhead) == nil){
+	if((gp = gget()) != nil){
-		if(debug){
+		unlock(&sched);
-			prints("nextgoroutine runhead=nil ng=");
+		return gp;
-			sys·printint(sched.ng);
-			prints("\n");
-		}
-		if(sched.ng == 0)
-			return nil;
-		m->waitlink = sched.wait;
-		m->waitr.l = &sched.Lock;
-		sched.wait = m;
-		sched.nwait++;
-		if(sched.nm == sched.nwait)
-			prints("all goroutines are asleep - deadlock!\n");
-		rsleep(&m->waitr);
-		sched.nwait--;
 	}
-	sched.nready--;
-	sched.runhead = gp->runlink;
+	mput(m);
+	if(sched.mcount == sched.mwait)
+		prints("warning: all goroutines are asleep - deadlock!\n");
+	m->nextg = nil;
+	noteclear(&m->havenextg);
+	unlock(&sched);
+	notesleep(&m->havenextg);
+	if((gp = m->nextg) == nil)
+		throw("bad m->nextg in nextgoroutine");
+	m->nextg = nil;
 	return gp;
 }
-void
+// Scheduler loop: find g to run, run it, repeat.
+static void
 scheduler(void)
 {
 	G* gp;
-	m->pid = getprocid();
+	// Initialization.
+	m->procid = getprocid();
-	gosave(&m->sched);
 	lock(&sched);
-	if(m->curg == nil){
+	if(gosave(&m->sched)){
-		// Brand new scheduler; nwait counts us.
+		// Jumped here via gosave/gogo, so didn'
-		// Not anymore.
+		// execute lock(&sched) above.
-		sched.nwait--;
+		lock(&sched);
-	}else{
+		// Just finished running m->curg.
 		gp = m->curg;
-		gp->m = nil;
+		gp->m = nil;	// for debugger
 		switch(gp->status){
+		case Grunnable:
 		case Gdead:
-			sched.ng--;
+			// Shouldn't have been running!
-			if(debug){
+			throw("bad gp->status in sched");
-				prints("sched: dead: ");
-				sys·printint(sched.ng);
-				prints("\n");
-			}
-			break;
 		case Grunning:
-			readylocked(gp);
+			gp->status = Grunnable;
+			gput(gp);
 			break;
-		case Grunnable:
+		case Gmoribund:
-			// don't want to see this
+			gp->status = Gdead;
-			*(int32*)0x456 = 0x234;
+			if(--sched.gcount == 0)
+				sys·exit(0);
 			break;
 		}
-		unlock(&gp->runlock);
+		notewakeup(&gp->stopped);
 	}
-	gp = nextgoroutine();
+	// Find (or wait for) g to run.  Unlocks sched.
-	if(gp == nil) {
+	gp = nextgandunlock();
-//		prints("sched: no more work\n");
-		sys·exit(0);
-	}
-	unlock(&sched);
-	lock(&gp->runlock);
+	noteclear(&gp->stopped);
 	gp->status = Grunning;
 	m->curg = gp;
-	gp->m = m;
+	gp->m = m;	// for debugger
 	g = gp;
 	gogo(&gp->sched);
 }
-void
+// Enter scheduler.  If g->status is Grunning,
-newmach(void)
+// re-queues g and runs everyone else who is waiting
-{
+// before running g again.  If g->status is Gmoribund,
-	M *mm;
+// kills off g.
-	byte *stk, *stktop;
-	int64 ret;
-	sched.nm++;
-	if(!(sched.nm&(sched.nm-1))){
-		sys·printint(sched.nm);
-		prints(" threads\n");
-	}
-	mm = mal(sizeof(M)+sizeof(G)+1024+104);
-	sys·memclr((byte*)mm, sizeof(M));
-	mm->g0 = (G*)(mm+1);
-	sys·memclr((byte*)mm->g0, sizeof(G));
-	stk = (byte*)mm->g0 + 104;
-	stktop = stk + 1024;
-	mm->g0->stackguard = stk;
-	mm->g0->stackbase = stktop;
-	newosproc(mm, mm->g0, stktop, (void(*)(void*))scheduler, nil);
-}
-void
-gom0init(void)
-{
-	scheduler();
-}
 void
 sys·gosched(void)
 {
 	if(gosave(&g->sched) == 0){
-		// (rsc) signal race here?
+		// TODO(rsc) signal race here?
+		// If a signal comes in between
+		// changing g and changing SP,
+		// growing the stack will fail.
 		g = m->g0;
 		gogo(&m->sched);
 	}
 }
+// Fork off a new m.  Sched must be locked.
+static void
+mnew(void)
+{
+	M *m;
+	G *g;
+	byte *stk, *stktop;
+	sched.mcount++;
+	if(debug){
+		sys·printint(sched.mcount);
+		prints(" threads\n");
+	}
+	// Allocate m, g, stack in one chunk.
+	// 1024 and 104 are the magic constants
+	// use in rt0_amd64.s when setting up g0.
+	m = mal(sizeof(M)+sizeof(G)+104+1024);
+	g = (G*)(m+1);
+	stk = (byte*)g + 104;
+	stktop = stk + 1024;
+	m->g0 = g;
+	g->stackguard = stk;
+	g->stackbase = stktop;
+	newosproc(m, g, stktop, scheduler);
+}
 //
-// the calling sequence for a routine that
+// the calling sequence for a routine tha
 // needs N bytes stack, A args.
 //
 //	N1 = (N+160 > 4096)? N+160: 0

--- a/src/runtime/rt0_amd64.s
+++ b/src/runtime/rt0_amd64.s
@@ -41,7 +41,7 @@ TEXT	_rt0_amd64(SB),7,$-8
 	PUSHQ	$main·main(SB)		// entry
 	PUSHQ	$16			// arg size
 	CALL	sys·newproc(SB)
-	CALL	gom0init(SB)
+	CALL	m0init(SB)
 	POPQ	AX
 	POPQ	AX

--- a/src/runtime/rt1_amd64_darwin.c
+++ b/src/runtime/rt1_amd64_darwin.c
@@ -191,7 +191,7 @@ sys·sleep(int64 ms)
 void
 lock(Lock *l)
 {
-	if(xadd(&l->key, 1) == 1)
+	if(cas(&l->key, 0, 1))
 		return;
 	unimplemented("lock wait");
 }
@@ -199,43 +199,33 @@ lock(Lock *l)
 void
 unlock(Lock *l)
 {
-	if(xadd(&l->key, -1) == 0)
+	if(cas(&l->key, 1, 0))
 		return;
 	unimplemented("unlock wakeup");
 }
 void
-rsleep(Rendez *r)
+noteclear(Note *n)
 {
-	unimplemented("rsleep");
+	n->lock.key = 0;
+	lock(&n->lock);
-	// dumb implementation:
-	r->sleeping = 1;
-	unlock(r->l);
-	while(r->sleeping)
-		;
-	lock(r->l);
 }
 void
-rwakeup(Rendez *r)
+notesleep(Note *n)
 {
-	unimplemented("rwakeup");
+	lock(&n->lock);
+	unlock(&n->lock);
-	// dumb implementation:
-	r->sleeping = 0;
 }
 void
-rwakeupandunlock(Rendez *r)
+notewakeup(Note *n)
 {
-	// dumb implementation:
+	unlock(&n->lock);
-	rwakeup(r);
-	unlock(r->l);
 }
 void
-newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg)
+newosproc(M *mm, G *gg, void *stk, void (*fn)(void))
 {
 	unimplemented("newosproc");
 }

--- a/src/runtime/rt1_amd64_linux.c
+++ b/src/runtime/rt1_amd64_linux.c
@@ -138,21 +138,19 @@ typedef struct sigaction {
 void
 sighandler(int32 sig, siginfo* info, void** context)
 {
-	int32 i;
 	if(sig < 0 || sig >= NSIG){
 		prints("Signal ");
 		sys·printint(sig);
 	}else{
 		prints(sigtab[sig].name);
 	}
        struct sigcontext *sc = &(((struct ucontext *)context)->uc_mcontext);
        prints("\nFaulting address: 0x");  sys·printpointer(info->si_addr);
        prints("\npc: 0x");  sys·printpointer((void *)sc->rip);
        prints("\n\n");
 	traceback((void *)sc->rip, (void *)sc->rsp, (void *)sc->r15);
 	tracebackothers((void*)sc->r15);
 	print_sigcontext(sc);
@@ -179,16 +177,14 @@ initsig(void)
 		}
 }
-// Linux futex.  The simple cases really are simple:
+// Linux futex.
-//
-//	futex(addr, FUTEX_WAIT, val, duration, _, _)
-//		Inside the kernel, atomically check that *addr == val
-//		and go to sleep for at most duration.
 //
-//	futex(addr, FUTEX_WAKE, val, _, _, _)
+//	futexsleep(uint32 *addr, uint32 val)
-//		Wake up at least val procs sleeping on addr.
+//	futexwakeup(uint32 *addr)
 //
-// (Of course, they have added more complicated things since then.)
+// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+// Futexwakeup wakes up one thread sleeping on addr.
+// Futexsleep is allowed to wake up spuriously.
 enum
 {	
@@ -199,10 +195,10 @@ enum
 	EAGAIN = 11,
 };
-// TODO(rsc) I tried using 1<<40 here but it woke up (-ETIMEDOUT).
+// TODO(rsc) I tried using 1<<40 here but futex woke up (-ETIMEDOUT).
 // I wonder if the timespec that gets to the kernel
-// actually has two 32-bit numbers in it, so that
+// actually has two 32-bit numbers in it, so tha
-// a 64-bit 1<<40 ends up being 0 seconds, 
+// a 64-bit 1<<40 ends up being 0 seconds,
 // 1<<8 nanoseconds.
 static struct timespec longtime =
 {
@@ -210,69 +206,106 @@ static struct timespec longtime =
 	0
 };
+// Atomically,
+//	if(*addr == val) sleep
+// Might be woken up spuriously; that's allowed.
 static void
-efutex(uint32 *addr, int32 op, int32 val, struct timespec *ts)
+futexsleep(uint32 *addr, uint32 val)
 {
 	int64 ret;
+	ret = futex(addr, FUTEX_WAIT, val, &longtime, nil, 0);
+	if(ret >= 0 || ret == -EAGAIN || ret == -EINTR)
+		return;
-again:
+	prints("futexsleep addr=");
-	ret = futex(addr, op, val, ts, nil, 0);
+	sys·printpointer(addr);
+	prints(" val=");
+	sys·printint(val);
+	prints(" returned ");
+	sys·printint(ret);
+	prints("\n");
+	*(int32*)0x1005 = 0x1005;
+}
-	// These happen when you use a debugger, among other times.
+// If any procs are sleeping on addr, wake up at least one.
-	if(ret == -EAGAIN || ret == -EINTR){
+static void
-		// If we were sleeping, it's okay to wake up early.
+futexwakeup(uint32 *addr)
-		if(op == FUTEX_WAIT)
+{
-			return;
+	int64 ret;
-		// If we were waking someone up, we don't know
+	ret = futex(addr, FUTEX_WAKE, 1, nil, nil, 0);
-		// whether that succeeded, so wake someone else up too.
-		if(op == FUTEX_WAKE){
-prints("futexwake ");
-sys·printint(ret);
-prints("\n");
-			goto again;
-		}
-	}
-	if(ret < 0){
+	if(ret >= 0)
-		prints("futex error addr=");
+		return;
-		sys·printpointer(addr);
-		prints(" op=");
+	// I don't know that futex wakeup can return
-		sys·printint(op);
+	// EAGAIN or EINTR, but if it does, it would be 
-		prints(" val=");
+	// safe to loop and call futex again.
-		sys·printint(val);
-		prints(" ts=");
+	prints("futexwakeup addr=");
-		sys·printpointer(ts);
+	sys·printpointer(addr);
-		prints(" returned ");
+	prints(" returned ");
-		sys·printint(-ret);
+	sys·printint(ret);
-		prints("\n");
+	prints("\n");
-		*(int32*)101 = 202;
+	*(int32*)0x1006 = 0x1006;
-	}
 }
-// Lock and unlock.  
-// A zeroed Lock is unlocked (no need to initialize each lock).
+// Lock and unlock.
-// The l->key is either 0 (unlocked), 1 (locked), or >=2 (contended).
+//
+// The lock state is a single 32-bit word that holds
+// a 31-bit count of threads waiting for the lock
+// and a single bit (the low bit) saying whether the lock is held.
+// The uncontended case runs entirely in user space.
+// When contention is detected, we defer to the kernel (futex).
+//
+// A reminder: compare-and-swap cas(addr, old, new) does
+//	if(*addr == old) { *addr = new; return 1; }
+// 	else return 0;
+// but atomically.
 void
 lock(Lock *l)
 {
 	uint32 v;
-	if(l->key != 0) *(int32*)0x1001 = 0x1001;
-	l->key = 1;
-	return;
-	for(;;){
+again:
-		// Try for lock.  If we incremented it from 0 to 1, we win.
+	v = l->key;
-		if((v=xadd(&l->key, 1)) == 1)
+	if((v&1) == 0){
+		if(cas(&l->key, v, v|1)){
+			// Lock wasn't held; we grabbed it.
 			return;
+		}
-		// We lose.  It was already >=1 and is now >=2.
+		goto again;
-		// Use futex to atomically check that the value is still
+	}
-		// what we think it is and go to sleep.
-		efutex(&l->key, FUTEX_WAIT, v, &longtime);
+	// Lock was held; try to add ourselves to the waiter count.
+	if(!cas(&l->key, v, v+2))
+		goto again;
+	// We're accounted for, now sleep in the kernel.
+	//
+	// We avoid the obvious lock/unlock race because
+	// the kernel won't put us to sleep if l->key has
+	// changed underfoot and is no longer v+2.
+	//
+	// We only really care that (v&1) == 1 (the lock is held),
+	// and in fact there is a futex variant that could
+	// accomodate that check, but let's not get carried away.)
+	futexsleep(&l->key, v+2);
+	// We're awake: remove ourselves from the count.
+	for(;;){
+		v = l->key;
+		if(v < 2)
+			throw("bad lock key");
+		if(cas(&l->key, v, v-2))
+			break;
 	}
+	// Try for the lock again.
+	goto again;
 }
 void
@@ -280,68 +313,54 @@ unlock(Lock *l)
 {
 	uint32 v;
-	if(l->key != 1) *(int32*)0x1002 = 0x1002;
+	// Atomically get value and clear lock bit.
-	l->key = 0;
+again:
-	return;
+	v = l->key;
+	if((v&1) == 0)
-	// Unlock the lock.  If we decremented from 1 to 0, wasn't contended.
+		throw("unlock of unlocked lock");
-	if((v=xadd(&l->key, -1)) == 0)
+	if(!cas(&l->key, v, v&~1))
-		return;
+		goto again;
-	// The lock was contended.  Mark it as unlocked and wake a waiter.
+	// If there were waiters, wake one.
-	l->key = 0;
+	if(v & ~1)
-	efutex(&l->key, FUTEX_WAKE, 1, nil);
+		futexwakeup(&l->key);
 }
-// Sleep and wakeup (see description in runtime.h)
+// One-time notifications.
+//
+// Since the lock/unlock implementation already
+// takes care of sleeping in the kernel, we just reuse it.
+// (But it's a weird use, so it gets its own interface.)
+//
+// We use a lock to represent the event:
+// unlocked == event has happened.
+// Thus the lock starts out locked, and to wait for the
+// event you try to lock the lock.  To signal the event,
+// you unlock the lock.
 void
-rsleep(Rendez *r)
+noteclear(Note *n)
 {
-	// Record that we're about to go to sleep and drop the lock.
+	n->lock.key = 0;	// memset(n, 0, sizeof *n)
-	r->sleeping = 1;
+	lock(&n->lock);
-	unlock(r->l);
-	// Go to sleep if r->sleeping is still 1.
-	efutex(&r->sleeping, FUTEX_WAIT, 1, &longtime);
-	// Reacquire the lock.
-	lock(r->l);
 }
 void
-rwakeup(Rendez *r)
+notewakeup(Note *n)
 {
-	if(!r->sleeping)
+	unlock(&n->lock);
-		return;
-	// Clear the sleeping flag in case sleeper
-	// is between unlock and futex.
-	r->sleeping = 0;
-	// Wake up if actually made it to sleep.
-	efutex(&r->sleeping, FUTEX_WAKE, 1, nil);
 }
-// Like rwakeup(r), unlock(r->l), but drops the lock before
-// waking the other proc.  This reduces bouncing back and forth
-// in the scheduler: the first thing the other proc wants to do
-// is acquire r->l, so it helps to unlock it before we wake him.
 void
-rwakeupandunlock(Rendez *r)
+notesleep(Note *n)
 {
-	int32 wassleeping;
+	lock(&n->lock);
+	unlock(&n->lock);	// Let other sleepers find out too.
-	if(!r->sleeping){
-		unlock(r->l);
-		return;
-	}
-	r->sleeping = 0;
-	unlock(r->l);
-	efutex(&r->sleeping, FUTEX_WAKE, 1, nil);
 }
+// Clone, the Linux rfork.
 enum
 {
 	CLONE_VM = 0x100,
@@ -365,7 +384,7 @@ enum
 };
 void
-newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg)
+newosproc(M *m, G *g, void *stk, void (*fn)(void))
 {
 	int64 ret;
 	int32 flags;
@@ -382,20 +401,18 @@ newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg)
 	if(0){
 		prints("newosproc stk=");
 		sys·printpointer(stk);
-		prints(" mm=");
+		prints(" m=");
-		sys·printpointer(mm);
+		sys·printpointer(m);
-		prints(" gg=");
+		prints(" g=");
-		sys·printpointer(gg);
+		sys·printpointer(g);
 		prints(" fn=");
 		sys·printpointer(fn);
-		prints(" arg=");
-		sys·printpointer(arg);
 		prints(" clone=");
 		sys·printpointer(clone);
 		prints("\n");
 	}
-	ret = clone(flags, stk, mm, gg, fn, arg);
+	ret = clone(flags, stk, m, g, fn);
 	if(ret < 0)
 		*(int32*)123 = 123;
 }

--- a/src/runtime/runtime.c
+++ b/src/runtime/runtime.c
@@ -71,6 +71,7 @@ rnd(uint32 n, uint32 m)
 	return n;
 }
+// Convenient wrapper around mmap.
 static void*
 brk(uint32 n)
 {
@@ -81,12 +82,15 @@ brk(uint32 n)
 	return v;
 }
+// Allocate n bytes of memory.  Note that this gets used
+// to allocate new stack segments, so at each call to a function
+// you have to ask yourself "would it be okay to call mal recursively
+// right here?"  The answer is yes unless we're in the middle of
+// editing the malloc state in m->mem.
 void*
 mal(uint32 n)
 {
 	byte* v;
-	Mem *mem;
 	// round to keep everything 64-bit aligned
 	n = rnd(n, 8);
@@ -94,17 +98,19 @@ mal(uint32 n)
 	// be careful.  calling any function might invoke
 	// mal to allocate more stack.
 	if(n > NHUNK) {
-		// this call is okay - calling mal recursively
-		// won't change anything we depend on.
 		v = brk(n);
 	} else {
 		// allocate a new hunk if this one is too small
 		if(n > m->mem.nhunk) {
-			// better not to call brk here - it might grow the stack,
+			// here we're in the middle of editing m->mem
-			// causing a call to mal and the allocation of a 
+			// (we're about to overwrite m->mem.hunk),
-			// new hunk behind our backs.  then we'd toss away
+			// so we can't call brk - it might call mal to grow the
-			// almost all of that new hunk and replace it.
+			// stack, and the recursive call would allocate a new
-			// that'd just be a memory leak - the code would still run.
+			// hunk, and then once brk returned we'd immediately
+			// overwrite that hunk with our own.
+			// (the net result would be a memory leak, not a crash.)
+			// so we have to call sys·mmap directly - it is written
+			// in assembly and tagged not to grow the stack.
 			m->mem.hunk =
 				sys·mmap(nil, NHUNK, PROT_READ|PROT_WRITE,
 					MAP_ANON|MAP_PRIVATE, 0, 0);
@@ -136,7 +142,7 @@ hashmap(Sigi *si, Sigs *ss)
 	byte *sname, *iname;
 	Map *m;
-	h = ((uint32)si + (uint32)ss) % nelem(hash);
+	h = ((uint32)(uint64)si + (uint32)(uint64)ss) % nelem(hash);
 	for(m=hash[h]; m!=nil; m=m->link) {
 		if(m->si == si && m->ss == ss) {
 			if(m->bad) {
@@ -301,9 +307,9 @@ enum
 	NANSIGN		= 1<<31,
 };
-static	uint64	uvnan		= 0x7FF0000000000001;
+static	uint64	uvnan		= 0x7FF0000000000001ULL;
-static	uint64	uvinf		= 0x7FF0000000000000;
+static	uint64	uvinf		= 0x7FF0000000000000ULL;
-static	uint64	uvneginf	= 0xFFF0000000000000;
+static	uint64	uvneginf	= 0xFFF0000000000000ULL;
 static int32
 isInf(float64 d, int32 sign)
@@ -338,7 +344,7 @@ isNaN(float64 d)
 	uint64 x;
 	x = *(uint64*)&d;
-	return ((uint32)x>>32)==0x7FF00000 && !isInf(d, 0);
+	return (uint32)(x>>32)==0x7FF00000 && !isInf(d, 0);
 }
 static float64
@@ -424,7 +430,7 @@ modf(float64 d, float64 *ip)
 	return d - dd;
 }
-// func frexp(float64) (float64, int32); // break fp into exp,fract
+// func frexp(float64) (float64, int32); // break fp into exp,frac
 void
 sys·frexp(float64 din, float64 dou, int32 iou)
 {
@@ -432,7 +438,7 @@ sys·frexp(float64 din, float64 dou, int32 iou)
 	FLUSH(&dou);
 }
-//func	ldexp(int32, float64) float64;	// make fp from exp,fract
+//func	ldexp(int32, float64) float64;	// make fp from exp,frac
 void
 sys·ldexp(float64 din, int32 ein, float64 dou)
 {
@@ -441,7 +447,7 @@ sys·ldexp(float64 din, int32 ein, float64 dou)
 }
 //func	modf(float64) (float64, float64);	// break fp into double+double
-float64
+void
 sys·modf(float64 din, float64 integer, float64 fraction)
 {
 	fraction = modf(din, &integer);
@@ -593,6 +599,7 @@ out:
 	FLUSH(&s);
 }
+void
 check(void)
 {
 	int8 a;
@@ -638,18 +645,6 @@ check(void)
 	initsig();
 }
-uint32
-xadd(uint32 *val, uint32 delta)
-{
-	uint32 v;
-	for(;;){
-		v = *val;
-		if(cas(val, v, v+delta))
-			return v+delta;
-	}
-}
 /*
 * map and chan helpers for
 * dealing with unknown types
@@ -657,6 +652,7 @@ xadd(uint32 *val, uint32 delta)
 static uint64
 memhash(uint32 s, void *a)
 {
+	USED(s, a);
 	prints("memhash\n");
 	return 0x12345;
 }
@@ -718,6 +714,7 @@ memcopy(uint32 s, void *a, void *b)
 static uint64
 stringhash(uint32 s, string *a)
 {
+	USED(s, a);
 	prints("stringhash\n");
 	return 0x12345;
 }
@@ -725,18 +722,21 @@ stringhash(uint32 s, string *a)
 static uint32
 stringequal(uint32 s, string *a, string *b)
 {
+	USED(s);
 	return cmpstring(*a, *b) == 0;
 }
 static void
 stringprint(uint32 s, string *a)
 {
+	USED(s);
 	sys·printstring(*a);
 }
 static void
 stringcopy(uint32 s, string *a, string *b)
 {
+	USED(s);
 	if(b == nil) {
 		*a = nil;
 		return;
@@ -747,6 +747,7 @@ stringcopy(uint32 s, string *a, string *b)
 static uint64
 pointerhash(uint32 s, void **a)
 {
+	USED(s, a);
 	prints("pointerhash\n");
 	return 0x12345;
 }
@@ -754,6 +755,7 @@ pointerhash(uint32 s, void **a)
 static uint32
 pointerequal(uint32 s, void **a, void **b)
 {
+	USED(s, a, b);
 	prints("pointerequal\n");
 	return 0;
 }
@@ -761,12 +763,14 @@ pointerequal(uint32 s, void **a, void **b)
 static void
 pointerprint(uint32 s, void **a)
 {
+	USED(s, a);
 	prints("pointerprint\n");
 }
 static void
 pointercopy(uint32 s, void **a, void **b)
 {
+	USED(s);
 	if(b == nil) {
 		*a = nil;
 		return;
@@ -777,8 +781,8 @@ pointercopy(uint32 s, void **a, void **b)
 Alg
 algarray[3] =
 {
-	{	&memhash,	&memequal,	&memprint,	&memcopy	},  // 0
+	{	memhash,	memequal,	memprint,	memcopy	},  // 0
-	{	&stringhash,	&stringequal,	&stringprint,	&stringcopy	},  // 1
+	{	stringhash,	stringequal,	stringprint,	stringcopy	},  // 1
-//	{	&pointerhash,	&pointerequal,	&pointerprint,	&pointercopy	},  // 2
+//	{	pointerhash,	pointerequal,	pointerprint,	pointercopy	},  // 2
-	{	&memhash,	&memequal,	&memprint,	&memcopy	},  // 2 - treat pointers as ints
+	{	memhash,	memequal,	memprint,	memcopy	},  // 2 - treat pointers as ints
 };
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -43,7 +43,7 @@ typedef	struct	M		M;
 typedef	struct	Stktop		Stktop;
 typedef	struct	Alg		Alg;
 typedef	struct	Lock		Lock;
-typedef	struct	Rendez	Rendez;
+typedef	struct	Note	Note;
 typedef	struct	Mem		Mem;
 /*
@@ -62,6 +62,7 @@ enum
 	Grunnable,
 	Grunning,
 	Gwaiting,
+	Gmoribund,
 	Gdead,
 };
 enum
@@ -77,10 +78,9 @@ struct	Lock
 {
 	uint32	key;
 };
-struct	Rendez
+struct	Note
 {
-	Lock*	l;
+	Lock	lock;
-	uint32	sleeping;	// someone is sleeping (Linux)
 };
 struct String
 {
@@ -124,8 +124,8 @@ struct	G
 	int16	status;
 	int32	goid;
 	int32	selgen;		// valid sudog pointer
-	G*	runlink;
+	G*	schedlink;
-	Lock	runlock;
+	Note	stopped;
 	M*	m;	// for debuggers
 };
 struct	Mem
@@ -147,9 +147,10 @@ struct	M
 	byte*	moresp;
 	int32	siz1;
 	int32	siz2;
-	Rendez	waitr;
+	Note	havenextg;
-	M*	waitlink;
+	G*	nextg;
-	int32	pid;	// for debuggers
+	M*	schedlink;
+	int32	procid;	// for debuggers
 	Mem	mem;
 };
 struct	Stktop
@@ -224,36 +225,34 @@ int32	write(int32, void*, int32);
 void	close(int32);
 int32	fstat(int32, void*);
 bool	cas(uint32*, uint32, uint32);
-uint32	xadd(uint32*, uint32);
 void	exit1(int32);
 void	ready(G*);
 byte*	getenv(int8*);
 int32	atoi(byte*);
-void	newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg);
+void	newosproc(M *m, G *g, void *stk, void (*fn)(void));
 int32	getprocid(void);
 /*
 * mutual exclusion locks.  in the uncontended case,
 * as fast as spin locks (just a few user-level instructions),
 * but on the contention path they sleep in the kernel.
+ * a zeroed Lock is unlocked (no need to initialize each lock).
 */
 void	lock(Lock*);
 void	unlock(Lock*);
-void	lockinit(Lock*);
 /*
- * sleep and wakeup.
+ * sleep and wakeup on one-time events.
- * a Rendez is somewhere to sleep.  it is protected by the lock r->l.
+ * before any calls to notesleep or notewakeup, 
- * the caller must acquire r->l, check the condition, and if the 
+ * must call noteclear to initialize the Note.
- * condition is false, call rsleep.  rsleep will atomically drop the lock
+ * then, any number of threads can call notesleep
- * and go to sleep.  a subsequent rwakeup (caller must hold r->l)
+ * and exactly one thread can call notewakeup (once).
- * will wake up the guy who is rsleeping.  the lock keeps rsleep and
+ * once notewakeup has been called, all the notesleeps
- * rwakeup from missing each other.
+ * will return.  future notesleeps will return immediately.
- * n.b. only one proc can rsleep on a given rendez at a time.
 */
-void	rsleep(Rendez*);
+void	noteclear(Note*);
-void	rwakeup(Rendez*);
+void	notesleep(Note*);
-void	rwakeupandunlock(Rendez*);
+void	notewakeup(Note*);
 /*
 * low level go -called

--- a/src/runtime/string.c
+++ b/src/runtime/string.c
@@ -45,8 +45,6 @@ out:
 static void
 prbounds(int8* s, int32 a, int32 b, int32 c)
 {
-	int32 i;
 	prints(s);
 	prints(" ");
 	sys·printint(a);
@@ -115,7 +113,6 @@ strcmp(byte *s1, byte *s2)
 void
 sys·slicestring(string si, int32 lindex, int32 hindex, string so)
 {
-	string s, str;
 	int32 l;
 	if(si == nil)
@@ -154,8 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
 void
 sys·intstring(int64 v, string s)
 {
-	int32 l;
 	s = mal(sizeof(s->len)+8);
 	s->len = runetochar(s->str, v);
 	FLUSH(&s);

--- a/src/runtime/sys_amd64_darwin.s
+++ b/src/runtime/sys_amd64_darwin.s
@@ -7,21 +7,24 @@
 //
 // TODO(rsc): Either sys·exit or exit1 is wrong!
-TEXT	sys·exit(SB),1,$-8
+// It looks like sys·exit is correct (exits the entire program)
+// and exit1 should be mimicking the OS X library routine
+// __bsdthread_terminate.
+TEXT	sys·exit(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 exit status
 	MOVL	$(0x2000000+1), AX	// syscall entry
 	SYSCALL
 	CALL	notok(SB)
 	RET
-TEXT	exit1(SB),1,$-8
+TEXT	exit1(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 exit status
 	MOVL	$(0x2000000+1), AX	// syscall entry
 	SYSCALL
 	CALL	notok(SB)
 	RET
-TEXT	sys·write(SB),1,$-8
+TEXT	sys·write(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 fid
 	MOVQ	16(SP), SI		// arg 2 buf
 	MOVL	24(SP), DX		// arg 3 count
@@ -31,7 +34,7 @@ TEXT	sys·write(SB),1,$-8
 	CALL	notok(SB)
 	RET
-TEXT	open(SB),1,$-8
+TEXT	open(SB),7,$-8
 	MOVQ	8(SP), DI
 	MOVL	16(SP), SI
 	MOVL	20(SP), DX
@@ -40,20 +43,20 @@ TEXT	open(SB),1,$-8
 	SYSCALL
 	RET
-TEXT	close(SB),1,$-8
+TEXT	close(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVL	$(0x2000000+6), AX	// syscall entry
 	SYSCALL
 	RET
-TEXT	fstat(SB),1,$-8
+TEXT	fstat(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	$(0x2000000+339), AX	// syscall entry; really fstat64
 	SYSCALL
 	RET
-TEXT	read(SB),1,$-8
+TEXT	read(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@@ -61,7 +64,7 @@ TEXT	read(SB),1,$-8
 	SYSCALL
 	RET
-TEXT	write(SB),1,$-8
+TEXT	write(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@@ -69,7 +72,7 @@ TEXT	write(SB),1,$-8
 	SYSCALL
 	RET
-TEXT	sys·sigaction(SB),1,$-8
+TEXT	sys·sigaction(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 sig
 	MOVQ	16(SP), SI		// arg 2 act
 	MOVQ	24(SP), DX		// arg 3 oact
@@ -81,7 +84,7 @@ TEXT	sys·sigaction(SB),1,$-8
 	CALL	notok(SB)
 	RET
-TEXT sigtramp(SB),1,$24
+TEXT sigtramp(SB),7,$24
 	MOVL	DX,0(SP)
 	MOVQ	CX,8(SP)
 	MOVQ	R8,16(SP)
@@ -101,7 +104,7 @@ TEXT	sys·mmap(SB),7,$-8
 	CALL	notok(SB)
 	RET
-TEXT	notok(SB),1,$-8
+TEXT	notok(SB),7,$-8
 	MOVL	$0xf1, BP
 	MOVQ	BP, (BP)
 	RET
@@ -117,12 +120,12 @@ TEXT	sys·memclr(SB),7,$-8
 	STOSQ
 	RET
-TEXT	sys·getcallerpc+0(SB),1,$0
+TEXT	sys·getcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	-8(AX),AX		// get calling pc
 	RET
-TEXT	sys·setcallerpc+0(SB),1,$0
+TEXT	sys·setcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	x+8(FP), BX
 	MOVQ	BX, -8(AX)		// set calling pc

--- a/src/runtime/sys_amd64_linux.s
+++ b/src/runtime/sys_amd64_linux.s
@@ -6,19 +6,19 @@
 // System calls and other sys.stuff for AMD64, Linux
 //
-TEXT	sys·exit(SB),1,$0-8
+TEXT	sys·exit(SB),7,$0-8
 	MOVL	8(SP), DI
-	MOVL	$231, AX	// force all os threads to exit
+	MOVL	$231, AX	// exitgroup - force all os threads to exi
 	SYSCALL
 	RET
-TEXT exit1(SB),1,$0-8
+TEXT exit1(SB),7,$0-8
 	MOVL	8(SP), DI
-	MOVL	$60, AX	// exit the current os thread
+	MOVL	$60, AX	// exit - exit the current os thread
 	SYSCALL
 	RET
-TEXT	open(SB),1,$0-16
+TEXT	open(SB),7,$0-16
 	MOVQ	8(SP), DI
 	MOVL	16(SP), SI
 	MOVL	20(SP), DX
@@ -26,20 +26,20 @@ TEXT	open(SB),1,$0-16
 	SYSCALL
 	RET
-TEXT	close(SB),1,$0-8
+TEXT	close(SB),7,$0-8
 	MOVL	8(SP), DI
 	MOVL	$3, AX			// syscall entry
 	SYSCALL
 	RET
-TEXT	fstat(SB),1,$0-16
+TEXT	fstat(SB),7,$0-16
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	$5, AX			// syscall entry
 	SYSCALL
 	RET
-TEXT	read(SB),1,$0-24
+TEXT	read(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@@ -47,7 +47,7 @@ TEXT	read(SB),1,$0-24
 	SYSCALL
 	RET
-TEXT	write(SB),1,$0-24
+TEXT	write(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@@ -55,7 +55,7 @@ TEXT	write(SB),1,$0-24
 	SYSCALL
 	RET
-TEXT	sys·write(SB),1,$0-24
+TEXT	sys·write(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@@ -63,7 +63,7 @@ TEXT	sys·write(SB),1,$0-24
 	SYSCALL
 	RET
-TEXT	sys·rt_sigaction(SB),1,$0-32
+TEXT	sys·rt_sigaction(SB),7,$0-32
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVQ	24(SP), DX
@@ -72,7 +72,7 @@ TEXT	sys·rt_sigaction(SB),1,$0-32
 	SYSCALL
 	RET
-TEXT	sigtramp(SB),1,$24-16
+TEXT	sigtramp(SB),7,$24-16
 	MOVQ	DI,0(SP)
 	MOVQ	SI,8(SP)
 	MOVQ	DX,16(SP)
@@ -118,20 +118,20 @@ TEXT	sys·memclr(SB),7,$0-16
 	STOSQ
 	RET
-TEXT	sys·getcallerpc+0(SB),1,$0
+TEXT	sys·getcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	-8(AX),AX		// get calling pc
 	RET
-TEXT	sys·setcallerpc+0(SB),1,$0
+TEXT	sys·setcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	x+8(FP), BX
 	MOVQ	BX, -8(AX)		// set calling pc
 	RET
-// int64 futex(int32 *uaddr, int32 op, int32 val, 
+// int64 futex(int32 *uaddr, int32 op, int32 val,
 //	struct timespec *timeout, int32 *uaddr2, int32 val2);
-TEXT futex(SB),1,$0
+TEXT futex(SB),7,$0
 	MOVQ	8(SP), DI
 	MOVL	16(SP), SI
 	MOVL	20(SP), DX
@@ -142,17 +142,16 @@ TEXT futex(SB),1,$0
 	SYSCALL
 	RET
-// int64 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void*), void *arg);
+// int64 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
 TEXT clone(SB),7,$0
-	MOVL	8(SP), DI
+	MOVL	flags+8(SP), DI
-	MOVQ	16(SP), SI
+	MOVQ	stack+16(SP), SI
-	// Copy m, g, fn, arg off parent stack for use by child.
+	// Copy m, g, fn off parent stack for use by child.
 	// Careful: Linux system call clobbers CX and R11.
-	MOVQ	24(SP), R8
+	MOVQ	m+24(SP), R8
-	MOVQ	32(SP), R9
+	MOVQ	g+32(SP), R9
-	MOVQ	40(SP), R12
+	MOVQ	fn+40(SP), R12
-	MOVQ	48(SP), R13
 	MOVL	$56, AX
 	SYSCALL
@@ -162,21 +161,20 @@ TEXT clone(SB),7,$0
 	JEQ	2(PC)
 	RET
-	// In child, call fn(arg) on new stack
+	// In child, call fn on new stack
 	MOVQ	SI, SP
 	MOVQ	R8, R14	// m
 	MOVQ	R9, R15	// g
-	PUSHQ	R13
 	CALL	R12
-	// It shouldn't return.  If it does, exit
+	// It shouldn't return.  If it does, exi
 	MOVL	$111, DI
 	MOVL	$60, AX
 	SYSCALL
 	JMP	-3(PC)	// keep exiting
 // int64 select(int32, void*, void*, void*, void*)
-TEXT select(SB),1,$0
+TEXT select(SB),7,$0
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVQ	24(SP), DX
@@ -187,14 +185,14 @@ TEXT select(SB),1,$0
 	RET
 // Linux allocates each thread its own pid, like Plan 9.
-// But the getpid() system call returns the pid of the 
+// But the getpid() system call returns the pid of the
 // original thread (the one that exec started with),
 // no matter which thread asks.  This system call,
 // which Linux calls gettid, returns the actual pid of
 // the calling thread, not the fake one.
 //
 // int32 getprocid(void)
-TEXT getprocid(SB),1,$0
+TEXT getprocid(SB),7,$0
 	MOVL	$186, AX
 	SYSCALL
 	RET