runtime: reduce frame size for runtime.cgocallback_gofunc

Tying preemption to stack splits means that we have to able to complete the call to exitsyscall (inside cgocallbackg at least for now) without any stack split checks, meaning that the whole sequence has to work within 128 bytes of stack, unless we increase the size of the red zone. This CL frees up 24 bytes along that critical path on amd64. (The 32-bit systems have plenty of space because all their words are smaller.) R=dvyukov CC=golang-dev https://golang.org/cl/11676043

runtime: reduce frame size for runtime.cgocallback_gofunc
Tying preemption to stack splits means that we have to able to complete the call to exitsyscall (inside cgocallbackg at least for now) without any stack split checks, meaning that the whole sequence has to work within 128 bytes of stack, unless we increase the size of the red zone. This CL frees up 24 bytes along that critical path on amd64. (The 32-bit systems have plenty of space because all their words are smaller.) R=dvyukov CC=golang-dev https://golang.org/cl/11676043
dba623b1 · Russ Cox · e97c8706 · dba623b1 · dba623b1 · dba623b1
Commit dba623b1 authored Jul 23, 2013 by Russ Cox
5 changed files
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -524,7 +524,7 @@ TEXT runtime·cgocallback(SB),7,$12-12
 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
 // See cgocall.c for more details.
-TEXT runtime·cgocallback_gofunc(SB),7,$12-12
+TEXT runtime·cgocallback_gofunc(SB),7,$8-12
 	// If m is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
 	// In this case, we're running on the thread stack, so there's
@@ -532,13 +532,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12
 	// the linker analysis by using an indirect call through AX.
 	get_tls(CX)
 #ifdef GOOS_windows
+	MOVL	$0, BP
 	CMPL	CX, $0
-	JNE	3(PC)
+	JNE	2(PC)
-	PUSHL	$0
-	JMP needm
 #endif
 	MOVL	m(CX), BP
-	PUSHL	BP
+	MOVL	BP, 4(SP)
 	CMPL	BP, $0
 	JNE	havem
 needm:
@@ -552,55 +551,42 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
 	MOVL	m_g0(BP), SI
-	PUSHL	(g_sched+gobuf_sp)(SI)
+	MOVL	(g_sched+gobuf_sp)(SI), AX
+	MOVL	AX, 0(SP)
 	MOVL	SP, (g_sched+gobuf_sp)(SI)
-	// Switch to m->curg stack and call runtime.cgocallbackg
+	// Switch to m->curg stack and call runtime.cgocallbackg.
-	// with the three arguments.  Because we are taking over
+	// Because we are taking over the execution of m->curg
-	// the execution of m->curg but *not* resuming what had
+	// but *not* resuming what had been running, we need to
-	// been running, we need to save that information (m->curg->sched)
+	// save that information (m->curg->sched) so we can restore it.
-	// so that we can restore it when we're done. 
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
 	// To save m->curg->sched.pc, we push it onto the stack.
 	// This has the added benefit that it looks to the traceback
 	// routine like cgocallbackg is going to return to that
-	// PC (because we defined cgocallbackg to have
+	// PC (because the frame we allocate below has the same
-	// a frame size of 12, the same amount that we use below),
+	// size as cgocallback_gofunc's frame declared above)
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
-	MOVL	fn+0(FP), AX
+	//
-	MOVL	frame+4(FP), BX
+	// In the new goroutine, 0(SP) and 4(SP) are unused except
-	MOVL	framesize+8(FP), DX
+	// on Windows, where they are the SEH block.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
-	// Push gobuf.pc
 	MOVL	(g_sched+gobuf_pc)(SI), BP
-	SUBL	$4, DI
+	MOVL	BP, -4(DI)
-	MOVL	BP, 0(DI)
+	LEAL	-(4+8)(DI), SP
-	// Push arguments to cgocallbackg.
-	// Frame size here must match the frame size above plus the pushes
-	// to trick traceback routines into doing the right thing.
-	SUBL	$20, DI
-	MOVL	AX, 0(DI)
-	MOVL	BX, 4(DI)
-	MOVL	DX, 8(DI)
-	// Switch stack and make the call.
-	MOVL	DI, SP
 	CALL	runtime·cgocallbackg(SB)
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVL	g(CX), SI
-	MOVL	20(SP), BP
+	MOVL	8(SP), BP
 	MOVL	BP, (g_sched+gobuf_pc)(SI)
-	LEAL	(20+4)(SP), DI
+	LEAL	(8+4)(SP), DI
 	MOVL	DI, (g_sched+gobuf_sp)(SI)
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -610,11 +596,12 @@ havem:
 	MOVL	m_g0(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), SP
-	POPL	(g_sched+gobuf_sp)(SI)
+	MOVL	0(SP), AX
+	MOVL	AX, (g_sched+gobuf_sp)(SI)
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	POPL	BP
+	MOVL	8(SP), BP
 	CMPL	BP, $0
 	JNE 3(PC)
 	MOVL	$runtime·dropm(SB), AX

--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -563,7 +563,7 @@ TEXT runtime·cgocallback(SB),7,$24-24
 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
 // See cgocall.c for more details.
-TEXT runtime·cgocallback_gofunc(SB),7,$24-24
+TEXT runtime·cgocallback_gofunc(SB),7,$16-24
 	// If m is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
 	// In this case, we're running on the thread stack, so there's
@@ -571,13 +571,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$24-24
 	// the linker analysis by using an indirect call through AX.
 	get_tls(CX)
 #ifdef GOOS_windows
+	MOVL	$0, BP
 	CMPQ	CX, $0
-	JNE	3(PC)
+	JNE	2(PC)
-	PUSHQ	$0
-	JMP	needm
 #endif
 	MOVQ	m(CX), BP
-	PUSHQ	BP
+	MOVQ	BP, 8(SP)
 	CMPQ	BP, $0
 	JNE	havem
 needm:
@@ -591,55 +590,42 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
 	MOVQ	m_g0(BP), SI
-	PUSHQ	(g_sched+gobuf_sp)(SI)
+	MOVQ	(g_sched+gobuf_sp)(SI), AX
+	MOVQ	AX, 0(SP)
 	MOVQ	SP, (g_sched+gobuf_sp)(SI)
-	// Switch to m->curg stack and call runtime.cgocallbackg
+	// Switch to m->curg stack and call runtime.cgocallbackg.
-	// with the three arguments.  Because we are taking over
+	// Because we are taking over the execution of m->curg
-	// the execution of m->curg but *not* resuming what had
+	// but *not* resuming what had been running, we need to
-	// been running, we need to save that information (m->curg->sched)
+	// save that information (m->curg->sched) so we can restore it.
-	// so that we can restore it when we're done. 
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
 	// To save m->curg->sched.pc, we push it onto the stack.
 	// This has the added benefit that it looks to the traceback
 	// routine like cgocallbackg is going to return to that
-	// PC (because we defined cgocallbackg to have
+	// PC (because the frame we allocate below has the same
-	// a frame size of 24, the same amount that we use below),
+	// size as cgocallback_gofunc's frame declared above)
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
-	MOVQ	fn+0(FP), AX
+	//
-	MOVQ	frame+8(FP), BX
+	// In the new goroutine, 0(SP) and 8(SP) are unused except
-	MOVQ	framesize+16(FP), DX
+	// on Windows, where they are the SEH block.
 	MOVQ	m_curg(BP), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
-	// Push gobuf.pc
 	MOVQ	(g_sched+gobuf_pc)(SI), BP
-	SUBQ	$8, DI
+	MOVQ	BP, -8(DI)
-	MOVQ	BP, 0(DI)
+	LEAQ	-(8+16)(DI), SP
-	// Push arguments to cgocallbackg.
-	// Frame size here must match the frame size above plus the pushes
-	// to trick traceback routines into doing the right thing.
-	SUBQ	$40, DI
-	MOVQ	AX, 0(DI)
-	MOVQ	BX, 8(DI)
-	MOVQ	DX, 16(DI)
-	// Switch stack and make the call.
-	MOVQ	DI, SP
 	CALL	runtime·cgocallbackg(SB)
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVQ	g(CX), SI
-	MOVQ	40(SP), BP
+	MOVQ	16(SP), BP
 	MOVQ	BP, (g_sched+gobuf_pc)(SI)
-	LEAQ	(40+8)(SP), DI
+	LEAQ	(16+8)(SP), DI
 	MOVQ	DI, (g_sched+gobuf_sp)(SI)
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -649,11 +635,12 @@ havem:
 	MOVQ	m_g0(BP), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
-	POPQ	(g_sched+gobuf_sp)(SI)
+	MOVQ	0(SP), AX
+	MOVQ	AX, (g_sched+gobuf_sp)(SI)
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	POPQ	BP
+	MOVQ	8(SP), BP
 	CMPQ	BP, $0
 	JNE 3(PC)
 	MOVQ	$runtime·dropm(SB), AX

--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@@ -331,7 +331,7 @@ TEXT runtime·cgocallback(SB),7,$12-12
 // cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize)
 // See cgocall.c for more details.
-TEXT	runtime·cgocallback_gofunc(SB),7,$12-12
+TEXT	runtime·cgocallback_gofunc(SB),7,$8-12
 	// Load m and g from thread-local storage.
 	MOVW	_cgo_load_gm(SB), R0
 	CMP	$0, R0
@@ -342,7 +342,7 @@ TEXT	runtime·cgocallback_gofunc(SB),7,$12-12
 	// In this case, we're running on the thread stack, so there's
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
-	MOVW	m, savedm-12(SP)
+	MOVW	m, savedm-4(SP)
 	CMP	$0, m
 	B.NE havem
 	MOVW	$runtime·needm(SB), R0
@@ -353,51 +353,41 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
 	MOVW	m_g0(m), R3
 	MOVW	(g_sched+gobuf_sp)(R3), R4
-	MOVW.W	R4, -4(R13)
+	MOVW	R4, savedsp-8(SP)
 	MOVW	R13, (g_sched+gobuf_sp)(R3)
-	// Switch to m->curg stack and call runtime.cgocallbackg
+	// Switch to m->curg stack and call runtime.cgocallbackg.
-	// with the three arguments.  Because we are taking over
+	// Because we are taking over the execution of m->curg
-	// the execution of m->curg but *not* resuming what had
+	// but *not* resuming what had been running, we need to
-	// been running, we need to save that information (m->curg->sched)
+	// save that information (m->curg->sched) so we can restore it.
-	// so that we can restore it when we're done. 
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
 	// To save m->curg->sched.pc, we push it onto the stack.
 	// This has the added benefit that it looks to the traceback
 	// routine like cgocallbackg is going to return to that
-	// PC (because we defined cgocallbackg to have
+	// PC (because the frame we allocate below has the same
-	// a frame size of 12, the same amount that we use below),
+	// size as cgocallback_gofunc's frame declared above)
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
+	//
+	// In the new goroutine, -8(SP) and -4(SP) are unused.
 	MOVW	fn+4(FP), R0
 	MOVW	frame+8(FP), R1
 	MOVW	framesize+12(FP), R2
 	MOVW	m_curg(m), g
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
-	// Push gobuf.pc
-	// Frame size here must match the frame size above plus the push
-	// to trick traceback routines into doing the right thing.
 	MOVW	(g_sched+gobuf_pc)(g), R5
-	MOVW.W	R5, -20(R4)
+	MOVW	R5, -12(R4)
+	MOVW	$-12(R4), R13
-	// Push arguments to cgocallbackg.
-	MOVW	R0, 4(R4)
-	MOVW	R1, 8(R4)
-	MOVW	R2, 12(R4)
-	// Switch stack and make the call.
-	MOVW	R4, R13
 	BL	runtime·cgocallbackg(SB)
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R13), R5
 	MOVW	R5, (g_sched+gobuf_pc)(g)
-	ADD	$(16+4), R13, R4
+	MOVW	$12(R13), R4
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -405,14 +395,12 @@ havem:
 	// so we do not have to restore it.)
 	MOVW	m_g0(m), g
 	MOVW	(g_sched+gobuf_sp)(g), R13
-	// POP R6
+	MOVW	savedsp-8(SP), R4
-	MOVW	0(R13), R6
+	MOVW	R4, (g_sched+gobuf_sp)(g)
-	ADD	$4, R13
-	MOVW	R6, (g_sched+gobuf_sp)(g)
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	MOVW	savedm-12(SP), R6
+	MOVW	savedm-4(SP), R6
 	CMP	$0, R6
 	B.NE	3(PC)
 	MOVW	$runtime·dropm(SB), R0

--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -228,13 +228,25 @@ runtime·cfree(void *p)
 static FuncVal unwindmf = {unwindm};
+typedef struct CallbackArgs CallbackArgs;
+struct CallbackArgs
+{
+	FuncVal *fn;
+	void *arg;
+	uintptr argsize;
+};
+#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+(3+(thechar=='5'))*sizeof(void*))
 void
-runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize)
+runtime·cgocallbackg(void)
 {
 	Defer d;
+	CallbackArgs *cb;
 	if(m->racecall) {
-		reflect·call(fn, arg, argsize);
+		cb = CBARGS;
+		reflect·call(cb->fn, cb->arg, cb->argsize);
 		return;
 	}
@@ -261,7 +273,8 @@ runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize)
 		runtime·raceacquire(&cgosync);
 	// Invoke callback.
-	reflect·call(fn, arg, argsize);
+	cb = CBARGS;
+	reflect·call(cb->fn, cb->arg, cb->argsize);
 	if(raceenabled)
 		runtime·racereleasemerge(&cgosync);
@@ -286,9 +299,11 @@ unwindm(void)
 		runtime·throw("runtime: unwindm not implemented");
 	case '8':
 	case '6':
-	case '5':
 		m->g0->sched.sp = *(uintptr*)m->g0->sched.sp;
 		break;
+	case '5':
+		m->g0->sched.sp = *(uintptr*)((byte*)m->g0->sched.sp + 4);
+		break;
 	}
 }

--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -651,10 +651,10 @@ runtime·needm(byte x)
 	g->stackguard0 = g->stackguard;
 	// On windows/386, we need to put an SEH frame (two words)
-	// somewhere on the current stack. We are called
+	// somewhere on the current stack. We are called from cgocallback_gofunc
-	// from needm, and we know there is some available
+	// and we know that it will leave two unused words below m->curg->sched.sp.
-	// space one word into the argument frame. Use that.
+	// Use those.
-	m->seh = (SEH*)((uintptr*)&x + 1);
+	m->seh = (SEH*)((uintptr*)m->curg->sched.sp - 3);
 	// Initialize this thread to use the m.
 	runtime·asminit();