runtime: reduce frame size for runtime.cgocallback_gofunc

Tying preemption to stack splits means that we have to able to complete the call to exitsyscall (inside cgocallbackg at least for now) without any stack split checks, meaning that the whole sequence has to work within 128 bytes of stack, unless we increase the size of the red zone. This CL frees up 24 bytes along that critical path on amd64. (The 32-bit systems have plenty of space because all their words are smaller.) R=dvyukov CC=golang-dev https://golang.org/cl/11676043

runtime: reduce frame size for runtime.cgocallback_gofunc
Tying preemption to stack splits means that we have to able to complete the call to exitsyscall (inside cgocallbackg at least for now) without any stack split checks, meaning that the whole sequence has to work within 128 bytes of stack, unless we increase the size of the red zone. This CL frees up 24 bytes along that critical path on amd64. (The 32-bit systems have plenty of space because all their words are smaller.) R=dvyukov CC=golang-dev https://golang.org/cl/11676043
dba623b1 · Russ Cox · e97c8706 · dba623b1 · dba623b1 · dba623b1
Commit dba623b1 authored Jul 23, 2013 by Russ Cox
5 changed files
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -524,7 +524,7 @@ TEXT runtime·cgocallback(SB),7,$12-12

 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
 // See cgocall.c for more details.
-TEXT runtime·cgocallback_gofunc(SB),7,$12-12
+TEXT runtime·cgocallback_gofunc(SB),7,$8-12
 	// If m is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
 	// In this case, we're running on the thread stack, so there's
@@ -532,13 +532,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12
 	// the linker analysis by using an indirect call through AX.
 	get_tls(CX)
 #ifdef GOOS_windows
+	MOVL	$0, BP
 	CMPL	CX, $0
-	JNE	3(PC)
-	PUSHL	$0
-	JMP needm
+	JNE	2(PC)
 #endif
 	MOVL	m(CX), BP
-	PUSHL	BP
+	MOVL	BP, 4(SP)
 	CMPL	BP, $0
 	JNE	havem
 needm:
@@ -552,55 +551,42 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
 	MOVL	m_g0(BP), SI
-	PUSHL	(g_sched+gobuf_sp)(SI)
+	MOVL	(g_sched+gobuf_sp)(SI), AX
+	MOVL	AX, 0(SP)
 	MOVL	SP, (g_sched+gobuf_sp)(SI)

-	// Switch to m->curg stack and call runtime.cgocallbackg
-	// with the three arguments.  Because we are taking over
-	// the execution of m->curg but *not* resuming what had
-	// been running, we need to save that information (m->curg->sched)
-	// so that we can restore it when we're done. 
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
 	// To save m->curg->sched.pc, we push it onto the stack.
 	// This has the added benefit that it looks to the traceback
 	// routine like cgocallbackg is going to return to that
-	// PC (because we defined cgocallbackg to have
-	// a frame size of 12, the same amount that we use below),
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
-	MOVL	fn+0(FP), AX
-	MOVL	frame+4(FP), BX
-	MOVL	framesize+8(FP), DX
-
+	//
+	// In the new goroutine, 0(SP) and 4(SP) are unused except
+	// on Windows, where they are the SEH block.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
-	MOVL	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
-
-	// Push gobuf.pc
+	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
 	MOVL	(g_sched+gobuf_pc)(SI), BP
-	SUBL	$4, DI
-	MOVL	BP, 0(DI)
-
-	// Push arguments to cgocallbackg.
-	// Frame size here must match the frame size above plus the pushes
-	// to trick traceback routines into doing the right thing.
-	SUBL	$20, DI
-	MOVL	AX, 0(DI)
-	MOVL	BX, 4(DI)
-	MOVL	DX, 8(DI)
-	
-	// Switch stack and make the call.
-	MOVL	DI, SP
+	MOVL	BP, -4(DI)
+	LEAL	-(4+8)(DI), SP
 	CALL	runtime·cgocallbackg(SB)

 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVL	g(CX), SI
-	MOVL	20(SP), BP
+	MOVL	8(SP), BP
 	MOVL	BP, (g_sched+gobuf_pc)(SI)
-	LEAL	(20+4)(SP), DI
+	LEAL	(8+4)(SP), DI
 	MOVL	DI, (g_sched+gobuf_sp)(SI)

 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -610,11 +596,12 @@ havem:
 	MOVL	m_g0(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), SP
-	POPL	(g_sched+gobuf_sp)(SI)
+	MOVL	0(SP), AX
+	MOVL	AX, (g_sched+gobuf_sp)(SI)
 	
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	POPL	BP
+	MOVL	8(SP), BP
 	CMPL	BP, $0
 	JNE 3(PC)
 	MOVL	$runtime·dropm(SB), AX

--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -563,7 +563,7 @@ TEXT runtime·cgocallback(SB),7,$24-24

 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
 // See cgocall.c for more details.
-TEXT runtime·cgocallback_gofunc(SB),7,$24-24
+TEXT runtime·cgocallback_gofunc(SB),7,$16-24
 	// If m is nil, Go did not create the current thread.
 	// Call needm to obtain one for temporary use.
 	// In this case, we're running on the thread stack, so there's
@@ -571,13 +571,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$24-24
 	// the linker analysis by using an indirect call through AX.
 	get_tls(CX)
 #ifdef GOOS_windows
+	MOVL	$0, BP
 	CMPQ	CX, $0
-	JNE	3(PC)
-	PUSHQ	$0
-	JMP	needm
+	JNE	2(PC)
 #endif
 	MOVQ	m(CX), BP
-	PUSHQ	BP
+	MOVQ	BP, 8(SP)
 	CMPQ	BP, $0
 	JNE	havem
 needm:
@@ -591,55 +590,42 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
 	MOVQ	m_g0(BP), SI
-	PUSHQ	(g_sched+gobuf_sp)(SI)
+	MOVQ	(g_sched+gobuf_sp)(SI), AX
+	MOVQ	AX, 0(SP)
 	MOVQ	SP, (g_sched+gobuf_sp)(SI)

-	// Switch to m->curg stack and call runtime.cgocallbackg
-	// with the three arguments.  Because we are taking over
-	// the execution of m->curg but *not* resuming what had
-	// been running, we need to save that information (m->curg->sched)
-	// so that we can restore it when we're done. 
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
 	// To save m->curg->sched.pc, we push it onto the stack.
 	// This has the added benefit that it looks to the traceback
 	// routine like cgocallbackg is going to return to that
-	// PC (because we defined cgocallbackg to have
-	// a frame size of 24, the same amount that we use below),
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
-	MOVQ	fn+0(FP), AX
-	MOVQ	frame+8(FP), BX
-	MOVQ	framesize+16(FP), DX
-
+	//
+	// In the new goroutine, 0(SP) and 8(SP) are unused except
+	// on Windows, where they are the SEH block.
 	MOVQ	m_curg(BP), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
-
-	// Push gobuf.pc
 	MOVQ	(g_sched+gobuf_pc)(SI), BP
-	SUBQ	$8, DI
-	MOVQ	BP, 0(DI)
-
-	// Push arguments to cgocallbackg.
-	// Frame size here must match the frame size above plus the pushes
-	// to trick traceback routines into doing the right thing.
-	SUBQ	$40, DI
-	MOVQ	AX, 0(DI)
-	MOVQ	BX, 8(DI)
-	MOVQ	DX, 16(DI)
-	
-	// Switch stack and make the call.
-	MOVQ	DI, SP
+	MOVQ	BP, -8(DI)
+	LEAQ	-(8+16)(DI), SP
 	CALL	runtime·cgocallbackg(SB)

 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVQ	g(CX), SI
-	MOVQ	40(SP), BP
+	MOVQ	16(SP), BP
 	MOVQ	BP, (g_sched+gobuf_pc)(SI)
-	LEAQ	(40+8)(SP), DI
+	LEAQ	(16+8)(SP), DI
 	MOVQ	DI, (g_sched+gobuf_sp)(SI)

 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -649,11 +635,12 @@ havem:
 	MOVQ	m_g0(BP), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
-	POPQ	(g_sched+gobuf_sp)(SI)
+	MOVQ	0(SP), AX
+	MOVQ	AX, (g_sched+gobuf_sp)(SI)
 	
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	POPQ	BP
+	MOVQ	8(SP), BP
 	CMPQ	BP, $0
 	JNE 3(PC)
 	MOVQ	$runtime·dropm(SB), AX

--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@@ -331,7 +331,7 @@ TEXT runtime·cgocallback(SB),7,$12-12

 // cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize)
 // See cgocall.c for more details.
-TEXT	runtime·cgocallback_gofunc(SB),7,$12-12
+TEXT	runtime·cgocallback_gofunc(SB),7,$8-12
 	// Load m and g from thread-local storage.
 	MOVW	_cgo_load_gm(SB), R0
 	CMP	$0, R0
@@ -342,7 +342,7 @@ TEXT	runtime·cgocallback_gofunc(SB),7,$12-12
 	// In this case, we're running on the thread stack, so there's
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
-	MOVW	m, savedm-12(SP)
+	MOVW	m, savedm-4(SP)
 	CMP	$0, m
 	B.NE havem
 	MOVW	$runtime·needm(SB), R0
@@ -353,51 +353,41 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
 	MOVW	m_g0(m), R3
 	MOVW	(g_sched+gobuf_sp)(R3), R4
-	MOVW.W	R4, -4(R13)
+	MOVW	R4, savedsp-8(SP)
 	MOVW	R13, (g_sched+gobuf_sp)(R3)

-	// Switch to m->curg stack and call runtime.cgocallbackg
-	// with the three arguments.  Because we are taking over
-	// the execution of m->curg but *not* resuming what had
-	// been running, we need to save that information (m->curg->sched)
-	// so that we can restore it when we're done. 
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
 	// To save m->curg->sched.pc, we push it onto the stack.
 	// This has the added benefit that it looks to the traceback
 	// routine like cgocallbackg is going to return to that
-	// PC (because we defined cgocallbackg to have
-	// a frame size of 12, the same amount that we use below),
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
+	//
+	// In the new goroutine, -8(SP) and -4(SP) are unused.
 	MOVW	fn+4(FP), R0
 	MOVW	frame+8(FP), R1
 	MOVW	framesize+12(FP), R2
-
 	MOVW	m_curg(m), g
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
-
-	// Push gobuf.pc
-	// Frame size here must match the frame size above plus the push
-	// to trick traceback routines into doing the right thing.
 	MOVW	(g_sched+gobuf_pc)(g), R5
-	MOVW.W	R5, -20(R4)
-
-	// Push arguments to cgocallbackg.
-	MOVW	R0, 4(R4)
-	MOVW	R1, 8(R4)
-	MOVW	R2, 12(R4)
-	
-	// Switch stack and make the call.
-	MOVW	R4, R13
+	MOVW	R5, -12(R4)
+	MOVW	$-12(R4), R13
 	BL	runtime·cgocallbackg(SB)

 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R13), R5
 	MOVW	R5, (g_sched+gobuf_pc)(g)
-	ADD	$(16+4), R13, R4
+	MOVW	$12(R13), R4
 	MOVW	R4, (g_sched+gobuf_sp)(g)

 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -405,14 +395,12 @@ havem:
 	// so we do not have to restore it.)
 	MOVW	m_g0(m), g
 	MOVW	(g_sched+gobuf_sp)(g), R13
-	// POP R6
-	MOVW	0(R13), R6
-	ADD	$4, R13
-	MOVW	R6, (g_sched+gobuf_sp)(g)
+	MOVW	savedsp-8(SP), R4
+	MOVW	R4, (g_sched+gobuf_sp)(g)

 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	MOVW	savedm-12(SP), R6
+	MOVW	savedm-4(SP), R6
 	CMP	$0, R6
 	B.NE	3(PC)
 	MOVW	$runtime·dropm(SB), R0

--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -228,13 +228,25 @@ runtime·cfree(void *p)

 static FuncVal unwindmf = {unwindm};

+typedef struct CallbackArgs CallbackArgs;
+struct CallbackArgs
+{
+	FuncVal *fn;
+	void *arg;
+	uintptr argsize;
+};
+
+#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+(3+(thechar=='5'))*sizeof(void*))
+
 void
-runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize)
+runtime·cgocallbackg(void)
 {
 	Defer d;
+	CallbackArgs *cb;

 	if(m->racecall) {
-		reflect·call(fn, arg, argsize);
+		cb = CBARGS;
+		reflect·call(cb->fn, cb->arg, cb->argsize);
 		return;
 	}

@@ -261,7 +273,8 @@ runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize)
 		runtime·raceacquire(&cgosync);

 	// Invoke callback.
-	reflect·call(fn, arg, argsize);
+	cb = CBARGS;
+	reflect·call(cb->fn, cb->arg, cb->argsize);

 	if(raceenabled)
 		runtime·racereleasemerge(&cgosync);
@@ -286,9 +299,11 @@ unwindm(void)
 		runtime·throw("runtime: unwindm not implemented");
 	case '8':
 	case '6':
-	case '5':
 		m->g0->sched.sp = *(uintptr*)m->g0->sched.sp;
 		break;
+	case '5':
+		m->g0->sched.sp = *(uintptr*)((byte*)m->g0->sched.sp + 4);
+		break;
 	}
 }


--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -651,10 +651,10 @@ runtime·needm(byte x)
 	g->stackguard0 = g->stackguard;

 	// On windows/386, we need to put an SEH frame (two words)
-	// somewhere on the current stack. We are called
-	// from needm, and we know there is some available
-	// space one word into the argument frame. Use that.
-	m->seh = (SEH*)((uintptr*)&x + 1);
+	// somewhere on the current stack. We are called from cgocallback_gofunc
+	// and we know that it will leave two unused words below m->curg->sched.sp.
+	// Use those.
+	m->seh = (SEH*)((uintptr*)m->curg->sched.sp - 3);

 	// Initialize this thread to use the m.
 	runtime·asminit();