Commit dba623b1 authored by Russ Cox's avatar Russ Cox

runtime: reduce frame size for runtime.cgocallback_gofunc

Tying preemption to stack splits means that we have to able to
complete the call to exitsyscall (inside cgocallbackg at least for now)
without any stack split checks, meaning that the whole sequence
has to work within 128 bytes of stack, unless we increase the size
of the red zone. This CL frees up 24 bytes along that critical path
on amd64. (The 32-bit systems have plenty of space because all
their words are smaller.)

R=dvyukov
CC=golang-dev
https://golang.org/cl/11676043
parent e97c8706
...@@ -524,7 +524,7 @@ TEXT runtime·cgocallback(SB),7,$12-12 ...@@ -524,7 +524,7 @@ TEXT runtime·cgocallback(SB),7,$12-12
// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
// See cgocall.c for more details. // See cgocall.c for more details.
TEXT runtime·cgocallback_gofunc(SB),7,$12-12 TEXT runtime·cgocallback_gofunc(SB),7,$8-12
// If m is nil, Go did not create the current thread. // If m is nil, Go did not create the current thread.
// Call needm to obtain one for temporary use. // Call needm to obtain one for temporary use.
// In this case, we're running on the thread stack, so there's // In this case, we're running on the thread stack, so there's
...@@ -532,13 +532,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12 ...@@ -532,13 +532,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12
// the linker analysis by using an indirect call through AX. // the linker analysis by using an indirect call through AX.
get_tls(CX) get_tls(CX)
#ifdef GOOS_windows #ifdef GOOS_windows
MOVL $0, BP
CMPL CX, $0 CMPL CX, $0
JNE 3(PC) JNE 2(PC)
PUSHL $0
JMP needm
#endif #endif
MOVL m(CX), BP MOVL m(CX), BP
PUSHL BP MOVL BP, 4(SP)
CMPL BP, $0 CMPL BP, $0
JNE havem JNE havem
needm: needm:
...@@ -552,55 +551,42 @@ havem: ...@@ -552,55 +551,42 @@ havem:
// Save current m->g0->sched.sp on stack and then set it to SP. // Save current m->g0->sched.sp on stack and then set it to SP.
// Save current sp in m->g0->sched.sp in preparation for // Save current sp in m->g0->sched.sp in preparation for
// switch back to m->curg stack. // switch back to m->curg stack.
// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
MOVL m_g0(BP), SI MOVL m_g0(BP), SI
PUSHL (g_sched+gobuf_sp)(SI) MOVL (g_sched+gobuf_sp)(SI), AX
MOVL AX, 0(SP)
MOVL SP, (g_sched+gobuf_sp)(SI) MOVL SP, (g_sched+gobuf_sp)(SI)
// Switch to m->curg stack and call runtime.cgocallbackg // Switch to m->curg stack and call runtime.cgocallbackg.
// with the three arguments. Because we are taking over // Because we are taking over the execution of m->curg
// the execution of m->curg but *not* resuming what had // but *not* resuming what had been running, we need to
// been running, we need to save that information (m->curg->sched) // save that information (m->curg->sched) so we can restore it.
// so that we can restore it when we're done.
// We can restore m->curg->sched.sp easily, because calling // We can restore m->curg->sched.sp easily, because calling
// runtime.cgocallbackg leaves SP unchanged upon return. // runtime.cgocallbackg leaves SP unchanged upon return.
// To save m->curg->sched.pc, we push it onto the stack. // To save m->curg->sched.pc, we push it onto the stack.
// This has the added benefit that it looks to the traceback // This has the added benefit that it looks to the traceback
// routine like cgocallbackg is going to return to that // routine like cgocallbackg is going to return to that
// PC (because we defined cgocallbackg to have // PC (because the frame we allocate below has the same
// a frame size of 12, the same amount that we use below), // size as cgocallback_gofunc's frame declared above)
// so that the traceback will seamlessly trace back into // so that the traceback will seamlessly trace back into
// the earlier calls. // the earlier calls.
MOVL fn+0(FP), AX //
MOVL frame+4(FP), BX // In the new goroutine, 0(SP) and 4(SP) are unused except
MOVL framesize+8(FP), DX // on Windows, where they are the SEH block.
MOVL m_curg(BP), SI MOVL m_curg(BP), SI
MOVL SI, g(CX) MOVL SI, g(CX)
MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI MOVL (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
// Push gobuf.pc
MOVL (g_sched+gobuf_pc)(SI), BP MOVL (g_sched+gobuf_pc)(SI), BP
SUBL $4, DI MOVL BP, -4(DI)
MOVL BP, 0(DI) LEAL -(4+8)(DI), SP
// Push arguments to cgocallbackg.
// Frame size here must match the frame size above plus the pushes
// to trick traceback routines into doing the right thing.
SUBL $20, DI
MOVL AX, 0(DI)
MOVL BX, 4(DI)
MOVL DX, 8(DI)
// Switch stack and make the call.
MOVL DI, SP
CALL runtime·cgocallbackg(SB) CALL runtime·cgocallbackg(SB)
// Restore g->sched (== m->curg->sched) from saved values. // Restore g->sched (== m->curg->sched) from saved values.
get_tls(CX) get_tls(CX)
MOVL g(CX), SI MOVL g(CX), SI
MOVL 20(SP), BP MOVL 8(SP), BP
MOVL BP, (g_sched+gobuf_pc)(SI) MOVL BP, (g_sched+gobuf_pc)(SI)
LEAL (20+4)(SP), DI LEAL (8+4)(SP), DI
MOVL DI, (g_sched+gobuf_sp)(SI) MOVL DI, (g_sched+gobuf_sp)(SI)
// Switch back to m->g0's stack and restore m->g0->sched.sp. // Switch back to m->g0's stack and restore m->g0->sched.sp.
...@@ -610,11 +596,12 @@ havem: ...@@ -610,11 +596,12 @@ havem:
MOVL m_g0(BP), SI MOVL m_g0(BP), SI
MOVL SI, g(CX) MOVL SI, g(CX)
MOVL (g_sched+gobuf_sp)(SI), SP MOVL (g_sched+gobuf_sp)(SI), SP
POPL (g_sched+gobuf_sp)(SI) MOVL 0(SP), AX
MOVL AX, (g_sched+gobuf_sp)(SI)
// If the m on entry was nil, we called needm above to borrow an m // If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm. // for the duration of the call. Since the call is over, return it with dropm.
POPL BP MOVL 8(SP), BP
CMPL BP, $0 CMPL BP, $0
JNE 3(PC) JNE 3(PC)
MOVL $runtime·dropm(SB), AX MOVL $runtime·dropm(SB), AX
......
...@@ -563,7 +563,7 @@ TEXT runtime·cgocallback(SB),7,$24-24 ...@@ -563,7 +563,7 @@ TEXT runtime·cgocallback(SB),7,$24-24
// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
// See cgocall.c for more details. // See cgocall.c for more details.
TEXT runtime·cgocallback_gofunc(SB),7,$24-24 TEXT runtime·cgocallback_gofunc(SB),7,$16-24
// If m is nil, Go did not create the current thread. // If m is nil, Go did not create the current thread.
// Call needm to obtain one for temporary use. // Call needm to obtain one for temporary use.
// In this case, we're running on the thread stack, so there's // In this case, we're running on the thread stack, so there's
...@@ -571,13 +571,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$24-24 ...@@ -571,13 +571,12 @@ TEXT runtime·cgocallback_gofunc(SB),7,$24-24
// the linker analysis by using an indirect call through AX. // the linker analysis by using an indirect call through AX.
get_tls(CX) get_tls(CX)
#ifdef GOOS_windows #ifdef GOOS_windows
MOVL $0, BP
CMPQ CX, $0 CMPQ CX, $0
JNE 3(PC) JNE 2(PC)
PUSHQ $0
JMP needm
#endif #endif
MOVQ m(CX), BP MOVQ m(CX), BP
PUSHQ BP MOVQ BP, 8(SP)
CMPQ BP, $0 CMPQ BP, $0
JNE havem JNE havem
needm: needm:
...@@ -591,55 +590,42 @@ havem: ...@@ -591,55 +590,42 @@ havem:
// Save current m->g0->sched.sp on stack and then set it to SP. // Save current m->g0->sched.sp on stack and then set it to SP.
// Save current sp in m->g0->sched.sp in preparation for // Save current sp in m->g0->sched.sp in preparation for
// switch back to m->curg stack. // switch back to m->curg stack.
// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
MOVQ m_g0(BP), SI MOVQ m_g0(BP), SI
PUSHQ (g_sched+gobuf_sp)(SI) MOVQ (g_sched+gobuf_sp)(SI), AX
MOVQ AX, 0(SP)
MOVQ SP, (g_sched+gobuf_sp)(SI) MOVQ SP, (g_sched+gobuf_sp)(SI)
// Switch to m->curg stack and call runtime.cgocallbackg // Switch to m->curg stack and call runtime.cgocallbackg.
// with the three arguments. Because we are taking over // Because we are taking over the execution of m->curg
// the execution of m->curg but *not* resuming what had // but *not* resuming what had been running, we need to
// been running, we need to save that information (m->curg->sched) // save that information (m->curg->sched) so we can restore it.
// so that we can restore it when we're done.
// We can restore m->curg->sched.sp easily, because calling // We can restore m->curg->sched.sp easily, because calling
// runtime.cgocallbackg leaves SP unchanged upon return. // runtime.cgocallbackg leaves SP unchanged upon return.
// To save m->curg->sched.pc, we push it onto the stack. // To save m->curg->sched.pc, we push it onto the stack.
// This has the added benefit that it looks to the traceback // This has the added benefit that it looks to the traceback
// routine like cgocallbackg is going to return to that // routine like cgocallbackg is going to return to that
// PC (because we defined cgocallbackg to have // PC (because the frame we allocate below has the same
// a frame size of 24, the same amount that we use below), // size as cgocallback_gofunc's frame declared above)
// so that the traceback will seamlessly trace back into // so that the traceback will seamlessly trace back into
// the earlier calls. // the earlier calls.
MOVQ fn+0(FP), AX //
MOVQ frame+8(FP), BX // In the new goroutine, 0(SP) and 8(SP) are unused except
MOVQ framesize+16(FP), DX // on Windows, where they are the SEH block.
MOVQ m_curg(BP), SI MOVQ m_curg(BP), SI
MOVQ SI, g(CX) MOVQ SI, g(CX)
MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
// Push gobuf.pc
MOVQ (g_sched+gobuf_pc)(SI), BP MOVQ (g_sched+gobuf_pc)(SI), BP
SUBQ $8, DI MOVQ BP, -8(DI)
MOVQ BP, 0(DI) LEAQ -(8+16)(DI), SP
// Push arguments to cgocallbackg.
// Frame size here must match the frame size above plus the pushes
// to trick traceback routines into doing the right thing.
SUBQ $40, DI
MOVQ AX, 0(DI)
MOVQ BX, 8(DI)
MOVQ DX, 16(DI)
// Switch stack and make the call.
MOVQ DI, SP
CALL runtime·cgocallbackg(SB) CALL runtime·cgocallbackg(SB)
// Restore g->sched (== m->curg->sched) from saved values. // Restore g->sched (== m->curg->sched) from saved values.
get_tls(CX) get_tls(CX)
MOVQ g(CX), SI MOVQ g(CX), SI
MOVQ 40(SP), BP MOVQ 16(SP), BP
MOVQ BP, (g_sched+gobuf_pc)(SI) MOVQ BP, (g_sched+gobuf_pc)(SI)
LEAQ (40+8)(SP), DI LEAQ (16+8)(SP), DI
MOVQ DI, (g_sched+gobuf_sp)(SI) MOVQ DI, (g_sched+gobuf_sp)(SI)
// Switch back to m->g0's stack and restore m->g0->sched.sp. // Switch back to m->g0's stack and restore m->g0->sched.sp.
...@@ -649,11 +635,12 @@ havem: ...@@ -649,11 +635,12 @@ havem:
MOVQ m_g0(BP), SI MOVQ m_g0(BP), SI
MOVQ SI, g(CX) MOVQ SI, g(CX)
MOVQ (g_sched+gobuf_sp)(SI), SP MOVQ (g_sched+gobuf_sp)(SI), SP
POPQ (g_sched+gobuf_sp)(SI) MOVQ 0(SP), AX
MOVQ AX, (g_sched+gobuf_sp)(SI)
// If the m on entry was nil, we called needm above to borrow an m // If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm. // for the duration of the call. Since the call is over, return it with dropm.
POPQ BP MOVQ 8(SP), BP
CMPQ BP, $0 CMPQ BP, $0
JNE 3(PC) JNE 3(PC)
MOVQ $runtime·dropm(SB), AX MOVQ $runtime·dropm(SB), AX
......
...@@ -331,7 +331,7 @@ TEXT runtime·cgocallback(SB),7,$12-12 ...@@ -331,7 +331,7 @@ TEXT runtime·cgocallback(SB),7,$12-12
// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize) // cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize)
// See cgocall.c for more details. // See cgocall.c for more details.
TEXT runtime·cgocallback_gofunc(SB),7,$12-12 TEXT runtime·cgocallback_gofunc(SB),7,$8-12
// Load m and g from thread-local storage. // Load m and g from thread-local storage.
MOVW _cgo_load_gm(SB), R0 MOVW _cgo_load_gm(SB), R0
CMP $0, R0 CMP $0, R0
...@@ -342,7 +342,7 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12 ...@@ -342,7 +342,7 @@ TEXT runtime·cgocallback_gofunc(SB),7,$12-12
// In this case, we're running on the thread stack, so there's // In this case, we're running on the thread stack, so there's
// lots of space, but the linker doesn't know. Hide the call from // lots of space, but the linker doesn't know. Hide the call from
// the linker analysis by using an indirect call. // the linker analysis by using an indirect call.
MOVW m, savedm-12(SP) MOVW m, savedm-4(SP)
CMP $0, m CMP $0, m
B.NE havem B.NE havem
MOVW $runtime·needm(SB), R0 MOVW $runtime·needm(SB), R0
...@@ -353,51 +353,41 @@ havem: ...@@ -353,51 +353,41 @@ havem:
// Save current m->g0->sched.sp on stack and then set it to SP. // Save current m->g0->sched.sp on stack and then set it to SP.
// Save current sp in m->g0->sched.sp in preparation for // Save current sp in m->g0->sched.sp in preparation for
// switch back to m->curg stack. // switch back to m->curg stack.
// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
MOVW m_g0(m), R3 MOVW m_g0(m), R3
MOVW (g_sched+gobuf_sp)(R3), R4 MOVW (g_sched+gobuf_sp)(R3), R4
MOVW.W R4, -4(R13) MOVW R4, savedsp-8(SP)
MOVW R13, (g_sched+gobuf_sp)(R3) MOVW R13, (g_sched+gobuf_sp)(R3)
// Switch to m->curg stack and call runtime.cgocallbackg // Switch to m->curg stack and call runtime.cgocallbackg.
// with the three arguments. Because we are taking over // Because we are taking over the execution of m->curg
// the execution of m->curg but *not* resuming what had // but *not* resuming what had been running, we need to
// been running, we need to save that information (m->curg->sched) // save that information (m->curg->sched) so we can restore it.
// so that we can restore it when we're done.
// We can restore m->curg->sched.sp easily, because calling // We can restore m->curg->sched.sp easily, because calling
// runtime.cgocallbackg leaves SP unchanged upon return. // runtime.cgocallbackg leaves SP unchanged upon return.
// To save m->curg->sched.pc, we push it onto the stack. // To save m->curg->sched.pc, we push it onto the stack.
// This has the added benefit that it looks to the traceback // This has the added benefit that it looks to the traceback
// routine like cgocallbackg is going to return to that // routine like cgocallbackg is going to return to that
// PC (because we defined cgocallbackg to have // PC (because the frame we allocate below has the same
// a frame size of 12, the same amount that we use below), // size as cgocallback_gofunc's frame declared above)
// so that the traceback will seamlessly trace back into // so that the traceback will seamlessly trace back into
// the earlier calls. // the earlier calls.
//
// In the new goroutine, -8(SP) and -4(SP) are unused.
MOVW fn+4(FP), R0 MOVW fn+4(FP), R0
MOVW frame+8(FP), R1 MOVW frame+8(FP), R1
MOVW framesize+12(FP), R2 MOVW framesize+12(FP), R2
MOVW m_curg(m), g MOVW m_curg(m), g
MOVW (g_sched+gobuf_sp)(g), R4 // prepare stack as R4 MOVW (g_sched+gobuf_sp)(g), R4 // prepare stack as R4
// Push gobuf.pc
// Frame size here must match the frame size above plus the push
// to trick traceback routines into doing the right thing.
MOVW (g_sched+gobuf_pc)(g), R5 MOVW (g_sched+gobuf_pc)(g), R5
MOVW.W R5, -20(R4) MOVW R5, -12(R4)
MOVW $-12(R4), R13
// Push arguments to cgocallbackg.
MOVW R0, 4(R4)
MOVW R1, 8(R4)
MOVW R2, 12(R4)
// Switch stack and make the call.
MOVW R4, R13
BL runtime·cgocallbackg(SB) BL runtime·cgocallbackg(SB)
// Restore g->sched (== m->curg->sched) from saved values. // Restore g->sched (== m->curg->sched) from saved values.
MOVW 0(R13), R5 MOVW 0(R13), R5
MOVW R5, (g_sched+gobuf_pc)(g) MOVW R5, (g_sched+gobuf_pc)(g)
ADD $(16+4), R13, R4 MOVW $12(R13), R4
MOVW R4, (g_sched+gobuf_sp)(g) MOVW R4, (g_sched+gobuf_sp)(g)
// Switch back to m->g0's stack and restore m->g0->sched.sp. // Switch back to m->g0's stack and restore m->g0->sched.sp.
...@@ -405,14 +395,12 @@ havem: ...@@ -405,14 +395,12 @@ havem:
// so we do not have to restore it.) // so we do not have to restore it.)
MOVW m_g0(m), g MOVW m_g0(m), g
MOVW (g_sched+gobuf_sp)(g), R13 MOVW (g_sched+gobuf_sp)(g), R13
// POP R6 MOVW savedsp-8(SP), R4
MOVW 0(R13), R6 MOVW R4, (g_sched+gobuf_sp)(g)
ADD $4, R13
MOVW R6, (g_sched+gobuf_sp)(g)
// If the m on entry was nil, we called needm above to borrow an m // If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm. // for the duration of the call. Since the call is over, return it with dropm.
MOVW savedm-12(SP), R6 MOVW savedm-4(SP), R6
CMP $0, R6 CMP $0, R6
B.NE 3(PC) B.NE 3(PC)
MOVW $runtime·dropm(SB), R0 MOVW $runtime·dropm(SB), R0
......
...@@ -228,13 +228,25 @@ runtime·cfree(void *p) ...@@ -228,13 +228,25 @@ runtime·cfree(void *p)
static FuncVal unwindmf = {unwindm}; static FuncVal unwindmf = {unwindm};
typedef struct CallbackArgs CallbackArgs;
struct CallbackArgs
{
FuncVal *fn;
void *arg;
uintptr argsize;
};
#define CBARGS (CallbackArgs*)((byte*)m->g0->sched.sp+(3+(thechar=='5'))*sizeof(void*))
void void
runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize) runtime·cgocallbackg(void)
{ {
Defer d; Defer d;
CallbackArgs *cb;
if(m->racecall) { if(m->racecall) {
reflect·call(fn, arg, argsize); cb = CBARGS;
reflect·call(cb->fn, cb->arg, cb->argsize);
return; return;
} }
...@@ -261,7 +273,8 @@ runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize) ...@@ -261,7 +273,8 @@ runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize)
runtime·raceacquire(&cgosync); runtime·raceacquire(&cgosync);
// Invoke callback. // Invoke callback.
reflect·call(fn, arg, argsize); cb = CBARGS;
reflect·call(cb->fn, cb->arg, cb->argsize);
if(raceenabled) if(raceenabled)
runtime·racereleasemerge(&cgosync); runtime·racereleasemerge(&cgosync);
...@@ -286,9 +299,11 @@ unwindm(void) ...@@ -286,9 +299,11 @@ unwindm(void)
runtime·throw("runtime: unwindm not implemented"); runtime·throw("runtime: unwindm not implemented");
case '8': case '8':
case '6': case '6':
case '5':
m->g0->sched.sp = *(uintptr*)m->g0->sched.sp; m->g0->sched.sp = *(uintptr*)m->g0->sched.sp;
break; break;
case '5':
m->g0->sched.sp = *(uintptr*)((byte*)m->g0->sched.sp + 4);
break;
} }
} }
......
...@@ -651,10 +651,10 @@ runtime·needm(byte x) ...@@ -651,10 +651,10 @@ runtime·needm(byte x)
g->stackguard0 = g->stackguard; g->stackguard0 = g->stackguard;
// On windows/386, we need to put an SEH frame (two words) // On windows/386, we need to put an SEH frame (two words)
// somewhere on the current stack. We are called // somewhere on the current stack. We are called from cgocallback_gofunc
// from needm, and we know there is some available // and we know that it will leave two unused words below m->curg->sched.sp.
// space one word into the argument frame. Use that. // Use those.
m->seh = (SEH*)((uintptr*)&x + 1); m->seh = (SEH*)((uintptr*)m->curg->sched.sp - 3);
// Initialize this thread to use the m. // Initialize this thread to use the m.
runtime·asminit(); runtime·asminit();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment