Commit 88f423ed authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

cmd/internal/obj/x86: improve static branch prediction for wrapper prologue

Static branch prediction assumes that forward branches are not taken.
The existing wrapper prologue almost always takes the first forward
branch.
Move the rare case to the end of the function.

This CL is amd64 only. Other architectures will be done in separate CLs.

Updates #19042.

Package sort benchmarks:

SearchWrappers-8       104ns ± 2%   104ns ± 0%  -0.41%  (p=0.006 n=30+41)
SortString1K-8         128µs ± 1%   128µs ± 1%  -0.25%  (p=0.045 n=30+56)
SortString1K_Slice-8   117µs ± 1%   117µs ± 1%    ~     (p=0.855 n=30+59)
StableString1K-8      18.6µs ± 1%  18.6µs ± 1%    ~     (p=0.599 n=29+60)
SortInt1K-8           61.0µs ± 1%  56.5µs ± 1%  -7.36%  (p=0.000 n=29+58)
StableInt1K-8         74.6µs ± 1%  70.4µs ± 3%  -5.54%  (p=0.000 n=28+60)
StableInt1K_Slice-8   59.9µs ± 1%  58.3µs ± 4%  -2.64%  (p=0.000 n=29+60)
SortInt64K-8          6.02ms ± 2%  5.98ms ± 2%  -0.60%  (p=0.000 n=29+59)
SortInt64K_Slice-8    5.07ms ± 2%  5.05ms ± 2%  -0.38%  (p=0.006 n=30+58)
StableInt64K-8        6.41ms ± 1%  6.22ms ± 1%  -3.00%  (p=0.000 n=27+58)
Sort1e2-8             37.4µs ± 1%  37.1µs ± 1%  -0.91%  (p=0.000 n=30+57)
Stable1e2-8           74.8µs ± 1%  75.2µs ± 1%  +0.52%  (p=0.000 n=30+57)
Sort1e4-8             8.11ms ± 1%  8.01ms ± 1%  -1.20%  (p=0.000 n=30+59)
Stable1e4-8           24.3ms ± 1%  24.3ms ± 1%    ~     (p=0.157 n=30+60)
Sort1e6-8              1.25s ± 1%   1.23s ± 1%  -1.43%  (p=0.000 n=29+58)
Stable1e6-8            4.93s ± 1%   4.90s ± 1%  -0.56%  (p=0.000 n=29+59)
[Geo mean]             720µs        709µs       -1.52%

Assembly for sort.(*intPairs).Swap:

Before:

"".(*intPairs).Swap t=1 size=147 args=0x18 locals=0x8
	0x0000 00000 (<autogenerated>:1)	TEXT	"".(*intPairs).Swap(SB), $8-24
	0x0000 00000 (<autogenerated>:1)	MOVQ	(TLS), CX
	0x0009 00009 (<autogenerated>:1)	SUBQ	$8, SP
	0x000d 00013 (<autogenerated>:1)	MOVQ	BP, (SP)
	0x0011 00017 (<autogenerated>:1)	LEAQ	(SP), BP
	0x0015 00021 (<autogenerated>:1)	MOVQ	32(CX), BX
	0x0019 00025 (<autogenerated>:1)	TESTQ	BX, BX
	0x001c 00028 (<autogenerated>:1)	JEQ	43
	0x001e 00030 (<autogenerated>:1)	LEAQ	16(SP), DI
	0x0023 00035 (<autogenerated>:1)	CMPQ	(BX), DI
	0x0026 00038 (<autogenerated>:1)	JNE	43
	0x0028 00040 (<autogenerated>:1)	MOVQ	SP, (BX)
	0x002b 00043 (<autogenerated>:1)	NOP
	0x002b 00043 (<autogenerated>:1)	FUNCDATA	$0, gclocals·e6397a44f8e1b6e77d0f200b4fba5269(SB)
	0x002b 00043 (<autogenerated>:1)	FUNCDATA	$1, gclocals·69c1753bd5f81501d95132d08af04464(SB)
	0x002b 00043 (<autogenerated>:1)	MOVQ	""..this+16(FP), AX
	0x0030 00048 (<autogenerated>:1)	TESTQ	AX, AX
	0x0033 00051 (<autogenerated>:1)	JEQ	$0, 140
	0x0035 00053 (<autogenerated>:1)	MOVQ	(AX), CX
	0x0038 00056 (<autogenerated>:1)	MOVQ	8(AX), AX
	0x003c 00060 (<autogenerated>:1)	MOVQ	"".i+24(FP), DX
	0x0041 00065 (<autogenerated>:1)	CMPQ	DX, AX
	0x0044 00068 (<autogenerated>:1)	JCC	$0, 133
	0x0046 00070 (<autogenerated>:1)	SHLQ	$4, DX
	0x004a 00074 (<autogenerated>:1)	MOVQ	8(CX)(DX*1), BX
	0x004f 00079 (<autogenerated>:1)	MOVQ	(CX)(DX*1), SI
	0x0053 00083 (<autogenerated>:1)	MOVQ	"".j+32(FP), DI
	0x0058 00088 (<autogenerated>:1)	CMPQ	DI, AX
	0x005b 00091 (<autogenerated>:1)	JCC	$0, 133
	0x005d 00093 (<autogenerated>:1)	SHLQ	$4, DI
	0x0061 00097 (<autogenerated>:1)	MOVQ	8(CX)(DI*1), AX
	0x0066 00102 (<autogenerated>:1)	MOVQ	(CX)(DI*1), R8
	0x006a 00106 (<autogenerated>:1)	MOVQ	R8, (CX)(DX*1)
	0x006e 00110 (<autogenerated>:1)	MOVQ	AX, 8(CX)(DX*1)
	0x0073 00115 (<autogenerated>:1)	MOVQ	SI, (CX)(DI*1)
	0x0077 00119 (<autogenerated>:1)	MOVQ	BX, 8(CX)(DI*1)
	0x007c 00124 (<autogenerated>:1)	MOVQ	(SP), BP
	0x0080 00128 (<autogenerated>:1)	ADDQ	$8, SP
	0x0084 00132 (<autogenerated>:1)	RET
	0x0085 00133 (<autogenerated>:1)	PCDATA	$0, $1
	0x0085 00133 (<autogenerated>:1)	CALL	runtime.panicindex(SB)
	0x008a 00138 (<autogenerated>:1)	UNDEF
	0x008c 00140 (<autogenerated>:1)	PCDATA	$0, $1
	0x008c 00140 (<autogenerated>:1)	CALL	runtime.panicwrap(SB)
	0x0091 00145 (<autogenerated>:1)	UNDEF

After:

"".(*intPairs).Swap t=1 size=149 args=0x18 locals=0x8
	0x0000 00000 (<autogenerated>:1)	TEXT	"".(*intPairs).Swap(SB), $8-24
	0x0000 00000 (<autogenerated>:1)	MOVQ	(TLS), CX
	0x0009 00009 (<autogenerated>:1)	SUBQ	$8, SP
	0x000d 00013 (<autogenerated>:1)	MOVQ	BP, (SP)
	0x0011 00017 (<autogenerated>:1)	LEAQ	(SP), BP
	0x0015 00021 (<autogenerated>:1)	MOVQ	32(CX), BX
	0x0019 00025 (<autogenerated>:1)	TESTQ	BX, BX
	0x001c 00028 (<autogenerated>:1)	JNE	134
	0x001e 00030 (<autogenerated>:1)	NOP
	0x001e 00030 (<autogenerated>:1)	FUNCDATA	$0, gclocals·e6397a44f8e1b6e77d0f200b4fba5269(SB)
	0x001e 00030 (<autogenerated>:1)	FUNCDATA	$1, gclocals·69c1753bd5f81501d95132d08af04464(SB)
	0x001e 00030 (<autogenerated>:1)	MOVQ	""..this+16(FP), AX
	0x0023 00035 (<autogenerated>:1)	TESTQ	AX, AX
	0x0026 00038 (<autogenerated>:1)	JEQ	$0, 127
	0x0028 00040 (<autogenerated>:1)	MOVQ	(AX), CX
	0x002b 00043 (<autogenerated>:1)	MOVQ	8(AX), AX
	0x002f 00047 (<autogenerated>:1)	MOVQ	"".i+24(FP), DX
	0x0034 00052 (<autogenerated>:1)	CMPQ	DX, AX
	0x0037 00055 (<autogenerated>:1)	JCC	$0, 120
	0x0039 00057 (<autogenerated>:1)	SHLQ	$4, DX
	0x003d 00061 (<autogenerated>:1)	MOVQ	8(CX)(DX*1), BX
	0x0042 00066 (<autogenerated>:1)	MOVQ	(CX)(DX*1), SI
	0x0046 00070 (<autogenerated>:1)	MOVQ	"".j+32(FP), DI
	0x004b 00075 (<autogenerated>:1)	CMPQ	DI, AX
	0x004e 00078 (<autogenerated>:1)	JCC	$0, 120
	0x0050 00080 (<autogenerated>:1)	SHLQ	$4, DI
	0x0054 00084 (<autogenerated>:1)	MOVQ	8(CX)(DI*1), AX
	0x0059 00089 (<autogenerated>:1)	MOVQ	(CX)(DI*1), R8
	0x005d 00093 (<autogenerated>:1)	MOVQ	R8, (CX)(DX*1)
	0x0061 00097 (<autogenerated>:1)	MOVQ	AX, 8(CX)(DX*1)
	0x0066 00102 (<autogenerated>:1)	MOVQ	SI, (CX)(DI*1)
	0x006a 00106 (<autogenerated>:1)	MOVQ	BX, 8(CX)(DI*1)
	0x006f 00111 (<autogenerated>:1)	MOVQ	(SP), BP
	0x0073 00115 (<autogenerated>:1)	ADDQ	$8, SP
	0x0077 00119 (<autogenerated>:1)	RET
	0x0078 00120 (<autogenerated>:1)	PCDATA	$0, $1
	0x0078 00120 (<autogenerated>:1)	CALL	runtime.panicindex(SB)
	0x007d 00125 (<autogenerated>:1)	UNDEF
	0x007f 00127 (<autogenerated>:1)	PCDATA	$0, $1
	0x007f 00127 (<autogenerated>:1)	CALL	runtime.panicwrap(SB)
	0x0084 00132 (<autogenerated>:1)	UNDEF
	0x0086 00134 (<autogenerated>:1)	LEAQ	16(SP), DI
	0x008b 00139 (<autogenerated>:1)	CMPQ	(BX), DI
	0x008e 00142 (<autogenerated>:1)	JNE	30
	0x0090 00144 (<autogenerated>:1)	MOVQ	SP, (BX)
	0x0093 00147 (<autogenerated>:1)	JMP	30

Change-Id: Ie8c37f384bba10fbacaa754bb0a6b0a7e520ef01
Reviewed-on: https://go-review.googlesource.com/36893Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent f7f3514b
......@@ -748,16 +748,22 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
//
// MOVQ g_panic(CX), BX
// TESTQ BX, BX
// JEQ end
// JNE checkargp
// end:
// NOP
// ... rest of function ...
// checkargp:
// LEAQ (autoffset+8)(SP), DI
// CMPQ panic_argp(BX), DI
// JNE end
// MOVQ SP, panic_argp(BX)
// end:
// NOP
// MOVQ SP, panic_argp(BX)
// JMP end
//
// The NOP is needed to give the jumps somewhere to land.
// It is a liblink NOP, not an x86 NOP: it encodes to 0 instruction bytes.
//
// The layout is chosen to help static branch prediction:
// Both conditional jumps are unlikely, so they are arranged to be forward jumps.
// MOVQ g_panic(CX), BX
p = obj.Appendp(ctxt, p)
......@@ -789,14 +795,23 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.As = ATESTL
}
// JEQ end
p = obj.Appendp(ctxt, p)
p.As = AJEQ
p.To.Type = obj.TYPE_BRANCH
p1 := p
// JNE checkargp (checkargp to be resolved later)
jne := obj.Appendp(ctxt, p)
jne.As = AJNE
jne.To.Type = obj.TYPE_BRANCH
// end:
// NOP
end := obj.Appendp(ctxt, jne)
end.As = obj.ANOP
// Fast forward to end of function.
var last *obj.Prog
for last = end; last.Link != nil; last = last.Link {
}
// LEAQ (autoffset+8)(SP), DI
p = obj.Appendp(ctxt, p)
p = obj.Appendp(ctxt, last)
p.As = ALEAQ
p.From.Type = obj.TYPE_MEM
p.From.Reg = REG_SP
......@@ -807,6 +822,9 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.As = ALEAL
}
// Set jne branch target.
jne.Pcond = p
// CMPQ panic_argp(BX), DI
p = obj.Appendp(ctxt, p)
p.As = ACMPQ
......@@ -830,7 +848,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p = obj.Appendp(ctxt, p)
p.As = AJNE
p.To.Type = obj.TYPE_BRANCH
p2 := p
p.Pcond = end
// MOVQ SP, panic_argp(BX)
p = obj.Appendp(ctxt, p)
......@@ -851,13 +869,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.As = AMOVL
}
// NOP
// JMP end
p = obj.Appendp(ctxt, p)
p.As = obj.ANOP
p.As = obj.AJMP
p.To.Type = obj.TYPE_BRANCH
p.Pcond = end
// Set targets for jumps above to the NOP
p1.Pcond = p
p2.Pcond = p
// Reset p for following code.
p = end
}
for ; p != nil; p = p.Link {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment