Commit 7e0c11c3 authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

cmd/6g, runtime: improve duffzero throughput

It is faster to execute

	MOVQ AX,(DI)
	MOVQ AX,8(DI)
	MOVQ AX,16(DI)
	MOVQ AX,24(DI)
	ADDQ $32,DI

than

	STOSQ
	STOSQ
	STOSQ
	STOSQ

However, in order to be able to jump into
the middle of a block of MOVQs, the call
site needs to pre-adjust DI.

If we're clearing a small area, the cost
of that DI pre-adjustment isn't repaid.

This CL switches the DUFFZERO implementation
to use a hybrid strategy, in which small
clears use STOSQ as before, but large clears
use mostly MOVQ/ADDQ blocks.

benchmark                 old ns/op     new ns/op     delta
BenchmarkClearFat8        0.55          0.55          +0.00%
BenchmarkClearFat12       0.82          0.83          +1.22%
BenchmarkClearFat16       0.55          0.55          +0.00%
BenchmarkClearFat24       0.82          0.82          +0.00%
BenchmarkClearFat32       2.20          1.94          -11.82%
BenchmarkClearFat40       1.92          1.66          -13.54%
BenchmarkClearFat48       2.21          1.93          -12.67%
BenchmarkClearFat56       3.03          2.20          -27.39%
BenchmarkClearFat64       3.26          2.48          -23.93%
BenchmarkClearFat72       3.57          2.76          -22.69%
BenchmarkClearFat80       3.83          3.05          -20.37%
BenchmarkClearFat88       4.14          3.30          -20.29%
BenchmarkClearFat128      5.54          4.69          -15.34%
BenchmarkClearFat256      9.95          9.09          -8.64%
BenchmarkClearFat512      18.7          17.9          -4.28%
BenchmarkClearFat1024     36.2          35.4          -2.21%

Change-Id: Ic786406d9b3cab68d5a231688f9e66fcd1bd7103
Reviewed-on: https://go-review.googlesource.com/2585Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent 5ed90cbb
...@@ -62,6 +62,55 @@ func defframe(ptxt *obj.Prog) { ...@@ -62,6 +62,55 @@ func defframe(ptxt *obj.Prog) {
zerorange(p, int64(frame), lo, hi, &ax) zerorange(p, int64(frame), lo, hi, &ax)
} }
// DUFFZERO consists of repeated blocks of 4 MOVs + ADD,
// with 4 STOSQs at the very end.
// The trailing STOSQs prevent the need for a DI preadjustment
// for small numbers of words to clear.
// See runtime/mkduff.go.
const (
dzBlocks = 31 // number of MOV/ADD blocks
dzBlockLen = 4 // number of clears per block
dzBlockSize = 19 // size of instructions in a single block
dzMovSize = 4 // size of single MOV instruction w/ offset
dzAddSize = 4 // size of single ADD instruction
dzDIStep = 8 // number of bytes cleared by each MOV instruction
dzTailLen = 4 // number of final STOSQ instructions
dzTailSize = 2 // size of single STOSQ instruction
dzSize = dzBlocks*dzBlockSize + dzTailLen*dzTailSize // total size of DUFFZERO routine
)
// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
// q is the number of words to zero.
func dzDI(q int64) int64 {
if q < dzTailLen {
return 0
}
q -= dzTailLen
if q%dzBlockLen == 0 {
return 0
}
return -dzDIStep * (dzBlockLen - q%dzBlockLen)
}
// dzOff returns the offset for a jump into DUFFZERO.
// q is the number of words to zero.
func dzOff(q int64) int64 {
off := int64(dzSize)
if q < dzTailLen {
return off - q*dzTailSize
}
off -= dzTailLen * dzTailSize
q -= dzTailLen
blocks, steps := q/dzBlockLen, q%dzBlockLen
off -= dzBlockSize * blocks
if steps > 0 {
off -= dzAddSize + dzMovSize*steps
}
return off
}
func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Prog { func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Prog {
cnt := hi - lo cnt := hi - lo
if cnt == 0 { if cnt == 0 {
...@@ -87,8 +136,9 @@ func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Pr ...@@ -87,8 +136,9 @@ func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Pr
p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i) p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i)
} }
} else if !gc.Nacl && (cnt <= int64(128*gc.Widthreg)) { } else if !gc.Nacl && (cnt <= int64(128*gc.Widthreg)) {
p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) q := cnt / int64(gc.Widthreg)
p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, 2*(128-cnt/int64(gc.Widthreg))) p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo+dzDI(q), obj.TYPE_REG, x86.REG_DI, 0)
p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(q))
p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
} else { } else {
p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0)
...@@ -562,12 +612,13 @@ func clearfat(nl *gc.Node) { ...@@ -562,12 +612,13 @@ func clearfat(nl *gc.Node) {
gins(x86.AREP, nil, nil) // repeat gins(x86.AREP, nil, nil) // repeat
gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+ gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+
} else { } else {
if di := dzDI(q); di != 0 {
gconreg(addptr, di, x86.REG_DI)
}
p := gins(obj.ADUFFZERO, nil, nil) p := gins(obj.ADUFFZERO, nil, nil)
p.To.Type = obj.TYPE_ADDR p.To.Type = obj.TYPE_ADDR
p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
p.To.Offset = dzOff(q)
// 2 and 128 = magic constants: see ../../runtime/asm_amd64.s
p.To.Offset = 2 * (128 - q)
} }
z := ax z := ax
......
...@@ -5,130 +5,192 @@ ...@@ -5,130 +5,192 @@
#include "textflag.h" #include "textflag.h"
TEXT runtime·duffzero(SB), NOSPLIT, $0-0 TEXT runtime·duffzero(SB), NOSPLIT, $0-0
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
STOSQ ADDQ $32,DI
STOSQ
STOSQ MOVQ AX,(DI)
STOSQ MOVQ AX,8(DI)
STOSQ MOVQ AX,16(DI)
STOSQ MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
STOSQ STOSQ
STOSQ STOSQ
STOSQ STOSQ
......
...@@ -206,6 +206,24 @@ func BenchmarkClearFat32(b *testing.B) { ...@@ -206,6 +206,24 @@ func BenchmarkClearFat32(b *testing.B) {
_ = x _ = x
} }
} }
func BenchmarkClearFat40(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [40 / 4]uint32
_ = x
}
}
func BenchmarkClearFat48(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [48 / 4]uint32
_ = x
}
}
func BenchmarkClearFat56(b *testing.B) {
for i := 0; i < b.N; i++ {
var x [56 / 4]uint32
_ = x
}
}
func BenchmarkClearFat64(b *testing.B) { func BenchmarkClearFat64(b *testing.B) {
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
var x [64 / 4]uint32 var x [64 / 4]uint32
......
...@@ -64,7 +64,15 @@ func zeroAMD64(w io.Writer) { ...@@ -64,7 +64,15 @@ func zeroAMD64(w io.Writer) {
// DI: ptr to memory to be zeroed // DI: ptr to memory to be zeroed
// DI is updated as a side effect. // DI is updated as a side effect.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0") fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
for i := 0; i < 128; i++ { for i := 0; i < 31; i++ {
fmt.Fprintln(w, "\tMOVQ\tAX,(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)")
fmt.Fprintln(w, "\tADDQ\t$32,DI")
fmt.Fprintln(w)
}
for i := 0; i < 4; i++ {
fmt.Fprintln(w, "\tSTOSQ") fmt.Fprintln(w, "\tSTOSQ")
} }
fmt.Fprintln(w, "\tRET") fmt.Fprintln(w, "\tRET")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment