Commit b597e1ed authored by Ilya Tocar's avatar Ilya Tocar Committed by Russ Cox

runtime: speed up memclr with avx2 on amd64

Results are a bit noisy, but show good improvement (haswell)

name            old time/op    new time/op     delta
Memclr5-48        6.06ns ± 8%     5.65ns ± 8%    -6.81%  (p=0.000 n=20+20)
Memclr16-48       5.75ns ± 6%     5.71ns ± 6%      ~     (p=0.545 n=20+19)
Memclr64-48       6.54ns ± 5%     6.14ns ± 9%    -6.12%  (p=0.000 n=18+20)
Memclr256-48      10.1ns ±12%      9.9ns ±14%      ~     (p=0.285 n=20+19)
Memclr4096-48      104ns ± 8%       57ns ±15%   -44.98%  (p=0.000 n=20+20)
Memclr65536-48    2.45µs ± 5%     2.43µs ± 8%      ~     (p=0.665 n=16+20)
Memclr1M-48       58.7µs ±13%     56.4µs ±11%    -3.92%  (p=0.033 n=20+19)
Memclr4M-48        233µs ± 9%      234µs ± 9%      ~     (p=0.728 n=20+19)
Memclr8M-48        469µs ±11%      472µs ±16%      ~     (p=0.947 n=20+20)
Memclr16M-48       947µs ±10%      916µs ±10%      ~     (p=0.050 n=20+19)
Memclr64M-48      10.9ms ±10%      4.5ms ± 9%   -58.43%  (p=0.000 n=20+20)
GoMemclr5-48      3.80ns ±13%     3.38ns ± 6%   -11.02%  (p=0.000 n=20+20)
GoMemclr16-48     3.34ns ±15%     3.40ns ± 9%      ~     (p=0.351 n=20+20)
GoMemclr64-48     4.10ns ±15%     4.04ns ±10%      ~     (p=1.000 n=20+19)
GoMemclr256-48    7.75ns ±20%     7.88ns ± 9%      ~     (p=0.227 n=20+19)

name            old speed      new speed       delta
Memclr5-48       826MB/s ± 7%    886MB/s ± 8%    +7.32%  (p=0.000 n=20+20)
Memclr16-48     2.78GB/s ± 5%   2.81GB/s ± 6%      ~     (p=0.550 n=20+19)
Memclr64-48     9.79GB/s ± 5%  10.44GB/s ±10%    +6.64%  (p=0.000 n=18+20)
Memclr256-48    25.4GB/s ±14%   25.6GB/s ±12%      ~     (p=0.647 n=20+19)
Memclr4096-48   39.4GB/s ± 8%   72.0GB/s ±13%   +82.81%  (p=0.000 n=20+20)
Memclr65536-48  26.6GB/s ± 6%   27.0GB/s ± 9%      ~     (p=0.517 n=17+20)
Memclr1M-48     17.9GB/s ±12%   18.5GB/s ±11%      ~     (p=0.068 n=20+20)
Memclr4M-48     18.0GB/s ± 9%   17.8GB/s ±14%      ~     (p=0.547 n=20+20)
Memclr8M-48     17.9GB/s ±10%   17.8GB/s ±14%      ~     (p=0.947 n=20+20)
Memclr16M-48    17.8GB/s ± 9%   18.4GB/s ± 9%      ~     (p=0.050 n=20+19)
Memclr64M-48    6.19GB/s ±10%  14.87GB/s ± 9%  +140.11%  (p=0.000 n=20+20)
GoMemclr5-48    1.31GB/s ±10%   1.48GB/s ± 6%   +13.06%  (p=0.000 n=19+20)
GoMemclr16-48   4.81GB/s ±14%   4.71GB/s ± 8%      ~     (p=0.341 n=20+20)
GoMemclr64-48   15.7GB/s ±13%   15.8GB/s ±11%      ~     (p=0.967 n=20+19)

Change-Id: I393f3f20e2f31538d1b1dd70d6e5c201c106a095
Reviewed-on: https://go-review.googlesource.com/16773
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarKlaus Post <klauspost@gmail.com>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent 98400426
......@@ -747,6 +747,7 @@ const (
AMOVNTHD
AMOVHDA
AVPCMPEQB
AVPXOR
AVPMOVMSKB
AVPAND
AVPTEST
......
......@@ -688,6 +688,7 @@ var Anames = []string{
"MOVNTHD",
"MOVHDA",
"VPCMPEQB",
"VPXOR",
"VPMOVMSKB",
"VPAND",
"VPTEST",
......
......@@ -1514,6 +1514,7 @@ var optab =
{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
{AVPXOR, yxm_xm_xm, Pvex1, [23]uint8{0xef, 0xef}},
{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
{AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
{AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}},
......
......@@ -36,8 +36,10 @@ tail:
JBE _65through128
CMPQ BX, $256
JBE _129through256
CMPB runtime·support_avx2(SB), $1
JE loop_preheader_avx2
// TODO: use branch table and BSR to make this just a single dispatch
// TODO: for really big clears, use MOVNTDQ.
// TODO: for really big clears, use MOVNTDQ, even without AVX2.
loop:
MOVOU X0, 0(DI)
......@@ -62,6 +64,57 @@ loop:
JAE loop
JMP tail
loop_preheader_avx2:
VPXOR X0, X0, X0
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
// For larger sizes it is always faster, even on dual Xeons with 30M cache.
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
loop_avx2:
MOVHDU X0, 0(DI)
MOVHDU X0, 32(DI)
MOVHDU X0, 64(DI)
MOVHDU X0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
JAE loop_avx2
MOVHDU X0, -32(DI)(BX*1)
MOVHDU X0, -64(DI)(BX*1)
MOVHDU X0, -96(DI)(BX*1)
MOVHDU X0, -128(DI)(BX*1)
VZEROUPPER
RET
loop_preheader_avx2_huge:
// Align to 32 byte boundary
MOVHDU X0, 0(DI)
MOVQ DI, SI
ADDQ $32, DI
ANDQ $~31, DI
SUBQ DI, SI
ADDQ SI, BX
loop_avx2_huge:
MOVNTHD X0, 0(DI)
MOVNTHD X0, 32(DI)
MOVNTHD X0, 64(DI)
MOVNTHD X0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
JAE loop_avx2_huge
// In the desciption of MOVNTDQ in [1]
// "... fencing operation implemented with the SFENCE or MFENCE instruction
// should be used in conjunction with MOVNTDQ instructions..."
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
SFENCE
MOVHDU X0, -32(DI)(BX*1)
MOVHDU X0, -64(DI)(BX*1)
MOVHDU X0, -96(DI)(BX*1)
MOVHDU X0, -128(DI)(BX*1)
VZEROUPPER
RET
_1or2:
MOVB AX, (DI)
MOVB AX, -1(DI)(BX*1)
......
......@@ -196,6 +196,11 @@ func BenchmarkMemclr64(b *testing.B) { bmMemclr(b, 64) }
func BenchmarkMemclr256(b *testing.B) { bmMemclr(b, 256) }
func BenchmarkMemclr4096(b *testing.B) { bmMemclr(b, 4096) }
func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
func BenchmarkMemclr1M(b *testing.B) { bmMemclr(b, 1<<20) }
func BenchmarkMemclr4M(b *testing.B) { bmMemclr(b, 4<<20) }
func BenchmarkMemclr8M(b *testing.B) { bmMemclr(b, 8<<20) }
func BenchmarkMemclr16M(b *testing.B) { bmMemclr(b, 16<<20) }
func BenchmarkMemclr64M(b *testing.B) { bmMemclr(b, 64<<20) }
func bmGoMemclr(b *testing.B, n int) {
x := make([]byte, n)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment