Commit 967564be authored by Ilya Tocar's avatar Ilya Tocar Committed by Keith Randall

runtime: optimize string comparison on amd64

Use AVX2 if possible.
Results below (haswell):

name                            old time/op    new time/op     delta
CompareStringEqual-6              8.77ns ± 0%     8.63ns ± 1%   -1.58%        (p=0.000 n=20+19)
CompareStringIdentical-6          5.02ns ± 0%     5.02ns ± 0%     ~     (all samples are equal)
CompareStringSameLength-6         7.51ns ± 0%     7.51ns ± 0%     ~     (all samples are equal)
CompareStringDifferentLength-6    1.56ns ± 0%     1.56ns ± 0%     ~     (all samples are equal)
CompareStringBigUnaligned-6        124µs ± 1%      105µs ± 5%  -14.99%        (p=0.000 n=20+18)
CompareStringBig-6                 112µs ± 1%      103µs ± 0%   -7.87%        (p=0.000 n=20+17)

name                            old speed      new speed       delta
CompareStringBigUnaligned-6     8.48GB/s ± 1%   9.98GB/s ± 5%  +17.67%        (p=0.000 n=20+18)
CompareStringBig-6              9.37GB/s ± 1%  10.17GB/s ± 0%   +8.54%        (p=0.000 n=20+17)

Change-Id: I1c949626dd2aaf9f633e3c888a9df71c82eed7e1
Reviewed-on: https://go-review.googlesource.com/16481Reviewed-by: default avatarKeith Randall <khr@golang.org>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarKlaus Post <klauspost@gmail.com>
parent 8e496f1d
...@@ -748,6 +748,7 @@ const ( ...@@ -748,6 +748,7 @@ const (
AMOVHDA AMOVHDA
AVPCMPEQB AVPCMPEQB
AVPMOVMSKB AVPMOVMSKB
AVPAND
// from 386 // from 386
AJCXZW AJCXZW
......
...@@ -689,6 +689,7 @@ var Anames = []string{ ...@@ -689,6 +689,7 @@ var Anames = []string{
"MOVHDA", "MOVHDA",
"VPCMPEQB", "VPCMPEQB",
"VPMOVMSKB", "VPMOVMSKB",
"VPAND",
"JCXZW", "JCXZW",
"FCMOVCC", "FCMOVCC",
"FCMOVCS", "FCMOVCS",
......
...@@ -1509,6 +1509,7 @@ var optab = ...@@ -1509,6 +1509,7 @@ var optab =
{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}}, {AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}}, {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}}, {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
{AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}}, {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
{obj.ATYPE, nil, 0, [23]uint8{}}, {obj.ATYPE, nil, 0, [23]uint8{}},
{obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}}, {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
......
...@@ -1416,6 +1416,10 @@ eq: ...@@ -1416,6 +1416,10 @@ eq:
TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
CMPQ BX, $8 CMPQ BX, $8
JB small JB small
CMPQ BX, $64
JB bigloop
CMPB runtime·support_avx2(SB), $1
JE hugeloop_avx2
// 64 bytes at a time using xmm registers // 64 bytes at a time using xmm registers
hugeloop: hugeloop:
...@@ -1445,6 +1449,30 @@ hugeloop: ...@@ -1445,6 +1449,30 @@ hugeloop:
MOVB $0, (AX) MOVB $0, (AX)
RET RET
// 64 bytes at a time using ymm registers
hugeloop_avx2:
CMPQ BX, $64
JB bigloop_avx2
MOVHDU (SI), X0
MOVHDU (DI), X1
MOVHDU 32(SI), X2
MOVHDU 32(DI), X3
VPCMPEQB X1, X0, X4
VPCMPEQB X2, X3, X5
VPAND X4, X5, X6
VPMOVMSKB X6, DX
ADDQ $64, SI
ADDQ $64, DI
SUBQ $64, BX
CMPL DX, $0xffffffff
JEQ hugeloop_avx2
VZEROUPPER
MOVB $0, (AX)
RET
bigloop_avx2:
VZEROUPPER
// 8 bytes at a time using 64-bit register // 8 bytes at a time using 64-bit register
bigloop: bigloop:
CMPQ BX, $8 CMPQ BX, $8
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment