Commit 0f2ef0ad authored by Ilya Tocar's avatar Ilya Tocar

cmd/compile/internal/ssa: combine byte stores on amd64

On amd64 we optimize  encoding/binary.BigEndian.PutUint{16,32,64}
into bswap + single store, but strangely enough not LittleEndian.PutUint{16,32}.
We have similar rules, but they use 64-bit shifts everywhere,
and fail for 16/32-bit case. Add rules that matchLittleEndian.PutUint,
and relevant tests. Performance results:

LittleEndianPutUint16-6    1.43ns ± 0%    1.07ns ± 0%   -25.17%  (p=0.000 n=9+9)
LittleEndianPutUint32-6    2.14ns ± 0%    0.94ns ± 0%   -56.07%  (p=0.019 n=6+8)

LittleEndianPutUint16-6  1.40GB/s ± 0%  1.87GB/s ± 0%   +33.24%  (p=0.000 n=9+9)
LittleEndianPutUint32-6  1.87GB/s ± 0%  4.26GB/s ± 0%  +128.54%  (p=0.000 n=8+8)

Discovered, while looking at ethereum_ethash from community benchmarks

Change-Id: Id86d5443687ecddd2803edf3203dbdd1246f61fe
Reviewed-on: https://go-review.googlesource.com/95475
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent d7cd61ce
......@@ -338,6 +338,54 @@ var linuxAMD64Tests = []*asmTest{
`,
pos: []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
},
{
fn: `
func $(b []byte, v uint64) {
binary.LittleEndian.PutUint64(b, v)
}
`,
neg: []string{"SHRQ"},
},
{
fn: `
func $(b []byte, i int, v uint64) {
binary.LittleEndian.PutUint64(b[i:], v)
}
`,
neg: []string{"SHRQ"},
},
{
fn: `
func $(b []byte, v uint32) {
binary.LittleEndian.PutUint32(b, v)
}
`,
neg: []string{"SHRL", "SHRQ"},
},
{
fn: `
func $(b []byte, i int, v uint32) {
binary.LittleEndian.PutUint32(b[i:], v)
}
`,
neg: []string{"SHRL", "SHRQ"},
},
{
fn: `
func $(b []byte, v uint16) {
binary.LittleEndian.PutUint16(b, v)
}
`,
neg: []string{"SHRW", "SHRL", "SHRQ"},
},
{
fn: `
func $(b []byte, i int, v uint16) {
binary.LittleEndian.PutUint16(b[i:], v)
}
`,
neg: []string{"SHRW", "SHRL", "SHRQ"},
},
{
fn: `
func f6(b []byte) uint64 {
......
......@@ -2014,19 +2014,19 @@
-> (MOVQstoreidx1 [ValAndOff(a).Off()] {s} p (SHLQconst <i.Type> [2] i) (MOVQconst [ValAndOff(a).Val()&0xffffffff | ValAndOff(c).Val()<<32]) mem)
// Combine stores into larger (unaligned) stores.
(MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
(MOVBstore [i] {s} p (SHR(W|L|Q)const [8] w) x:(MOVBstore [i-1] {s} p w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstore [i-1] {s} p w mem)
(MOVBstore [i] {s} p (SHRQconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SHRQconst [j-8] w) mem))
(MOVBstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVBstore [i-1] {s} p w0:(SHR(L|Q)const [j-8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstore [i-1] {s} p w0 mem)
(MOVWstore [i] {s} p (SHRQconst [16] w) x:(MOVWstore [i-2] {s} p w mem))
(MOVWstore [i] {s} p (SHR(L|Q)const [16] w) x:(MOVWstore [i-2] {s} p w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVLstore [i-2] {s} p w mem)
(MOVWstore [i] {s} p (SHRQconst [j] w) x:(MOVWstore [i-2] {s} p w0:(SHRQconst [j-16] w) mem))
(MOVWstore [i] {s} p (SHR(L|Q)const [j] w) x:(MOVWstore [i-2] {s} p w0:(SHR(L|Q)const [j-16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVLstore [i-2] {s} p w0 mem)
......@@ -2039,19 +2039,19 @@
&& clobber(x)
-> (MOVQstore [i-4] {s} p w0 mem)
(MOVBstoreidx1 [i] {s} p idx (SHRQconst [8] w) x:(MOVBstoreidx1 [i-1] {s} p idx w mem))
(MOVBstoreidx1 [i] {s} p idx (SHR(W|L|Q)const [8] w) x:(MOVBstoreidx1 [i-1] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstoreidx1 [i-1] {s} p idx w mem)
(MOVBstoreidx1 [i] {s} p idx (SHRQconst [j] w) x:(MOVBstoreidx1 [i-1] {s} p idx w0:(SHRQconst [j-8] w) mem))
(MOVBstoreidx1 [i] {s} p idx (SHR(L|Q)const [j] w) x:(MOVBstoreidx1 [i-1] {s} p idx w0:(SHR(L|Q)const [j-8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstoreidx1 [i-1] {s} p idx w0 mem)
(MOVWstoreidx1 [i] {s} p idx (SHRQconst [16] w) x:(MOVWstoreidx1 [i-2] {s} p idx w mem))
(MOVWstoreidx1 [i] {s} p idx (SHR(L|Q)const [16] w) x:(MOVWstoreidx1 [i-2] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVLstoreidx1 [i-2] {s} p idx w mem)
(MOVWstoreidx1 [i] {s} p idx (SHRQconst [j] w) x:(MOVWstoreidx1 [i-2] {s} p idx w0:(SHRQconst [j-16] w) mem))
(MOVWstoreidx1 [i] {s} p idx (SHR(L|Q)const [j] w) x:(MOVWstoreidx1 [i-2] {s} p idx w0:(SHR(L|Q)const [j-16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVLstoreidx1 [i-2] {s} p idx w0 mem)
......@@ -2064,7 +2064,7 @@
&& clobber(x)
-> (MOVQstoreidx1 [i-4] {s} p idx w0 mem)
(MOVWstoreidx2 [i] {s} p idx (SHRQconst [16] w) x:(MOVWstoreidx2 [i-2] {s} p idx w mem))
(MOVWstoreidx2 [i] {s} p idx (SHR(L|Q)const [16] w) x:(MOVWstoreidx2 [i-2] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVLstoreidx1 [i-2] {s} p (SHLQconst <idx.Type> [1] idx) w mem)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment