Commit ecd9e8a2 authored by Chad Rosier's avatar Chad Rosier Committed by Cherry Zhang

cmd/compile/internal/ssa: combine zero stores into larger stores on arm64

This reduces the go tool binary on arm64 by 12k.

go1 results on Amberwing:
name                   old time/op    new time/op    delta
RegexpMatchEasy0_32       249ns ± 0%     249ns ± 0%    ~     (p=0.087 n=10+10)
RegexpMatchEasy0_1K       584ns ± 0%     584ns ± 0%    ~     (all equal)
RegexpMatchEasy1_32       246ns ± 0%     246ns ± 0%    ~     (p=1.000 n=10+10)
RegexpMatchEasy1_1K       806ns ± 0%     806ns ± 0%    ~     (p=0.706 n=10+9)
RegexpMatchMedium_32      314ns ± 0%     314ns ± 0%    ~     (all equal)
RegexpMatchMedium_1K     52.1µs ± 0%    52.1µs ± 0%    ~     (p=0.245 n=10+8)
RegexpMatchHard_32       2.75µs ± 1%    2.75µs ± 1%    ~     (p=0.690 n=10+10)
RegexpMatchHard_1K       78.9µs ± 0%    78.9µs ± 1%    ~     (p=0.295 n=9+9)
FmtFprintfEmpty          58.5ns ± 0%    58.5ns ± 0%    ~     (all equal)
FmtFprintfString          112ns ± 0%     112ns ± 0%    ~     (all equal)
FmtFprintfInt             117ns ± 0%     116ns ± 0%  -0.85%  (p=0.000 n=10+10)
FmtFprintfIntInt          181ns ± 0%     181ns ± 0%    ~     (all equal)
FmtFprintfPrefixedInt     222ns ± 0%     224ns ± 0%  +0.90%  (p=0.000 n=9+10)
FmtFprintfFloat           318ns ± 1%     322ns ± 0%    ~     (p=0.059 n=10+8)
FmtManyArgs               736ns ± 1%     735ns ± 0%    ~     (p=0.206 n=9+9)
Gzip                      437ms ± 0%     436ms ± 0%  -0.25%  (p=0.000 n=10+10)
HTTPClientServer         89.8µs ± 1%    90.2µs ± 2%    ~     (p=0.393 n=10+10)
JSONEncode               20.1ms ± 1%    20.2ms ± 1%    ~     (p=0.065 n=9+10)
JSONDecode               94.2ms ± 1%    93.9ms ± 1%  -0.42%  (p=0.043 n=10+10)
GobDecode                12.7ms ± 1%    12.8ms ± 2%  +0.94%  (p=0.019 n=10+10)
GobEncode                12.1ms ± 0%    12.1ms ± 0%    ~     (p=0.052 n=10+10)
Mandelbrot200            5.06ms ± 0%    5.05ms ± 0%  -0.04%  (p=0.000 n=9+10)
TimeParse                 450ns ± 3%     446ns ± 0%    ~     (p=0.238 n=10+9)
TimeFormat                485ns ± 1%     483ns ± 1%    ~     (p=0.073 n=10+10)
Template                 90.4ms ± 0%    90.7ms ± 0%  +0.29%  (p=0.000 n=8+10)
GoParse                  6.01ms ± 0%    6.03ms ± 0%  +0.35%  (p=0.000 n=10+10)
BinaryTree17              11.7s ± 0%     11.7s ± 0%    ~     (p=0.481 n=10+10)
Revcomp                   669ms ± 0%     669ms ± 0%    ~     (p=0.315 n=10+10)
Fannkuch11                3.40s ± 0%     3.37s ± 0%  -0.92%  (p=0.000 n=10+10)
[Geo mean]               67.9µs         67.9µs       +0.02%

name                   old speed      new speed      delta
RegexpMatchEasy0_32     128MB/s ± 0%   128MB/s ± 0%  -0.08%  (p=0.003 n=8+10)
RegexpMatchEasy0_1K    1.75GB/s ± 0%  1.75GB/s ± 0%    ~     (p=0.642 n=8+10)
RegexpMatchEasy1_32     130MB/s ± 0%   130MB/s ± 0%    ~     (p=0.690 n=10+9)
RegexpMatchEasy1_1K    1.27GB/s ± 0%  1.27GB/s ± 0%    ~     (p=0.661 n=10+9)
RegexpMatchMedium_32   3.18MB/s ± 0%  3.18MB/s ± 0%    ~     (all equal)
RegexpMatchMedium_1K   19.7MB/s ± 0%  19.6MB/s ± 0%    ~     (p=0.190 n=10+9)
RegexpMatchHard_32     11.6MB/s ± 0%  11.6MB/s ± 1%    ~     (p=0.669 n=10+10)
RegexpMatchHard_1K     13.0MB/s ± 0%  13.0MB/s ± 0%    ~     (p=0.718 n=9+9)
Gzip                   44.4MB/s ± 0%  44.5MB/s ± 0%  +0.24%  (p=0.000 n=10+10)
JSONEncode             96.5MB/s ± 1%  96.1MB/s ± 1%    ~     (p=0.065 n=9+10)
JSONDecode             20.6MB/s ± 1%  20.7MB/s ± 1%  +0.42%  (p=0.041 n=10+10)
GobDecode              60.6MB/s ± 1%  60.0MB/s ± 2%  -0.92%  (p=0.016 n=10+10)
GobEncode              63.4MB/s ± 0%  63.6MB/s ± 0%    ~     (p=0.055 n=10+10)
Template               21.5MB/s ± 0%  21.4MB/s ± 0%  -0.30%  (p=0.000 n=9+10)
GoParse                9.64MB/s ± 0%  9.61MB/s ± 0%  -0.36%  (p=0.000 n=10+10)
Revcomp                 380MB/s ± 0%   380MB/s ± 0%    ~     (p=0.323 n=10+10)
[Geo mean]             56.0MB/s       55.9MB/s       -0.07%

Change-Id: Ia732fa57fbcf4767d72382516d9f16705d177736
Reviewed-on: https://go-review.googlesource.com/96435
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent 3a9e4440
...@@ -2971,6 +2971,232 @@ var linuxARM64Tests = []*asmTest{ ...@@ -2971,6 +2971,232 @@ var linuxARM64Tests = []*asmTest{
`, `,
pos: []string{"\tCSEL\t"}, pos: []string{"\tCSEL\t"},
}, },
// Check that zero stores are combine into larger stores
{
fn: `
func $(b []byte) {
_ = b[1] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
}
`,
pos: []string{"MOVH\tZR"},
neg: []string{"MOVB"},
},
{
fn: `
func $(b []byte) {
_ = b[1] // early bounds check to guarantee safety of writes below
b[1] = 0
b[0] = 0
}
`,
pos: []string{"MOVH\tZR"},
neg: []string{"MOVB"},
},
{
fn: `
func $(b []byte) {
_ = b[3] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
b[2] = 0
b[3] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(b []byte) {
_ = b[3] // early bounds check to guarantee safety of writes below
b[2] = 0
b[3] = 0
b[1] = 0
b[0] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(h []uint16) {
_ = h[1] // early bounds check to guarantee safety of writes below
h[0] = 0
h[1] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(h []uint16) {
_ = h[1] // early bounds check to guarantee safety of writes below
h[1] = 0
h[0] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(b []byte) {
_ = b[7] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
b[2] = 0
b[3] = 0
b[4] = 0
b[5] = 0
b[6] = 0
b[7] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(h []uint16) {
_ = h[3] // early bounds check to guarantee safety of writes below
h[0] = 0
h[1] = 0
h[2] = 0
h[3] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(h []uint16) {
_ = h[3] // early bounds check to guarantee safety of writes below
h[2] = 0
h[3] = 0
h[1] = 0
h[0] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(w []uint32) {
_ = w[1] // early bounds check to guarantee safety of writes below
w[0] = 0
w[1] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(w []uint32) {
_ = w[1] // early bounds check to guarantee safety of writes below
w[1] = 0
w[0] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(b []byte) {
_ = b[15] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
b[2] = 0
b[3] = 0
b[4] = 0
b[5] = 0
b[6] = 0
b[7] = 0
b[8] = 0
b[9] = 0
b[10] = 0
b[11] = 0
b[12] = 0
b[13] = 0
b[15] = 0
b[14] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(h []uint16) {
_ = h[7] // early bounds check to guarantee safety of writes below
h[0] = 0
h[1] = 0
h[2] = 0
h[3] = 0
h[4] = 0
h[5] = 0
h[6] = 0
h[7] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(w []uint32) {
_ = w[3] // early bounds check to guarantee safety of writes below
w[0] = 0
w[1] = 0
w[2] = 0
w[3] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(w []uint32) {
_ = w[3] // early bounds check to guarantee safety of writes below
w[1] = 0
w[0] = 0
w[3] = 0
w[2] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(d []uint64) {
_ = d[1] // early bounds check to guarantee safety of writes below
d[0] = 0
d[1] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(d []uint64) {
_ = d[1] // early bounds check to guarantee safety of writes below
d[1] = 0
d[0] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
} }
var linuxMIPSTests = []*asmTest{ var linuxMIPSTests = []*asmTest{
......
...@@ -1439,6 +1439,36 @@ ...@@ -1439,6 +1439,36 @@
&& clobber(o4) && clobber(o5) && clobber(s0) && clobber(o4) && clobber(o5) && clobber(s0)
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [i0] p) mem)) -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [i0] p) mem))
// Combine zero stores into larger (unaligned) stores.
(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
&& x.Uses == 1
&& areAdjacentOffsets(i,j,1)
&& is32Bit(min(i,j))
&& isSamePtr(ptr0, ptr1)
&& clobber(x)
-> (MOVHstorezero [min(i,j)] {s} ptr0 mem)
(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
&& x.Uses == 1
&& areAdjacentOffsets(i,j,2)
&& is32Bit(min(i,j))
&& isSamePtr(ptr0, ptr1)
&& clobber(x)
-> (MOVWstorezero [min(i,j)] {s} ptr0 mem)
(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
&& x.Uses == 1
&& areAdjacentOffsets(i,j,4)
&& is32Bit(min(i,j))
&& isSamePtr(ptr0, ptr1)
&& clobber(x)
-> (MOVDstorezero [min(i,j)] {s} ptr0 mem)
(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
&& x.Uses == 1
&& areAdjacentOffsets(i,j,8)
&& is32Bit(min(i,j))
&& isSamePtr(ptr0, ptr1)
&& clobber(x)
-> (MOVQstorezero [min(i,j)] {s} ptr0 mem)
// FP simplification // FP simplification
(FNEGS (FMULS x y)) -> (FNMULS x y) (FNEGS (FMULS x y)) -> (FNMULS x y)
(FNEGD (FMULD x y)) -> (FNMULD x y) (FNEGD (FMULD x y)) -> (FNMULD x y)
......
...@@ -769,6 +769,10 @@ func overlap(offset1, size1, offset2, size2 int64) bool { ...@@ -769,6 +769,10 @@ func overlap(offset1, size1, offset2, size2 int64) bool {
return false return false
} }
func areAdjacentOffsets(off1, off2, size int64) bool {
return off1+size == off2 || off1 == off2+size
}
// check if value zeroes out upper 32-bit of 64-bit register. // check if value zeroes out upper 32-bit of 64-bit register.
// depth limits recursion depth. In AMD64.rules 3 is used as limit, // depth limits recursion depth. In AMD64.rules 3 is used as limit,
// because it catches same amount of cases as 4. // because it catches same amount of cases as 4.
......
...@@ -5941,6 +5941,35 @@ func rewriteValueARM64_OpARM64MOVBstorezero_0(v *Value) bool { ...@@ -5941,6 +5941,35 @@ func rewriteValueARM64_OpARM64MOVBstorezero_0(v *Value) bool {
v.AddArg(mem) v.AddArg(mem)
return true return true
} }
// match: (MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
// cond: x.Uses == 1 && areAdjacentOffsets(i,j,1) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
// result: (MOVHstorezero [min(i,j)] {s} ptr0 mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[1]
ptr0 := v.Args[0]
x := v.Args[1]
if x.Op != OpARM64MOVBstorezero {
break
}
j := x.AuxInt
if x.Aux != s {
break
}
_ = x.Args[1]
ptr1 := x.Args[0]
mem := x.Args[1]
if !(x.Uses == 1 && areAdjacentOffsets(i, j, 1) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
break
}
v.reset(OpARM64MOVHstorezero)
v.AuxInt = min(i, j)
v.Aux = s
v.AddArg(ptr0)
v.AddArg(mem)
return true
}
return false return false
} }
func rewriteValueARM64_OpARM64MOVDload_0(v *Value) bool { func rewriteValueARM64_OpARM64MOVDload_0(v *Value) bool {
...@@ -6205,6 +6234,35 @@ func rewriteValueARM64_OpARM64MOVDstorezero_0(v *Value) bool { ...@@ -6205,6 +6234,35 @@ func rewriteValueARM64_OpARM64MOVDstorezero_0(v *Value) bool {
v.AddArg(mem) v.AddArg(mem)
return true return true
} }
// match: (MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
// cond: x.Uses == 1 && areAdjacentOffsets(i,j,8) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
// result: (MOVQstorezero [min(i,j)] {s} ptr0 mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[1]
ptr0 := v.Args[0]
x := v.Args[1]
if x.Op != OpARM64MOVDstorezero {
break
}
j := x.AuxInt
if x.Aux != s {
break
}
_ = x.Args[1]
ptr1 := x.Args[0]
mem := x.Args[1]
if !(x.Uses == 1 && areAdjacentOffsets(i, j, 8) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
break
}
v.reset(OpARM64MOVQstorezero)
v.AuxInt = min(i, j)
v.Aux = s
v.AddArg(ptr0)
v.AddArg(mem)
return true
}
return false return false
} }
func rewriteValueARM64_OpARM64MOVHUload_0(v *Value) bool { func rewriteValueARM64_OpARM64MOVHUload_0(v *Value) bool {
...@@ -6747,6 +6805,35 @@ func rewriteValueARM64_OpARM64MOVHstorezero_0(v *Value) bool { ...@@ -6747,6 +6805,35 @@ func rewriteValueARM64_OpARM64MOVHstorezero_0(v *Value) bool {
v.AddArg(mem) v.AddArg(mem)
return true return true
} }
// match: (MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
// cond: x.Uses == 1 && areAdjacentOffsets(i,j,2) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
// result: (MOVWstorezero [min(i,j)] {s} ptr0 mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[1]
ptr0 := v.Args[0]
x := v.Args[1]
if x.Op != OpARM64MOVHstorezero {
break
}
j := x.AuxInt
if x.Aux != s {
break
}
_ = x.Args[1]
ptr1 := x.Args[0]
mem := x.Args[1]
if !(x.Uses == 1 && areAdjacentOffsets(i, j, 2) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
break
}
v.reset(OpARM64MOVWstorezero)
v.AuxInt = min(i, j)
v.Aux = s
v.AddArg(ptr0)
v.AddArg(mem)
return true
}
return false return false
} }
func rewriteValueARM64_OpARM64MOVQstorezero_0(v *Value) bool { func rewriteValueARM64_OpARM64MOVQstorezero_0(v *Value) bool {
...@@ -7379,6 +7466,35 @@ func rewriteValueARM64_OpARM64MOVWstorezero_0(v *Value) bool { ...@@ -7379,6 +7466,35 @@ func rewriteValueARM64_OpARM64MOVWstorezero_0(v *Value) bool {
v.AddArg(mem) v.AddArg(mem)
return true return true
} }
// match: (MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
// cond: x.Uses == 1 && areAdjacentOffsets(i,j,4) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
// result: (MOVDstorezero [min(i,j)] {s} ptr0 mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[1]
ptr0 := v.Args[0]
x := v.Args[1]
if x.Op != OpARM64MOVWstorezero {
break
}
j := x.AuxInt
if x.Aux != s {
break
}
_ = x.Args[1]
ptr1 := x.Args[0]
mem := x.Args[1]
if !(x.Uses == 1 && areAdjacentOffsets(i, j, 4) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
break
}
v.reset(OpARM64MOVDstorezero)
v.AuxInt = min(i, j)
v.Aux = s
v.AddArg(ptr0)
v.AddArg(mem)
return true
}
return false return false
} }
func rewriteValueARM64_OpARM64MUL_0(v *Value) bool { func rewriteValueARM64_OpARM64MUL_0(v *Value) bool {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment