Commit 09425840 authored by Balaram Makam's avatar Balaram Makam Committed by Cherry Zhang

cmd/compile: improve fractional word zeroing

This change improves fractional word zeroing by
using overlapping MOVDs for the fractions.

Performance of go1 benchmarks on Amberwing was all noise:
name                   old time/op    new time/op    delta
RegexpMatchEasy0_32       247ns ± 0%     246ns ± 0%  -0.40%  (p=0.008 n=5+5)
RegexpMatchEasy0_1K       581ns ± 0%     579ns ± 0%  -0.34%  (p=0.000 n=5+4)
RegexpMatchEasy1_32       244ns ± 0%     242ns ± 0%    ~     (p=0.079 n=4+5)
RegexpMatchEasy1_1K       804ns ± 0%     805ns ± 0%    ~     (p=0.238 n=5+4)
RegexpMatchMedium_32      313ns ± 0%     311ns ± 0%  -0.64%  (p=0.008 n=5+5)
RegexpMatchMedium_1K     52.2µs ± 0%    51.9µs ± 0%  -0.52%  (p=0.016 n=5+4)
RegexpMatchHard_32       2.75µs ± 0%    2.74µs ± 0%    ~     (p=0.603 n=5+5)
RegexpMatchHard_1K       78.8µs ± 0%    78.9µs ± 0%  +0.05%  (p=0.008 n=5+5)
FmtFprintfEmpty          58.6ns ± 0%    58.6ns ± 0%    ~     (p=0.159 n=5+5)
FmtFprintfString          118ns ± 0%     119ns ± 0%  +0.85%  (p=0.008 n=5+5)
FmtFprintfInt             119ns ± 0%     123ns ± 0%  +3.36%  (p=0.016 n=5+4)
FmtFprintfIntInt          192ns ± 0%     200ns ± 0%  +4.17%  (p=0.008 n=5+5)
FmtFprintfPrefixedInt     224ns ± 0%     209ns ± 0%  -6.70%  (p=0.008 n=5+5)
FmtFprintfFloat           335ns ± 0%     335ns ± 0%    ~     (all equal)
FmtManyArgs               775ns ± 0%     811ns ± 1%  +4.67%  (p=0.016 n=4+5)
Gzip                      437ms ± 0%     438ms ± 0%  +0.19%  (p=0.008 n=5+5)
HTTPClientServer         88.7µs ± 1%    90.3µs ± 1%  +1.75%  (p=0.016 n=5+5)
JSONEncode               20.1ms ± 1%    20.1ms ± 0%    ~     (p=1.000 n=5+5)
JSONDecode               94.7ms ± 1%    94.8ms ± 1%    ~     (p=0.548 n=5+5)
GobDecode                12.8ms ± 1%    12.8ms ± 1%    ~     (p=0.548 n=5+5)
GobEncode                12.1ms ± 0%    12.1ms ± 0%    ~     (p=0.151 n=5+5)
Mandelbrot200            5.37ms ± 0%    5.37ms ± 0%  -0.03%  (p=0.008 n=5+5)
TimeParse                 450ns ± 0%     451ns ± 1%    ~     (p=0.635 n=4+5)
TimeFormat                485ns ± 0%     484ns ± 0%    ~     (p=0.508 n=5+5)
Template                 90.4ms ± 0%    90.2ms ± 0%  -0.24%  (p=0.016 n=5+5)
GoParse                  5.98ms ± 0%    5.98ms ± 0%    ~     (p=1.000 n=5+5)
BinaryTree17              11.8s ± 0%     11.8s ± 0%    ~     (p=0.841 n=5+5)
Revcomp                   669ms ± 0%     669ms ± 0%    ~     (p=0.310 n=5+5)
Fannkuch11                3.28s ± 0%     3.34s ± 0%  +1.64%  (p=0.008 n=5+5)

name                   old speed      new speed      delta
RegexpMatchEasy0_32     129MB/s ± 0%   130MB/s ± 0%  +0.30%  (p=0.016 n=4+5)
RegexpMatchEasy0_1K    1.76GB/s ± 0%  1.77GB/s ± 0%  +0.27%  (p=0.016 n=5+4)
RegexpMatchEasy1_32     131MB/s ± 0%   132MB/s ± 0%  +0.71%  (p=0.016 n=4+5)
RegexpMatchEasy1_1K    1.27GB/s ± 0%  1.27GB/s ± 0%  -0.17%  (p=0.016 n=5+4)
RegexpMatchMedium_32   3.19MB/s ± 0%  3.21MB/s ± 0%  +0.63%  (p=0.008 n=5+5)
RegexpMatchMedium_1K   19.6MB/s ± 0%  19.7MB/s ± 0%  +0.52%  (p=0.016 n=5+4)
RegexpMatchHard_32     11.7MB/s ± 0%  11.7MB/s ± 0%    ~     (p=0.643 n=5+5)
RegexpMatchHard_1K     13.0MB/s ± 0%  13.0MB/s ± 0%    ~     (p=0.079 n=4+5)
Gzip                   44.4MB/s ± 0%  44.3MB/s ± 0%  -0.19%  (p=0.008 n=5+5)
JSONEncode             96.3MB/s ± 1%  96.4MB/s ± 0%    ~     (p=1.000 n=5+5)
JSONDecode             20.5MB/s ± 1%  20.5MB/s ± 1%    ~     (p=0.460 n=5+5)
GobDecode              60.1MB/s ± 1%  59.9MB/s ± 1%    ~     (p=0.548 n=5+5)
GobEncode              63.5MB/s ± 0%  63.7MB/s ± 0%    ~     (p=0.135 n=5+5)
Template               21.5MB/s ± 0%  21.5MB/s ± 0%  +0.24%  (p=0.016 n=5+5)
GoParse                9.68MB/s ± 0%  9.69MB/s ± 0%    ~     (p=0.786 n=5+5)
Revcomp                 380MB/s ± 0%   380MB/s ± 0%    ~     (p=0.310 n=5+5)
Change-Id: I596eee6421cdbad1a0189cdb9fe0628bba534eaf
Reviewed-on: https://go-review.googlesource.com/96775Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 413d8a83
......@@ -3245,6 +3245,24 @@ var linuxARM64Tests = []*asmTest{
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(a *[39]byte) {
*a = [39]byte{}
}
`,
pos: []string{"MOVD"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(a *[30]byte) {
*a = [30]byte{}
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
}
var linuxMIPSTests = []*asmTest{
......
......@@ -399,10 +399,14 @@
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
// strip off fractional word zeroing
(Zero [s] ptr mem) && s%16 != 0 && s > 16 ->
(Zero [s-s%16]
(OffPtr <ptr.Type> ptr [s%16])
(Zero [s%16] ptr mem))
(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 ->
(Zero [8]
(OffPtr <ptr.Type> ptr [s-8])
(Zero [s-s%16] ptr mem))
(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 ->
(Zero [16]
(OffPtr <ptr.Type> ptr [s-16])
(Zero [s-s%16] ptr mem))
// medium zeroing uses a duff device
// 4, 16, and 64 are magic constants, see runtime/mkduff.go
......
......@@ -18551,24 +18551,48 @@ func rewriteValueARM64_OpZero_20(v *Value) bool {
config := b.Func.Config
_ = config
// match: (Zero [s] ptr mem)
// cond: s%16 != 0 && s > 16
// result: (Zero [s-s%16] (OffPtr <ptr.Type> ptr [s%16]) (Zero [s%16] ptr mem))
// cond: s%16 != 0 && s%16 <= 8 && s > 16
// result: (Zero [8] (OffPtr <ptr.Type> ptr [s-8]) (Zero [s-s%16] ptr mem))
for {
s := v.AuxInt
_ = v.Args[1]
ptr := v.Args[0]
mem := v.Args[1]
if !(s%16 != 0 && s > 16) {
if !(s%16 != 0 && s%16 <= 8 && s > 16) {
break
}
v.reset(OpZero)
v.AuxInt = s - s%16
v.AuxInt = 8
v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
v0.AuxInt = s - 8
v0.AddArg(ptr)
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
v1.AuxInt = s - s%16
v1.AddArg(ptr)
v1.AddArg(mem)
v.AddArg(v1)
return true
}
// match: (Zero [s] ptr mem)
// cond: s%16 != 0 && s%16 > 8 && s > 16
// result: (Zero [16] (OffPtr <ptr.Type> ptr [s-16]) (Zero [s-s%16] ptr mem))
for {
s := v.AuxInt
_ = v.Args[1]
ptr := v.Args[0]
mem := v.Args[1]
if !(s%16 != 0 && s%16 > 8 && s > 16) {
break
}
v.reset(OpZero)
v.AuxInt = 16
v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
v0.AuxInt = s % 16
v0.AuxInt = s - 16
v0.AddArg(ptr)
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
v1.AuxInt = s % 16
v1.AuxInt = s - s%16
v1.AddArg(ptr)
v1.AddArg(mem)
v.AddArg(v1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment