Commit 743117a8 authored by Martin Möhrmann's avatar Martin Möhrmann Committed by David Chase

cmd/compile: simplify slice/array range loops for some element sizes

In range loops over slices and arrays besides a variable to track the
index an extra variable containing the address of the current element
is used. To compute a pointer to the next element the elements size is
added to the address.

On 386 and amd64 an element of size 1, 2, 4 or 8 bytes can by copied
from an array using a MOV instruction with suitable addressing mode
that uses the start address of the array, the index of the element and
element size as scaling factor. Thereby, for arrays and slices with
suitable element size we can avoid keeping and incrementing an extra
variable to compute the next elements address.

Shrinks cmd/go by 4 kilobytes.

AMD64:
name                   old time/op    new time/op    delta
BinaryTree17              2.66s ± 7%     2.54s ± 0%  -4.53%  (p=0.000 n=10+8)
Fannkuch11                3.02s ± 1%     3.02s ± 1%    ~     (p=0.579 n=10+10)
FmtFprintfEmpty          45.6ns ± 1%    42.2ns ± 1%  -7.46%  (p=0.000 n=10+10)
FmtFprintfString         69.8ns ± 1%    70.4ns ± 1%  +0.84%  (p=0.041 n=10+10)
FmtFprintfInt            80.1ns ± 1%    79.0ns ± 1%  -1.35%  (p=0.000 n=10+10)
FmtFprintfIntInt          127ns ± 1%     125ns ± 1%  -1.00%  (p=0.007 n=10+9)
FmtFprintfPrefixedInt     158ns ± 2%     152ns ± 1%  -4.11%  (p=0.000 n=10+10)
FmtFprintfFloat           218ns ± 1%     214ns ± 1%  -1.61%  (p=0.000 n=10+10)
FmtManyArgs               508ns ± 1%     504ns ± 1%  -0.93%  (p=0.001 n=9+10)
GobDecode                6.76ms ± 1%    6.78ms ± 1%    ~     (p=0.353 n=10+10)
GobEncode                5.84ms ± 1%    5.77ms ± 1%  -1.31%  (p=0.000 n=10+9)
Gzip                      223ms ± 1%     218ms ± 1%  -2.39%  (p=0.000 n=10+10)
Gunzip                   40.3ms ± 1%    40.4ms ± 3%    ~     (p=0.796 n=10+10)
HTTPClientServer         73.5µs ± 0%    73.3µs ± 0%  -0.28%  (p=0.000 n=10+9)
JSONEncode               12.7ms ± 1%    12.6ms ± 8%    ~     (p=0.173 n=8+10)
JSONDecode               57.5ms ± 1%    56.1ms ± 2%  -2.40%  (p=0.000 n=10+10)
Mandelbrot200            3.80ms ± 1%    3.86ms ± 6%    ~     (p=0.579 n=10+10)
GoParse                  3.25ms ± 1%    3.23ms ± 1%    ~     (p=0.052 n=10+10)
RegexpMatchEasy0_32      74.4ns ± 1%    76.9ns ± 1%  +3.39%  (p=0.000 n=10+10)
RegexpMatchEasy0_1K       243ns ± 2%     248ns ± 1%  +1.86%  (p=0.000 n=10+8)
RegexpMatchEasy1_32      71.0ns ± 2%    72.8ns ± 1%  +2.55%  (p=0.000 n=10+10)
RegexpMatchEasy1_1K       370ns ± 1%     383ns ± 0%  +3.39%  (p=0.000 n=10+9)
RegexpMatchMedium_32      107ns ± 0%     113ns ± 1%  +5.33%  (p=0.000 n=6+10)
RegexpMatchMedium_1K     35.0µs ± 1%    36.0µs ± 1%  +3.13%  (p=0.000 n=10+10)
RegexpMatchHard_32       1.65µs ± 1%    1.69µs ± 1%  +2.23%  (p=0.000 n=10+9)
RegexpMatchHard_1K       49.8µs ± 1%    50.6µs ± 1%  +1.59%  (p=0.000 n=10+10)
Revcomp                   398ms ± 1%     396ms ± 1%  -0.51%  (p=0.043 n=10+10)
Template                 63.4ms ± 1%    60.8ms ± 0%  -4.11%  (p=0.000 n=10+9)
TimeParse                 318ns ± 1%     322ns ± 1%  +1.10%  (p=0.005 n=10+10)
TimeFormat                323ns ± 1%     336ns ± 1%  +4.15%  (p=0.000 n=10+10)

Updates: #15809.

Change-Id: I55915aaf6d26768e12247f8a8edf14e7630726d1
Reviewed-on: https://go-review.googlesource.com/38061
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent af40cbe8
......@@ -7,6 +7,7 @@ package gc
import (
"cmd/compile/internal/types"
"cmd/internal/objabi"
"cmd/internal/sys"
"unicode/utf8"
)
......@@ -137,6 +138,22 @@ out:
decldepth--
}
func cheapComputableIndex(width int64) bool {
switch thearch.LinkArch.Family {
// MIPS does not have R+R addressing
// Arm64 may lack ability to generate this code in our assembler,
// but the architecture supports it.
case sys.PPC64, sys.S390X:
return width == 1
case sys.AMD64, sys.I386, sys.ARM64, sys.ARM:
switch width {
case 1, 2, 4, 8:
return true
}
}
return false
}
// walkrange transforms various forms of ORANGE into
// simpler forms. The result must be assigned back to n.
// Node n may also be modified in place, and may also be
......@@ -202,57 +219,76 @@ func walkrange(n *Node) *Node {
hv1 := temp(types.Types[TINT])
hn := temp(types.Types[TINT])
var hp *Node
init = append(init, nod(OAS, hv1, nil))
init = append(init, nod(OAS, hn, nod(OLEN, ha, nil)))
if v2 != nil {
hp = temp(types.NewPtr(n.Type.Elem()))
tmp := nod(OINDEX, ha, nodintconst(0))
tmp.SetBounded(true)
init = append(init, nod(OAS, hp, nod(OADDR, tmp, nil)))
}
n.Left = nod(OLT, hv1, hn)
n.Right = nod(OAS, hv1, nod(OADD, hv1, nodintconst(1)))
// for range ha { body }
if v1 == nil {
body = nil
} else if v2 == nil {
break
}
// for v1 := range ha { body }
if v2 == nil {
body = []*Node{nod(OAS, v1, hv1)}
} else { // for i,a := range thing { body }
if objabi.Preemptibleloops_enabled != 0 {
// Doing this transformation makes a bounds check removal less trivial; see #20711
// TODO enhance the preemption check insertion so that this transformation is not necessary.
ifGuard = nod(OIF, nil, nil)
ifGuard.Left = nod(OLT, hv1, hn)
translatedLoopOp = OFORUNTIL
}
break
}
// for v1, v2 := range ha { body }
if cheapComputableIndex(n.Type.Elem().Width) {
// v1, v2 = hv1, ha[hv1]
tmp := nod(OINDEX, ha, hv1)
tmp.SetBounded(true)
// Use OAS2 to correctly handle assignments
// of the form "v1, a[v1] := range".
a := nod(OAS2, nil, nil)
a.List.Set2(v1, v2)
a.Rlist.Set2(hv1, nod(OIND, hp, nil))
a.Rlist.Set2(hv1, tmp)
body = []*Node{a}
break
}
// Advance pointer as part of increment.
// We used to advance the pointer before executing the loop body,
// but doing so would make the pointer point past the end of the
// array during the final iteration, possibly causing another unrelated
// piece of memory not to be garbage collected until the loop finished.
// Advancing during the increment ensures that the pointer p only points
// pass the end of the array during the final "p++; i++; if(i >= len(x)) break;",
// after which p is dead, so it cannot confuse the collector.
tmp := nod(OADD, hp, nodintconst(t.Elem().Width))
tmp.Type = hp.Type
tmp.SetTypecheck(1)
tmp.Right.Type = types.Types[types.Tptr]
tmp.Right.SetTypecheck(1)
a = nod(OAS, hp, tmp)
a = typecheck(a, Etop)
n.Right.Ninit.Set1(a)
if objabi.Preemptibleloops_enabled != 0 {
// Doing this transformation makes a bounds check removal less trivial; see #20711
// TODO enhance the preemption check insertion so that this transformation is not necessary.
ifGuard = nod(OIF, nil, nil)
ifGuard.Left = nod(OLT, hv1, hn)
translatedLoopOp = OFORUNTIL
}
hp := temp(types.NewPtr(n.Type.Elem()))
tmp := nod(OINDEX, ha, nodintconst(0))
tmp.SetBounded(true)
init = append(init, nod(OAS, hp, nod(OADDR, tmp, nil)))
// Use OAS2 to correctly handle assignments
// of the form "v1, a[v1] := range".
a := nod(OAS2, nil, nil)
a.List.Set2(v1, v2)
a.Rlist.Set2(hv1, nod(OIND, hp, nil))
body = append(body, a)
// Advance pointer as part of increment.
// We used to advance the pointer before executing the loop body,
// but doing so would make the pointer point past the end of the
// array during the final iteration, possibly causing another unrelated
// piece of memory not to be garbage collected until the loop finished.
// Advancing during the increment ensures that the pointer p only points
// pass the end of the array during the final "p++; i++; if(i >= len(x)) break;",
// after which p is dead, so it cannot confuse the collector.
tmp = nod(OADD, hp, nodintconst(t.Elem().Width))
tmp.Type = hp.Type
tmp.SetTypecheck(1)
tmp.Right.Type = types.Types[types.Tptr]
tmp.Right.SetTypecheck(1)
a = nod(OAS, hp, tmp)
a = typecheck(a, Etop)
n.Right.Ninit.Set1(a)
case TMAP:
// orderstmt allocated the iterator for us.
// we only use a once, so no copy needed.
......
......@@ -463,22 +463,30 @@ func f29(b bool) {
}
// copy of array of pointers should die at end of range loop
var pstructarr [10]pstruct
var ptrarr [10]*int
// Struct size choosen to make pointer to element in pstructarr
// not computable by strength reduction.
type pstruct struct {
intp *int
_ [8]byte
}
func f30(b bool) {
// two live temps during print(p):
// the copy of ptrarr and the internal iterator pointer.
// two live temps during printintpointer(p):
// in the copy of p.intp and
// the internal iterator pointer if a pointer to pstruct in pstructarr
// can not be easily computed by strength reduction.
if b {
for _, p := range ptrarr {
printintpointer(p) // ERROR "live at call to printintpointer: .autotmp_[0-9]+ .autotmp_[0-9]+$"
for _, p := range pstructarr {
printintpointer(p.intp) // ERROR "live at call to printintpointer: .autotmp_[0-9]+ .autotmp_[0-9]+$"
}
}
for _, p := range ptrarr {
printintpointer(p) // ERROR "live at call to printintpointer: .autotmp_[0-9]+ .autotmp_[0-9]+$"
for _, p := range pstructarr {
printintpointer(p.intp) // ERROR "live at call to printintpointer: .autotmp_[0-9]+ .autotmp_[0-9]+$"
}
for _, p := range ptrarr {
printintpointer(p) // ERROR "live at call to printintpointer: .autotmp_[0-9]+ .autotmp_[0-9]+$"
for _, p := range pstructarr {
printintpointer(p.intp) // ERROR "live at call to printintpointer: .autotmp_[0-9]+ .autotmp_[0-9]+$"
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment