Commit 03e216f3 authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

cmd/compile: re-enable in-place append optimization

CL 21891 was too clever in its attempts to avoid spills.
Storing newlen too early caused uses of append in the runtime
itself to receive an inconsistent view of a slice,
leading to corruption.

This CL makes the generate code much more similar to
the old backend. It spills more than before,
but those spills have been contained to the grow path.
It recalculates newlen unnecessarily on the fast path,
but that's measurably cheaper than spilling it.

CL 21891 caused runtime failures in 6 of 2000 runs
of net/http and crypto/x509 in my test setup.
This CL has gone 6000 runs without a failure.


Benchmarks going from master to this CL:

name                         old time/op  new time/op  delta
AppendInPlace/NoGrow/Byte-8   439ns ± 2%   436ns ± 2%  -0.72%  (p=0.001 n=28+27)
AppendInPlace/NoGrow/1Ptr-8   901ns ± 0%   856ns ± 0%  -4.95%  (p=0.000 n=26+29)
AppendInPlace/NoGrow/2Ptr-8  2.15µs ± 1%  1.95µs ± 0%  -9.07%  (p=0.000 n=28+30)
AppendInPlace/NoGrow/3Ptr-8  2.66µs ± 0%  2.45µs ± 0%  -7.93%  (p=0.000 n=29+26)
AppendInPlace/NoGrow/4Ptr-8  3.24µs ± 1%  3.02µs ± 1%  -6.75%  (p=0.000 n=28+30)
AppendInPlace/Grow/Byte-8     269ns ± 1%   271ns ± 1%  +0.84%  (p=0.000 n=30+29)
AppendInPlace/Grow/1Ptr-8     275ns ± 1%   280ns ± 1%  +1.75%  (p=0.000 n=30+30)
AppendInPlace/Grow/2Ptr-8     384ns ± 0%   391ns ± 0%  +1.94%  (p=0.000 n=27+30)
AppendInPlace/Grow/3Ptr-8     455ns ± 0%   462ns ± 0%  +1.43%  (p=0.000 n=29+29)
AppendInPlace/Grow/4Ptr-8     478ns ± 0%   479ns ± 0%  +0.23%  (p=0.000 n=30+27)


However, for the large no-grow cases, there is still more work to be done.
Going from this CL to the non-SSA backend:

name                         old time/op  new time/op  delta
AppendInPlace/NoGrow/Byte-8   436ns ± 2%   436ns ± 2%     ~     (p=0.967 n=27+29)
AppendInPlace/NoGrow/1Ptr-8   856ns ± 0%   884ns ± 0%   +3.28%  (p=0.000 n=29+26)
AppendInPlace/NoGrow/2Ptr-8  1.95µs ± 0%  1.56µs ± 0%  -20.28%  (p=0.000 n=30+29)
AppendInPlace/NoGrow/3Ptr-8  2.45µs ± 0%  1.89µs ± 0%  -22.88%  (p=0.000 n=26+28)
AppendInPlace/NoGrow/4Ptr-8  3.02µs ± 1%  2.56µs ± 1%  -15.35%  (p=0.000 n=30+28)
AppendInPlace/Grow/Byte-8     271ns ± 1%   283ns ± 1%   +4.56%  (p=0.000 n=29+29)
AppendInPlace/Grow/1Ptr-8     280ns ± 1%   288ns ± 1%   +2.99%  (p=0.000 n=30+30)
AppendInPlace/Grow/2Ptr-8     391ns ± 0%   409ns ± 0%   +4.66%  (p=0.000 n=30+29)
AppendInPlace/Grow/3Ptr-8     462ns ± 0%   481ns ± 0%   +4.13%  (p=0.000 n=29+30)
AppendInPlace/Grow/4Ptr-8     479ns ± 0%   502ns ± 0%   +4.81%  (p=0.000 n=27+26)


New generated code:

var x []byte

func a() {
	x = append(x, 1)
}


"".a t=1 size=208 args=0x0 locals=0x48
	0x0000 00000 (a.go:5)	TEXT	"".a(SB), $72-0
	0x0000 00000 (a.go:5)	MOVQ	(TLS), CX
	0x0009 00009 (a.go:5)	CMPQ	SP, 16(CX)
	0x000d 00013 (a.go:5)	JLS	190
	0x0013 00019 (a.go:5)	SUBQ	$72, SP
	0x0017 00023 (a.go:5)	FUNCDATA	$0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0017 00023 (a.go:5)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0017 00023 (a.go:6)	MOVQ	"".x+16(SB), CX
	0x001e 00030 (a.go:6)	MOVQ	"".x+8(SB), DX
	0x0025 00037 (a.go:6)	MOVQ	"".x(SB), BX
	0x002c 00044 (a.go:6)	LEAQ	1(DX), BP
	0x0030 00048 (a.go:6)	CMPQ	BP, CX
	0x0033 00051 (a.go:6)	JGT	$0, 73
	0x0035 00053 (a.go:6)	LEAQ	1(DX), AX
	0x0039 00057 (a.go:6)	MOVQ	AX, "".x+8(SB)
	0x0040 00064 (a.go:6)	MOVB	$1, (BX)(DX*1)
	0x0044 00068 (a.go:7)	ADDQ	$72, SP
	0x0048 00072 (a.go:7)	RET
	0x0049 00073 (a.go:6)	LEAQ	type.[]uint8(SB), AX
	0x0050 00080 (a.go:6)	MOVQ	AX, (SP)
	0x0054 00084 (a.go:6)	MOVQ	BX, 8(SP)
	0x0059 00089 (a.go:6)	MOVQ	DX, 16(SP)
	0x005e 00094 (a.go:6)	MOVQ	CX, 24(SP)
	0x0063 00099 (a.go:6)	MOVQ	BP, 32(SP)
	0x0068 00104 (a.go:6)	PCDATA	$0, $0
	0x0068 00104 (a.go:6)	CALL	runtime.growslice(SB)
	0x006d 00109 (a.go:6)	MOVQ	40(SP), CX
	0x0072 00114 (a.go:6)	MOVQ	48(SP), DX
	0x0077 00119 (a.go:6)	MOVQ	DX, "".autotmp_0+64(SP)
	0x007c 00124 (a.go:6)	MOVQ	56(SP), BX
	0x0081 00129 (a.go:6)	MOVQ	BX, "".x+16(SB)
	0x0088 00136 (a.go:6)	MOVL	runtime.writeBarrier(SB), AX
	0x008e 00142 (a.go:6)	TESTB	AL, AL
	0x0090 00144 (a.go:6)	JNE	$0, 162
	0x0092 00146 (a.go:6)	MOVQ	CX, "".x(SB)
	0x0099 00153 (a.go:6)	MOVQ	"".x(SB), BX
	0x00a0 00160 (a.go:6)	JMP	53
	0x00a2 00162 (a.go:6)	LEAQ	"".x(SB), BX
	0x00a9 00169 (a.go:6)	MOVQ	BX, (SP)
	0x00ad 00173 (a.go:6)	MOVQ	CX, 8(SP)
	0x00b2 00178 (a.go:6)	PCDATA	$0, $0
	0x00b2 00178 (a.go:6)	CALL	runtime.writebarrierptr(SB)
	0x00b7 00183 (a.go:6)	MOVQ	"".autotmp_0+64(SP), DX
	0x00bc 00188 (a.go:6)	JMP	153
	0x00be 00190 (a.go:6)	NOP
	0x00be 00190 (a.go:5)	CALL	runtime.morestack_noctxt(SB)
	0x00c3 00195 (a.go:5)	JMP	0


Fixes #14969 again

Change-Id: Ia50463b1f506011aad0718a4fef1d4738e43c32d
Reviewed-on: https://go-review.googlesource.com/22197
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent b5ddbb90
...@@ -338,6 +338,7 @@ var ( ...@@ -338,6 +338,7 @@ var (
// dummy nodes for temporary variables // dummy nodes for temporary variables
ptrVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ptr"}} ptrVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ptr"}}
lenVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "len"}}
newlenVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "newlen"}} newlenVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "newlen"}}
capVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "cap"}} capVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "cap"}}
typVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}} typVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}}
...@@ -699,8 +700,7 @@ func (s *state) stmt(n *Node) { ...@@ -699,8 +700,7 @@ func (s *state) stmt(n *Node) {
// If the slice can be SSA'd, it'll be on the stack, // If the slice can be SSA'd, it'll be on the stack,
// so there will be no write barriers, // so there will be no write barriers,
// so there's no need to attempt to prevent them. // so there's no need to attempt to prevent them.
const doInPlaceAppend = false // issue 15246 if samesafeexpr(n.Left, rhs.List.First()) && !s.canSSA(n.Left) {
if doInPlaceAppend && samesafeexpr(n.Left, rhs.List.First()) && !s.canSSA(n.Left) {
s.append(rhs, true) s.append(rhs, true)
return return
} }
...@@ -2128,12 +2128,14 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value { ...@@ -2128,12 +2128,14 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
// a := &s // a := &s
// ptr, len, cap := s // ptr, len, cap := s
// newlen := len + 3 // newlen := len + 3
// *a.len = newlen // store newlen immediately to avoid a spill
// if newlen > cap { // if newlen > cap {
// newptr, _, newcap = growslice(ptr, len, cap, newlen) // newptr, len, newcap = growslice(ptr, len, cap, newlen)
// vardef(a) // if necessary, advise liveness we are writing a new a
// *a.cap = newcap // write before ptr to avoid a spill // *a.cap = newcap // write before ptr to avoid a spill
// *a.ptr = newptr // with write barrier // *a.ptr = newptr // with write barrier
// } // }
// newlen = len + 3 // recalculate to avoid a spill
// *a.len = newlen
// // with write barriers, if needed: // // with write barriers, if needed:
// *(ptr+len) = e1 // *(ptr+len) = e1
// *(ptr+len+1) = e2 // *(ptr+len+1) = e2
...@@ -2164,17 +2166,14 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value { ...@@ -2164,17 +2166,14 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
c := s.newValue1(ssa.OpSliceCap, Types[TINT], slice) c := s.newValue1(ssa.OpSliceCap, Types[TINT], slice)
nl := s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], l, s.constInt(Types[TINT], nargs)) nl := s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], l, s.constInt(Types[TINT], nargs))
if inplace {
lenaddr := s.newValue1I(ssa.OpOffPtr, pt, int64(Array_nel), addr)
s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, s.config.IntSize, lenaddr, nl, s.mem())
}
cmp := s.newValue2(s.ssaOp(OGT, Types[TINT]), Types[TBOOL], nl, c) cmp := s.newValue2(s.ssaOp(OGT, Types[TINT]), Types[TBOOL], nl, c)
s.vars[&ptrVar] = p s.vars[&ptrVar] = p
if !inplace { if !inplace {
s.vars[&newlenVar] = nl s.vars[&newlenVar] = nl
s.vars[&capVar] = c s.vars[&capVar] = c
} else {
s.vars[&lenVar] = l
} }
b := s.endBlock() b := s.endBlock()
...@@ -2191,11 +2190,16 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value { ...@@ -2191,11 +2190,16 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
r := s.rtcall(growslice, true, []*Type{pt, Types[TINT], Types[TINT]}, taddr, p, l, c, nl) r := s.rtcall(growslice, true, []*Type{pt, Types[TINT], Types[TINT]}, taddr, p, l, c, nl)
if inplace { if inplace {
if sn.Op == ONAME {
// Tell liveness we're about to build a new slice
s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, ssa.TypeMem, sn, s.mem())
}
capaddr := s.newValue1I(ssa.OpOffPtr, pt, int64(Array_cap), addr) capaddr := s.newValue1I(ssa.OpOffPtr, pt, int64(Array_cap), addr)
s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, s.config.IntSize, capaddr, r[2], s.mem()) s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, s.config.IntSize, capaddr, r[2], s.mem())
s.insertWBstore(pt, addr, r[0], n.Lineno, 0) s.insertWBstore(pt, addr, r[0], n.Lineno, 0)
// load the value we just stored to avoid having to spill it // load the value we just stored to avoid having to spill it
s.vars[&ptrVar] = s.newValue2(ssa.OpLoad, pt, addr, s.mem()) s.vars[&ptrVar] = s.newValue2(ssa.OpLoad, pt, addr, s.mem())
s.vars[&lenVar] = r[1] // avoid a spill in the fast path
} else { } else {
s.vars[&ptrVar] = r[0] s.vars[&ptrVar] = r[0]
s.vars[&newlenVar] = s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], r[1], s.constInt(Types[TINT], nargs)) s.vars[&newlenVar] = s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], r[1], s.constInt(Types[TINT], nargs))
...@@ -2208,6 +2212,13 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value { ...@@ -2208,6 +2212,13 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
// assign new elements to slots // assign new elements to slots
s.startBlock(assign) s.startBlock(assign)
if inplace {
l = s.variable(&lenVar, Types[TINT]) // generates phi for len
nl = s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], l, s.constInt(Types[TINT], nargs))
lenaddr := s.newValue1I(ssa.OpOffPtr, pt, int64(Array_nel), addr)
s.vars[&memVar] = s.newValue3I(ssa.OpStore, ssa.TypeMem, s.config.IntSize, lenaddr, nl, s.mem())
}
// Evaluate args // Evaluate args
args := make([]*ssa.Value, 0, nargs) args := make([]*ssa.Value, 0, nargs)
store := make([]bool, 0, nargs) store := make([]bool, 0, nargs)
...@@ -2248,6 +2259,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value { ...@@ -2248,6 +2259,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
delete(s.vars, &ptrVar) delete(s.vars, &ptrVar)
if inplace { if inplace {
delete(s.vars, &lenVar)
return nil return nil
} }
delete(s.vars, &newlenVar) delete(s.vars, &newlenVar)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment