cmd/compile: move spills to loop exits when easy.

For call-free inner loops. Revised statistics: 85 inner loop spills sunk 341 inner loop spills remaining 1162 inner loop spills that were candidates for sinking ended up completely register allocated 119 inner loop spills could have been sunk were used in "shuffling" at the bottom of the loop. 1 inner loop spill not sunk because the register assigned changed between def and exit, Understanding how to make an inner loop definition not be a candidate for from-memory shuffling (to force the shuffle code to choose some other value) should pick up some of the 119 other spills disqualified for this reason. Modified the stats printing based on feedback from Austin. Change-Id: If3fb9b5d5a028f42ccc36c4e3d9e0da39db5ca60 Reviewed-on: https://go-review.googlesource.com/21037Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>

cmd/compile: move spills to loop exits when easy.
For call-free inner loops. Revised statistics: 85 inner loop spills sunk 341 inner loop spills remaining 1162 inner loop spills that were candidates for sinking ended up completely register allocated 119 inner loop spills could have been sunk were used in "shuffling" at the bottom of the loop. 1 inner loop spill not sunk because the register assigned changed between def and exit, Understanding how to make an inner loop definition not be a candidate for from-memory shuffling (to force the shuffle code to choose some other value) should pick up some of the 119 other spills disqualified for this reason. Modified the stats printing based on feedback from Austin. Change-Id: If3fb9b5d5a028f42ccc36c4e3d9e0da39db5ca60 Reviewed-on: https://go-review.googlesource.com/21037Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
6b85a45e · David Chase · c4807d4c · 6b85a45e · 6b85a45e · 6b85a45e
Commit 6b85a45e authored Mar 21, 2016 by David Chase
3 changed files
--- a/src/cmd/compile/internal/ssa/likelyadjust.go
+++ b/src/cmd/compile/internal/ssa/likelyadjust.go
@@ -11,11 +11,24 @@ import (
 type loop struct {
 	header *Block // The header node of this (reducible) loop
 	outer  *loop  // loop containing this loop
-	// Next two fields not currently used, but cheap to maintain,
-	// and aid in computation of inner-ness and list of blocks.
-	nBlocks      int32 // Number of blocks in this loop but not within inner loops
-	isInner      bool  // True if never discovered to contain a loop
-	containsCall bool  // if any block in this loop or any loop it contains is a BlockCall or BlockDefer
+
+	// By default, children exits, and depth are not initialized.
+	children []*loop  // loops nested directly within this loop. Initialized by assembleChildren().
+	exits    []*Block // exits records blocks reached by exits from this loop. Initialized by findExits().
+
+	// Loops aren't that common, so rather than force regalloc to keep
+	// a map or slice for its data, just put it here.
+	spills  []*Value
+	scratch int32
+
+	// Next three fields used by regalloc and/or
+	// aid in computation of inner-ness and list of blocks.
+	nBlocks int32 // Number of blocks in this loop but not within inner loops
+	depth   int16 // Nesting depth of the loop; 1 is outermost. Initialized by calculateDepths().
+	isInner bool  // True if never discovered to contain a loop
+
+	// register allocation uses this.
+	containsCall bool // if any block in this loop or any loop it contains is a BlockCall or BlockDefer
 }

 // outerinner records that outer contains inner
@@ -48,6 +61,9 @@ type loopnest struct {
 	po    []*Block
 	sdom  sparseTree
 	loops []*loop
+
+	// Record which of the lazily initialized fields have actually been initialized.
+	initializedChildren, initializedDepth, initializedExits bool
 }

 func min8(a, b int8) int8 {
@@ -295,6 +311,35 @@ func loopnestfor(f *Func) *loopnest {
 			innermost.nBlocks++
 		}
 	}
+
+	ln := &loopnest{f: f, b2l: b2l, po: po, sdom: sdom, loops: loops}
+
+	// Curious about the loopiness? "-d=ssa/likelyadjust/stats"
+	if f.pass.stats > 0 && len(loops) > 0 {
+		ln.assembleChildren()
+		ln.calculateDepths()
+		ln.findExits()
+
+		// Note stats for non-innermost loops are slightly flawed because
+		// they don't account for inner loop exits that span multiple levels.
+
+		for _, l := range loops {
+			x := len(l.exits)
+			cf := 0
+			if !l.containsCall {
+				cf = 1
+			}
+			inner := 0
+			if l.isInner {
+				inner++
+			}
+
+			f.logStat("loopstats:",
+				l.depth, "depth", x, "exits",
+				inner, "is_inner", cf, "is_callfree", l.nBlocks, "n_blocks")
+		}
+	}
+
 	if f.pass.debug > 1 && len(loops) > 0 {
 		fmt.Printf("Loops in %s:\n", f.Name)
 		for _, l := range loops {
@@ -314,5 +359,90 @@ func loopnestfor(f *Func) *loopnest {
 		}
 		fmt.Print("\n")
 	}
-	return &loopnest{f, b2l, po, sdom, loops}
+	return ln
+}
+
+// assembleChildren initializes the children field of each
+// loop in the nest.  Loop A is a child of loop B if A is
+// directly nested within B (based on the reducible-loops
+// detection above)
+func (ln *loopnest) assembleChildren() {
+	if ln.initializedChildren {
+		return
+	}
+	for _, l := range ln.loops {
+		if l.outer != nil {
+			l.outer.children = append(l.outer.children, l)
+		}
+	}
+	ln.initializedChildren = true
+}
+
+// calculateDepths uses the children field of loops
+// to determine the nesting depth (outer=1) of each
+// loop.  This is helpful for finding exit edges.
+func (ln *loopnest) calculateDepths() {
+	if ln.initializedDepth {
+		return
+	}
+	ln.assembleChildren()
+	for _, l := range ln.loops {
+		if l.outer == nil {
+			l.setDepth(1)
+		}
+	}
+	ln.initializedDepth = true
+}
+
+// findExits uses loop depth information to find the
+// exits from a loop.
+func (ln *loopnest) findExits() {
+	if ln.initializedExits {
+		return
+	}
+	ln.calculateDepths()
+	b2l := ln.b2l
+	for _, b := range ln.po {
+		l := b2l[b.ID]
+		if l != nil && len(b.Succs) == 2 {
+			sl := b2l[b.Succs[0].ID]
+			if recordIfExit(l, sl, b.Succs[0]) {
+				continue
+			}
+			sl = b2l[b.Succs[1].ID]
+			if recordIfExit(l, sl, b.Succs[1]) {
+				continue
+			}
+		}
+	}
+	ln.initializedExits = true
+}
+
+// recordIfExit checks sl (the loop containing b) to see if it
+// is outside of loop l, and if so, records b as an exit block
+// from l and returns true.
+func recordIfExit(l, sl *loop, b *Block) bool {
+	if sl != l {
+		if sl == nil || sl.depth <= l.depth {
+			l.exits = append(l.exits, b)
+			return true
+		}
+		// sl is not nil, and is deeper than l
+		// it's possible for this to be a goto into an irreducible loop made from gotos.
+		for sl.depth > l.depth {
+			sl = sl.outer
+		}
+		if sl != l {
+			l.exits = append(l.exits, b)
+			return true
+		}
+	}
+	return false
+}
+
+func (l *loop) setDepth(d int16) {
+	l.depth = d
+	for _, c := range l.children {
+		c.setDepth(d + 1)
+	}
 }
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -91,6 +91,13 @@
 // will have no use (so don't run deadcode after regalloc!).
 // TODO: maybe we should introduce these extra phis?

+// Additional not-quite-SSA output occurs when spills are sunk out
+// of loops to the targets of exit edges from the loop.  Before sinking,
+// there is one spill site (one StoreReg) targeting stack slot X, after
+// sinking there may be multiple spill sites targeting stack slot X,
+// with no phi functions at any join points reachable by the multiple
+// spill sites.
+
 package ssa

 import (
@@ -100,7 +107,8 @@ import (
 )

 const (
-	logSpills = iota
+	moveSpills = iota
+	logSpills
 	regDebug
 	stackDebug
 )
@@ -176,6 +184,7 @@ type valState struct {
 	uses              *use    // list of uses in this block
 	spill             *Value  // spilled copy of the Value
 	spillUsed         bool
+	spillUsedShuffle  bool     // true if used in shuffling, after ordinary uses
 	needReg           bool     // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
 	rematerializeable bool     // cached value of v.rematerializeable()
 	desired           register // register we want value to be in, if any
@@ -243,6 +252,15 @@ type regAllocState struct {
 	loopnest *loopnest
 }

+type spillToSink struct {
+	spill *Value // Spill instruction to move (a StoreReg)
+	dests int32  // Bitmask indicating exit blocks from loop in which spill/val is defined. 1<<i set means val is live into loop.exitBlocks[i]
+}
+
+func (sts *spillToSink) spilledValue() *Value {
+	return sts.spill.Args[0]
+}
+
 type endReg struct {
 	r register
 	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
@@ -558,6 +576,22 @@ func (s *regAllocState) compatRegs(t Type) regMask {
 	return m &^ s.reserved()
 }

+// loopForBlock returns the loop containing block b,
+// provided that the loop is "interesting" for purposes
+// of improving register allocation (= is inner, and does
+// not contain a call)
+func (s *regAllocState) loopForBlock(b *Block) *loop {
+	loop := s.loopnest.b2l[b.ID]
+
+	// Minor for-the-time-being optimization: nothing happens
+	// unless a loop is both inner and call-free, therefore
+	// don't bother with other loops.
+	if loop != nil && (loop.containsCall || !loop.isInner) {
+		loop = nil
+	}
+	return loop
+}
+
 func (s *regAllocState) regalloc(f *Func) {
 	liveSet := f.newSparseSet(f.NumValues())
 	defer f.retSparseSet(liveSet)
@@ -566,12 +600,36 @@ func (s *regAllocState) regalloc(f *Func) {
 	var phiRegs []register
 	var args []*Value

+	// statistics
+	var nSpills int               // # of spills remaining
+	var nSpillsInner int          // # of spills remaining in inner loops
+	var nSpillsSunk int           // # of sunk spills remaining
+	var nSpillsChanged int        // # of sunk spills lost because of register use change
+	var nSpillsSunkUnused int     // # of spills not sunk because they were removed completely
+	var nSpillsNotSunkLateUse int // # of spills not sunk because of very late use (in shuffle)
+
 	if f.Entry != f.Blocks[0] {
 		f.Fatalf("entry block must be first")
 	}

+	// Get loop nest so that spills in inner loops can be
+	// tracked.  When the last block of a loop is processed,
+	// attempt to move spills out of the loop.
+	s.loopnest.findExits()
+
+	// Spills are moved from one block's slice of values to another's.
+	// This confuses register allocation if it occurs before it is
+	// complete, so candidates are recorded, then rechecked and
+	// moved after all allocation (register and stack) is complete.
+	// Because movement is only within a stack slot's lifetime, it
+	// is safe to do this.
+	var toSink []spillToSink
+	// Will be used to figure out live inputs to exit blocks of inner loops.
+	entryCandidates := newSparseMap(f.NumValues())
+
 	for _, b := range f.Blocks {
 		s.curBlock = b
+		loop := s.loopForBlock(b)

 		// Initialize liveSet and uses fields for this block.
 		// Walk backwards through the block doing liveness analysis.
@@ -751,6 +809,11 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.setOrig(spill, v)
 				s.values[v.ID].spill = spill
 				s.values[v.ID].spillUsed = false
+				if loop != nil {
+					loop.spills = append(loop.spills, v)
+					nSpillsInner++
+				}
+				nSpills++
 			}

 			// Save the starting state for use by merge edges.
@@ -970,6 +1033,11 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.setOrig(spill, v)
 				s.values[v.ID].spill = spill
 				s.values[v.ID].spillUsed = false
+				if loop != nil {
+					loop.spills = append(loop.spills, v)
+					nSpillsInner++
+				}
+				nSpills++
 			}
 		}

@@ -1079,6 +1147,69 @@ func (s *regAllocState) regalloc(f *Func) {
 			s.values[e.ID].spillUsed = true
 		}

+		// Keep track of values that are spilled in the loop, but whose spill
+		// is not used in the loop.  It may be possible to move ("sink") the
+		// spill out of the loop into one or more exit blocks.
+		if loop != nil {
+			loop.scratch++                    // increment count of blocks in this loop that have been processed
+			if loop.scratch == loop.nBlocks { // just processed last block of loop, if it is an inner loop.
+				// This check is redundant with code at the top of the loop.
+				// This is definitive; the one at the top of the loop is an optimization.
+				if loop.isInner && // Common case, easier, most likely to be profitable
+					!loop.containsCall && // Calls force spills, also lead to puzzling spill info.
+					len(loop.exits) <= 32 { // Almost no inner loops have more than 32 exits,
+					// and this allows use of a bitvector and a sparseMap.
+
+					// TODO: exit calculation is messed up for non-inner loops
+					// because of multilevel exits that are not part of the "exit"
+					// count.
+
+					// Compute the set of spill-movement candidates live at entry to exit blocks.
+					// isLoopSpillCandidate filters for
+					// (1) defined in appropriate loop
+					// (2) needs a register
+					// (3) spill not already used (in the loop)
+					// Condition (3) === "in a register at all loop exits"
+
+					entryCandidates.clear()
+
+					for whichExit, ss := range loop.exits {
+						// Start with live at end.
+						for _, li := range s.live[ss.ID] {
+							if s.isLoopSpillCandidate(loop, s.orig[li.ID]) {
+								entryCandidates.setBit(li.ID, uint(whichExit))
+							}
+						}
+						// Control can also be live.
+						if ss.Control != nil && s.isLoopSpillCandidate(loop, ss.Control) {
+							entryCandidates.setBit(ss.Control.ID, uint(whichExit))
+						}
+						// Walk backwards, filling in locally live values, removing those defined.
+						for i := len(ss.Values) - 1; i >= 0; i-- {
+							v := ss.Values[i]
+							entryCandidates.remove(v.ID) // Cannot be an issue, only keeps the sets smaller.
+							for _, a := range v.Args {
+								if s.isLoopSpillCandidate(loop, a) {
+									entryCandidates.setBit(a.ID, uint(whichExit))
+								}
+							}
+						}
+					}
+
+					for _, e := range loop.spills {
+						whichblocks := entryCandidates.get(e.ID)
+						oldSpill := s.values[e.ID].spill
+						if whichblocks != 0 && whichblocks != -1 { // -1 = not in map.
+							toSink = append(toSink, spillToSink{spill: oldSpill, dests: whichblocks})
+						}
+					}
+
+				} // loop is inner etc
+				loop.scratch = 0 // Don't leave a mess, just in case.
+				loop.spills = nil
+			} // if scratch == nBlocks
+		} // if loop is not nil
+
 		// Clear any final uses.
 		// All that is left should be the pseudo-uses added for values which
 		// are live at the end of b.
@@ -1110,9 +1241,16 @@ func (s *regAllocState) regalloc(f *Func) {
 			// Constants, SP, SB, ...
 			continue
 		}
+		loop := s.loopForBlock(spill.Block)
+		if loop != nil {
+			nSpillsInner--
+		}
+
 		spill.Args[0].Uses--
 		f.freeValue(spill)
+		nSpills--
 	}
+
 	for _, b := range f.Blocks {
 		i := 0
 		for _, v := range b.Values {
@@ -1127,12 +1265,153 @@ func (s *regAllocState) regalloc(f *Func) {
 		// Not important now because this is the last phase that manipulates Values
 	}

+	// Must clear these out before any potential recycling, though that's
+	// not currently implemented.
+	for i, ts := range toSink {
+		vsp := ts.spill
+		if vsp.Op == OpInvalid { // This spill was completely eliminated
+			toSink[i].spill = nil
+		}
+	}
+
 	// Anything that didn't get a register gets a stack location here.
 	// (StoreReg, stack-based phis, inputs, ...)
 	stacklive := stackalloc(s.f, s.spillLive)

 	// Fix up all merge edges.
 	s.shuffle(stacklive)
+
+	// Insert moved spills (that have not been marked invalid above)
+	// at start of appropriate block and remove the originals from their
+	// location within loops.  Notice that this can break SSA form;
+	// if a spill is sunk to multiple exits, there will be no phi for that
+	// spill at a join point downstream of those two exits, though the
+	// two spills will target the same stack slot.  Notice also that this
+	// takes place after stack allocation, so the stack allocator does
+	// not need to process these malformed flow graphs.
+sinking:
+	for _, ts := range toSink {
+		vsp := ts.spill
+		if vsp == nil { // This spill was completely eliminated
+			nSpillsSunkUnused++
+			continue sinking
+		}
+		e := ts.spilledValue()
+		if s.values[e.ID].spillUsedShuffle {
+			nSpillsNotSunkLateUse++
+			continue sinking
+		}
+
+		// move spills to a better (outside of loop) block.
+		// This would be costly if it occurred very often, but it doesn't.
+		b := vsp.Block
+		loop := s.loopnest.b2l[b.ID]
+		dests := ts.dests
+
+		// Pre-check to be sure that spilled value is still in expected register on all exits where live.
+	check_val_still_in_reg:
+		for i := uint(0); i < 32 && dests != 0; i++ {
+
+			if dests&(1<<i) == 0 {
+				continue
+			}
+			dests ^= 1 << i
+			d := loop.exits[i]
+			if len(d.Preds) > 1 {
+				panic("Should be impossible given critical edges removed")
+			}
+			p := d.Preds[0] // block in loop exiting to d.
+
+			endregs := s.endRegs[p.ID]
+			for _, regrec := range endregs {
+				if regrec.v == e && regrec.r != noRegister && regrec.c == e { // TODO: regrec.c != e implies different spill possible.
+					continue check_val_still_in_reg
+				}
+			}
+			// If here, the register assignment was lost down at least one exit and it can't be sunk
+			if s.f.pass.debug > moveSpills {
+				s.f.Config.Warnl(e.Line, "lost register assignment for spill %v in %v at exit %v to %v",
+					vsp, b, p, d)
+			}
+			nSpillsChanged++
+			continue sinking
+		}
+
+		nSpillsSunk++
+		nSpillsInner--
+		// don't update nSpills, since spill is only moved, and if it is duplicated, the spills-on-a-path is not increased.
+
+		dests = ts.dests
+
+		// remove vsp from b.Values
+		i := 0
+		for _, w := range b.Values {
+			if vsp == w {
+				continue
+			}
+			b.Values[i] = w
+			i++
+		}
+		b.Values = b.Values[:i]
+
+		for i := uint(0); i < 32 && dests != 0; i++ {
+
+			if dests&(1<<i) == 0 {
+				continue
+			}
+
+			dests ^= 1 << i
+
+			d := loop.exits[i]
+			vspnew := d.NewValue1(e.Line, OpStoreReg, e.Type, e)
+
+			if s.f.pass.debug > moveSpills {
+				s.f.Config.Warnl(e.Line, "moved spill %v in %v for %v to %v in %v",
+					vsp, b, e, vspnew, d)
+			}
+
+			f.setHome(vspnew, f.getHome(vsp.ID)) // copy stack home
+
+			// shuffle vspnew to the beginning of its block
+			copy(d.Values[1:], d.Values[0:len(d.Values)-1])
+			d.Values[0] = vspnew
+		}
+	}
+
+	if f.pass.stats > 0 {
+		f.logStat("spills_info",
+			nSpills, "spills", nSpillsInner, "inner_spills_remaining", nSpillsSunk, "inner_spills_sunk", nSpillsSunkUnused, "inner_spills_unused", nSpillsNotSunkLateUse, "inner_spills_shuffled", nSpillsChanged, "inner_spills_changed")
+	}
+}
+
+// isLoopSpillCandidate indicates whether the spill for v satisfies preliminary
+// spill-sinking conditions just after the last block of loop has been processed.
+// In particular:
+//   v needs a register.
+//   v's spill is not (YET) used.
+//   v's definition is within loop.
+// The spill may be used in the future, either by an outright use
+// in the code, or by shuffling code inserted after stack allocation.
+// Outright uses cause sinking; shuffling (within the loop) inhibits it.
+func (s *regAllocState) isLoopSpillCandidate(loop *loop, v *Value) bool {
+	return s.values[v.ID].needReg && !s.values[v.ID].spillUsed && s.loopnest.b2l[v.Block.ID] == loop
+}
+
+// lateSpillUse notes a late (after stack allocation) use of spill c
+// This will inhibit spill sinking.
+func (s *regAllocState) lateSpillUse(c *Value) {
+	// TODO investigate why this is necessary.
+	// It appears that an outside-the-loop use of
+	// an otherwise sinkable spill makes the spill
+	// a candidate for shuffling, when it would not
+	// otherwise have been the case (spillUsed was not
+	// true when isLoopSpillCandidate was called, yet
+	// it was shuffled).  Such shuffling cuts the amount
+	// of spill sinking by more than half (in make.bash)
+	v := s.orig[c.ID]
+	if v != nil {
+		s.values[v.ID].spillUsedShuffle = true
+	}
 }

 // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
@@ -1307,6 +1586,7 @@ func (e *edgeState) process() {
 		if _, isReg := loc.(*Register); isReg {
 			c = e.p.NewValue1(c.Line, OpCopy, c.Type, c)
 		} else {
+			e.s.lateSpillUse(c)
 			c = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
 		}
 		e.set(r, vid, c, false)
@@ -1395,6 +1675,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
 			}
 		} else {
 			if dstReg {
+				e.s.lateSpillUse(c)
 				x = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
 			} else {
 				// mem->mem. Use temp register.
@@ -1412,6 +1693,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
 				e.erase(loc)

 				r := e.findRegFor(c.Type)
+				e.s.lateSpillUse(c)
 				t := e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
 				e.set(r, vid, t, false)
 				x = e.p.NewValue1(c.Line, OpStoreReg, loc.(LocalSlot).Type, t)

--- a/src/cmd/compile/internal/ssa/sparsemap.go
+++ b/src/cmd/compile/internal/ssa/sparsemap.go
@@ -32,6 +32,8 @@ func (s *sparseMap) contains(k ID) bool {
 	return i < len(s.dense) && s.dense[i].key == k
 }

+// get returns the value for key k, or -1 if k does
+// not appear in the map.
 func (s *sparseMap) get(k ID) int32 {
 	i := s.sparse[k]
 	if i < len(s.dense) && s.dense[i].key == k {
@@ -50,6 +52,20 @@ func (s *sparseMap) set(k ID, v int32) {
 	s.sparse[k] = len(s.dense) - 1
 }

+// setBit sets the v'th bit of k's value, where 0 <= v < 32
+func (s *sparseMap) setBit(k ID, v uint) {
+	if v >= 32 {
+		panic("bit index too large.")
+	}
+	i := s.sparse[k]
+	if i < len(s.dense) && s.dense[i].key == k {
+		s.dense[i].val |= 1 << v
+		return
+	}
+	s.dense = append(s.dense, sparseEntry{k, 1 << v})
+	s.sparse[k] = len(s.dense) - 1
+}
+
 func (s *sparseMap) remove(k ID) {
 	i := s.sparse[k]
 	if i < len(s.dense) && s.dense[i].key == k {