cmd/compile: redo writebarrier pass

SSA's writebarrier pass requires WB store ops are always at the end of a block. If we move write barrier insertion into SSA and emits normal Store ops when building SSA, this requirement becomes impractical -- it will create too many blocks for all the Store ops. Redo SSA's writebarrier pass, explicitly order values in store order, so it no longer needs this requirement. Updates #17583. Fixes #19067. Change-Id: I66e817e526affb7e13517d4245905300a90b7170 Reviewed-on: https://go-review.googlesource.com/36834 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>

cmd/compile: redo writebarrier pass
SSA's writebarrier pass requires WB store ops are always at the end of a block. If we move write barrier insertion into SSA and emits normal Store ops when building SSA, this requirement becomes impractical -- it will create too many blocks for all the Store ops. Redo SSA's writebarrier pass, explicitly order values in store order, so it no longer needs this requirement. Updates #17583. Fixes #19067. Change-Id: I66e817e526affb7e13517d4245905300a90b7170 Reviewed-on: https://go-review.googlesource.com/36834 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
c4ef597c · Cherry Zhang · 98061fa5 · c4ef597c · c4ef597c · c4ef597c
Commit c4ef597c authored Feb 01, 2017 by Cherry Zhang
4 changed files
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3438,14 +3438,6 @@ func (s *state) insertWBmove(t *Type, left, right *ssa.Value, rightIsVolatile bo
 	}
 	val.Aux = &ssa.ExternSymbol{Typ: Types[TUINTPTR], Sym: Linksym(typenamesym(t))}
 	s.vars[&memVar] = val
-
-	// WB ops will be expanded to branches at writebarrier phase.
-	// To make it easy, we put WB ops at the end of a block, so
-	// that it does not need to split a block into two parts when
-	// expanding WB ops.
-	b := s.f.NewBlock(ssa.BlockPlain)
-	s.endBlock().AddEdgeTo(b)
-	s.startBlock(b)
 }

 // insertWBstore inserts the assignment *left = right including a write barrier.
@@ -3466,14 +3458,6 @@ func (s *state) insertWBstore(t *Type, left, right *ssa.Value, skip skipMask) {
 	}
 	s.storeTypeScalars(t, left, right, skip)
 	s.storeTypePtrsWB(t, left, right)
-
-	// WB ops will be expanded to branches at writebarrier phase.
-	// To make it easy, we put WB ops at the end of a block, so
-	// that it does not need to split a block into two parts when
-	// expanding WB ops.
-	b := s.f.NewBlock(ssa.BlockPlain)
-	s.endBlock().AddEdgeTo(b)
-	s.startBlock(b)
 }

 // do *left = right for all scalar (non-pointer) parts of t.

--- a/src/cmd/compile/internal/ssa/nilcheck.go
+++ b/src/cmd/compile/internal/ssa/nilcheck.go
@@ -227,163 +227,3 @@ func nilcheckelim2(f *Func) {
 		// more unnecessary nil checks.  Would fix test/nilptr3_ssa.go:157.
 	}
 }
-
-// storeOrder orders values with respect to stores. That is,
-// if v transitively depends on store s, v is ordered after s,
-// otherwise v is ordered before s.
-// Specifically, values are ordered like
-//   store1
-//   NilCheck that depends on store1
-//   other values that depends on store1
-//   store2
-//   NilCheck that depends on store2
-//   other values that depends on store2
-//   ...
-// The order of non-store and non-NilCheck values are undefined
-// (not necessarily dependency order). This should be cheaper
-// than a full scheduling as done in schedule.go.
-// Note that simple dependency order won't work: there is no
-// dependency between NilChecks and values like IsNonNil.
-// Auxiliary data structures are passed in as arguments, so
-// that they can be allocated in the caller and be reused.
-// This function takes care of reset them.
-func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value {
-	// find all stores
-	var stores []*Value // members of values that are store values
-	hasNilCheck := false
-	sset.clear() // sset is the set of stores that are used in other values
-	for _, v := range values {
-		if v.Type.IsMemory() {
-			stores = append(stores, v)
-			if v.Op == OpInitMem || v.Op == OpPhi {
-				continue
-			}
-			a := v.Args[len(v.Args)-1]
-			if v.Op == OpSelect1 {
-				a = a.Args[len(a.Args)-1]
-			}
-			sset.add(a.ID) // record that a is used
-		}
-		if v.Op == OpNilCheck {
-			hasNilCheck = true
-		}
-	}
-	if len(stores) == 0 || !hasNilCheck {
-		// there is no store or nilcheck, the order does not matter
-		return values
-	}
-
-	f := stores[0].Block.Func
-
-	// find last store, which is the one that is not used by other stores
-	var last *Value
-	for _, v := range stores {
-		if !sset.contains(v.ID) {
-			if last != nil {
-				f.Fatalf("two stores live simutaneously: %v and %v", v, last)
-			}
-			last = v
-		}
-	}
-
-	// We assign a store number to each value. Store number is the
-	// index of the latest store that this value transitively depends.
-	// The i-th store in the current block gets store number 3*i. A nil
-	// check that depends on the i-th store gets store number 3*i+1.
-	// Other values that depends on the i-th store gets store number 3*i+2.
-	// Special case: 0 -- unassigned, 1 or 2 -- the latest store it depends
-	// is in the previous block (or no store at all, e.g. value is Const).
-	// First we assign the number to all stores by walking back the store chain,
-	// then assign the number to other values in DFS order.
-	count := make([]int32, 3*(len(stores)+1))
-	sset.clear() // reuse sparse set to ensure that a value is pushed to stack only once
-	for n, w := len(stores), last; n > 0; n-- {
-		storeNumber[w.ID] = int32(3 * n)
-		count[3*n]++
-		sset.add(w.ID)
-		if w.Op == OpInitMem || w.Op == OpPhi {
-			if n != 1 {
-				f.Fatalf("store order is wrong: there are stores before %v", w)
-			}
-			break
-		}
-		if w.Op == OpSelect1 {
-			w = w.Args[0]
-		}
-		w = w.Args[len(w.Args)-1]
-	}
-	var stack []*Value
-	for _, v := range values {
-		if sset.contains(v.ID) {
-			// in sset means v is a store, or already pushed to stack, or already assigned a store number
-			continue
-		}
-		stack = append(stack, v)
-		sset.add(v.ID)
-
-		for len(stack) > 0 {
-			w := stack[len(stack)-1]
-			if storeNumber[w.ID] != 0 {
-				stack = stack[:len(stack)-1]
-				continue
-			}
-			if w.Op == OpPhi {
-				// Phi value doesn't depend on store in the current block.
-				// Do this early to avoid dependency cycle.
-				storeNumber[w.ID] = 2
-				count[2]++
-				stack = stack[:len(stack)-1]
-				continue
-			}
-
-			max := int32(0) // latest store dependency
-			argsdone := true
-			for _, a := range w.Args {
-				if a.Block != w.Block {
-					continue
-				}
-				if !sset.contains(a.ID) {
-					stack = append(stack, a)
-					sset.add(a.ID)
-					argsdone = false
-					continue
-				}
-				if storeNumber[a.ID]/3 > max {
-					max = storeNumber[a.ID] / 3
-				}
-			}
-			if !argsdone {
-				continue
-			}
-
-			n := 3*max + 2
-			if w.Op == OpNilCheck {
-				n = 3*max + 1
-			}
-			storeNumber[w.ID] = n
-			count[n]++
-			stack = stack[:len(stack)-1]
-		}
-	}
-
-	// convert count to prefix sum of counts: count'[i] = sum_{j<=i} count[i]
-	for i := range count {
-		if i == 0 {
-			continue
-		}
-		count[i] += count[i-1]
-	}
-	if count[len(count)-1] != int32(len(values)) {
-		f.Fatalf("storeOrder: value is missing, total count = %d, values = %v", count[len(count)-1], values)
-	}
-
-	// place values in count-indexed bins, which are in the desired store order
-	order := make([]*Value, len(values))
-	for _, v := range values {
-		s := storeNumber[v.ID]
-		order[count[s-1]] = v
-		count[s-1]++
-	}
-
-	return order
-}
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -277,3 +277,167 @@ func schedule(f *Func) {

 	f.scheduled = true
 }
+
+// storeOrder orders values with respect to stores. That is,
+// if v transitively depends on store s, v is ordered after s,
+// otherwise v is ordered before s.
+// Specifically, values are ordered like
+//   store1
+//   NilCheck that depends on store1
+//   other values that depends on store1
+//   store2
+//   NilCheck that depends on store2
+//   other values that depends on store2
+//   ...
+// The order of non-store and non-NilCheck values are undefined
+// (not necessarily dependency order). This should be cheaper
+// than a full scheduling as done above.
+// Note that simple dependency order won't work: there is no
+// dependency between NilChecks and values like IsNonNil.
+// Auxiliary data structures are passed in as arguments, so
+// that they can be allocated in the caller and be reused.
+// This function takes care of reset them.
+func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value {
+	if len(values) == 0 {
+		return values
+	}
+
+	f := values[0].Block.Func
+
+	// find all stores
+	var stores []*Value // members of values that are store values
+	hasNilCheck := false
+	sset.clear() // sset is the set of stores that are used in other values
+	for _, v := range values {
+		if v.Type.IsMemory() {
+			stores = append(stores, v)
+			if v.Op == OpInitMem || v.Op == OpPhi {
+				continue
+			}
+			a := v.Args[len(v.Args)-1]
+			if v.Op == OpSelect1 {
+				a = a.Args[len(a.Args)-1]
+			}
+			sset.add(a.ID) // record that a is used
+		}
+		if v.Op == OpNilCheck {
+			hasNilCheck = true
+		}
+	}
+	if len(stores) == 0 || !hasNilCheck && f.pass.name == "nilcheckelim" {
+		// there is no store, the order does not matter
+		return values
+	}
+
+	// find last store, which is the one that is not used by other stores
+	var last *Value
+	for _, v := range stores {
+		if !sset.contains(v.ID) {
+			if last != nil {
+				f.Fatalf("two stores live simutaneously: %v and %v", v, last)
+			}
+			last = v
+		}
+	}
+
+	// We assign a store number to each value. Store number is the
+	// index of the latest store that this value transitively depends.
+	// The i-th store in the current block gets store number 3*i. A nil
+	// check that depends on the i-th store gets store number 3*i+1.
+	// Other values that depends on the i-th store gets store number 3*i+2.
+	// Special case: 0 -- unassigned, 1 or 2 -- the latest store it depends
+	// is in the previous block (or no store at all, e.g. value is Const).
+	// First we assign the number to all stores by walking back the store chain,
+	// then assign the number to other values in DFS order.
+	count := make([]int32, 3*(len(stores)+1))
+	sset.clear() // reuse sparse set to ensure that a value is pushed to stack only once
+	for n, w := len(stores), last; n > 0; n-- {
+		storeNumber[w.ID] = int32(3 * n)
+		count[3*n]++
+		sset.add(w.ID)
+		if w.Op == OpInitMem || w.Op == OpPhi {
+			if n != 1 {
+				f.Fatalf("store order is wrong: there are stores before %v", w)
+			}
+			break
+		}
+		if w.Op == OpSelect1 {
+			w = w.Args[0]
+		}
+		w = w.Args[len(w.Args)-1]
+	}
+	var stack []*Value
+	for _, v := range values {
+		if sset.contains(v.ID) {
+			// in sset means v is a store, or already pushed to stack, or already assigned a store number
+			continue
+		}
+		stack = append(stack, v)
+		sset.add(v.ID)
+
+		for len(stack) > 0 {
+			w := stack[len(stack)-1]
+			if storeNumber[w.ID] != 0 {
+				stack = stack[:len(stack)-1]
+				continue
+			}
+			if w.Op == OpPhi {
+				// Phi value doesn't depend on store in the current block.
+				// Do this early to avoid dependency cycle.
+				storeNumber[w.ID] = 2
+				count[2]++
+				stack = stack[:len(stack)-1]
+				continue
+			}
+
+			max := int32(0) // latest store dependency
+			argsdone := true
+			for _, a := range w.Args {
+				if a.Block != w.Block {
+					continue
+				}
+				if !sset.contains(a.ID) {
+					stack = append(stack, a)
+					sset.add(a.ID)
+					argsdone = false
+					continue
+				}
+				if storeNumber[a.ID]/3 > max {
+					max = storeNumber[a.ID] / 3
+				}
+			}
+			if !argsdone {
+				continue
+			}
+
+			n := 3*max + 2
+			if w.Op == OpNilCheck {
+				n = 3*max + 1
+			}
+			storeNumber[w.ID] = n
+			count[n]++
+			stack = stack[:len(stack)-1]
+		}
+	}
+
+	// convert count to prefix sum of counts: count'[i] = sum_{j<=i} count[i]
+	for i := range count {
+		if i == 0 {
+			continue
+		}
+		count[i] += count[i-1]
+	}
+	if count[len(count)-1] != int32(len(values)) {
+		f.Fatalf("storeOrder: value is missing, total count = %d, values = %v", count[len(count)-1], values)
+	}
+
+	// place values in count-indexed bins, which are in the desired store order
+	order := make([]*Value, len(values))
+	for _, v := range values {
+		s := storeNumber[v.ID]
+		order[count[s-1]] = v
+		count[s-1]++
+	}
+
+	return order
+}
--- a/src/cmd/compile/internal/ssa/writebarrier.go
+++ b/src/cmd/compile/internal/ssa/writebarrier.go
@@ -22,21 +22,18 @@ import (
 // and a normal store will be used.
 // A sequence of WB stores for many pointer fields of a single type will
 // be emitted together, with a single branch.
-//
-// Expanding WB ops introduces new control flows, and we would need to
-// split a block into two if there were values after WB ops, which would
-// require scheduling the values. To avoid this complexity, when building
-// SSA, we make sure that WB ops are always at the end of a block. We do
-// this before fuse as it may merge blocks. It also helps to reduce
-// number of blocks as fuse merges blocks introduced in this phase.
 func writebarrier(f *Func) {
-	var sb, sp, wbaddr *Value
+	var sb, sp, wbaddr, const0 *Value
 	var writebarrierptr, typedmemmove, typedmemclr *obj.LSym
-	var storeWBs, others []*Value
-	var wbs *sparseSet
-	for _, b := range f.Blocks { // range loop is safe since the blocks we added contain no WB stores
-	valueLoop:
-		for i, v := range b.Values {
+	var stores, after []*Value
+	var sset *sparseSet
+	var storeNumber []int32
+
+	for _, b := range f.Blocks { // range loop is safe since the blocks we added contain no stores to expand
+		// rewrite write barrier for stack writes to ordinary Store/Move/Zero,
+		// record presence of non-stack WB ops.
+		hasStore := false
+		for _, v := range b.Values {
 			switch v.Op {
 			case OpStoreWB, OpMoveWB, OpMoveWBVolatile, OpZeroWB:
 				if IsStackAddr(v.Args[0]) {
@@ -52,11 +49,18 @@ func writebarrier(f *Func) {
 					}
 					continue
 				}
+				hasStore = true
+				break
+			}
+		}
+		if !hasStore {
+			continue
+		}

 		if wbaddr == nil {
-					// initalize global values for write barrier test and calls
+			// lazily initialize global values for write barrier test and calls
 			// find SB and SP values in entry block
-					initln := f.Entry.Pos
+			initpos := f.Entry.Pos
 			for _, v := range f.Entry.Values {
 				if v.Op == OpSB {
 					sb = v
@@ -64,72 +68,67 @@ func writebarrier(f *Func) {
 				if v.Op == OpSP {
 					sp = v
 				}
+				if sb != nil && sp != nil {
+					break
+				}
 			}
 			if sb == nil {
-						sb = f.Entry.NewValue0(initln, OpSB, f.Config.fe.TypeUintptr())
+				sb = f.Entry.NewValue0(initpos, OpSB, f.Config.fe.TypeUintptr())
 			}
 			if sp == nil {
-						sp = f.Entry.NewValue0(initln, OpSP, f.Config.fe.TypeUintptr())
+				sp = f.Entry.NewValue0(initpos, OpSP, f.Config.fe.TypeUintptr())
 			}
 			wbsym := &ExternSymbol{Typ: f.Config.fe.TypeBool(), Sym: f.Config.fe.Syslook("writeBarrier")}
-					wbaddr = f.Entry.NewValue1A(initln, OpAddr, f.Config.fe.TypeUInt32().PtrTo(), wbsym, sb)
+			wbaddr = f.Entry.NewValue1A(initpos, OpAddr, f.Config.fe.TypeUInt32().PtrTo(), wbsym, sb)
 			writebarrierptr = f.Config.fe.Syslook("writebarrierptr")
 			typedmemmove = f.Config.fe.Syslook("typedmemmove")
 			typedmemclr = f.Config.fe.Syslook("typedmemclr")
+			const0 = f.ConstInt32(initpos, f.Config.fe.TypeUInt32(), 0)

-					wbs = f.newSparseSet(f.NumValues())
-					defer f.retSparseSet(wbs)
+			// allocate auxiliary data structures for computing store order
+			sset = f.newSparseSet(f.NumValues())
+			defer f.retSparseSet(sset)
+			storeNumber = make([]int32, f.NumValues())
 		}

-				pos := v.Pos
+		// order values in store order
+		b.Values = storeOrder(b.Values, sset, storeNumber)

-				// there may be a sequence of WB stores in the current block. find them.
-				storeWBs = storeWBs[:0]
-				others = others[:0]
-				wbs.clear()
-				for _, w := range b.Values[i:] {
+	again:
+		// find the start and end of the last contiguous WB store sequence.
+		// a branch will be inserted there. values after it will be moved
+		// to a new block.
+		var last *Value
+		var start, end int
+		values := b.Values
+		for i := len(values) - 1; i >= 0; i-- {
+			w := values[i]
 			if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
-						storeWBs = append(storeWBs, w)
-						wbs.add(w.ID)
-					} else {
-						others = append(others, w)
-					}
-				}
-
-				// make sure that no value in this block depends on WB stores
-				for _, w := range b.Values {
-					if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
-						continue
+				if last == nil {
+					last = w
+					end = i + 1
 				}
-					for _, a := range w.Args {
-						if wbs.contains(a.ID) {
-							f.Fatalf("value %v depends on WB store %v in the same block %v", w, a, b)
+			} else {
+				if last != nil {
+					start = i + 1
+					break
 				}
 			}
 		}
+		stores = append(stores[:0], b.Values[start:end]...) // copy to avoid aliasing
+		after = append(after[:0], b.Values[end:]...)
+		b.Values = b.Values[:start]

 		// find the memory before the WB stores
-				// this memory is not a WB store but it is used in a WB store.
-				var mem *Value
-				for _, w := range storeWBs {
-					a := w.Args[len(w.Args)-1]
-					if wbs.contains(a.ID) {
-						continue
-					}
-					if mem != nil {
-						b.Fatalf("two stores live simultaneously: %s, %s", mem, a)
-					}
-					mem = a
-				}
-
-				b.Values = append(b.Values[:i], others...) // move WB ops out of this block
-
+		mem := stores[0].Args[len(stores[0].Args)-1]
+		pos := stores[0].Pos
 		bThen := f.NewBlock(BlockPlain)
 		bElse := f.NewBlock(BlockPlain)
 		bEnd := f.NewBlock(b.Kind)
 		bThen.Pos = pos
 		bElse.Pos = pos
-				bEnd.Pos = pos
+		bEnd.Pos = b.Pos
+		b.Pos = pos

 		// set up control flow for end block
 		bEnd.SetControl(b.Control)
@@ -142,7 +141,6 @@ func writebarrier(f *Func) {
 		// set up control flow for write barrier test
 		// load word, test word, avoiding partial register write from load byte.
 		flag := b.NewValue2(pos, OpLoad, f.Config.fe.TypeUInt32(), wbaddr, mem)
-				const0 := f.ConstInt32(pos, f.Config.fe.TypeUInt32(), 0)
 		flag = b.NewValue2(pos, OpNeq32, f.Config.fe.TypeBool(), flag, const0)
 		b.Kind = BlockIf
 		b.SetControl(flag)
@@ -153,13 +151,16 @@ func writebarrier(f *Func) {
 		bThen.AddEdgeTo(bEnd)
 		bElse.AddEdgeTo(bEnd)

+		// for each write barrier store, append write barrier version to bThen
+		// and simple store version to bElse
 		memThen := mem
 		memElse := mem
-				for _, w := range storeWBs {
+		for _, w := range stores {
 			var val *Value
 			ptr := w.Args[0]
 			siz := w.AuxInt
 			typ := w.Aux // only non-nil for MoveWB, MoveWBVolatile, ZeroWB
+			pos = w.Pos

 			var op Op
 			var fn *obj.LSym
@@ -186,6 +187,10 @@ func writebarrier(f *Func) {
 			} else {
 				memElse = bElse.NewValue3I(pos, op, TypeMem, siz, ptr, val, memElse)
 			}
+
+			if f.Config.fe.Debug_wb() {
+				f.Config.Warnl(pos, "write barrier")
+			}
 		}

 		// merge memory
@@ -193,46 +198,33 @@ func writebarrier(f *Func) {
 		// which may be used in subsequent blocks. Other memories in the
 		// sequence must be dead after this block since there can be only
 		// one memory live.
-				last := storeWBs[0]
-				if len(storeWBs) > 1 {
-					// find the last store
-					last = nil
-					wbs.clear() // we reuse wbs to record WB stores that is used in another WB store
-					for _, w := range storeWBs {
-						wbs.add(w.Args[len(w.Args)-1].ID)
-					}
-					for _, w := range storeWBs {
-						if wbs.contains(w.ID) {
-							continue
-						}
-						if last != nil {
-							b.Fatalf("two stores live simultaneously: %s, %s", last, w)
-						}
-						last = w
-					}
-				}
 		bEnd.Values = append(bEnd.Values, last)
 		last.Block = bEnd
 		last.reset(OpPhi)
 		last.Type = TypeMem
 		last.AddArg(memThen)
 		last.AddArg(memElse)
-				for _, w := range storeWBs {
+		for _, w := range stores {
 			if w != last {
 				w.resetArgs()
 			}
 		}
-				for _, w := range storeWBs {
+		for _, w := range stores {
 			if w != last {
 				f.freeValue(w)
 			}
 		}

-				if f.Config.fe.Debug_wb() {
-					f.Config.Warnl(pos, "write barrier")
+		// put values after the store sequence into the end block
+		bEnd.Values = append(bEnd.Values, after...)
+		for _, w := range after {
+			w.Block = bEnd
 		}

-				break valueLoop
+		// if we have more stores in this block, do this block again
+		for _, w := range b.Values {
+			if w.Op == OpStoreWB || w.Op == OpMoveWB || w.Op == OpMoveWBVolatile || w.Op == OpZeroWB {
+				goto again
 			}
 		}
 	}