[dev.ssa] cmd/compile: clean up tuple types and selects

Make tuple types and their SelectX ops fully generic. These ops no longer need to be lowered. Regalloc understands them and their tuple-generating arguments. We can now have opcodes returning arbitrary pairs of results. (And it would be easy to move to >2 results if needed.) Update arm implementation to the new standard. Implement just enough in 386 port to do 64-bit add. Change-Id: I370ed5aacce219c82e1954c61d1f63af76c16f79 Reviewed-on: https://go-review.googlesource.com/24976Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>

[dev.ssa] cmd/compile: clean up tuple types and selects
Make tuple types and their SelectX ops fully generic. These ops no longer need to be lowered. Regalloc understands them and their tuple-generating arguments. We can now have opcodes returning arbitrary pairs of results. (And it would be easy to move to >2 results if needed.) Update arm implementation to the new standard. Implement just enough in 386 port to do 64-bit add. Change-Id: I370ed5aacce219c82e1954c61d1f63af76c16f79 Reviewed-on: https://go-review.googlesource.com/24976Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
25e0a367 · Keith Randall · 6b6de15d · 25e0a367 · 25e0a367 · 25e0a367
Commit 25e0a367 authored Jul 13, 2016 by Keith Randall
19 changed files
--- a/src/cmd/compile/internal/arm/ssa.go
+++ b/src/cmd/compile/internal/arm/ssa.go
@@ -278,7 +278,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = r
 	case ssa.OpARMADDS,
 		ssa.OpARMSUBS:
-		r := gc.SSARegNum(v)
+		r := gc.SSARegNum1(v)
 		r1 := gc.SSARegNum(v.Args[0])
 		r2 := gc.SSARegNum(v.Args[1])
 		p := gc.Prog(v.Op.Asm())
@@ -351,7 +351,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Offset = v.AuxInt
 		p.Reg = gc.SSARegNum(v.Args[0])
 		p.To.Type = obj.TYPE_REG
-		p.To.Reg = gc.SSARegNum(v)
+		p.To.Reg = gc.SSARegNum1(v)
 	case ssa.OpARMSRRconst:
 		genshift(arm.AMOVW, 0, gc.SSARegNum(v.Args[0]), gc.SSARegNum(v), arm.SHIFT_RR, v.AuxInt)
 	case ssa.OpARMADDshiftLL,
@@ -368,7 +368,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpARMADDSshiftLL,
 		ssa.OpARMSUBSshiftLL,
 		ssa.OpARMRSBSshiftLL:
-		p := genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm.SHIFT_LL, v.AuxInt)
+		p := genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum1(v), arm.SHIFT_LL, v.AuxInt)
 		p.Scond = arm.C_SBIT
 	case ssa.OpARMADDshiftRL,
 		ssa.OpARMADCshiftRL,
@@ -384,7 +384,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpARMADDSshiftRL,
 		ssa.OpARMSUBSshiftRL,
 		ssa.OpARMRSBSshiftRL:
-		p := genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm.SHIFT_LR, v.AuxInt)
+		p := genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum1(v), arm.SHIFT_LR, v.AuxInt)
 		p.Scond = arm.C_SBIT
 	case ssa.OpARMADDshiftRA,
 		ssa.OpARMADCshiftRA,
@@ -400,7 +400,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpARMADDSshiftRA,
 		ssa.OpARMSUBSshiftRA,
 		ssa.OpARMRSBSshiftRA:
-		p := genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v), arm.SHIFT_AR, v.AuxInt)
+		p := genshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum1(v), arm.SHIFT_AR, v.AuxInt)
 		p.Scond = arm.C_SBIT
 	case ssa.OpARMMVNshiftLL:
 		genshift(v.Op.Asm(), 0, gc.SSARegNum(v.Args[0]), gc.SSARegNum(v), arm.SHIFT_LL, v.AuxInt)
@@ -428,7 +428,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpARMADDSshiftLLreg,
 		ssa.OpARMSUBSshiftLLreg,
 		ssa.OpARMRSBSshiftLLreg:
-		p := genregshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[2]), gc.SSARegNum(v), arm.SHIFT_LL)
+		p := genregshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[2]), gc.SSARegNum1(v), arm.SHIFT_LL)
 		p.Scond = arm.C_SBIT
 	case ssa.OpARMADDshiftRLreg,
 		ssa.OpARMADCshiftRLreg,
@@ -444,7 +444,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpARMADDSshiftRLreg,
 		ssa.OpARMSUBSshiftRLreg,
 		ssa.OpARMRSBSshiftRLreg:
-		p := genregshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[2]), gc.SSARegNum(v), arm.SHIFT_LR)
+		p := genregshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[2]), gc.SSARegNum1(v), arm.SHIFT_LR)
 		p.Scond = arm.C_SBIT
 	case ssa.OpARMADDshiftRAreg,
 		ssa.OpARMADCshiftRAreg,
@@ -460,7 +460,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpARMADDSshiftRAreg,
 		ssa.OpARMSUBSshiftRAreg,
 		ssa.OpARMRSBSshiftRAreg:
-		p := genregshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[2]), gc.SSARegNum(v), arm.SHIFT_AR)
+		p := genregshift(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[2]), gc.SSARegNum1(v), arm.SHIFT_AR)
 		p.Scond = arm.C_SBIT
 	case ssa.OpARMHMUL,
 		ssa.OpARMHMULU:
@@ -473,14 +473,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = gc.SSARegNum(v)
 		p.To.Offset = arm.REGTMP // throw away low 32-bit into tmp register
 	case ssa.OpARMMULLU:
-		// 32-bit multiplication, results 64-bit, low 32-bit in reg(v), high 32-bit in R0
+		// 32-bit multiplication, results 64-bit, high 32-bit in out0, low 32-bit in out1
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = gc.SSARegNum(v.Args[0])
 		p.Reg = gc.SSARegNum(v.Args[1])
 		p.To.Type = obj.TYPE_REGREG
-		p.To.Reg = arm.REG_R0                // high 32-bit
-		p.To.Offset = int64(gc.SSARegNum(v)) // low 32-bit
+		p.To.Reg = gc.SSARegNum0(v)           // high 32-bit
+		p.To.Offset = int64(gc.SSARegNum1(v)) // low 32-bit
 	case ssa.OpARMMULA:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
@@ -928,9 +928,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Offset = 1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
-	case ssa.OpARMCarry,
-		ssa.OpARMLoweredSelect0,
-		ssa.OpARMLoweredSelect1:
+	case ssa.OpSelect0, ssa.OpSelect1:
 		// nothing to do
 	case ssa.OpARMLoweredGetClosurePtr:
 		// Closure pointer is R7 (arm.REGCTXT).

--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -4266,11 +4266,39 @@ func SSAReg(v *ssa.Value) *ssa.Register {
 	return reg.(*ssa.Register)
 }

+// SSAReg0 returns the register to which the first output of v has been allocated.
+func SSAReg0(v *ssa.Value) *ssa.Register {
+	reg := v.Block.Func.RegAlloc[v.ID].(ssa.LocPair)[0]
+	if reg == nil {
+		v.Fatalf("nil first register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
+// SSAReg1 returns the register to which the second output of v has been allocated.
+func SSAReg1(v *ssa.Value) *ssa.Register {
+	reg := v.Block.Func.RegAlloc[v.ID].(ssa.LocPair)[1]
+	if reg == nil {
+		v.Fatalf("nil second register for value: %s\n%s\n", v.LongString(), v.Block.Func)
+	}
+	return reg.(*ssa.Register)
+}
+
 // SSARegNum returns the register number (in cmd/internal/obj numbering) to which v has been allocated.
 func SSARegNum(v *ssa.Value) int16 {
 	return Thearch.SSARegToReg[SSAReg(v).Num]
 }

+// SSARegNum0 returns the register number (in cmd/internal/obj numbering) to which the first output of v has been allocated.
+func SSARegNum0(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg0(v).Num]
+}
+
+// SSARegNum1 returns the register number (in cmd/internal/obj numbering) to which the second output of v has been allocated.
+func SSARegNum1(v *ssa.Value) int16 {
+	return Thearch.SSARegToReg[SSAReg1(v).Num]
+}
+
 // CheckLoweredPhi checks that regalloc and stackalloc correctly handled phi values.
 // Called during ssaGenValue.
 func CheckLoweredPhi(v *ssa.Value) {

--- a/src/cmd/compile/internal/ssa/cse.go
+++ b/src/cmd/compile/internal/ssa/cse.go
@@ -171,10 +171,10 @@ func cse(f *Func) {
 			if rewrite[v.ID] != nil {
 				continue
 			}
-			if !v.Op.isTupleSelector() {
+			if v.Op != OpSelect0 && v.Op != OpSelect1 {
 				continue
 			}
-			if !v.Args[0].Op.isTupleGenerator() {
+			if !v.Args[0].Type.IsTuple() {
 				f.Fatalf("arg of tuple selector %s is not a tuple: %s", v.String(), v.Args[0].LongString())
 			}
 			t := rewrite[v.Args[0].ID]

--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -10,6 +10,9 @@
 (Add32F x y) -> (ADDSS x y)
 (Add64F x y) -> (ADDSD x y)

+(Add32carry x y) -> (ADDLcarry x y)
+(Add32withcarry x y c) -> (ADCL x y c)
+
 (SubPtr x y) -> (SUBL  x y)
 (Sub32  x y) -> (SUBL  x y)
 (Sub16  x y) -> (SUBL  x y)

--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -99,6 +99,8 @@ func init() {
 		gp11nf    = regInfo{inputs: []regMask{gpsp}, outputs: gponly} // nf: no flags clobbered
 		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
 		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly, clobbers: flags}
+		gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{flags, gp}}
+		gp2carry1 = regInfo{inputs: []regMask{gp, gp, flags}, outputs: gponly}
 		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly, clobbers: flags}
 		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
 		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}, clobbers: flags}
@@ -171,6 +173,9 @@ func init() {
 		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true},                // arg0 + arg1
 		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32"}, // arg0 + auxint

+		{name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true}, // arg0 + arg1, generates <carry,result> pair
+		{name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true},      // arg0+arg1+carry(arg2), where arg2 is flags
+
 		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true},                    // arg0 - arg1
 		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0 - auxint


--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@@ -366,10 +366,6 @@
 (IsSliceInBounds idx len) -> (LessEqualU (CMP idx len))

 // pseudo-ops
-(Select0 <t> x) && t.IsFlags() -> (Carry x)
-(Select0 <t> x) && !t.IsFlags() -> (LoweredSelect0 x)
-(Select1 x) -> (LoweredSelect1 x)
-
 (GetClosurePtr) -> (LoweredGetClosurePtr)
 (Convert x mem) -> (MOVWconvert x mem)


--- a/src/cmd/compile/internal/ssa/gen/ARMOps.go
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
--- a/src/cmd/compile/internal/ssa/gen/main.go
+++ b/src/cmd/compile/internal/ssa/gen/main.go
@@ -42,8 +42,8 @@ type opData struct {
 	aux               string
 	rematerializeable bool
 	argLength         int32 // number of arguments, if -1, then this operation has a variable number of arguments
-	commutative       bool  // this operation is commutative (e.g. addition)
-	resultInArg0      bool  // v and v.Args[0] must be allocated to the same register
+	commutative       bool  // this operation is commutative on its first 2 arguments (e.g. addition)
+	resultInArg0      bool  // last output of v and v.Args[0] must be allocated to the same register
 }

 type blockData struct {
@@ -160,11 +160,11 @@ func genOp() {
 			}
 			if v.resultInArg0 {
 				fmt.Fprintln(w, "resultInArg0: true,")
-				if v.reg.inputs[0] != v.reg.outputs[0] {
-					log.Fatalf("input[0] and output registers must be equal for %s", v.name)
+				if v.reg.inputs[0] != v.reg.outputs[len(v.reg.outputs)-1] {
+					log.Fatalf("input[0] and last output register must be equal for %s", v.name)
 				}
-				if v.commutative && v.reg.inputs[1] != v.reg.outputs[0] {
-					log.Fatalf("input[1] and output registers must be equal for %s", v.name)
+				if v.commutative && v.reg.inputs[1] != v.reg.outputs[len(v.reg.outputs)-1] {
+					log.Fatalf("input[1] and last output register must be equal for %s", v.name)
 				}
 			}
 			if a.name == "generic" {
@@ -196,14 +196,24 @@ func genOp() {
 				}
 				fmt.Fprintln(w, "},")
 			}
+
 			if v.reg.clobbers > 0 {
 				fmt.Fprintf(w, "clobbers: %d,%s\n", v.reg.clobbers, a.regMaskComment(v.reg.clobbers))
 			}
+
 			// reg outputs
-			if len(v.reg.outputs) > 0 {
-				fmt.Fprintln(w, "outputs: []regMask{")
-				for _, r := range v.reg.outputs {
-					fmt.Fprintf(w, "%d,%s\n", r, a.regMaskComment(r))
+			s = s[:0]
+			for i, r := range v.reg.outputs {
+				if r != 0 {
+					s = append(s, intPair{countRegs(r), i})
+				}
+			}
+			if len(s) > 0 {
+				sort.Sort(byKey(s))
+				fmt.Fprintln(w, "outputs: []outputInfo{")
+				for _, p := range s {
+					r := v.reg.outputs[p.val]
+					fmt.Fprintf(w, "{%d,%d},%s\n", p.val, r, a.regMaskComment(r))
 				}
 				fmt.Fprintln(w, "},")
 			}

--- a/src/cmd/compile/internal/ssa/html.go
+++ b/src/cmd/compile/internal/ssa/html.go
@@ -359,7 +359,7 @@ func (v *Value) LongHTML() string {
 	}
 	r := v.Block.Func.RegAlloc
 	if int(v.ID) < len(r) && r[v.ID] != nil {
-		s += " : " + r[v.ID].Name()
+		s += " : " + html.EscapeString(r[v.ID].Name())
 	}
 	s += "</span>"
 	return s

--- a/src/cmd/compile/internal/ssa/location.go
+++ b/src/cmd/compile/internal/ssa/location.go
@@ -36,3 +36,16 @@ func (s LocalSlot) Name() string {
 	}
 	return fmt.Sprintf("%s+%d[%s]", s.N, s.Off, s.Type)
 }
+
+type LocPair [2]Location
+
+func (t LocPair) Name() string {
+	n0, n1 := "nil", "nil"
+	if t[0] != nil {
+		n0 = t[0].Name()
+	}
+	if t[1] != nil {
+		n1 = t[1].Name()
+	}
+	return fmt.Sprintf("<%s,%s>", n0, n1)
+}
--- a/src/cmd/compile/internal/ssa/lower.go
+++ b/src/cmd/compile/internal/ssa/lower.go
@@ -21,7 +21,7 @@ func checkLower(f *Func) {
 				continue // lowered
 			}
 			switch v.Op {
-			case OpSP, OpSB, OpInitMem, OpArg, OpPhi, OpVarDef, OpVarKill, OpVarLive, OpKeepAlive:
+			case OpSP, OpSB, OpInitMem, OpArg, OpPhi, OpVarDef, OpVarKill, OpVarLive, OpKeepAlive, OpSelect0, OpSelect1:
 				continue // ok not to lower
 			case OpGetG:
 				if f.Config.hasGReg {

--- a/src/cmd/compile/internal/ssa/op.go
+++ b/src/cmd/compile/internal/ssa/op.go
@@ -26,7 +26,7 @@ type opInfo struct {
 	generic           bool // this is a generic (arch-independent) opcode
 	rematerializeable bool // this op is rematerializeable
 	commutative       bool // this operation is commutative (e.g. addition)
-	resultInArg0      bool // v and v.Args[0] must be allocated to the same register
+	resultInArg0      bool // last output of v and v.Args[0] must be allocated to the same register
 }

 type inputInfo struct {
@@ -34,10 +34,15 @@ type inputInfo struct {
 	regs regMask // allowed input registers
 }

+type outputInfo struct {
+	idx  int     // index in output tuple
+	regs regMask // allowed output registers
+}
+
 type regInfo struct {
 	inputs   []inputInfo // ordered in register allocation order
 	clobbers regMask
-	outputs  []regMask // NOTE: values can only have 1 output for now.
+	outputs  []outputInfo // ordered in register allocation order
 }

 type auxType int8
@@ -152,28 +157,3 @@ func MakeSizeAndAlign(size, align int64) SizeAndAlign {
 	}
 	return SizeAndAlign(size | align<<56)
 }
-
-func (op Op) isTupleGenerator() bool {
-	switch op {
-	case OpAdd32carry, OpSub32carry, OpMul32uhilo,
-		OpARMADDS, OpARMSUBS, OpARMMULLU,
-		OpARMADDSconst, OpARMSUBSconst, OpARMRSBSconst,
-		OpARMADDSshiftLL, OpARMSUBSshiftLL, OpARMRSBSshiftLL,
-		OpARMADDSshiftRL, OpARMSUBSshiftRL, OpARMRSBSshiftRL,
-		OpARMADDSshiftRA, OpARMSUBSshiftRA, OpARMRSBSshiftRA,
-		OpARMADDSshiftLLreg, OpARMSUBSshiftLLreg, OpARMRSBSshiftLLreg,
-		OpARMADDSshiftRLreg, OpARMSUBSshiftRLreg, OpARMRSBSshiftRLreg,
-		OpARMADDSshiftRAreg, OpARMSUBSshiftRAreg, OpARMRSBSshiftRAreg:
-		return true
-	}
-	return false
-}
-
-func (op Op) isTupleSelector() bool {
-	switch op {
-	case OpSelect0, OpSelect1,
-		OpARMLoweredSelect0, OpARMLoweredSelect1, OpARMCarry:
-		return true
-	}
-	return false
-}
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -333,10 +333,10 @@ func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
 	s.f.setHome(c, &s.registers[r])
 }

-// allocReg chooses a register for v from the set of registers in mask.
+// allocReg chooses a register from the set of registers in mask.
 // If there is no unused register, a Value will be kicked out of
 // a register to make room.
-func (s *regAllocState) allocReg(v *Value, mask regMask) register {
+func (s *regAllocState) allocReg(mask regMask) register {
 	mask &= s.allocatable
 	mask &^= s.nospill
 	if mask == 0 {
@@ -401,7 +401,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, line
 	}

 	// Allocate a register.
-	r := s.allocReg(v, mask)
+	r := s.allocReg(mask)

 	// Allocate v to the new register.
 	var c *Value
@@ -471,7 +471,7 @@ func (s *regAllocState) init(f *Func) {
 	}

 	// Figure out which registers we're allowed to use.
-	s.allocatable = s.f.Config.gpRegMask | s.f.Config.fpRegMask | s.f.Config.flagRegMask
+	s.allocatable = s.f.Config.gpRegMask | s.f.Config.fpRegMask
 	s.allocatable &^= 1 << s.SPReg
 	s.allocatable &^= 1 << s.SBReg
 	if s.f.Config.hasGReg {
@@ -499,11 +499,13 @@ func (s *regAllocState) init(f *Func) {
 	s.orig = make([]*Value, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() {
+			if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && !v.Type.IsTuple() {
 				s.values[v.ID].needReg = true
 				s.values[v.ID].rematerializeable = v.rematerializeable()
 				s.orig[v.ID] = v
 			}
+			// Note: needReg is false for values returning Tuple types.
+			// Instead, we mark the corresponding Selects as needReg.
 		}
 	}
 	s.computeLive()
@@ -947,6 +949,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			if s.f.pass.debug > regDebug {
 				fmt.Printf("  processing %s\n", v.LongString())
 			}
+			regspec := opcodeTable[v.Op].reg
 			if v.Op == OpPhi {
 				f.Fatalf("phi %s not at start of block", v)
 			}
@@ -962,6 +965,18 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.advanceUses(v)
 				continue
 			}
+			if v.Op == OpSelect0 || v.Op == OpSelect1 {
+				if s.values[v.ID].needReg {
+					var i = 0
+					if v.Op == OpSelect1 {
+						i = 1
+					}
+					s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocPair)[i].(*Register).Num), v, v)
+				}
+				b.Values = append(b.Values, v)
+				s.advanceUses(v)
+				goto issueSpill
+			}
 			if v.Op == OpGetG && s.f.Config.hasGReg {
 				// use hardware g register
 				if s.regs[s.GReg].v != nil {
@@ -970,17 +985,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.assignReg(s.GReg, v, v)
 				b.Values = append(b.Values, v)
 				s.advanceUses(v)
-				// spill unconditionally, will be deleted if never used
-				spill := b.NewValue1(v.Line, OpStoreReg, v.Type, v)
-				s.setOrig(spill, v)
-				s.values[v.ID].spill = spill
-				s.values[v.ID].spillUsed = false
-				if loop != nil {
-					loop.spills = append(loop.spills, v)
-					nSpillsInner++
-				}
-				nSpills++
-				continue
+				goto issueSpill
 			}
 			if v.Op == OpArg {
 				// Args are "pre-spilled" values. We don't allocate
@@ -1009,7 +1014,6 @@ func (s *regAllocState) regalloc(f *Func) {
 				b.Values = append(b.Values, v)
 				continue
 			}
-			regspec := opcodeTable[v.Op].reg
 			if len(regspec.inputs) == 0 && len(regspec.outputs) == 0 {
 				// No register allocation required (or none specified yet)
 				s.freeRegs(regspec.clobbers)
@@ -1167,49 +1171,73 @@ func (s *regAllocState) regalloc(f *Func) {
 			// Dump any registers which will be clobbered
 			s.freeRegs(regspec.clobbers)

-			// Pick register for output.
-			if s.values[v.ID].needReg {
-				mask := regspec.outputs[0] & s.allocatable
-				if opcodeTable[v.Op].resultInArg0 {
-					if !opcodeTable[v.Op].commutative {
-						// Output must use the same register as input 0.
-						r := register(s.f.getHome(args[0].ID).(*Register).Num)
-						mask = regMask(1) << r
-					} else {
-						// Output must use the same register as input 0 or 1.
-						r0 := register(s.f.getHome(args[0].ID).(*Register).Num)
-						r1 := register(s.f.getHome(args[1].ID).(*Register).Num)
-						// Check r0 and r1 for desired output register.
-						found := false
-						for _, r := range dinfo[idx].out {
-							if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
-								mask = regMask(1) << r
-								found = true
-								if r == r1 {
-									args[0], args[1] = args[1], args[0]
+			// Pick registers for outputs.
+			{
+				outRegs := [2]register{noRegister, noRegister}
+				var used regMask
+				for _, out := range regspec.outputs {
+					mask := out.regs & s.allocatable &^ used
+					if mask == 0 {
+						continue
+					}
+					if opcodeTable[v.Op].resultInArg0 && out.idx == len(regspec.outputs)-1 {
+						if !opcodeTable[v.Op].commutative {
+							// Output must use the same register as input 0.
+							r := register(s.f.getHome(args[0].ID).(*Register).Num)
+							mask = regMask(1) << r
+						} else {
+							// Output must use the same register as input 0 or 1.
+							r0 := register(s.f.getHome(args[0].ID).(*Register).Num)
+							r1 := register(s.f.getHome(args[1].ID).(*Register).Num)
+							// Check r0 and r1 for desired output register.
+							found := false
+							for _, r := range dinfo[idx].out {
+								if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
+									mask = regMask(1) << r
+									found = true
+									if r == r1 {
+										args[0], args[1] = args[1], args[0]
+									}
+									break
 								}
-								break
+							}
+							if !found {
+								// Neither are desired, pick r0.
+								mask = regMask(1) << r0
 							}
 						}
-						if !found {
-							// Neither are desired, pick r0.
-							mask = regMask(1) << r0
+					}
+					for _, r := range dinfo[idx].out {
+						if r != noRegister && (mask&^s.used)>>r&1 != 0 {
+							// Desired register is allowed and unused.
+							mask = regMask(1) << r
+							break
 						}
 					}
-				}
-				for _, r := range dinfo[idx].out {
-					if r != noRegister && (mask&^s.used)>>r&1 != 0 {
-						// Desired register is allowed and unused.
-						mask = regMask(1) << r
-						break
+					// Avoid registers we're saving for other values.
+					if mask&^desired.avoid != 0 {
+						mask &^= desired.avoid
+					}
+					r := s.allocReg(mask)
+					outRegs[out.idx] = r
+					used |= regMask(1) << r
+				}
+				// Record register choices
+				if v.Type.IsTuple() {
+					var outLocs LocPair
+					if r := outRegs[0]; r != noRegister {
+						outLocs[0] = &s.registers[r]
+					}
+					if r := outRegs[1]; r != noRegister {
+						outLocs[1] = &s.registers[r]
+					}
+					s.f.setHome(v, outLocs)
+					// Note that subsequent SelectX instructions will do the assignReg calls.
+				} else {
+					if r := outRegs[0]; r != noRegister {
+						s.assignReg(r, v, v)
 					}
 				}
-				// Avoid registers we're saving for other values.
-				if mask&^desired.avoid != 0 {
-					mask &^= desired.avoid
-				}
-				r := s.allocReg(v, mask)
-				s.assignReg(r, v, v)
 			}

 			// Issue the Value itself.
@@ -1228,6 +1256,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			//     f()
 			// }
 			// It would be good to have both spill and restore inside the IF.
+		issueSpill:
 			if s.values[v.ID].needReg {
 				spill := b.NewValue1(v.Line, OpStoreReg, v.Type, v)
 				s.setOrig(spill, v)
@@ -1246,9 +1275,10 @@ func (s *regAllocState) regalloc(f *Func) {
 			if s.f.pass.debug > regDebug {
 				fmt.Printf("  processing control %s\n", v.LongString())
 			}
-			// TODO: regspec for block control values, instead of using
-			// register set from the control op's output.
-			s.allocValToReg(v, opcodeTable[v.Op].reg.outputs[0], false, b.Line)
+			// We assume that a control input can be passed in any
+			// type-compatible register. If this turns out not to be true,
+			// we'll need to introduce a regspec for a block's control value.
+			s.allocValToReg(v, s.compatRegs(v.Type), false, b.Line)
 			// Remove this use from the uses list.
 			vi := &s.values[v.ID]
 			u := vi.uses
@@ -2065,6 +2095,8 @@ func (e *edgeState) findRegFor(typ Type) Location {
 	return nil
 }

+// rematerializeable reports whether the register allocator should recompute
+// a value instead of spilling/restoring it.
 func (v *Value) rematerializeable() bool {
 	if !opcodeTable[v.Op].rematerializeable {
 		return false

--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
@@ -22,6 +22,10 @@ func rewriteValue386(v *Value, config *Config) bool {
 		return rewriteValue386_OpAdd32(v, config)
 	case OpAdd32F:
 		return rewriteValue386_OpAdd32F(v, config)
+	case OpAdd32carry:
+		return rewriteValue386_OpAdd32carry(v, config)
+	case OpAdd32withcarry:
+		return rewriteValue386_OpAdd32withcarry(v, config)
 	case OpAdd64F:
 		return rewriteValue386_OpAdd64F(v, config)
 	case OpAdd8:
@@ -1116,6 +1120,38 @@ func rewriteValue386_OpAdd32F(v *Value, config *Config) bool {
 		return true
 	}
 }
+func rewriteValue386_OpAdd32carry(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Add32carry x y)
+	// cond:
+	// result: (ADDLcarry x y)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		v.reset(Op386ADDLcarry)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+}
+func rewriteValue386_OpAdd32withcarry(v *Value, config *Config) bool {
+	b := v.Block
+	_ = b
+	// match: (Add32withcarry x y c)
+	// cond:
+	// result: (ADCL x y c)
+	for {
+		x := v.Args[0]
+		y := v.Args[1]
+		c := v.Args[2]
+		v.reset(Op386ADCL)
+		v.AddArg(x)
+		v.AddArg(y)
+		v.AddArg(c)
+		return true
+	}
+}
 func rewriteValue386_OpAdd64F(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b

--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
@@ -654,10 +654,6 @@ func rewriteValueARM(v *Value, config *Config) bool {
 		return rewriteValueARM_OpARMSUBshiftRL(v, config)
 	case OpARMSUBshiftRLreg:
 		return rewriteValueARM_OpARMSUBshiftRLreg(v, config)
-	case OpSelect0:
-		return rewriteValueARM_OpSelect0(v, config)
-	case OpSelect1:
-		return rewriteValueARM_OpSelect1(v, config)
 	case OpSignExt16to32:
 		return rewriteValueARM_OpSignExt16to32(v, config)
 	case OpSignExt8to16:
@@ -15554,50 +15550,6 @@ func rewriteValueARM_OpARMSUBshiftRLreg(v *Value, config *Config) bool {
 	}
 	return false
 }
-func rewriteValueARM_OpSelect0(v *Value, config *Config) bool {
-	b := v.Block
-	_ = b
-	// match: (Select0 <t> x)
-	// cond: t.IsFlags()
-	// result: (Carry x)
-	for {
-		t := v.Type
-		x := v.Args[0]
-		if !(t.IsFlags()) {
-			break
-		}
-		v.reset(OpARMCarry)
-		v.AddArg(x)
-		return true
-	}
-	// match: (Select0 <t> x)
-	// cond: !t.IsFlags()
-	// result: (LoweredSelect0 x)
-	for {
-		t := v.Type
-		x := v.Args[0]
-		if !(!t.IsFlags()) {
-			break
-		}
-		v.reset(OpARMLoweredSelect0)
-		v.AddArg(x)
-		return true
-	}
-	return false
-}
-func rewriteValueARM_OpSelect1(v *Value, config *Config) bool {
-	b := v.Block
-	_ = b
-	// match: (Select1 x)
-	// cond:
-	// result: (LoweredSelect1 x)
-	for {
-		x := v.Args[0]
-		v.reset(OpARMLoweredSelect1)
-		v.AddArg(x)
-		return true
-	}
-}
 func rewriteValueARM_OpSignExt16to32(v *Value, config *Config) bool {
 	b := v.Block
 	_ = b

--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -45,21 +45,6 @@ func (h ValHeap) Less(i, j int) bool {
 	if c := sx - sy; c != 0 {
 		return c > 0 // higher score comes later.
 	}
-	if sx == ScoreReadTuple {
-		// both are tuple-reading ops
-		// if they read same tuple, flag-reading op comes earlier
-		if x.Args[0] == y.Args[0] {
-			if x.Op == OpARMCarry || x.Op == OpARMLoweredSelect0 { //TODO: abstract this condition?
-				return false
-			} else {
-				return true
-			}
-		}
-		// if they read different tuples, order them as
-		// tuple-generating order to avoid interleaving
-		x = x.Args[0]
-		y = y.Args[0]
-	}
 	if x.Line != y.Line { // Favor in-order line stepping
 		return x.Line > y.Line
 	}
@@ -119,7 +104,7 @@ func schedule(f *Func) {
 				// reduce register pressure. It also helps make sure
 				// VARDEF ops are scheduled before the corresponding LEA.
 				score[v.ID] = ScoreMemory
-			case v.Op == OpARMCarry || v.Op == OpARMLoweredSelect0 || v.Op == OpARMLoweredSelect1:
+			case v.Op == OpSelect0 || v.Op == OpSelect1:
 				// Schedule the pseudo-op of reading part of a tuple
 				// immediately after the tuple-generating op, since
 				// this value is already live. This also removes its
@@ -226,12 +211,12 @@ func schedule(f *Func) {
 			// Do not emit tuple-reading ops until we're ready to emit the tuple-generating op.
 			//TODO: maybe remove ReadTuple score above, if it does not help on performance
 			switch {
-			case v.Op == OpARMCarry || v.Op == OpARMLoweredSelect0:
+			case v.Op == OpSelect0:
 				if tuples[v.Args[0].ID] == nil {
 					tuples[v.Args[0].ID] = make([]*Value, 2)
 				}
 				tuples[v.Args[0].ID][0] = v
-			case v.Op == OpARMLoweredSelect1:
+			case v.Op == OpSelect1:
 				if tuples[v.Args[0].ID] == nil {
 					tuples[v.Args[0].ID] = make([]*Value, 2)
 				}

--- a/src/cmd/compile/internal/ssa/tighten.go
+++ b/src/cmd/compile/internal/ssa/tighten.go
@@ -64,7 +64,7 @@ func tighten(f *Func) {
 					continue
 				default:
 				}
-				if v.Op.isTupleSelector() {
+				if v.Op == OpSelect0 || v.Op == OpSelect1 {
 					// tuple selector must stay with tuple generator
 					continue
 				}

--- a/src/cmd/compile/internal/x86/ssa.go
+++ b/src/cmd/compile/internal/x86/ssa.go
@@ -165,6 +165,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = r
 		}
+
 	// 2-address opcode arithmetic
 	case ssa.Op386SUBL,
 		ssa.Op386MULL,
@@ -176,13 +177,22 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.Op386SARL, ssa.Op386SARW, ssa.Op386SARB,
 		ssa.Op386ADDSS, ssa.Op386ADDSD, ssa.Op386SUBSS, ssa.Op386SUBSD,
 		ssa.Op386MULSS, ssa.Op386MULSD, ssa.Op386DIVSS, ssa.Op386DIVSD,
-		ssa.Op386PXOR:
+		ssa.Op386PXOR,
+		ssa.Op386ADCL:
 		r := gc.SSARegNum(v)
 		if r != gc.SSARegNum(v.Args[0]) {
 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
 		}
 		opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1]))

+	case ssa.Op386ADDLcarry:
+		// output 0 is carry, output 1 is the low 32 bits.
+		r := gc.SSARegNum1(v)
+		if r != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output[1] not in same register %s", v.LongString())
+		}
+		opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1]))
+
 	case ssa.Op386DIVL, ssa.Op386DIVW,
 		ssa.Op386DIVLU, ssa.Op386DIVWU,
 		ssa.Op386MODL, ssa.Op386MODW,
@@ -716,7 +726,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Reg = gc.SSARegNum(v.Args[0])
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
-	case ssa.OpSP, ssa.OpSB:
+	case ssa.OpSP, ssa.OpSB, ssa.OpSelect0, ssa.OpSelect1:
 		// nothing to do
 	case ssa.Op386SETEQ, ssa.Op386SETNE,
 		ssa.Op386SETL, ssa.Op386SETLE,