cmd/compile: implement CMOV on amd64

This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>

cmd/compile: implement CMOV on amd64
This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
a35ec9a5 · Giovanni Bajo · 42311108 · a35ec9a5 · a35ec9a5 · a35ec9a5
Commit a35ec9a5 authored Mar 05, 2018 by Giovanni Bajo
9 changed files
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -398,7 +398,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r

-	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ:
+	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
+		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
+		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
+		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
+		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
+		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
+		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
+		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
+		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
+		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
+		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
+		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
 		r := v.Reg()
 		if r != v.Args[0].Reg() {
 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
@@ -409,6 +420,71 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r

+	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
+		r := v.Reg()
+		if r != v.Args[0].Reg() {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		// Flag condition: ^ZERO || PARITY
+		// Generate:
+		//   CMOV*NE  SRC,DST
+		//   CMOV*PS  SRC,DST
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = v.Args[1].Reg()
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+		var q *obj.Prog
+		if v.Op == ssa.OpAMD64CMOVQNEF {
+			q = s.Prog(x86.ACMOVQPS)
+		} else if v.Op == ssa.OpAMD64CMOVLNEF {
+			q = s.Prog(x86.ACMOVLPS)
+		} else {
+			q = s.Prog(x86.ACMOVWPS)
+		}
+		q.From.Type = obj.TYPE_REG
+		q.From.Reg = v.Args[1].Reg()
+		q.To.Type = obj.TYPE_REG
+		q.To.Reg = r
+
+	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
+		r := v.Reg()
+		if r != v.Args[0].Reg() {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+
+		// Flag condition: ZERO && !PARITY
+		// Generate:
+		//   MOV      SRC,AX
+		//   CMOV*NE  DST,AX
+		//   CMOV*PC  AX,DST
+		//
+		// TODO(rasky): we could generate:
+		//   CMOV*NE  DST,SRC
+		//   CMOV*PC  SRC,DST
+		// But this requires a way for regalloc to know that SRC might be
+		// clobbered by this instruction.
+		if v.Args[1].Reg() != x86.REG_AX {
+			opregreg(s, moveByType(v.Type), x86.REG_AX, v.Args[1].Reg())
+		}
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = x86.REG_AX
+		var q *obj.Prog
+		if v.Op == ssa.OpAMD64CMOVQEQF {
+			q = s.Prog(x86.ACMOVQPC)
+		} else if v.Op == ssa.OpAMD64CMOVLEQF {
+			q = s.Prog(x86.ACMOVLPC)
+		} else {
+			q = s.Prog(x86.ACMOVWPC)
+		}
+		q.From.Type = obj.TYPE_REG
+		q.From.Reg = x86.REG_AX
+		q.To.Type = obj.TYPE_REG
+		q.To.Reg = r
+
 	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
 		r := v.Reg()
 		p := s.Prog(v.Op.Asm())

--- a/src/cmd/compile/internal/ssa/branchelim.go
+++ b/src/cmd/compile/internal/ssa/branchelim.go
@@ -19,7 +19,10 @@ package ssa
 // rewrite Phis in the postdominator as CondSelects.
 func branchelim(f *Func) {
 	// FIXME: add support for lowering CondSelects on more architectures
-	if f.Config.arch != "arm64" {
+	switch f.Config.arch {
+	case "arm64", "amd64":
+		// implemented
+	default:
 		return
 	}

@@ -32,10 +35,22 @@ func branchelim(f *Func) {
 	}
 }

-func canCondSelect(v *Value) bool {
+func canCondSelect(v *Value, arch string) bool {
 	// For now, stick to simple scalars that fit in registers
-	sz := v.Type.Size()
-	return sz <= v.Block.Func.Config.RegSize && (v.Type.IsInteger() || v.Type.IsPtrShaped())
+	switch {
+	case v.Type.Size() > v.Block.Func.Config.RegSize:
+		return false
+	case v.Type.IsPtrShaped():
+		return true
+	case v.Type.IsInteger():
+		if arch == "amd64" && v.Type.Size() < 2 {
+			// amd64 doesn't support CMOV with byte registers
+			return false
+		}
+		return true
+	default:
+		return false
+	}
 }

 func elimIf(f *Func, dom *Block) bool {
@@ -68,7 +83,7 @@ func elimIf(f *Func, dom *Block) bool {
 	for _, v := range post.Values {
 		if v.Op == OpPhi {
 			hasphis = true
-			if !canCondSelect(v) {
+			if !canCondSelect(v, f.Config.arch) {
 				return false
 			}
 		}
@@ -169,7 +184,7 @@ func elimIfElse(f *Func, b *Block) bool {
 	for _, v := range post.Values {
 		if v.Op == OpPhi {
 			hasphis = true
-			if !canCondSelect(v) {
+			if !canCondSelect(v, f.Config.arch) {
 				return false
 			}
 		}

--- a/src/cmd/compile/internal/ssa/branchelim_test.go
+++ b/src/cmd/compile/internal/ssa/branchelim_test.go
@@ -11,128 +11,162 @@ import (

 // Test that a trivial 'if' is eliminated
 func TestBranchElimIf(t *testing.T) {
-	c := testConfig(t)
-	c.config.arch = "arm64" // FIXME
-	boolType := types.New(types.TBOOL)
-	intType := types.New(types.TINT32)
-	fun := c.Fun("entry",
-		Bloc("entry",
-			Valu("start", OpInitMem, types.TypeMem, 0, nil),
-			Valu("sb", OpSB, types.TypeInvalid, 0, nil),
-			Valu("const1", OpConst32, intType, 1, nil),
-			Valu("const2", OpConst32, intType, 2, nil),
-			Valu("addr", OpAddr, boolType.PtrTo(), 0, nil, "sb"),
-			Valu("cond", OpLoad, boolType, 0, nil, "addr", "start"),
-			If("cond", "b2", "b3")),
-		Bloc("b2",
-			Goto("b3")),
-		Bloc("b3",
-			Valu("phi", OpPhi, intType, 0, nil, "const1", "const2"),
-			Valu("retstore", OpStore, types.TypeMem, 0, nil, "phi", "sb", "start"),
-			Exit("retstore")))
+	var testData = []struct {
+		arch    string
+		intType string
+		ok      bool
+	}{
+		{"arm64", "int32", true},
+		{"amd64", "int32", true},
+		{"amd64", "int8", false},
+	}

-	CheckFunc(fun.f)
-	branchelim(fun.f)
-	CheckFunc(fun.f)
-	Deadcode(fun.f)
-	CheckFunc(fun.f)
+	for _, data := range testData {
+		t.Run(data.arch+"/"+data.intType, func(t *testing.T) {
+			c := testConfigArch(t, data.arch)
+			boolType := c.config.Types.Bool
+			var intType *types.Type
+			switch data.intType {
+			case "int32":
+				intType = c.config.Types.Int32
+			case "int8":
+				intType = c.config.Types.Int8
+			default:
+				t.Fatal("invalid integer type:", data.intType)
+			}
+			fun := c.Fun("entry",
+				Bloc("entry",
+					Valu("start", OpInitMem, types.TypeMem, 0, nil),
+					Valu("sb", OpSB, types.TypeInvalid, 0, nil),
+					Valu("const1", OpConst32, intType, 1, nil),
+					Valu("const2", OpConst32, intType, 2, nil),
+					Valu("addr", OpAddr, boolType.PtrTo(), 0, nil, "sb"),
+					Valu("cond", OpLoad, boolType, 0, nil, "addr", "start"),
+					If("cond", "b2", "b3")),
+				Bloc("b2",
+					Goto("b3")),
+				Bloc("b3",
+					Valu("phi", OpPhi, intType, 0, nil, "const1", "const2"),
+					Valu("retstore", OpStore, types.TypeMem, 0, nil, "phi", "sb", "start"),
+					Exit("retstore")))

-	if len(fun.f.Blocks) != 1 {
-		t.Errorf("expected 1 block after branchelim and deadcode; found %d", len(fun.f.Blocks))
-	}
-	if fun.values["phi"].Op != OpCondSelect {
-		t.Errorf("expected phi op to be CondSelect; found op %s", fun.values["phi"].Op)
-	}
-	if fun.values["phi"].Args[2] != fun.values["cond"] {
-		t.Errorf("expected CondSelect condition to be %s; found %s", fun.values["cond"], fun.values["phi"].Args[2])
-	}
-	if fun.blocks["entry"].Kind != BlockExit {
-		t.Errorf("expected entry to be BlockExit; found kind %s", fun.blocks["entry"].Kind.String())
+			CheckFunc(fun.f)
+			branchelim(fun.f)
+			CheckFunc(fun.f)
+			Deadcode(fun.f)
+			CheckFunc(fun.f)
+
+			if data.ok {
+
+				if len(fun.f.Blocks) != 1 {
+					t.Fatalf("expected 1 block after branchelim and deadcode; found %d", len(fun.f.Blocks))
+				}
+				if fun.values["phi"].Op != OpCondSelect {
+					t.Fatalf("expected phi op to be CondSelect; found op %s", fun.values["phi"].Op)
+				}
+				if fun.values["phi"].Args[2] != fun.values["cond"] {
+					t.Errorf("expected CondSelect condition to be %s; found %s", fun.values["cond"], fun.values["phi"].Args[2])
+				}
+				if fun.blocks["entry"].Kind != BlockExit {
+					t.Errorf("expected entry to be BlockExit; found kind %s", fun.blocks["entry"].Kind.String())
+				}
+			} else {
+				if len(fun.f.Blocks) != 3 {
+					t.Fatalf("expected 3 block after branchelim and deadcode; found %d", len(fun.f.Blocks))
+				}
+			}
+		})
 	}
 }

 // Test that a trivial if/else is eliminated
 func TestBranchElimIfElse(t *testing.T) {
-	c := testConfig(t)
-	c.config.arch = "arm64" // FIXME
-	boolType := types.New(types.TBOOL)
-	intType := types.New(types.TINT32)
-	fun := c.Fun("entry",
-		Bloc("entry",
-			Valu("start", OpInitMem, types.TypeMem, 0, nil),
-			Valu("sb", OpSB, types.TypeInvalid, 0, nil),
-			Valu("const1", OpConst32, intType, 1, nil),
-			Valu("const2", OpConst32, intType, 2, nil),
-			Valu("addr", OpAddr, boolType.PtrTo(), 0, nil, "sb"),
-			Valu("cond", OpLoad, boolType, 0, nil, "addr", "start"),
-			If("cond", "b2", "b3")),
-		Bloc("b2",
-			Goto("b4")),
-		Bloc("b3",
-			Goto("b4")),
-		Bloc("b4",
-			Valu("phi", OpPhi, intType, 0, nil, "const1", "const2"),
-			Valu("retstore", OpStore, types.TypeMem, 0, nil, "phi", "sb", "start"),
-			Exit("retstore")))
+	for _, arch := range []string{"arm64", "amd64"} {
+		t.Run(arch, func(t *testing.T) {
+			c := testConfigArch(t, arch)
+			boolType := c.config.Types.Bool
+			intType := c.config.Types.Int32
+			fun := c.Fun("entry",
+				Bloc("entry",
+					Valu("start", OpInitMem, types.TypeMem, 0, nil),
+					Valu("sb", OpSB, types.TypeInvalid, 0, nil),
+					Valu("const1", OpConst32, intType, 1, nil),
+					Valu("const2", OpConst32, intType, 2, nil),
+					Valu("addr", OpAddr, boolType.PtrTo(), 0, nil, "sb"),
+					Valu("cond", OpLoad, boolType, 0, nil, "addr", "start"),
+					If("cond", "b2", "b3")),
+				Bloc("b2",
+					Goto("b4")),
+				Bloc("b3",
+					Goto("b4")),
+				Bloc("b4",
+					Valu("phi", OpPhi, intType, 0, nil, "const1", "const2"),
+					Valu("retstore", OpStore, types.TypeMem, 0, nil, "phi", "sb", "start"),
+					Exit("retstore")))

-	CheckFunc(fun.f)
-	branchelim(fun.f)
-	CheckFunc(fun.f)
-	Deadcode(fun.f)
-	CheckFunc(fun.f)
+			CheckFunc(fun.f)
+			branchelim(fun.f)
+			CheckFunc(fun.f)
+			Deadcode(fun.f)
+			CheckFunc(fun.f)

-	if len(fun.f.Blocks) != 1 {
-		t.Errorf("expected 1 block after branchelim; found %d", len(fun.f.Blocks))
-	}
-	if fun.values["phi"].Op != OpCondSelect {
-		t.Errorf("expected phi op to be CondSelect; found op %s", fun.values["phi"].Op)
-	}
-	if fun.values["phi"].Args[2] != fun.values["cond"] {
-		t.Errorf("expected CondSelect condition to be %s; found %s", fun.values["cond"], fun.values["phi"].Args[2])
-	}
-	if fun.blocks["entry"].Kind != BlockExit {
-		t.Errorf("expected entry to be BlockExit; found kind %s", fun.blocks["entry"].Kind.String())
+			if len(fun.f.Blocks) != 1 {
+				t.Fatalf("expected 1 block after branchelim; found %d", len(fun.f.Blocks))
+			}
+			if fun.values["phi"].Op != OpCondSelect {
+				t.Fatalf("expected phi op to be CondSelect; found op %s", fun.values["phi"].Op)
+			}
+			if fun.values["phi"].Args[2] != fun.values["cond"] {
+				t.Errorf("expected CondSelect condition to be %s; found %s", fun.values["cond"], fun.values["phi"].Args[2])
+			}
+			if fun.blocks["entry"].Kind != BlockExit {
+				t.Errorf("expected entry to be BlockExit; found kind %s", fun.blocks["entry"].Kind.String())
+			}
+		})
 	}
 }

 // Test that an if/else CFG that loops back
 // into itself does *not* get eliminated.
 func TestNoBranchElimLoop(t *testing.T) {
-	c := testConfig(t)
-	c.config.arch = "arm64" // FIXME
-	boolType := types.New(types.TBOOL)
-	intType := types.New(types.TINT32)
+	for _, arch := range []string{"arm64", "amd64"} {
+		t.Run(arch, func(t *testing.T) {
+			c := testConfigArch(t, arch)
+			boolType := c.config.Types.Bool
+			intType := c.config.Types.Int32

-	// The control flow here is totally bogus,
-	// but a dead cycle seems like the only plausible
-	// way to arrive at a diamond CFG that is also a loop.
-	fun := c.Fun("entry",
-		Bloc("entry",
-			Valu("start", OpInitMem, types.TypeMem, 0, nil),
-			Valu("sb", OpSB, types.TypeInvalid, 0, nil),
-			Valu("const2", OpConst32, intType, 2, nil),
-			Valu("const3", OpConst32, intType, 3, nil),
-			Goto("b5")),
-		Bloc("b2",
-			Valu("addr", OpAddr, boolType.PtrTo(), 0, nil, "sb"),
-			Valu("cond", OpLoad, boolType, 0, nil, "addr", "start"),
-			Valu("phi", OpPhi, intType, 0, nil, "const2", "const3"),
-			If("cond", "b3", "b4")),
-		Bloc("b3",
-			Goto("b2")),
-		Bloc("b4",
-			Goto("b2")),
-		Bloc("b5",
-			Exit("start")))
+			// The control flow here is totally bogus,
+			// but a dead cycle seems like the only plausible
+			// way to arrive at a diamond CFG that is also a loop.
+			fun := c.Fun("entry",
+				Bloc("entry",
+					Valu("start", OpInitMem, types.TypeMem, 0, nil),
+					Valu("sb", OpSB, types.TypeInvalid, 0, nil),
+					Valu("const2", OpConst32, intType, 2, nil),
+					Valu("const3", OpConst32, intType, 3, nil),
+					Goto("b5")),
+				Bloc("b2",
+					Valu("addr", OpAddr, boolType.PtrTo(), 0, nil, "sb"),
+					Valu("cond", OpLoad, boolType, 0, nil, "addr", "start"),
+					Valu("phi", OpPhi, intType, 0, nil, "const2", "const3"),
+					If("cond", "b3", "b4")),
+				Bloc("b3",
+					Goto("b2")),
+				Bloc("b4",
+					Goto("b2")),
+				Bloc("b5",
+					Exit("start")))

-	CheckFunc(fun.f)
-	branchelim(fun.f)
-	CheckFunc(fun.f)
+			CheckFunc(fun.f)
+			branchelim(fun.f)
+			CheckFunc(fun.f)

-	if len(fun.f.Blocks) != 5 {
-		t.Errorf("expected 5 block after branchelim; found %d", len(fun.f.Blocks))
-	}
-	if fun.values["phi"].Op != OpPhi {
-		t.Errorf("expected phi op to be CondSelect; found op %s", fun.values["phi"].Op)
+			if len(fun.f.Blocks) != 5 {
+				t.Errorf("expected 5 block after branchelim; found %d", len(fun.f.Blocks))
+			}
+			if fun.values["phi"].Op != OpPhi {
+				t.Errorf("expected phi op to be CondSelect; found op %s", fun.values["phi"].Op)
+			}
+		})
 	}
 }
--- a/src/cmd/compile/internal/ssa/export_test.go
+++ b/src/cmd/compile/internal/ssa/export_test.go
@@ -7,6 +7,7 @@ package ssa
 import (
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
+	"cmd/internal/obj/arm64"
 	"cmd/internal/obj/s390x"
 	"cmd/internal/obj/x86"
 	"cmd/internal/src"
@@ -22,6 +23,7 @@ var Copyelim = copyelim
 var testCtxts = map[string]*obj.Link{
 	"amd64": obj.Linknew(&x86.Linkamd64),
 	"s390x": obj.Linknew(&s390x.Links390x),
+	"arm64": obj.Linknew(&arm64.Linkarm64),
 }

 func testConfig(tb testing.TB) *Conf      { return testConfigArch(tb, "amd64") }

--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -475,6 +475,52 @@
 (ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
 (InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)

+// Lowering conditional moves
+// If the condition is a SETxx, we can just run a CMOV from the comparison that was
+// setting the flags.
+// Legend: HI=unsigned ABOVE, CS=unsigned BELOW, CC=unsigned ABOVE EQUAL, LS=unsigned BELOW EQUAL
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && (is64BitInt(t) || isPtr(t))
+    -> (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is32BitInt(t)
+    -> (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+(CondSelect <t> x y (SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) cond)) && is16BitInt(t)
+    -> (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) y x cond)
+
+// If the condition does not set the flags, we need to generate a comparison.
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 1
+    -> (CondSelect <t> x y (MOVBQZX <typ.UInt64> check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 2
+    -> (CondSelect <t> x y (MOVWQZX <typ.UInt64> check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 4
+    -> (CondSelect <t> x y (MOVLQZX <typ.UInt64> check))
+
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && (is64BitInt(t) || isPtr(t))
+    -> (CMOVQNE y x (CMPQconst [0] check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is32BitInt(t)
+    -> (CMOVLNE y x (CMPQconst [0] check))
+(CondSelect <t> x y check) && !check.Type.IsFlags() && check.Type.Size() == 8 && is16BitInt(t)
+    -> (CMOVWNE y x (CMPQconst [0] check))
+
+// Absorb InvertFlags
+(CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+    -> (CMOVQ(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+(CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+    -> (CMOVL(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+(CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS) x y (InvertFlags cond))
+    -> (CMOVW(EQ|NE|GT|LT|GE|LE|CS|HI|LS|CC) x y cond)
+
+// Absorb constants generated during lower
+(CMOV(QEQ|QLE|QGE|QCC|QLS|LEQ|LLE|LGE|LCC|LLS|WEQ|WLE|WGE|WCC|WLS) _ x (FlagEQ)) -> x
+(CMOV(QNE|QLT|QGT|QCS|QHI|LNE|LLT|LGT|LCS|LHI|WNE|WLT|WGT|WCS|WHI) y _ (FlagEQ)) -> y
+(CMOV(QNE|QGT|QGE|QHI|QCC|LNE|LGT|LGE|LHI|LCC|WNE|WGT|WGE|WHI|WCC) _ x (FlagGT_UGT)) -> x
+(CMOV(QEQ|QLE|QLT|QLS|QCS|LEQ|LLE|LLT|LLS|LCS|WEQ|WLE|WLT|WLS|WCS) y _ (FlagGT_UGT)) -> y
+(CMOV(QNE|QGT|QGE|QLS|QCS|LNE|LGT|LGE|LLS|LCS|WNE|WGT|WGE|WLS|WCS) _ x (FlagGT_ULT)) -> x
+(CMOV(QEQ|QLE|QLT|QHI|QCC|LEQ|LLE|LLT|LHI|LCC|WEQ|WLE|WLT|WHI|WCC) y _ (FlagGT_ULT)) -> y
+(CMOV(QNE|QLT|QLE|QCS|QLS|LNE|LLT|LLE|LCS|LLS|WNE|WLT|WLE|WCS|WLS) _ x (FlagLT_ULT)) -> x
+(CMOV(QEQ|QGT|QGE|QHI|QCC|LEQ|LGT|LGE|LHI|LCC|WEQ|WGT|WGE|WHI|WCC) y _ (FlagLT_ULT)) -> y
+(CMOV(QNE|QLT|QLE|QHI|QCC|LNE|LLT|LLE|LHI|LCC|WNE|WLT|WLE|WHI|WCC) _ x (FlagLT_UGT)) -> x
+(CMOV(QEQ|QGT|QGE|QCS|QLS|LEQ|LGT|LGE|LCS|LLS|WEQ|WGT|WGE|WCS|WLS) y _ (FlagLT_UGT)) -> y
+
 // Miscellaneous
 (Convert <t> x mem) && config.PtrSize == 8 -> (MOVQconvert <t> x mem)
 (Convert <t> x mem) && config.PtrSize == 4 -> (MOVLconvert <t> x mem)
@@ -1350,6 +1396,10 @@
 (CMPLconst x [0]) -> (TESTL x x)
 (CMPWconst x [0]) -> (TESTW x x)
 (CMPBconst x [0]) -> (TESTB x x)
+(TESTQconst [-1] x) -> (TESTQ x x)
+(TESTLconst [-1] x) -> (TESTL x x)
+(TESTWconst [-1] x) -> (TESTW x x)
+(TESTBconst [-1] x) -> (TESTB x x)

 // Combining byte loads into larger (unaligned) loads.
 // There are many ways these combinations could occur.  This is

--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -132,6 +132,7 @@ func init() {
 		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
 		gp21load  = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
 		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+		gp21pax   = regInfo{inputs: []regMask{gp &^ ax, gp}, outputs: []regMask{gp &^ ax}, clobbers: ax}

 		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
 		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
@@ -340,10 +341,57 @@ func init() {
 		{name: "BSRQ", argLength: 1, reg: gp11flags, asm: "BSRQ", typ: "(UInt64,Flags)"}, // # of high-order zeroes in 64-bit arg
 		{name: "BSRL", argLength: 1, reg: gp11flags, asm: "BSRL", typ: "(UInt32,Flags)"}, // # of high-order zeroes in 32-bit arg

-		// Note ASM for ops moves whole register
-		//
-		{name: "CMOVQEQ", argLength: 3, reg: gp21, asm: "CMOVQEQ", resultInArg0: true}, // if arg2 encodes "equal" return arg1 else arg0
-		{name: "CMOVLEQ", argLength: 3, reg: gp21, asm: "CMOVLEQ", resultInArg0: true}, // if arg2 encodes "equal" return arg1 else arg0
+		// CMOV instructions: 64, 32 and 16-bit sizes.
+		// if arg2 encodes a true result, return arg1, else arg0
+		{name: "CMOVQEQ", argLength: 3, reg: gp21, asm: "CMOVQEQ", resultInArg0: true},
+		{name: "CMOVQNE", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true},
+		{name: "CMOVQLT", argLength: 3, reg: gp21, asm: "CMOVQLT", resultInArg0: true},
+		{name: "CMOVQGT", argLength: 3, reg: gp21, asm: "CMOVQGT", resultInArg0: true},
+		{name: "CMOVQLE", argLength: 3, reg: gp21, asm: "CMOVQLE", resultInArg0: true},
+		{name: "CMOVQGE", argLength: 3, reg: gp21, asm: "CMOVQGE", resultInArg0: true},
+		{name: "CMOVQLS", argLength: 3, reg: gp21, asm: "CMOVQLS", resultInArg0: true},
+		{name: "CMOVQHI", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true},
+		{name: "CMOVQCC", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true},
+		{name: "CMOVQCS", argLength: 3, reg: gp21, asm: "CMOVQCS", resultInArg0: true},
+
+		{name: "CMOVLEQ", argLength: 3, reg: gp21, asm: "CMOVLEQ", resultInArg0: true},
+		{name: "CMOVLNE", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+		{name: "CMOVLLT", argLength: 3, reg: gp21, asm: "CMOVLLT", resultInArg0: true},
+		{name: "CMOVLGT", argLength: 3, reg: gp21, asm: "CMOVLGT", resultInArg0: true},
+		{name: "CMOVLLE", argLength: 3, reg: gp21, asm: "CMOVLLE", resultInArg0: true},
+		{name: "CMOVLGE", argLength: 3, reg: gp21, asm: "CMOVLGE", resultInArg0: true},
+		{name: "CMOVLLS", argLength: 3, reg: gp21, asm: "CMOVLLS", resultInArg0: true},
+		{name: "CMOVLHI", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true},
+		{name: "CMOVLCC", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true},
+		{name: "CMOVLCS", argLength: 3, reg: gp21, asm: "CMOVLCS", resultInArg0: true},
+
+		{name: "CMOVWEQ", argLength: 3, reg: gp21, asm: "CMOVWEQ", resultInArg0: true},
+		{name: "CMOVWNE", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+		{name: "CMOVWLT", argLength: 3, reg: gp21, asm: "CMOVWLT", resultInArg0: true},
+		{name: "CMOVWGT", argLength: 3, reg: gp21, asm: "CMOVWGT", resultInArg0: true},
+		{name: "CMOVWLE", argLength: 3, reg: gp21, asm: "CMOVWLE", resultInArg0: true},
+		{name: "CMOVWGE", argLength: 3, reg: gp21, asm: "CMOVWGE", resultInArg0: true},
+		{name: "CMOVWLS", argLength: 3, reg: gp21, asm: "CMOVWLS", resultInArg0: true},
+		{name: "CMOVWHI", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true},
+		{name: "CMOVWCC", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true},
+		{name: "CMOVWCS", argLength: 3, reg: gp21, asm: "CMOVWCS", resultInArg0: true},
+
+		// CMOV with floating point instructions. We need separate pseudo-op to handle
+		// InvertFlags correctly, and to generate special code that handles NaN (unordered flag).
+		// NOTE: the fact that CMOV*EQF here is marked to generate CMOV*NE is not a bug. See
+		// code generation in amd64/ssa.go.
+		{name: "CMOVQEQF", argLength: 3, reg: gp21pax, asm: "CMOVQNE", resultInArg0: true},
+		{name: "CMOVQNEF", argLength: 3, reg: gp21, asm: "CMOVQNE", resultInArg0: true},
+		{name: "CMOVQGTF", argLength: 3, reg: gp21, asm: "CMOVQHI", resultInArg0: true},
+		{name: "CMOVQGEF", argLength: 3, reg: gp21, asm: "CMOVQCC", resultInArg0: true},
+		{name: "CMOVLEQF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+		{name: "CMOVLNEF", argLength: 3, reg: gp21, asm: "CMOVLNE", resultInArg0: true},
+		{name: "CMOVLGTF", argLength: 3, reg: gp21, asm: "CMOVLHI", resultInArg0: true},
+		{name: "CMOVLGEF", argLength: 3, reg: gp21, asm: "CMOVLCC", resultInArg0: true},
+		{name: "CMOVWEQF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+		{name: "CMOVWNEF", argLength: 3, reg: gp21, asm: "CMOVWNE", resultInArg0: true},
+		{name: "CMOVWGTF", argLength: 3, reg: gp21, asm: "CMOVWHI", resultInArg0: true},
+		{name: "CMOVWGEF", argLength: 3, reg: gp21, asm: "CMOVWCC", resultInArg0: true},

 		{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
 		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
@@ -578,7 +626,6 @@ func init() {
 		{name: "LoweredGetCallerSP", reg: gp01, rematerializeable: true},
 		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
 		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
-
 		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
 		// It saves all GP registers if necessary, but may clobber others.
 		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/test/codegen/condmove.go
+++ b/test/codegen/condmove.go
+// asmcheck
+
+package codegen
+
+func cmovint(c int) int {
+	x := c + 4
+	if x < 0 {
+		x = 182
+	}
+	// amd64:"CMOVQLT"
+	// arm64:"CSEL\tLT"
+	return x
+}
+
+func cmovchan(x, y chan int) chan int {
+	if x != y {
+		x = y
+	}
+	// amd64:"CMOVQNE"
+	// arm64:"CSEL\tNE"
+	return x
+}
+
+func cmovuintptr(x, y uintptr) uintptr {
+	if x < y {
+		x = -y
+	}
+	// amd64:"CMOVQCS"
+	// arm64:"CSEL\tLO"
+	return x
+}
+
+func cmov32bit(x, y uint32) uint32 {
+	if x < y {
+		x = -y
+	}
+	// amd64:"CMOVLCS"
+	// arm64:"CSEL\tLO"
+	return x
+}
+
+func cmov16bit(x, y uint16) uint16 {
+	if x < y {
+		x = -y
+	}
+	// amd64:"CMOVWCS"
+	// arm64:"CSEL\tLO"
+	return x
+}
+
+// Floating point comparison. For EQ/NE, we must
+// generate special code to handle NaNs.
+func cmovfloateq(x, y float64) int {
+	a := 128
+	if x == y {
+		a = 256
+	}
+	// amd64:"CMOVQNE","CMOVQPC"
+	// arm64:"CSEL\tEQ"
+	return a
+}
+
+func cmovfloatne(x, y float64) int {
+	a := 128
+	if x != y {
+		a = 256
+	}
+	// amd64:"CMOVQNE","CMOVQPS"
+	// arm64:"CSEL\tNE"
+	return a
+}
+
+//go:noinline
+func frexp(f float64) (frac float64, exp int) {
+	return 1.0, 4
+}
+
+//go:noinline
+func ldexp(frac float64, exp int) float64 {
+	return 1.0
+}
+
+// Generate a CMOV with a floating comparison and integer move.
+func cmovfloatint2(x, y float64) float64 {
+	yfr, yexp := 4.0, 5
+
+	r := x
+	for r >= y {
+		rfr, rexp := frexp(r)
+		if rfr < yfr {
+			rexp = rexp - 1
+		}
+		// amd64:"CMOVQHI"
+		// arm64:"CSEL\tGT"
+		r = r - ldexp(y, (rexp-yexp))
+	}
+	return r
+}
+
+func cmovloaded(x [4]int, y int) int {
+	if x[2] != 0 {
+		y = x[2]
+	} else {
+		y = y >> 2
+	}
+	// amd64:"CMOVQNE"
+	// arm64:"CSEL\tNE"
+	return y
+}
+
+func cmovuintptr2(x, y uintptr) uintptr {
+	a := x * 2
+	if a == 0 {
+		a = 256
+	}
+	// amd64:"CMOVQEQ"
+	// arm64:"CSEL\tEQ"
+	return a
+}
+
+// Floating point CMOVs are not supported by amd64/arm64
+func cmovfloatmove(x, y int) float64 {
+	a := 1.0
+	if x <= y {
+		a = 2.0
+	}
+	// amd64:-"CMOV"
+	// arm64:-"CSEL"
+	return a
+}
+
+// On amd64, the following patterns trigger comparison inversion.
+// Test that we correctly invert the CMOV condition
+var gsink int64
+var gusink uint64
+
+func cmovinvert1(x, y int64) int64 {
+	if x < gsink {
+		y = -y
+	}
+	// amd64:"CMOVQGT"
+	return y
+}
+func cmovinvert2(x, y int64) int64 {
+	if x <= gsink {
+		y = -y
+	}
+	// amd64:"CMOVQGE"
+	return y
+}
+func cmovinvert3(x, y int64) int64 {
+	if x == gsink {
+		y = -y
+	}
+	// amd64:"CMOVQEQ"
+	return y
+}
+func cmovinvert4(x, y int64) int64 {
+	if x != gsink {
+		y = -y
+	}
+	// amd64:"CMOVQNE"
+	return y
+}
+func cmovinvert5(x, y uint64) uint64 {
+	if x > gusink {
+		y = -y
+	}
+	// amd64:"CMOVQCS"
+	return y
+}
+func cmovinvert6(x, y uint64) uint64 {
+	if x >= gusink {
+		y = -y
+	}
+	// amd64:"CMOVQLS"
+	return y
+}