cmd/compile: add math/bits.Mul64 intrinsic on s390x

This change adds an intrinsic for Mul64 on s390x. To achieve that, a new assembly instruction, MLGR, is introduced in s390x/asmz.go. This assembly instruction directly uses an existing instruction on Z and supports multiplication of two 64 bit unsigned integer and stores the result in two separate registers. In this case, we require the multiplcand to be stored in register R3 and the output result (the high and low 64 bit of the product) to be stored in R2 and R3 respectively. A test case is also added. Benchmark: name old time/op new time/op delta Mul-18 11.1ns ± 0% 1.4ns ± 0% -87.39% (p=0.002 n=8+10) Mul32-18 2.07ns ± 0% 2.07ns ± 0% ~ (all equal) Mul64-18 11.1ns ± 1% 1.4ns ± 0% -87.42% (p=0.000 n=10+10) Change-Id: Ieca6ad1f61fff9a48a31d50bbd3f3c6d9e6675c1 Reviewed-on: https://go-review.googlesource.com/c/go/+/194572Reviewed-by: Michael Munday <mike.munday@ibm.com> Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org>

cmd/compile: add math/bits.Mul64 intrinsic on s390x
This change adds an intrinsic for Mul64 on s390x. To achieve that, a new assembly instruction, MLGR, is introduced in s390x/asmz.go. This assembly instruction directly uses an existing instruction on Z and supports multiplication of two 64 bit unsigned integer and stores the result in two separate registers. In this case, we require the multiplcand to be stored in register R3 and the output result (the high and low 64 bit of the product) to be stored in R2 and R3 respectively. A test case is also added. Benchmark: name old time/op new time/op delta Mul-18 11.1ns ± 0% 1.4ns ± 0% -87.39% (p=0.002 n=8+10) Mul32-18 2.07ns ± 0% 2.07ns ± 0% ~ (all equal) Mul64-18 11.1ns ± 1% 1.4ns ± 0% -87.42% (p=0.000 n=10+10) Change-Id: Ieca6ad1f61fff9a48a31d50bbd3f3c6d9e6675c1 Reviewed-on: https://go-review.googlesource.com/c/go/+/194572Reviewed-by: Michael Munday <mike.munday@ibm.com> Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
98aa9780 · Ruixin Bao · Michael Munday · 03f63654 · 98aa9780 · 98aa9780
Commit 98aa9780 authored Sep 08, 2019 by Ruixin Bao Committed by Michael Munday Sep 13, 2019
11 changed files
--- a/src/cmd/asm/internal/asm/testdata/s390x.s
+++ b/src/cmd/asm/internal/asm/testdata/s390x.s
@@ -109,6 +109,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
 	MULHD	R7, R2, R1            // b90400b2b98600a7ebb7003f000ab98000b2b90900abebb2003f000ab98000b7b9e9b01a
 	MULHDU	R3, R4                // b90400b4b98600a3b904004a
 	MULHDU	R5, R6, R7            // b90400b6b98600a5b904007a
+	MLGR	R1, R2                // b9860021
 	DIVD	R1, R2                // b90400b2b90d00a1b904002b
 	DIVD	R1, R2, R3            // b90400b2b90d00a1b904003b
 	DIVW	R4, R5                // b90400b5b91d00a4b904005b

--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3600,8 +3600,8 @@ func init() {
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1])
 		},
-		sys.AMD64, sys.ARM64, sys.PPC64)
+		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X)
-	alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64)
+	alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X)
 	addF("math/bits", "Add64",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])

--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@@ -225,6 +225,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
 		}
 		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
+	case ssa.OpS390XMLGR:
+		// MLGR Rx R3 -> R2:R3
+		r0 := v.Args[0].Reg()
+		r1 := v.Args[1].Reg()
+		if r1 != s390x.REG_R3 {
+			v.Fatalf("We require the multiplcand to be stored in R3 for MLGR %s", v.LongString())
+		}
+		p := s.Prog(s390x.AMLGR)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r0
+		p.To.Reg = s390x.REG_R2
+		p.To.Type = obj.TYPE_REG
 	case ssa.OpS390XFMADD, ssa.OpS390XFMADDS,
 		ssa.OpS390XFMSUB, ssa.OpS390XFMSUBS:
 		r := v.Reg()

--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -17,6 +17,7 @@
 (Mul(32|16|8)  x y) -> (MULLW  x y)
 (Mul32F x y) -> (FMULS x y)
 (Mul64F x y) -> (FMUL x y)
+(Mul64uhilo x y) -> (MLGR x y)
 (Div32F x y) -> (FDIVS x y)
 (Div64F x y) -> (FDIV x y)

--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -568,6 +568,19 @@ func init() {
 			clobberFlags: true,
 		},
+		// unsigned multiplication (64x64 → 128)
+		//
+		// Multiply the two 64-bit input operands together and place the 128-bit result into
+		// an even-odd register pair. The second register in the target pair also contains
+		// one of the input operands. Since we don't currently have a way to specify an
+		// even-odd register pair we hardcode this register pair as R2:R3.
+		{
+			name:      "MLGR",
+			argLength: 2,
+			reg:       regInfo{inputs: []regMask{gp, r3}, outputs: []regMask{r2, r3}},
+			asm:       "MLGR",
+		},
 		// pseudo operations to sum the output of the POPCNT instruction
 		{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
 		{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2071,6 +2071,7 @@ const (
 	OpS390XLoweredAtomicExchange64
 	OpS390XFLOGR
 	OpS390XPOPCNT
+	OpS390XMLGR
 	OpS390XSumBytes2
 	OpS390XSumBytes4
 	OpS390XSumBytes8
@@ -27878,6 +27879,21 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "MLGR",
+		argLen: 2,
+		asm:    s390x.AMLGR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 8},     // R3
+				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+			},
+			outputs: []outputInfo{
+				{0, 4}, // R2
+				{1, 8}, // R3
+			},
+		},
+	},
 	{
 		name:   "SumBytes2",
 		argLen: 1,

--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@@ -335,6 +335,8 @@ func rewriteValueS390X(v *Value) bool {
 		return rewriteValueS390X_OpMul64_0(v)
 	case OpMul64F:
 		return rewriteValueS390X_OpMul64F_0(v)
+	case OpMul64uhilo:
+		return rewriteValueS390X_OpMul64uhilo_0(v)
 	case OpMul8:
 		return rewriteValueS390X_OpMul8_0(v)
 	case OpNeg16:
@@ -4609,6 +4611,19 @@ func rewriteValueS390X_OpMul64F_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueS390X_OpMul64uhilo_0(v *Value) bool {
+	// match: (Mul64uhilo x y)
+	// cond:
+	// result: (MLGR x y)
+	for {
+		y := v.Args[1]
+		x := v.Args[0]
+		v.reset(OpS390XMLGR)
+		v.AddArg(x)
+		v.AddArg(y)
+		return true
+	}
+}
 func rewriteValueS390X_OpMul8_0(v *Value) bool {
 	// match: (Mul8 x y)
 	// cond:

--- a/src/cmd/internal/obj/s390x/a.out.go
+++ b/src/cmd/internal/obj/s390x/a.out.go
@@ -240,6 +240,7 @@ const (
 	AMULLD
 	AMULHD
 	AMULHDU
+	AMLGR
 	ASUB
 	ASUBC
 	ASUBV

--- a/src/cmd/internal/obj/s390x/anames.go
+++ b/src/cmd/internal/obj/s390x/anames.go
@@ -21,6 +21,7 @@ var Anames = []string{
 	"MULLD",
 	"MULHD",
 	"MULHDU",
+	"MLGR",
 	"SUB",
 	"SUBC",
 	"SUBV",

--- a/src/cmd/internal/obj/s390x/asmz.go
+++ b/src/cmd/internal/obj/s390x/asmz.go
@@ -174,6 +174,7 @@ var optab = []Optab{
 	{i: 12, as: ASUB, a1: C_LAUTO, a6: C_REG},
 	{i: 4, as: AMULHD, a1: C_REG, a6: C_REG},
 	{i: 4, as: AMULHD, a1: C_REG, a2: C_REG, a6: C_REG},
+	{i: 62, as: AMLGR, a1: C_REG, a6: C_REG},
 	{i: 2, as: ADIVW, a1: C_REG, a2: C_REG, a6: C_REG},
 	{i: 2, as: ADIVW, a1: C_REG, a6: C_REG},
 	{i: 10, as: ASUB, a1: C_REG, a2: C_REG, a6: C_REG},
@@ -3407,6 +3408,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
 		d2 := c.regoff(&p.To)
 		zRXE(opcode, uint32(p.From.Reg), 0, 0, uint32(d2), 0, asm)
+	case 62: // equivalent of Mul64 in math/bits
+		zRRE(op_MLGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
 	case 66:
 		zRR(op_BCR, 0, 0, asm)

--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -557,6 +557,7 @@ func Mul(x, y uint) (hi, lo uint) {
 	// arm64:"UMULH","MUL"
 	// ppc64:"MULHDU","MULLD"
 	// ppc64le:"MULHDU","MULLD"
+	// s390x:"MLGR"
 	return bits.Mul(x, y)
 }
@@ -565,6 +566,7 @@ func Mul64(x, y uint64) (hi, lo uint64) {
 	// arm64:"UMULH","MUL"
 	// ppc64:"MULHDU","MULLD"
 	// ppc64le:"MULHDU","MULLD"
+	// s390x:"MLGR"
 	return bits.Mul64(x, y)
 }