cmd/asm: add arm64 instructions for math optimization

Add arm64 HW instructions FMADDD, FMADDS, FMSUBD, FMSUBS, FNMADDD, FNMADDS, FNMSUBD, FNMSUBS, VFMLA, VFMLS, VMOV (element) for math optimization. Add check on register element index and test cases. Change-Id: Ice07c50b1a02d488ad2cde2a4e8aea93f3e3afff Reviewed-on: https://go-review.googlesource.com/90876Reviewed-by: Cherry Zhang <cherryyz@google.com>

cmd/asm: add arm64 instructions for math optimization
Add arm64 HW instructions FMADDD, FMADDS, FMSUBD, FMSUBS, FNMADDD, FNMADDS, FNMSUBD, FNMSUBS, VFMLA, VFMLS, VMOV (element) for math optimization. Add check on register element index and test cases. Change-Id: Ice07c50b1a02d488ad2cde2a4e8aea93f3e3afff Reviewed-on: https://go-review.googlesource.com/90876Reviewed-by: Cherry Zhang <cherryyz@google.com>
f5de4200 · erifan01 · Brad Fitzpatrick · c18ff184 · f5de4200 · f5de4200
Commit f5de4200 authored Jan 26, 2018 by erifan01 Committed by Brad Fitzpatrick Feb 22, 2018
7 changed files
--- a/src/cmd/asm/internal/arch/arm64.go
+++ b/src/cmd/asm/internal/arch/arm64.go
@@ -178,18 +178,39 @@ func ARM64RegisterExtension(a *obj.Addr, ext string, reg, num int16, isAmount, i
 		a.Reg = arm64.REG_SXTX + (reg & 31) + int16(num<<5)
 		a.Offset = int64(((rm & 31) << 16) | (7 << 13) | (uint32(num) << 10))
 	case "B8":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_8B & 15) << 5)
 	case "B16":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_16B & 15) << 5)
 	case "H4":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_4H & 15) << 5)
 	case "H8":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_8H & 15) << 5)
 	case "S2":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_2S & 15) << 5)
 	case "S4":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_4S & 15) << 5)
 	case "D2":
+		if isIndex {
+			return errors.New("invalid register extension")
+		}
 		a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_2D & 15) << 5)
 	case "B":
 		if !isIndex {

--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -68,6 +68,12 @@ TEXT	foo(SB), DUPOK|NOSPLIT, $-8
 	VADD	V1, V3, V3                      // 6384e15e
 	VSUB	V12, V30, V30                   // de87ec7e
 	VSUB	V12, V20, V30                   // 9e86ec7e
+	VFMLA	V1.D2, V12.D2, V1.D2            // 81cd614e
+	VFMLA	V1.S2, V12.S2, V1.S2            // 81cd210e
+	VFMLA	V1.S4, V12.S4, V1.S4            // 81cd214e
+	VFMLS	V1.D2, V12.D2, V1.D2            // 81cde14e
+	VFMLS	V1.S2, V12.S2, V1.S2            // 81cda10e
+	VFMLS	V1.S4, V12.S4, V1.S4            // 81cda14e

 //	LTYPE1 imsr ',' spreg ','
 //	{
@@ -212,6 +218,10 @@ TEXT	foo(SB), DUPOK|NOSPLIT, $-8
 	VMOV	R22, V11.D2           // cb0e084e
 	VMOV 	V2.B16, V4.B16        // 441ca24e
 	VMOV	V20.S[0], V20         // 9406045e
+	VMOV	V12.D[0], V12.D[1]    // 8c05186e
+	VMOV	V10.S[0], V12.S[1]    // 4c050c6e
+	VMOV	V9.H[0], V12.H[1]     // 2c05066e
+	VMOV	V8.B[0], V12.B[1]     // 0c05036e
 	VREV32	V5.B16, V5.B16        // a508206e
 	VDUP	V19.S[0], V17.S4      // 7106044e
 //
@@ -367,6 +377,15 @@ again:
 //	}
 //	MADD	R1, R2, R3, R4

+	FMADDS	F1, F3, F2, F4          // 440c011f
+	FMADDD	F4, F5, F4, F4          // 8414441f
+	FMSUBS	F13, F21, F13, F19      // b3d50d1f
+	FMSUBD	F11, F7, F15, F31       // ff9d4b1f
+	FNMADDS	F1, F3, F2, F4          // 440c211f
+	FNMADDD	F1, F3, F2, F4          // 440c611f
+	FNMSUBS	F1, F3, F2, F4          // 448c211f
+	FNMSUBD	F1, F3, F2, F4          // 448c611f
+
 // DMB, HINT
 //
 //		LDMB imm

--- a/src/cmd/asm/internal/asm/testdata/arm64error.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64error.s
@@ -12,4 +12,42 @@ TEXT errors(SB),$0
 	VLD1	8(R8)(R13), [V2.B16]        // ERROR "illegal combination"
 	ADD	R1.UXTB<<5, R2, R3          // ERROR "shift amount out of range 0 to 4"
 	ADDS	R1.UXTX<<7, R2, R3          // ERROR "shift amount out of range 0 to 4"
+	VMOV	V8.D[2], V12.D[1]           // ERROR "register element index out of range 0 to 1"
+	VMOV	V8.S[4], V12.S[1]           // ERROR "register element index out of range 0 to 3"
+	VMOV	V8.H[8], V12.H[1]           // ERROR "register element index out of range 0 to 7"
+	VMOV	V8.B[16], V12.B[1]          // ERROR "register element index out of range 0 to 15"
+	VMOV	V8.D[0], V12.S[1]           // ERROR "operand mismatch"
+	VMOV	V8.D[0], V12.H[1]           // ERROR "operand mismatch"
+	VMOV	V8.D[0], V12.B[1]           // ERROR "operand mismatch"
+	VMOV	V8.S[0], V12.H[1]           // ERROR "operand mismatch"
+	VMOV	V8.S[0], V12.B[1]           // ERROR "operand mismatch"
+	VMOV	V8.H[0], V12.B[1]           // ERROR "operand mismatch"
+	VMOV	V8.B[16], R3                // ERROR "register element index out of range 0 to 15"
+	VMOV	V8.H[9], R3                 // ERROR "register element index out of range 0 to 7"
+	VMOV	V8.S[4], R3                 // ERROR "register element index out of range 0 to 3"
+	VMOV	V8.D[2], R3                 // ERROR "register element index out of range 0 to 1"
+	VDUP	V8.B[16], R3.B16            // ERROR "register element index out of range 0 to 15"
+	VDUP	V8.B[17], R3.B8             // ERROR "register element index out of range 0 to 15"
+	VDUP	V8.H[9], R3.H4              // ERROR "register element index out of range 0 to 7"
+	VDUP	V8.H[9], R3.H8              // ERROR "register element index out of range 0 to 7"
+	VDUP	V8.S[4], R3.S2              // ERROR "register element index out of range 0 to 3"
+	VDUP	V8.S[4], R3.S4              // ERROR "register element index out of range 0 to 3"
+	VDUP	V8.D[2], R3.D2              // ERROR "register element index out of range 0 to 1"
+	VFMLA	V1.D2, V12.D2, V3.S2        // ERROR "operand mismatch"
+	VFMLA	V1.S2, V12.S2, V3.D2        // ERROR "operand mismatch"
+	VFMLA	V1.S4, V12.S2, V3.D2        // ERROR "operand mismatch"
+	VFMLA	V1.H4, V12.H4, V3.D2        // ERROR "operand mismatch"
+	VFMLS	V1.S2, V12.S2, V3.S4        // ERROR "operand mismatch"
+	VFMLS	V1.S2, V12.D2, V3.S4        // ERROR "operand mismatch"
+	VFMLS	V1.S2, V12.S4, V3.D2        // ERROR "operand mismatch"
+	VFMLA	V1.B8, V12.B8, V3.B8        // ERROR "invalid arrangement"
+	VFMLA	V1.B16, V12.B16, V3.B16     // ERROR "invalid arrangement"
+	VFMLA	V1.H4, V12.H4, V3.H4        // ERROR "invalid arrangement"
+	VFMLA	V1.H8, V12.H8, V3.H8        // ERROR "invalid arrangement"
+	VFMLA	V1.H4, V12.H4, V3.H4        // ERROR "invalid arrangement"
+	VFMLS	V1.B8, V12.B8, V3.B8        // ERROR "invalid arrangement"
+	VFMLS	V1.B16, V12.B16, V3.B16     // ERROR "invalid arrangement"
+	VFMLS	V1.H4, V12.H4, V3.H4        // ERROR "invalid arrangement"
+	VFMLS	V1.H8, V12.H8, V3.H8        // ERROR "invalid arrangement"
+	VFMLS	V1.H4, V12.H4, V3.H4        // ERROR "invalid arrangement"
 	RET
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -766,6 +766,8 @@ const (
 	AVMOVI
 	AVUADDLV
 	AVSUB
+	AVFMLA
+	AVFMLS
 	ALAST
 	AB  = obj.AJMP
 	ABL = obj.ACALL

--- a/src/cmd/internal/obj/arm64/anames.go
+++ b/src/cmd/internal/obj/arm64/anames.go
@@ -383,5 +383,7 @@ var Anames = []string{
 	"VMOVI",
 	"VUADDLV",
 	"VSUB",
+	"VFMLA",
+	"VFMLS",
 	"LAST",
 }
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
--- a/src/cmd/internal/obj/arm64/doc.go
+++ b/src/cmd/internal/obj/arm64/doc.go
@@ -22,6 +22,46 @@ Go Assembly for ARM64 Reference Manual
 2. Alphabetical list of float-point instructions
    // TODO

+    FMADDD: 64-bit floating-point fused Multiply-Add
+      FMADDD	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>,
+        adds the product to <Fa>, and writes the result to <Fd>.
+
+    FMADDS: 32-bit floating-point fused Multiply-Add
+      FMADDS	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>,
+        adds the product to <Fa>, and writes the result to <Fd>.
+
+    FMSUBD: 64-bit floating-point fused Multiply-Subtract
+      FMSUBD	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>, negates the product,
+        adds the product to <Fa>, and writes the result to <Fd>.
+
+    FMSUBS: 32-bit floating-point fused Multiply-Subtract
+      FMSUBS	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>, negates the product,
+        adds the product to <Fa>, and writes the result to <Fd>.
+
+    FNMADDD: 64-bit floating-point negated fused Multiply-Add
+      FNMADDD	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>, negates the product,
+        subtracts the value of <Fa>, and writes the result to <Fd>.
+
+    FNMADDS: 32-bit floating-point negated fused Multiply-Add
+      FNMADDS	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>, negates the product,
+        subtracts the value of <Fa>, and writes the result to <Fd>.
+
+    FNMSUBD: 64-bit floating-point negated fused Multiply-Subtract
+      FNMSUBD	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>,
+        subtracts the value of <Fa>, and writes the result to <Fd>.
+
+    FNMSUBS: 32-bit floating-point negated fused Multiply-Subtract
+      FNMSUBS	<Fm>, <Fa>, <Fn>, <Fd>
+        Multiplies the values of <Fm> and <Fn>,
+        subtracts the value of <Fa>, and writes the result to <Fd>.
+
 3. Alphabetical list of SIMD instructions
    VADD: Add (scalar)
      VADD	<Vm>, <Vn>, <Vd>
@@ -65,6 +105,16 @@ Go Assembly for ARM64 Reference Manual
        <T> Is an arrangement specifier and can have the following values:
        B8, B16

+    VFMLA: Floating-point fused Multiply-Add to accumulator (vector)
+      VFMLA	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
+        <T> Is an arrangement specifier and can have the following values:
+        S2, S4, D2
+
+    VFMLS: Floating-point fused Multiply-Subtract from accumulator (vector)
+      VFMLS	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
+        <T> Is an arrangement specifier and can have the following values:
+        S2, S4, D2
+
    VLD1: Load multiple single-element structures
      VLD1	(Rn), [<Vt>.<T>, <Vt2>.<T> ...]     // no offset
      VLD1.P	imm(Rn), [<Vt>.<T>, <Vt2>.<T> ...]  // immediate offset variant
@@ -96,6 +146,10 @@ Go Assembly for ARM64 Reference Manual
        <T> Is an element size specifier and can have the following values:
        B, H, S, D

+      VMOV	<Vn>.<T>[index], <Vd>.<T>[index] // Move vector element to another vector element.
+        <T> Is an element size specifier and can have the following values:
+        B, H, S, D
+
    VMOVI: Move Immediate (vector).
      VMOVI	$imm8, <Vd>.<T>
        <T> is an arrangement specifier and can have the following values: