cmd/compile: add an optimaztion rule for math/bits.ReverseBytes16 on arm64

On amd64 ReverseBytes16 is lowered to a rotate instruction. However arm64 doesn't have 16-bit rotate instruction, but has a REV16W instruction which can be used for ReverseBytes16. This CL adds a rule to turn the patterns like (x<<8) | (x>>8) (the type of x is uint16, and "|" can also be "^" or "+") to a REV16W instruction. Code: func reverseBytes16(i uint16) uint16 { return bits.ReverseBytes16(i) } Before: 0x0004 00004 (test.go:6) MOVHU "".i(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:262) UBFX $8, R0, $8, R1 0x000c 00012 ($GOROOT/src/math/bits/bits.go:262) ORR R0<<8, R1, R0 0x0010 00016 (test.go:6) MOVH R0, "".~r1+8(FP) 0x0014 00020 (test.go:6) RET (R30) After: 0x0000 00000 (test.go:6) MOVHU "".i(FP), R0 0x0004 00004 (test.go:6) REV16W R0, R0 0x0008 00008 (test.go:6) MOVH R0, "".~r1+8(FP) 0x000c 00012 (test.go:6) RET (R30) Benchmarks: name old time/op new time/op delta ReverseBytes-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal) ReverseBytes16-224 1.500000ns +- 0% 1.000000ns +- 0% -33.33% (p=0.000 n=9+10) ReverseBytes32-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal) ReverseBytes64-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal) Change-Id: I87cd41b2d8e549bf39c601f185d5775bd42d739c Reviewed-on: https://go-review.googlesource.com/c/157757Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>

cmd/compile: add an optimaztion rule for math/bits.ReverseBytes16 on arm64
On amd64 ReverseBytes16 is lowered to a rotate instruction. However arm64 doesn't have 16-bit rotate instruction, but has a REV16W instruction which can be used for ReverseBytes16. This CL adds a rule to turn the patterns like (x<<8) | (x>>8) (the type of x is uint16, and "|" can also be "^" or "+") to a REV16W instruction. Code: func reverseBytes16(i uint16) uint16 { return bits.ReverseBytes16(i) } Before: 0x0004 00004 (test.go:6) MOVHU "".i(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:262) UBFX $8, R0, $8, R1 0x000c 00012 ($GOROOT/src/math/bits/bits.go:262) ORR R0<<8, R1, R0 0x0010 00016 (test.go:6) MOVH R0, "".~r1+8(FP) 0x0014 00020 (test.go:6) RET (R30) After: 0x0000 00000 (test.go:6) MOVHU "".i(FP), R0 0x0004 00004 (test.go:6) REV16W R0, R0 0x0008 00008 (test.go:6) MOVH R0, "".~r1+8(FP) 0x000c 00012 (test.go:6) RET (R30) Benchmarks: name old time/op new time/op delta ReverseBytes-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal) ReverseBytes16-224 1.500000ns +- 0% 1.000000ns +- 0% -33.33% (p=0.000 n=9+10) ReverseBytes32-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal) ReverseBytes64-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal) Change-Id: I87cd41b2d8e549bf39c601f185d5775bd42d739c Reviewed-on: https://go-review.googlesource.com/c/157757Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
192b675f · erifan01 · Cherry Zhang · 40df9cc6 · 192b675f · 192b675f
Commit 192b675f authored Feb 11, 2019 by erifan01 Committed by Cherry Zhang Mar 01, 2019
3 changed files
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -1786,6 +1786,9 @@
 		(CMPconst [64]  (SUB <t> (MOVDconst [32]) (ANDconst <t> [31] y))))) && cc.(Op) == OpARM64LessThanU
 	-> (RORW x y)

+// ((x>>8) | (x<<8)) -> (REV16W x), the type of x is uint16, "|" can also be "^" or "+".
+((ADDshiftLL|ORshiftLL|XORshiftLL) <typ.UInt16> [8] (UBFX <typ.UInt16> [arm64BFAuxInt(8, 8)] x) x) -> (REV16W x)
+
 // Extract from reg pair
 (ADDshiftLL [c] (SRLconst x [64-c]) x2) -> (EXTRconst [64-c] x2 x)
 ( ORshiftLL [c] (SRLconst x [64-c]) x2) -> (EXTRconst [64-c] x2 x)

--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -2304,6 +2304,8 @@ func rewriteValueARM64_OpARM64ADDconst_0(v *Value) bool {
 func rewriteValueARM64_OpARM64ADDshiftLL_0(v *Value) bool {
 	b := v.Block
 	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
 	// match: (ADDshiftLL (MOVDconst [c]) x [d])
 	// cond:
 	// result: (ADDconst [c] (SLLconst <x.Type> x [d]))
@@ -2387,6 +2389,35 @@ func rewriteValueARM64_OpARM64ADDshiftLL_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (ADDshiftLL <typ.UInt16> [8] (UBFX <typ.UInt16> [arm64BFAuxInt(8, 8)] x) x)
+	// cond:
+	// result: (REV16W x)
+	for {
+		if v.Type != typ.UInt16 {
+			break
+		}
+		if v.AuxInt != 8 {
+			break
+		}
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64UBFX {
+			break
+		}
+		if v_0.Type != typ.UInt16 {
+			break
+		}
+		if v_0.AuxInt != arm64BFAuxInt(8, 8) {
+			break
+		}
+		x := v_0.Args[0]
+		if x != v.Args[1] {
+			break
+		}
+		v.reset(OpARM64REV16W)
+		v.AddArg(x)
+		return true
+	}
 	// match: (ADDshiftLL [c] (SRLconst x [64-c]) x2)
 	// cond:
 	// result: (EXTRconst [64-c] x2 x)
@@ -26504,6 +26535,8 @@ func rewriteValueARM64_OpARM64ORconst_0(v *Value) bool {
 func rewriteValueARM64_OpARM64ORshiftLL_0(v *Value) bool {
 	b := v.Block
 	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
 	// match: (ORshiftLL (MOVDconst [c]) x [d])
 	// cond:
 	// result: (ORconst [c] (SLLconst <x.Type> x [d]))
@@ -26610,6 +26643,35 @@ func rewriteValueARM64_OpARM64ORshiftLL_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (ORshiftLL <typ.UInt16> [8] (UBFX <typ.UInt16> [arm64BFAuxInt(8, 8)] x) x)
+	// cond:
+	// result: (REV16W x)
+	for {
+		if v.Type != typ.UInt16 {
+			break
+		}
+		if v.AuxInt != 8 {
+			break
+		}
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64UBFX {
+			break
+		}
+		if v_0.Type != typ.UInt16 {
+			break
+		}
+		if v_0.AuxInt != arm64BFAuxInt(8, 8) {
+			break
+		}
+		x := v_0.Args[0]
+		if x != v.Args[1] {
+			break
+		}
+		v.reset(OpARM64REV16W)
+		v.AddArg(x)
+		return true
+	}
 	// match: (ORshiftLL [c] (SRLconst x [64-c]) x2)
 	// cond:
 	// result: (EXTRconst [64-c] x2 x)
@@ -26739,6 +26801,11 @@ func rewriteValueARM64_OpARM64ORshiftLL_0(v *Value) bool {
 		v0.AddArg(mem)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64ORshiftLL_10(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (ORshiftLL <t> [8] y0:(MOVDnop x0:(MOVBUloadidx ptr0 idx0 mem)) y1:(MOVDnop x1:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)))
 	// cond: s == nil && x0.Uses == 1 && x1.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && mergePoint(b,x0,x1) != nil && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) && clobber(x0) && clobber(x1) && clobber(y0) && clobber(y1)
 	// result: @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr0 idx0 mem)
@@ -26795,11 +26862,6 @@ func rewriteValueARM64_OpARM64ORshiftLL_0(v *Value) bool {
 		v0.AddArg(mem)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64ORshiftLL_10(v *Value) bool {
-	b := v.Block
-	_ = b
 	// match: (ORshiftLL <t> [8] y0:(MOVDnop x0:(MOVBUloadidx ptr idx mem)) y1:(MOVDnop x1:(MOVBUloadidx ptr (ADDconst [1] idx) mem)))
 	// cond: x0.Uses == 1 && x1.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0) && clobber(x1) && clobber(y0) && clobber(y1)
 	// result: @mergePoint(b,x0,x1) (MOVHUloadidx <t> ptr idx mem)
@@ -27754,6 +27816,11 @@ func rewriteValueARM64_OpARM64ORshiftLL_10(v *Value) bool {
 		v0.AddArg(mem)
 		return true
 	}
+	return false
+}
+func rewriteValueARM64_OpARM64ORshiftLL_20(v *Value) bool {
+	b := v.Block
+	_ = b
 	// match: (ORshiftLL <t> [8] y0:(MOVDnop x0:(MOVBUload [i1] {s} p mem)) y1:(MOVDnop x1:(MOVBUload [i0] {s} p mem)))
 	// cond: i1 == i0+1 && x0.Uses == 1 && x1.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0) && clobber(x1) && clobber(y0) && clobber(y1)
 	// result: @mergePoint(b,x0,x1) (REV16W <t> (MOVHUload <t> [i0] {s} p mem))
@@ -27810,11 +27877,6 @@ func rewriteValueARM64_OpARM64ORshiftLL_10(v *Value) bool {
 		v0.AddArg(v1)
 		return true
 	}
-	return false
-}
-func rewriteValueARM64_OpARM64ORshiftLL_20(v *Value) bool {
-	b := v.Block
-	_ = b
 	// match: (ORshiftLL <t> [8] y0:(MOVDnop x0:(MOVBUload [1] {s} p1:(ADD ptr1 idx1) mem)) y1:(MOVDnop x1:(MOVBUloadidx ptr0 idx0 mem)))
 	// cond: s == nil && x0.Uses == 1 && x1.Uses == 1 && y0.Uses == 1 && y1.Uses == 1 && mergePoint(b,x0,x1) != nil && (isSamePtr(ptr0, ptr1) && isSamePtr(idx0, idx1) || isSamePtr(ptr0, idx1) && isSamePtr(idx0, ptr1)) && clobber(x0) && clobber(x1) && clobber(y0) && clobber(y1)
 	// result: @mergePoint(b,x0,x1) (REV16W <t> (MOVHUloadidx <t> ptr0 idx0 mem))
@@ -31905,6 +31967,8 @@ func rewriteValueARM64_OpARM64XORconst_0(v *Value) bool {
 func rewriteValueARM64_OpARM64XORshiftLL_0(v *Value) bool {
 	b := v.Block
 	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
 	// match: (XORshiftLL (MOVDconst [c]) x [d])
 	// cond:
 	// result: (XORconst [c] (SLLconst <x.Type> x [d]))
@@ -32010,6 +32074,35 @@ func rewriteValueARM64_OpARM64XORshiftLL_0(v *Value) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (XORshiftLL <typ.UInt16> [8] (UBFX <typ.UInt16> [arm64BFAuxInt(8, 8)] x) x)
+	// cond:
+	// result: (REV16W x)
+	for {
+		if v.Type != typ.UInt16 {
+			break
+		}
+		if v.AuxInt != 8 {
+			break
+		}
+		_ = v.Args[1]
+		v_0 := v.Args[0]
+		if v_0.Op != OpARM64UBFX {
+			break
+		}
+		if v_0.Type != typ.UInt16 {
+			break
+		}
+		if v_0.AuxInt != arm64BFAuxInt(8, 8) {
+			break
+		}
+		x := v_0.Args[0]
+		if x != v.Args[1] {
+			break
+		}
+		v.reset(OpARM64REV16W)
+		v.AddArg(x)
+		return true
+	}
 	// match: (XORshiftLL [c] (SRLconst x [64-c]) x2)
 	// cond:
 	// result: (EXTRconst [64-c] x2 x)

--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -170,6 +170,7 @@ func ReverseBytes32(n uint32) uint32 {

 func ReverseBytes16(n uint16) uint16 {
 	// amd64:"ROLW"
+	// arm64:"REV16W",-"UBFX",-"ORR"
 	return bits.ReverseBytes16(n)
 }