cmd/compile,test: combine byte loads and stores on ppc64le

CL 74410 added rules to combine consecutive byte loads and stores when the byte order was little endian for ppc64le. This is the corresponding change for bytes that are in big endian order. These rules are all intended for a little endian target arch. This adds new testcases in test/codegen/memcombine.go Fixes #22496 Updates #24242 Benchmark improvement for encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-16 11.0µs ± 0% 9.0µs ± 0% -17.47% (p=0.029 n=4+4) ReadStruct-16 2.47µs ± 1% 2.48µs ± 0% +0.67% (p=0.114 n=4+4) ReadInts-16 642ns ± 1% 630ns ± 1% -2.02% (p=0.029 n=4+4) WriteInts-16 654ns ± 0% 653ns ± 1% -0.08% (p=0.629 n=4+4) WriteSlice1000Int32s-16 8.75µs ± 0% 8.20µs ± 0% -6.19% (p=0.029 n=4+4) PutUint16-16 1.16ns ± 0% 0.93ns ± 0% -19.83% (p=0.029 n=4+4) PutUint32-16 1.16ns ± 0% 0.93ns ± 0% -19.83% (p=0.029 n=4+4) PutUint64-16 1.85ns ± 0% 0.93ns ± 0% -49.73% (p=0.029 n=4+4) LittleEndianPutUint16-16 1.03ns ± 0% 0.93ns ± 0% -9.71% (p=0.029 n=4+4) LittleEndianPutUint32-16 0.93ns ± 0% 0.93ns ± 0% ~ (all equal) LittleEndianPutUint64-16 0.93ns ± 0% 0.93ns ± 0% ~ (all equal) PutUvarint32-16 43.0ns ± 0% 43.1ns ± 0% +0.12% (p=0.429 n=4+4) PutUvarint64-16 174ns ± 0% 175ns ± 0% +0.29% (p=0.429 n=4+4) Updates made to functions in gcm.go to enable their matching. An existing testcase prevents these functions from being replaced by those in encoding/binary due to import dependencies. Change-Id: Idb3bd1e6e7b12d86cd828fb29cb095848a3e485a Reviewed-on: https://go-review.googlesource.com/98136 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>

cmd/compile,test: combine byte loads and stores on ppc64le
CL 74410 added rules to combine consecutive byte loads and stores when the byte order was little endian for ppc64le. This is the corresponding change for bytes that are in big endian order. These rules are all intended for a little endian target arch. This adds new testcases in test/codegen/memcombine.go Fixes #22496 Updates #24242 Benchmark improvement for encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-16 11.0µs ± 0% 9.0µs ± 0% -17.47% (p=0.029 n=4+4) ReadStruct-16 2.47µs ± 1% 2.48µs ± 0% +0.67% (p=0.114 n=4+4) ReadInts-16 642ns ± 1% 630ns ± 1% -2.02% (p=0.029 n=4+4) WriteInts-16 654ns ± 0% 653ns ± 1% -0.08% (p=0.629 n=4+4) WriteSlice1000Int32s-16 8.75µs ± 0% 8.20µs ± 0% -6.19% (p=0.029 n=4+4) PutUint16-16 1.16ns ± 0% 0.93ns ± 0% -19.83% (p=0.029 n=4+4) PutUint32-16 1.16ns ± 0% 0.93ns ± 0% -19.83% (p=0.029 n=4+4) PutUint64-16 1.85ns ± 0% 0.93ns ± 0% -49.73% (p=0.029 n=4+4) LittleEndianPutUint16-16 1.03ns ± 0% 0.93ns ± 0% -9.71% (p=0.029 n=4+4) LittleEndianPutUint32-16 0.93ns ± 0% 0.93ns ± 0% ~ (all equal) LittleEndianPutUint64-16 0.93ns ± 0% 0.93ns ± 0% ~ (all equal) PutUvarint32-16 43.0ns ± 0% 43.1ns ± 0% +0.12% (p=0.429 n=4+4) PutUvarint64-16 174ns ± 0% 175ns ± 0% +0.29% (p=0.429 n=4+4) Updates made to functions in gcm.go to enable their matching. An existing testcase prevents these functions from being replaced by those in encoding/binary due to import dependencies. Change-Id: Idb3bd1e6e7b12d86cd828fb29cb095848a3e485a Reviewed-on: https://go-review.googlesource.com/98136 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
28edaf45 · Lynn Boger · f31a18de · 28edaf45 · 28edaf45 · 28edaf45
Commit 28edaf45 authored Mar 01, 2018 by Lynn Boger
9 changed files
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@@ -619,35 +619,31 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = ppc64.REGTMP // discard result

 	case ssa.OpPPC64MOVDaddr:
+		switch v.Aux.(type) {
+		default:
+			v.Fatalf("aux in MOVDaddr is of unknown type %T", v.Aux)
+		case nil:
+			// If aux offset and aux int are both 0, and the same
+			// input and output regs are used, no instruction
+			// needs to be generated, since it would just be
+			// addi rx, rx, 0.
+			if v.AuxInt != 0 || v.Args[0].Reg() != v.Reg() {
 				p := s.Prog(ppc64.AMOVD)
 				p.From.Type = obj.TYPE_ADDR
 				p.From.Reg = v.Args[0].Reg()
+				p.From.Offset = v.AuxInt
 				p.To.Type = obj.TYPE_REG
 				p.To.Reg = v.Reg()
+			}

-		var wantreg string
-		// Suspect comment, copied from ARM code
-		// MOVD $sym+off(base), R
-		// the assembler expands it as the following:
-		// - base is SP: add constant offset to SP
-		//               when constant is large, tmp register (R11) may be used
-		// - base is SB: load external address from constant pool (use relocation)
-		switch v.Aux.(type) {
-		default:
-			v.Fatalf("aux is of unknown type %T", v.Aux)
-		case *obj.LSym:
-			wantreg = "SB"
-			gc.AddAux(&p.From, v)
-		case *gc.Node:
-			wantreg = "SP"
+		case *obj.LSym, *gc.Node:
+			p := s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_ADDR
+			p.From.Reg = v.Args[0].Reg()
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = v.Reg()
 			gc.AddAux(&p.From, v)
-		case nil:
-			// No sym, just MOVD $off(SP), R
-			wantreg = "SP"
-			p.From.Offset = v.AuxInt
-		}
-		if reg := v.Args[0].RegName(); reg != wantreg {
-			v.Fatalf("bad reg %s for symbol type %T, want %s", reg, v.Aux, wantreg)
+
 		}

 	case ssa.OpPPC64MOVDconst:
@@ -729,6 +725,20 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()

+	case ssa.OpPPC64MOVDBRload, ssa.OpPPC64MOVWBRload, ssa.OpPPC64MOVHBRload:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = v.Args[0].Reg()
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = v.Reg()
+
+	case ssa.OpPPC64MOVDBRstore, ssa.OpPPC64MOVWBRstore, ssa.OpPPC64MOVHBRstore:
+		p := s.Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = v.Args[0].Reg()
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = v.Args[1].Reg()
+
 	case ssa.OpPPC64FMOVDload, ssa.OpPPC64FMOVSload:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM

--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -678,6 +678,7 @@

 (MOVHZreg y:(MOVHZreg _)) -> y // repeat
 (MOVHZreg y:(MOVBZreg _)) -> y // wide of narrow
+(MOVHZreg y:(MOVHBRload _ _)) -> y

 (MOVHreg y:(MOVHreg _)) -> y // repeat
 (MOVHreg y:(MOVBreg _)) -> y // wide of narrow
@@ -690,6 +691,8 @@
 (MOVWZreg y:(MOVWZreg _)) -> y // repeat
 (MOVWZreg y:(MOVHZreg _)) -> y // wide of narrow
 (MOVWZreg y:(MOVBZreg _)) -> y // wide of narrow
+(MOVWZreg y:(MOVHBRload _ _)) -> y
+(MOVWZreg y:(MOVWBRload _ _)) -> y

 (MOVWreg y:(MOVWreg _)) -> y // repeat
 (MOVWreg y:(MOVHreg _)) -> y // wide of narrow
@@ -870,6 +873,8 @@
 (MOVWstore [off] {sym} ptr (MOV(W|WZ)reg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
 (MOVBstore [off] {sym} ptr (SRWconst (MOV(H|HZ)reg x) [c]) mem) && c <= 8 -> (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem)
 (MOVBstore [off] {sym} ptr (SRWconst (MOV(W|WZ)reg x) [c]) mem) && c <= 24 -> (MOVBstore [off] {sym} ptr (SRWconst <typ.UInt32> x [c]) mem)
+(MOVHBRstore {sym} ptr (MOV(H|HZ|W|WZ)reg x) mem) -> (MOVHBRstore {sym} ptr x mem)
+(MOVWBRstore {sym} ptr (MOV(W|WZ)reg x) mem) -> (MOVWBRstore {sym} ptr x mem)

 // Lose W-widening ops fed to compare-W
 (CMPW x (MOVWreg y)) -> (CMPW x y)
@@ -902,13 +907,14 @@
 (FSUBS (FMULS x y) z) -> (FMSUBS x y z)


-// The following rules are intended to match statements as are found in encoding/binary
-// functions UintXX (load) and PutUintXX (store), combining multi-byte loads and stores
-// into wider loads and stores.
-// Initial implementation handles only little endian loads and stores on little endian
-// targets.
-// TODO implement big endian loads and stores for little endian machines (using byte reverse
-// loads and stores).
+// The following statements are found in encoding/binary functions UintXX (load) and PutUintXX (store)
+// and convert the statements in these functions from multiple single byte loads or stores to
+// the single largest possible load or store.
+// Some are marked big or little endian based on the order in which the bytes are loaded or stored,
+// not on the ordering of the machine. These are intended for little endian machines.
+// To implement for big endian machines, most rules would have to be duplicated but the
+// resulting rule would be reversed, i. e., MOVHZload on little endian would be MOVHBRload on big endian
+// and vice versa.
 // b[0] | b[1]<<8 -> load 16-bit Little endian
 (OR <t> x0:(MOVBZload [i0] {s} p mem)
 	o1:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [8]))
@@ -920,9 +926,39 @@
 	&& clobber(x0) && clobber(x1) && clobber(o1)
 	 -> @mergePoint(b,x0,x1) (MOVHZload <t> {s} [i0] p mem)

+// b[0]<<8 | b[1] -> load 16-bit Big endian on Little endian arch.
+// Use byte-reverse indexed load for 2 bytes.
+(OR <t> x0:(MOVBZload [i1] {s} p mem)
+	o1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [8]))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& x0.Uses ==1 && x1.Uses == 1
+	&& o1.Uses == 1
+	&& mergePoint(b, x0, x1) != nil
+	&& clobber(x0) && clobber(x1) && clobber(o1)
+	  -> @mergePoint(b,x0,x1) (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<n+8 | b[1]<<n -> load 16-bit Big endian (where n%8== 0)
+// Use byte-reverse indexed load for 2 bytes,
+// then shift left to the correct position. Used to match subrules
+// from longer rules.
+(OR <t> s0:(SL(W|D)const x0:(MOVBZload [i1] {s} p mem) [n1])
+	s1:(SL(W|D)const x1:(MOVBZload [i0] {s} p mem) [n2]))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& n1%8 == 0
+	&& n2 == n1+8
+	&& x0.Uses == 1 && x1.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1) != nil
+	&& clobber(x0) && clobber(x1) && clobber(s0) && clobber(s1)
+	  -> @mergePoint(b,x0,x1) (SLDconst <t> (MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [n1])
+
 // b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 -> load 32-bit Little endian
+// Use byte-reverse indexed load for 4 bytes.
 (OR <t> s1:(SL(W|D)const x2:(MOVBZload [i3] {s} p mem) [24])
-	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [16]) x0:(MOVHZload [i0] {s} p mem)))
+	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [16])
+	x0:(MOVHZload [i0] {s} p mem)))
 	&& !config.BigEndian
 	&& i2 == i0+2
 	&& i3 == i0+3
@@ -935,9 +971,81 @@
 	&& clobber(o0)
 	 -> @mergePoint(b,x0,x1,x2) (MOVWZload <t> {s} [i0] p mem)

+// b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3] -> load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with computed address.
+// Could be used to match subrules of a longer rule.
+(OR <t> s1:(SL(W|D)const x2:(MOVBZload [i0] {s} p mem) [24])
+	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i1] {s} p mem) [16])
+	x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem)))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0) && clobber(x1) && clobber(x2)
+	&& clobber(s0) && clobber(s1)
+	&& clobber(o0)
+	  -> @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24 -> load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with computed address.
+// Could be used to match subrules of a longer rule.
+(OR <t> x0:(MOVBZload [i3] {s} p mem)
+	o0:(OR <t> s0:(SL(W|D)const x1:(MOVBZload [i2] {s} p mem) [8])
+	s1:(SL(W|D)const x2:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [16])))
+	&& !config.BigEndian
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0) && clobber(x1) && clobber(x2)
+	&& clobber(s0) && clobber(s1)
+	&& clobber(o0)
+	  -> @mergePoint(b,x0,x1,x2) (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 -> load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load to for 4 bytes with computed address.
+// Used to match longer rules.
+(OR <t> s2:(SLDconst x2:(MOVBZload [i3] {s} p mem) [32])
+	o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i2] {s} p mem) [40])
+	s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [48])))
+	&& !config.BigEndian
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& o0.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1
+	&& mergePoint(b, x0, x1, x2) != nil
+	&& clobber(x0) && clobber(x1) && clobber(x2)
+	&& clobber(s0) && clobber(s1) && clobber(s2)
+	&& clobber(o0)
+	  -> @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])
+
+// b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 -> load 32-bit Big endian order on Little endian arch
+// Use byte-reverse indexed load for 4 bytes with constant address.
+// Used to match longer rules.
+(OR <t> s2:(SLDconst x2:(MOVBZload [i0] {s} p mem) [56])
+        o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48])
+        s0:(SLDconst x0:(MOVHBRload <t> (MOVDaddr <typ.Uintptr> [i2] {s} p) mem) [32])))
+        && !config.BigEndian
+        && i1 == i0+1
+        && i2 == i0+2
+        && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+        && o0.Uses == 1
+        && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1
+        && mergePoint(b, x0, x1, x2) != nil
+        && clobber(x0) && clobber(x1) && clobber(x2)
+        && clobber(s0) && clobber(s1) && clobber(s2)
+        && clobber(o0)
+          -> @mergePoint(b,x0,x1,x2) (SLDconst <t> (MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])
+
 // b[0] | b[1]<<8 | b[2]<<16 | b[3]<<24 | b[4] <<32 | b[5]<<40 | b[6]<<48 | b[7]<<56 -> load 64-bit Little endian
-// Note: long rules with commutative ops will result in very large functions in rewritePPC64,
-// so shorter rules which make use of previously defined rules are preferred.
+// Rules with commutative ops and many operands will result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
 // Offset must be multiple of 4 for MOVD
 (OR <t> s6:(SLDconst x7:(MOVBZload [i7] {s} p mem) [56])
 	o5:(OR <t> s5:(SLDconst x6:(MOVBZload [i6] {s} p mem) [48])
@@ -959,10 +1067,56 @@
 	&& clobber(o3) && clobber(o4) && clobber(o5)
 	  -> @mergePoint(b,x0,x4,x5,x6,x7) (MOVDload <t> {s} [i0] p mem)

+// b[7] | b[6]<<8 | b[5]<<16 | b[4]<<24 | b[3]<<32 | b[2]<<40 | b[1]<<48 | b[0]<<56 load 64-bit Big endian ordered bytes on Little endian arch
+// Use byte-reverse indexed load of 8 bytes.
+// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+(OR <t> s0:(SLDconst x0:(MOVBZload [i0] {s} p mem) [56])
+	o0:(OR <t> s1:(SLDconst x1:(MOVBZload [i1] {s} p mem) [48])
+	o1:(OR <t> s2:(SLDconst x2:(MOVBZload [i2] {s} p mem) [40])
+	o2:(OR <t> s3:(SLDconst x3:(MOVBZload [i3] {s} p mem) [32])
+	x4:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i4] p) mem)))))
+	&& !config.BigEndian
+	&& i1 == i0+1
+	&& i2 == i0+2
+	&& i3 == i0+3
+	&& i4 == i0+4
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1
+	&& o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1
+	&& s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
+	&& mergePoint(b, x0, x1, x2, x3, x4) != nil
+	&& clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4)
+	&& clobber(o0) && clobber(o1) && clobber(o2)
+	&& clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3)
+	  -> @mergePoint(b,x0,x1,x2,x3,x4) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
+// b[0]<<56 | b[1]<<48 | b[2]<<40 | b[3]<<32 | b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7] -> load 64-bit Big endian ordered bytes on Little endian arch
+// Use byte-reverse indexed load of 8 bytes.
+// Rules with commutative ops and many operands can result in extremely large functions in rewritePPC64,
+// so matching shorter previously defined subrules is important.
+(OR <t> x7:(MOVBZload [i7] {s} p mem)
+	o5:(OR <t> s6:(SLDconst x6:(MOVBZload [i6] {s} p mem) [8])
+	o4:(OR <t> s5:(SLDconst x5:(MOVBZload [i5] {s} p mem) [16])
+	o3:(OR <t> s4:(SLDconst x4:(MOVBZload [i4] {s} p mem) [24])
+	s0:(SL(W|D)const x3:(MOVWBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem) [32])))))
+	&& !config.BigEndian
+	&& i4 == i0+4
+	&& i5 == i0+5
+	&& i6 == i0+6
+	&& i7 == i0+7
+	&& x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1
+	&& o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1
+	&& s0.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1
+	&& mergePoint(b, x3, x4, x5, x6, x7) != nil
+	&& clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7)
+	&& clobber(o3) && clobber(o4) && clobber(o5)
+	&& clobber(s0) && clobber(s4) && clobber(s5) && clobber(s6)
+	-> @mergePoint(b,x3,x4,x5,x6,x7) (MOVDBRload <t> (MOVDaddr <typ.Uintptr> [i0] {s} p) mem)
+
 // 2 byte store Little endian as in:
 //      b[0] = byte(v >> 16)
 //      b[1] = byte(v >> 24)
-// Added mainly to use when matching longer rules below
+// Added for use in matching longer rules.
 (MOVBstore [i1] {s} p (SR(W|D)const w [24])
        x0:(MOVBstore [i0] {s} p (SR(W|D)const w [16]) mem))
        && !config.BigEndian
@@ -993,6 +1147,38 @@
 	&& clobber(x0)
 	  -> (MOVWstore [i0] {s} p w mem)

+// 4 byte store Big endian as in:
+//     b[0] = byte(v >> 24)
+//     b[1] = byte(v >> 16)
+//     b[2] = byte(v >> 8)
+//     b[3] = byte(v)
+// Use byte-reverse indexed 4 byte store.
+(MOVBstore [i3] {s} p w
+	x0:(MOVBstore [i2] {s} p (SRWconst w [8])
+	x1:(MOVBstore [i1] {s} p (SRWconst w [16])
+	x2:(MOVBstore [i0] {s} p (SRWconst w [24]) mem))))
+	&& !config.BigEndian
+	&& x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1
+	&& i1 == i0+1 && i2 == i0+2 && i3 == i0+3
+	&& clobber(x0) && clobber(x1) && clobber(x2)
+	  -> (MOVWBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
+// The 2 byte store appears after the 4 byte store so that the
+// match for the 2 byte store is not done first.
+// If the 4 byte store is based on the 2 byte store then there are 
+// variations on the MOVDaddr subrule that would require additional
+// rules to be written.
+
+// 2 byte store Big endian as in:
+//      b[0] = byte(v >> 8)
+//      b[1] = byte(v)
+(MOVBstore [i1] {s} p w x0:(MOVBstore [i0] {s} p (SRWconst w [8]) mem))
+	&& !config.BigEndian
+	&& x0.Uses == 1
+	&& i1 == i0+1
+	&& clobber(x0)
+	  -> (MOVHBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
+
 // 8 byte store Little endian as in:
 //	b[0] = byte(v)
 //	b[1] = byte(v >> 8)
@@ -1015,3 +1201,27 @@
 	&& i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
 	&& clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3)
 	  -> (MOVDstore [i0] {s} p w mem)
+
+// 8 byte store Big endian as in:
+//      b[0] = byte(v >> 56)
+//      b[1] = byte(v >> 48)
+//      b[2] = byte(v >> 40)
+//      b[3] = byte(v >> 32)
+//      b[4] = byte(v >> 24)
+//      b[5] = byte(v >> 16)
+//      b[6] = byte(v >> 8)
+//      b[7] = byte(v)
+// Use byte-reverse indexed 8 byte store.
+(MOVBstore [i7] {s} p w
+        x0:(MOVBstore [i6] {s} p (SRDconst w [8])
+        x1:(MOVBstore [i5] {s} p (SRDconst w [16])
+        x2:(MOVBstore [i4] {s} p (SRDconst w [24])
+        x3:(MOVBstore [i3] {s} p (SRDconst w [32])
+        x4:(MOVBstore [i2] {s} p (SRDconst w [40])
+        x5:(MOVBstore [i1] {s} p (SRDconst w [48])
+        x6:(MOVBstore [i0] {s} p (SRDconst w [56]) mem))))))))
+        && !config.BigEndian
+        && x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1
+        && i1 == i0+1 && i2 == i0+2 && i3 == i0+3 && i4 == i0+4 && i5 == i0+5 && i6 == i0+6 && i7 == i0+7
+        && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6)
+          -> (MOVDBRstore (MOVDaddr <typ.Uintptr> [i0] {s} p) w mem)
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -265,28 +265,49 @@ func init() {
 		{name: "MOVHZreg", argLength: 1, reg: gp11, asm: "MOVHZ", typ: "Int64"}, // zero extend uint16 to uint64
 		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW", typ: "Int64"},   // sign extend int32 to int64
 		{name: "MOVWZreg", argLength: 1, reg: gp11, asm: "MOVWZ", typ: "Int64"}, // zero extend uint32 to uint64
-		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // zero extend uint8 to uint64
-		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // sign extend int16 to int64
-		{name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // zero extend uint16 to uint64
-		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},    // sign extend int32 to int64
-		{name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // zero extend uint32 to uint64
-		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"},
-
-		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"},
-		{name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"},
-		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
-		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
-		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
-		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
-		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
-		{name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},
-
-		{name: "MOVBstorezero", argLength: 2, reg: gpstorezero, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero byte to arg0+aux.  arg1=mem
-		{name: "MOVHstorezero", argLength: 2, reg: gpstorezero, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 2 bytes to ...
-		{name: "MOVWstorezero", argLength: 2, reg: gpstorezero, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 4 bytes to ...
-		{name: "MOVDstorezero", argLength: 2, reg: gpstorezero, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 8 bytes to ...
-
-		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
+
+		// Load bytes in the endian order of the arch from arg0+aux+auxint into a 64 bit register.
+		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load byte zero extend
+		{name: "MOVHload", argLength: 2, reg: gpload, asm: "MOVH", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"},    // load 2 bytes sign extend
+		{name: "MOVHZload", argLength: 2, reg: gpload, asm: "MOVHZ", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVW", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"},    // load 4 bytes sign extend
+		{name: "MOVWZload", argLength: 2, reg: gpload, asm: "MOVWZ", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend
+		{name: "MOVDload", argLength: 2, reg: gpload, asm: "MOVD", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"},    // load 8 bytes
+
+		// Load bytes in reverse endian order of the arch from arg0 into a 64 bit register, all zero extend.
+		// The generated instructions are indexed loads with no offset field in the instruction so the aux fields are not used.
+		// In these cases the index register field is set to 0 and the full address is in the base register.
+		{name: "MOVDBRload", argLength: 2, reg: gpload, asm: "MOVDBR", aux: "SymOff", typ: "Int64", faultOnNilArg0: true, symEffect: "Read"}, // load 8 bytes reverse order
+		{name: "MOVWBRload", argLength: 2, reg: gpload, asm: "MOVWBR", aux: "SymOff", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes zero extend reverse order
+		{name: "MOVHBRload", argLength: 2, reg: gpload, asm: "MOVHBR", aux: "SymOff", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes zero extend reverse order
+
+		// Store bytes in the reverse endian order of the arch into arg0.
+		// These are indexes stores with no offset field in the instruction so the aux fields are not used.
+		{name: "MOVDBRstore", argLength: 3, reg: gpstore, asm: "MOVDBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes reverse order
+		{name: "MOVWBRstore", argLength: 3, reg: gpstore, asm: "MOVWBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes reverse order
+		{name: "MOVHBRstore", argLength: 3, reg: gpstore, asm: "MOVHBR", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes reverse order
+
+		// Floating point loads from arg0+aux+auxint
+		{name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load double float
+		{name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load single float
+
+		// Store bytes in the endian order of the arch into arg0+aux+auxint
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte
+		{name: "MOVHstore", argLength: 3, reg: gpstore, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes
+		{name: "MOVDstore", argLength: 3, reg: gpstore, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes
+
+		// Store floating point value into arg0+aux+auxint
+		{name: "FMOVDstore", argLength: 3, reg: fpstore, asm: "FMOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store double flot
+		{name: "FMOVSstore", argLength: 3, reg: fpstore, asm: "FMOVS", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store single float
+
+		// The following ops store 0 into arg0+aux+auxint arg1=mem
+		{name: "MOVBstorezero", argLength: 2, reg: gpstorezero, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 1 byte
+		{name: "MOVHstorezero", argLength: 2, reg: gpstorezero, asm: "MOVH", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 2 bytes
+		{name: "MOVWstorezero", argLength: 2, reg: gpstorezero, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 4 bytes
+		{name: "MOVDstorezero", argLength: 2, reg: gpstorezero, asm: "MOVD", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store zero 8 bytes
+
+		{name: "MOVDaddr", argLength: 1, reg: regInfo{inputs: []regMask{sp | sb | gp}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVD", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB/GP

 		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "Int64", rematerializeable: true}, //
 		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", rematerializeable: true},           //

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1586,6 +1586,12 @@ const (
 	OpPPC64MOVWload
 	OpPPC64MOVWZload
 	OpPPC64MOVDload
+	OpPPC64MOVDBRload
+	OpPPC64MOVWBRload
+	OpPPC64MOVHBRload
+	OpPPC64MOVDBRstore
+	OpPPC64MOVWBRstore
+	OpPPC64MOVHBRstore
 	OpPPC64FMOVDload
 	OpPPC64FMOVSload
 	OpPPC64MOVBstore
@@ -20957,6 +20963,96 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:           "MOVDBRload",
+		auxType:        auxSymOff,
+		argLen:         2,
+		faultOnNilArg0: true,
+		symEffect:      SymRead,
+		asm:            ppc64.AMOVDBR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "MOVWBRload",
+		auxType:        auxSymOff,
+		argLen:         2,
+		faultOnNilArg0: true,
+		symEffect:      SymRead,
+		asm:            ppc64.AMOVWBR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "MOVHBRload",
+		auxType:        auxSymOff,
+		argLen:         2,
+		faultOnNilArg0: true,
+		symEffect:      SymRead,
+		asm:            ppc64.AMOVHBR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+			outputs: []outputInfo{
+				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "MOVDBRstore",
+		auxType:        auxSymOff,
+		argLen:         3,
+		faultOnNilArg0: true,
+		symEffect:      SymWrite,
+		asm:            ppc64.AMOVDBR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "MOVWBRstore",
+		auxType:        auxSymOff,
+		argLen:         3,
+		faultOnNilArg0: true,
+		symEffect:      SymWrite,
+		asm:            ppc64.AMOVWBR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
+	{
+		name:           "MOVHBRstore",
+		auxType:        auxSymOff,
+		argLen:         3,
+		faultOnNilArg0: true,
+		symEffect:      SymWrite,
+		asm:            ppc64.AMOVHBR,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+				{1, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+			},
+		},
+	},
 	{
 		name:           "FMOVDload",
 		auxType:        auxSymOff,
@@ -21134,7 +21230,7 @@ var opcodeTable = [...]opInfo{
 		asm:               ppc64.AMOVD,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 6}, // SP SB
+				{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
 			},
 			outputs: []outputInfo{
 				{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29

--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -375,7 +375,7 @@ func (s *regAllocState) allocReg(mask regMask, v *Value) register {
 	mask &= s.allocatable
 	mask &^= s.nospill
 	if mask == 0 {
-		s.f.Fatalf("no register available for %s", v)
+		s.f.Fatalf("no register available for %s", v.LongString())
 	}

 	// Pick an unused register if one is available.

--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -4973,6 +4973,8 @@ func (c *ctxt9) opstorex(a obj.As) uint32 {
 		return OPVCC(31, 661, 0, 0) /* stswx */
 	case AMOVWBR:
 		return OPVCC(31, 662, 0, 0) /* stwbrx */
+	case AMOVDBR:
+		return OPVCC(31, 660, 0, 0) /* stdbrx */
 	case ASTBCCC:
 		return OPVCC(31, 694, 0, 1) /* stbcx. */
 	case ASTWCCC:

--- a/src/crypto/cipher/gcm.go
+++ b/src/crypto/cipher/gcm.go
@@ -413,6 +413,7 @@ func (g *gcm) auth(out, ciphertext, additionalData []byte, tagMask *[gcmTagSize]
 }

 func getUint64(data []byte) uint64 {
+	_ = data[7] // bounds check hint to compiler; see golang.org/issue/14808
 	r := uint64(data[0])<<56 |
 		uint64(data[1])<<48 |
 		uint64(data[2])<<40 |
@@ -425,6 +426,7 @@ func getUint64(data []byte) uint64 {
 }

 func putUint64(out []byte, v uint64) {
+	_ = out[7] // bounds check hint to compiler; see golang.org/issue/14808
 	out[0] = byte(v >> 56)
 	out[1] = byte(v >> 48)
 	out[2] = byte(v >> 40)

--- a/test/codegen/memcombine.go
+++ b/test/codegen/memcombine.go
@@ -69,6 +69,7 @@ func load_be64(b []byte) {
 	// amd64:`BSWAPQ`
 	// s390x:`MOVD\s\(.*\),`
 	// arm64:`REV`,`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`,-`REVW`,-`REV16W`
+	// ppc64le:`MOVDBR`
 	sink64 = binary.BigEndian.Uint64(b)
 }

@@ -76,6 +77,7 @@ func load_be64_idx(b []byte, idx int) {
 	// amd64:`BSWAPQ`
 	// s390x:`MOVD\s\(.*\)\(.*\*1\),`
 	// arm64:`REV`,`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[WHB]`,-`REVW`,-`REV16W`
+	// ppc64le:`MOVDBR`
 	sink64 = binary.BigEndian.Uint64(b[idx:])
 }

@@ -83,6 +85,7 @@ func load_be32(b []byte) {
 	// amd64:`BSWAPL`
 	// s390x:`MOVWZ\s\(.*\),`
 	// arm64:`REVW`,`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`,-`REV16W`
+	// ppc64le:`MOVWBR`
 	sink32 = binary.BigEndian.Uint32(b)
 }

@@ -90,18 +93,21 @@ func load_be32_idx(b []byte, idx int) {
 	// amd64:`BSWAPL`
 	// s390x:`MOVWZ\s\(.*\)\(.*\*1\),`
 	// arm64:`REVW`,`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[HB]`,-`REV16W`
+	// ppc64le:`MOVWBR`
 	sink32 = binary.BigEndian.Uint32(b[idx:])
 }

 func load_be16(b []byte) {
 	// amd64:`ROLW\s\$8`
 	// arm64: `REV16W`,`MOVHU\s\(R[0-9]+\),`,-`MOVB`
+	// ppc64le:`MOVHBR`
 	sink16 = binary.BigEndian.Uint16(b)
 }

 func load_be16_idx(b []byte, idx int) {
 	// amd64:`ROLW\s\$8`
 	// arm64: `REV16W`,`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
+	// ppc64le:`MOVHBR`
 	sink16 = binary.BigEndian.Uint16(b[idx:])
 }

@@ -203,50 +209,56 @@ func store_le32_idx(b []byte, idx int) {
 func store_le16(b []byte) {
 	// amd64:`MOVW\s`
 	// arm64:`MOVH`,-`MOVB`
-	// ppc64le(DISABLED):`MOVH\s`
+	// ppc64le:`MOVH\s`
 	binary.LittleEndian.PutUint16(b, sink16)
 }

 func store_le16_idx(b []byte, idx int) {
 	// amd64:`MOVW\s`
 	// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOVB`
-	// ppc64le(DISABLED):`MOVH\s`
+	// ppc64le:`MOVH\s`
 	binary.LittleEndian.PutUint16(b[idx:], sink16)
 }

 func store_be64(b []byte) {
 	// amd64:`BSWAPQ`,-`SHR.`
 	// arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W`
+	// ppc64le:`MOVDBR`
 	binary.BigEndian.PutUint64(b, sink64)
 }

 func store_be64_idx(b []byte, idx int) {
 	// amd64:`BSWAPQ`,-`SHR.`
 	// arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW`
+	// ppc64le:`MOVDBR`
 	binary.BigEndian.PutUint64(b[idx:], sink64)
 }

 func store_be32(b []byte) {
 	// amd64:`BSWAPL`,-`SHR.`
 	// arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W`
+	// ppc64le:`MOVWBR`
 	binary.BigEndian.PutUint32(b, sink32)
 }

 func store_be32_idx(b []byte, idx int) {
 	// amd64:`BSWAPL`,-`SHR.`
 	// arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W`
+	// ppc64le:`MOVWBR`
 	binary.BigEndian.PutUint32(b[idx:], sink32)
 }

 func store_be16(b []byte) {
 	// amd64:`ROLW\s\$8`,-`SHR.`
 	// arm64:`MOVH`,`REV16W`,-`MOVB`
+	// ppc64le:`MOVHBR`
 	binary.BigEndian.PutUint16(b, sink16)
 }

 func store_be16_idx(b []byte, idx int) {
 	// amd64:`ROLW\s\$8`,-`SHR.`
 	// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,`REV16W`,-`MOVB`
+	// ppc64le:`MOVHBR`
 	binary.BigEndian.PutUint16(b[idx:], sink16)
 }