cmd/compile: improve s390x sign/zero extension removal

This CL gets rid of the MOVDreg and MOVDnop SSA operations on s390x. They were originally inserted to help avoid situations where a sign/zero extension was elided but a spill invalidated the optimization. It's not really clear we need to do this though (amd64 doesn't have these ops for example) so long as we are careful when removing sign/zero extensions. Also, the MOVDreg technique doesn't work if the register is spilled before the MOVDreg op (I haven't seen that in practice). Removing these ops reduces the complexity of the rules and also allows us to unblock optimizations. For example, the compiler can now merge the loads in binary.{Big,Little}Endian.PutUint16 which it wasn't able to do before. This CL reduces the size of the .text section in the go tool by about 4.7KB (0.09%). Change-Id: Icaddae7f2e4f9b2debb6fabae845adb3f73b41db Reviewed-on: https://go-review.googlesource.com/c/go/+/173897 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>

cmd/compile: improve s390x sign/zero extension removal
This CL gets rid of the MOVDreg and MOVDnop SSA operations on s390x. They were originally inserted to help avoid situations where a sign/zero extension was elided but a spill invalidated the optimization. It's not really clear we need to do this though (amd64 doesn't have these ops for example) so long as we are careful when removing sign/zero extensions. Also, the MOVDreg technique doesn't work if the register is spilled before the MOVDreg op (I haven't seen that in practice). Removing these ops reduces the complexity of the rules and also allows us to unblock optimizations. For example, the compiler can now merge the loads in binary.{Big,Little}Endian.PutUint16 which it wasn't able to do before. This CL reduces the size of the .text section in the go tool by about 4.7KB (0.09%). Change-Id: Icaddae7f2e4f9b2debb6fabae845adb3f73b41db Reviewed-on: https://go-review.googlesource.com/c/go/+/173897 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
5c5f217b · Michael Munday · b38be35e · 5c5f217b · 5c5f217b · 5c5f217b
Commit 5c5f217b authored Apr 25, 2019 by Michael Munday
6 changed files
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@@ -482,7 +482,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
 		gc.AddAux2(&p.To, v, sc.Off())
-	case ssa.OpCopy, ssa.OpS390XMOVDreg:
+	case ssa.OpCopy:
 		if v.Type.IsMemory() {
 			return
 		}
@@ -491,11 +491,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		if x != y {
 			opregreg(s, moveByType(v.Type), y, x)
 		}
-	case ssa.OpS390XMOVDnop:
-		if v.Reg() != v.Args[0].Reg() {
-			v.Fatalf("input[0] and output not in same register %s", v.LongString())
-		}
-		// nothing to do
 	case ssa.OpLoadReg:
 		if v.Type.IsFlags() {
 			v.Fatalf("load flags not implemented: %v", v.LongString())

--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -373,9 +373,6 @@ func init() {
 		{name: "MOVHZreg", argLength: 1, reg: gp11sp, asm: "MOVHZ", typ: "UInt64"}, // zero extend arg0 from int16 to int64
 		{name: "MOVWreg", argLength: 1, reg: gp11sp, asm: "MOVW", typ: "Int64"},    // sign extend arg0 from int32 to int64
 		{name: "MOVWZreg", argLength: 1, reg: gp11sp, asm: "MOVWZ", typ: "UInt64"}, // zero extend arg0 from int32 to int64
-		{name: "MOVDreg", argLength: 1, reg: gp11sp, asm: "MOVD"},                  // move from arg0
-
-		{name: "MOVDnop", argLength: 1, reg: gp11, resultInArg0: true}, // nop, return arg0 in same register

 		{name: "MOVDconst", reg: gp01, asm: "MOVD", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint


--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1979,8 +1979,6 @@ const (
 	OpS390XMOVHZreg
 	OpS390XMOVWreg
 	OpS390XMOVWZreg
-	OpS390XMOVDreg
-	OpS390XMOVDnop
 	OpS390XMOVDconst
 	OpS390XLDGR
 	OpS390XLGDR
@@ -26605,32 +26603,6 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
-	{
-		name:   "MOVDreg",
-		argLen: 1,
-		asm:    s390x.AMOVD,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP
-			},
-			outputs: []outputInfo{
-				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
-			},
-		},
-	},
-	{
-		name:         "MOVDnop",
-		argLen:       1,
-		resultInArg0: true,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
-			},
-			outputs: []outputInfo{
-				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
-			},
-		},
-	},
 	{
 		name:              "MOVDconst",
 		auxType:           auxInt64,

--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
--- a/test/codegen/memcombine.go
+++ b/test/codegen/memcombine.go
@@ -57,6 +57,7 @@ func load_le16(b []byte) {
 	// amd64:`MOVWLZX\s\(.*\),`,-`MOVB`,-`OR`
 	// ppc64le:`MOVHZ\s`,-`MOVBZ`
 	// arm64:`MOVHU\s\(R[0-9]+\),`,-`MOVB`
+	// s390x:`MOVHBR\s\(.*\),`
 	sink16 = binary.LittleEndian.Uint16(b)
 }

@@ -64,6 +65,7 @@ func load_le16_idx(b []byte, idx int) {
 	// amd64:`MOVWLZX\s\(.*\),`,-`MOVB`,-`OR`
 	// ppc64le:`MOVHZ\s`,-`MOVBZ`
 	// arm64:`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
+	// s390x:`MOVHBR\s\(.*\)\(.*\*1\),`
 	sink16 = binary.LittleEndian.Uint16(b[idx:])
 }

@@ -103,6 +105,7 @@ func load_be16(b []byte) {
 	// amd64:`ROLW\s\$8`,-`MOVB`,-`OR`
 	// arm64:`REV16W`,`MOVHU\s\(R[0-9]+\),`,-`MOVB`
 	// ppc64le:`MOVHBR`
+	// s390x:`MOVHZ\s\(.*\),`,-`OR`,-`ORW`,-`SLD`,-`SLW`
 	sink16 = binary.BigEndian.Uint16(b)
 }

@@ -110,6 +113,7 @@ func load_be16_idx(b []byte, idx int) {
 	// amd64:`ROLW\s\$8`,-`MOVB`,-`OR`
 	// arm64:`REV16W`,`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
 	// ppc64le:`MOVHBR`
+	// s390x:`MOVHZ\s\(.*\)\(.*\*1\),`,-`OR`,-`ORW`,-`SLD`,-`SLW`
 	sink16 = binary.BigEndian.Uint16(b[idx:])
 }

@@ -351,6 +355,7 @@ func store_le64(b []byte) {
 	// amd64:`MOVQ\s.*\(.*\)$`,-`SHR.`
 	// arm64:`MOVD`,-`MOV[WBH]`
 	// ppc64le:`MOVD\s`,-`MOV[BHW]\s`
+	// s390x:`MOVDBR\s.*\(.*\)$`
 	binary.LittleEndian.PutUint64(b, sink64)
 }

@@ -358,6 +363,7 @@ func store_le64_idx(b []byte, idx int) {
 	// amd64:`MOVQ\s.*\(.*\)\(.*\*1\)$`,-`SHR.`
 	// arm64:`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`
 	// ppc64le:`MOVD\s`,-`MOV[BHW]\s`
+	// s390x:`MOVDBR\s.*\(.*\)\(.*\*1\)$`
 	binary.LittleEndian.PutUint64(b[idx:], sink64)
 }

@@ -365,6 +371,7 @@ func store_le32(b []byte) {
 	// amd64:`MOVL\s`
 	// arm64:`MOVW`,-`MOV[BH]`
 	// ppc64le:`MOVW\s`
+	// s390x:`MOVWBR\s.*\(.*\)$`
 	binary.LittleEndian.PutUint32(b, sink32)
 }

@@ -372,6 +379,7 @@ func store_le32_idx(b []byte, idx int) {
 	// amd64:`MOVL\s`
 	// arm64:`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`
 	// ppc64le:`MOVW\s`
+	// s390x:`MOVWBR\s.*\(.*\)\(.*\*1\)$`
 	binary.LittleEndian.PutUint32(b[idx:], sink32)
 }

@@ -379,6 +387,7 @@ func store_le16(b []byte) {
 	// amd64:`MOVW\s`
 	// arm64:`MOVH`,-`MOVB`
 	// ppc64le:`MOVH\s`
+	// s390x:`MOVHBR\s.*\(.*\)$`
 	binary.LittleEndian.PutUint16(b, sink16)
 }

@@ -386,6 +395,7 @@ func store_le16_idx(b []byte, idx int) {
 	// amd64:`MOVW\s`
 	// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOVB`
 	// ppc64le:`MOVH\s`
+	// s390x:`MOVHBR\s.*\(.*\)\(.*\*1\)$`
 	binary.LittleEndian.PutUint16(b[idx:], sink16)
 }

@@ -393,6 +403,7 @@ func store_be64(b []byte) {
 	// amd64:`BSWAPQ`,-`SHR.`
 	// arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W`
 	// ppc64le:`MOVDBR`
+	// s390x:`MOVD\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
 	binary.BigEndian.PutUint64(b, sink64)
 }

@@ -400,6 +411,7 @@ func store_be64_idx(b []byte, idx int) {
 	// amd64:`BSWAPQ`,-`SHR.`
 	// arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW`
 	// ppc64le:`MOVDBR`
+	// s390x:`MOVD\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
 	binary.BigEndian.PutUint64(b[idx:], sink64)
 }

@@ -407,6 +419,7 @@ func store_be32(b []byte) {
 	// amd64:`BSWAPL`,-`SHR.`
 	// arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W`
 	// ppc64le:`MOVWBR`
+	// s390x:`MOVW\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
 	binary.BigEndian.PutUint32(b, sink32)
 }

@@ -414,6 +427,7 @@ func store_be32_idx(b []byte, idx int) {
 	// amd64:`BSWAPL`,-`SHR.`
 	// arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W`
 	// ppc64le:`MOVWBR`
+	// s390x:`MOVW\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
 	binary.BigEndian.PutUint32(b[idx:], sink32)
 }

@@ -421,6 +435,7 @@ func store_be16(b []byte) {
 	// amd64:`ROLW\s\$8`,-`SHR.`
 	// arm64:`MOVH`,`REV16W`,-`MOVB`
 	// ppc64le:`MOVHBR`
+	// s390x:`MOVH\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
 	binary.BigEndian.PutUint16(b, sink16)
 }

@@ -428,6 +443,7 @@ func store_be16_idx(b []byte, idx int) {
 	// amd64:`ROLW\s\$8`,-`SHR.`
 	// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,`REV16W`,-`MOVB`
 	// ppc64le:`MOVHBR`
+	// s390x:`MOVH\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
 	binary.BigEndian.PutUint16(b[idx:], sink16)
 }