Commit 14f3ca56 authored by Keith Randall's avatar Keith Randall

cmd/internal/obj: ARM, use immediates instead of constant pool entries

When a constant doesn't fit in a single instruction, use two
paired instructions instead of the constant pool.  For example

  ADD $0xaa00bb, R0, R1

Used to rewrite to:

  MOV ?(IP), R11
  ADD R11, R0, R1

Instead, do:

  ADD $0xaa0000, R0, R1
  ADD $0xbb, R1, R1

Same number of instructions.
Good:
  4 less bytes (no constant pool entry)
  One less load.
Bad:
  Critical path is one instruction longer.

It's probably worth it to avoid the loads, they are expensive.

Dave Cheney got us some performance numbers: https://perf.golang.org/search?q=upload:20170426.1
TL;DR mean 1.37% improvement.

Change-Id: Ib206836161fdc94a3962db6f9caa635c87d57cf1
Reviewed-on: https://go-review.googlesource.com/41612
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent c120e449
...@@ -123,6 +123,7 @@ const ( ...@@ -123,6 +123,7 @@ const (
C_RCON /* 0xff rotated */ C_RCON /* 0xff rotated */
C_NCON /* ~RCON */ C_NCON /* ~RCON */
C_RCON2 /* OR of two disjoint C_RCON constants */
C_SCON /* 0xffff */ C_SCON /* 0xffff */
C_LCON C_LCON
C_LCONADDR C_LCONADDR
......
...@@ -16,6 +16,7 @@ var cnames5 = []string{ ...@@ -16,6 +16,7 @@ var cnames5 = []string{
"FCR", "FCR",
"RCON", "RCON",
"NCON", "NCON",
"RCON2",
"SCON", "SCON",
"LCON", "LCON",
"LCONADDR", "LCONADDR",
......
...@@ -86,16 +86,22 @@ var optab = []Optab{ ...@@ -86,16 +86,22 @@ var optab = []Optab{
{obj.ATEXT, C_ADDR, C_NONE, C_TEXTSIZE, 0, 0, 0, 0, 0}, {obj.ATEXT, C_ADDR, C_NONE, C_TEXTSIZE, 0, 0, 0, 0, 0},
{AADD, C_REG, C_REG, C_REG, 1, 4, 0, 0, 0}, {AADD, C_REG, C_REG, C_REG, 1, 4, 0, 0, 0},
{AADD, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0}, {AADD, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0},
{AAND, C_REG, C_REG, C_REG, 1, 4, 0, 0, 0},
{AAND, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0},
{AMOVW, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0}, {AMOVW, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0},
{AMVN, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0}, {AMVN, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0},
{ACMP, C_REG, C_REG, C_NONE, 1, 4, 0, 0, 0}, {ACMP, C_REG, C_REG, C_NONE, 1, 4, 0, 0, 0},
{AADD, C_RCON, C_REG, C_REG, 2, 4, 0, 0, 0}, {AADD, C_RCON, C_REG, C_REG, 2, 4, 0, 0, 0},
{AADD, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0}, {AADD, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0},
{AAND, C_RCON, C_REG, C_REG, 2, 4, 0, 0, 0},
{AAND, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0},
{AMOVW, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0}, {AMOVW, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0},
{AMVN, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0}, {AMVN, C_RCON, C_NONE, C_REG, 2, 4, 0, 0, 0},
{ACMP, C_RCON, C_REG, C_NONE, 2, 4, 0, 0, 0}, {ACMP, C_RCON, C_REG, C_NONE, 2, 4, 0, 0, 0},
{AADD, C_SHIFT, C_REG, C_REG, 3, 4, 0, 0, 0}, {AADD, C_SHIFT, C_REG, C_REG, 3, 4, 0, 0, 0},
{AADD, C_SHIFT, C_NONE, C_REG, 3, 4, 0, 0, 0}, {AADD, C_SHIFT, C_NONE, C_REG, 3, 4, 0, 0, 0},
{AAND, C_SHIFT, C_REG, C_REG, 3, 4, 0, 0, 0},
{AAND, C_SHIFT, C_NONE, C_REG, 3, 4, 0, 0, 0},
{AMVN, C_SHIFT, C_NONE, C_REG, 3, 4, 0, 0, 0}, {AMVN, C_SHIFT, C_NONE, C_REG, 3, 4, 0, 0, 0},
{ACMP, C_SHIFT, C_REG, C_NONE, 3, 4, 0, 0, 0}, {ACMP, C_SHIFT, C_REG, C_NONE, 3, 4, 0, 0, 0},
{AMOVW, C_RACON, C_NONE, C_REG, 4, 4, REGSP, 0, 0}, {AMOVW, C_RACON, C_NONE, C_REG, 4, 4, REGSP, 0, 0},
...@@ -128,14 +134,22 @@ var optab = []Optab{ ...@@ -128,14 +134,22 @@ var optab = []Optab{
{AMOVW, C_LCONADDR, C_NONE, C_REG, 12, 4, 0, LFROM | LPCREL, 4}, {AMOVW, C_LCONADDR, C_NONE, C_REG, 12, 4, 0, LFROM | LPCREL, 4},
{AADD, C_NCON, C_REG, C_REG, 13, 8, 0, 0, 0}, {AADD, C_NCON, C_REG, C_REG, 13, 8, 0, 0, 0},
{AADD, C_NCON, C_NONE, C_REG, 13, 8, 0, 0, 0}, {AADD, C_NCON, C_NONE, C_REG, 13, 8, 0, 0, 0},
{AAND, C_NCON, C_REG, C_REG, 13, 8, 0, 0, 0},
{AAND, C_NCON, C_NONE, C_REG, 13, 8, 0, 0, 0},
{AMVN, C_NCON, C_NONE, C_REG, 13, 8, 0, 0, 0}, {AMVN, C_NCON, C_NONE, C_REG, 13, 8, 0, 0, 0},
{ACMP, C_NCON, C_REG, C_NONE, 13, 8, 0, 0, 0}, {ACMP, C_NCON, C_REG, C_NONE, 13, 8, 0, 0, 0},
{AADD, C_SCON, C_REG, C_REG, 13, 8, 0, 0, 0}, {AADD, C_SCON, C_REG, C_REG, 13, 8, 0, 0, 0},
{AADD, C_SCON, C_NONE, C_REG, 13, 8, 0, 0, 0}, {AADD, C_SCON, C_NONE, C_REG, 13, 8, 0, 0, 0},
{AAND, C_SCON, C_REG, C_REG, 13, 8, 0, 0, 0},
{AAND, C_SCON, C_NONE, C_REG, 13, 8, 0, 0, 0},
{AMVN, C_SCON, C_NONE, C_REG, 13, 8, 0, 0, 0}, {AMVN, C_SCON, C_NONE, C_REG, 13, 8, 0, 0, 0},
{ACMP, C_SCON, C_REG, C_NONE, 13, 8, 0, 0, 0}, {ACMP, C_SCON, C_REG, C_NONE, 13, 8, 0, 0, 0},
{AADD, C_RCON2, C_REG, C_REG, 106, 8, 0, 0, 0},
// TODO: RCON2: how to do AND and BIC?
{AADD, C_LCON, C_REG, C_REG, 13, 8, 0, LFROM, 0}, {AADD, C_LCON, C_REG, C_REG, 13, 8, 0, LFROM, 0},
{AADD, C_LCON, C_NONE, C_REG, 13, 8, 0, LFROM, 0}, {AADD, C_LCON, C_NONE, C_REG, 13, 8, 0, LFROM, 0},
{AAND, C_LCON, C_REG, C_REG, 13, 8, 0, LFROM, 0},
{AAND, C_LCON, C_NONE, C_REG, 13, 8, 0, LFROM, 0},
{AMVN, C_LCON, C_NONE, C_REG, 13, 8, 0, LFROM, 0}, {AMVN, C_LCON, C_NONE, C_REG, 13, 8, 0, LFROM, 0},
{ACMP, C_LCON, C_REG, C_NONE, 13, 8, 0, LFROM, 0}, {ACMP, C_LCON, C_REG, C_NONE, 13, 8, 0, LFROM, 0},
{AMOVB, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0}, {AMOVB, C_REG, C_NONE, C_REG, 1, 4, 0, 0, 0},
...@@ -957,6 +971,21 @@ func immrot(v uint32) int32 { ...@@ -957,6 +971,21 @@ func immrot(v uint32) int32 {
return 0 return 0
} }
// immrot2 returns bits encoding the immediate constant fields of two instructions,
// such that the encoded constants x, y satisfy x|y==v, x&y==0.
// Returns 0,0 if no such decomposition of v exists.
func immrot2(v uint32) (uint32, uint32) {
for i := uint(1); i < 32; i++ {
m := uint32(1<<i - 1)
if x, y := immrot(v&m), immrot(v&^m); x != 0 && y != 0 {
return uint32(x), uint32(y)
}
}
// TODO: handle some more cases, like where
// the wraparound from the rotate could help.
return 0, 0
}
func immaddr(v int32) int32 { func immaddr(v int32) int32 {
if v >= 0 && v <= 0xfff { if v >= 0 && v <= 0xfff {
return v&0xfff | 1<<24 | 1<<23 /* pre indexing */ /* pre indexing, up */ return v&0xfff | 1<<24 | 1<<23 /* pre indexing */ /* pre indexing, up */
...@@ -1131,6 +1160,9 @@ func (c *ctxt5) aclass(a *obj.Addr) int { ...@@ -1131,6 +1160,9 @@ func (c *ctxt5) aclass(a *obj.Addr) int {
if uint32(c.instoffset) <= 0xffff && objabi.GOARM == 7 { if uint32(c.instoffset) <= 0xffff && objabi.GOARM == 7 {
return C_SCON return C_SCON
} }
if x, y := immrot2(uint32(c.instoffset)); x != 0 && y != 0 {
return C_RCON2
}
return C_LCON return C_LCON
case obj.NAME_EXTERN, case obj.NAME_EXTERN,
...@@ -1195,6 +1227,16 @@ func (c *ctxt5) oplook(p *obj.Prog) *Optab { ...@@ -1195,6 +1227,16 @@ func (c *ctxt5) oplook(p *obj.Prog) *Optab {
a2 = C_REG a2 = C_REG
} }
// If Scond != 0, we must use the constant pool instead of
// splitting the instruction in two. The most common reason is
// .S (flag updating) instructions. There may be others.
if a1 == C_RCON2 && p.Scond != 0 {
a1 = C_LCON
}
if a3 == C_RCON2 && p.Scond != 0 {
a3 = C_LCON
}
if false { /*debug['O']*/ if false { /*debug['O']*/
fmt.Printf("oplook %v %v %v %v\n", p.As, DRconv(a1), DRconv(a2), DRconv(a3)) fmt.Printf("oplook %v %v %v %v\n", p.As, DRconv(a1), DRconv(a2), DRconv(a3))
fmt.Printf("\t\t%d %d\n", p.From.Type, p.To.Type) fmt.Printf("\t\t%d %d\n", p.From.Type, p.To.Type)
...@@ -1225,7 +1267,7 @@ func cmp(a int, b int) bool { ...@@ -1225,7 +1267,7 @@ func cmp(a int, b int) bool {
} }
switch a { switch a {
case C_LCON: case C_LCON:
if b == C_RCON || b == C_NCON || b == C_SCON { if b == C_RCON || b == C_NCON || b == C_SCON || b == C_RCON2 {
return true return true
} }
...@@ -1365,7 +1407,6 @@ func buildop(ctxt *obj.Link) { ...@@ -1365,7 +1407,6 @@ func buildop(ctxt *obj.Link) {
log.Fatalf("bad code") log.Fatalf("bad code")
case AADD: case AADD:
opset(AAND, r0)
opset(AEOR, r0) opset(AEOR, r0)
opset(ASUB, r0) opset(ASUB, r0)
opset(ARSB, r0) opset(ARSB, r0)
...@@ -1373,6 +1414,9 @@ func buildop(ctxt *obj.Link) { ...@@ -1373,6 +1414,9 @@ func buildop(ctxt *obj.Link) {
opset(ASBC, r0) opset(ASBC, r0)
opset(ARSC, r0) opset(ARSC, r0)
opset(AORR, r0) opset(AORR, r0)
case AAND:
opset(AAND, r0)
opset(ABIC, r0) opset(ABIC, r0)
case ACMP: case ACMP:
...@@ -1563,6 +1607,33 @@ func (c *ctxt5) asmout(p *obj.Prog, o *Optab, out []uint32) { ...@@ -1563,6 +1607,33 @@ func (c *ctxt5) asmout(p *obj.Prog, o *Optab, out []uint32) {
} }
o1 |= (uint32(r)&15)<<16 | (uint32(rt)&15)<<12 o1 |= (uint32(r)&15)<<16 | (uint32(rt)&15)<<12
case 106: /* op $I,R,R where I can be decomposed into 2 immediates */
c.aclass(&p.From)
r := int(p.Reg)
rt := int(p.To.Reg)
x, y := immrot2(uint32(c.instoffset))
var as2 obj.As
switch p.As {
case AADD, ASUB, AORR, AEOR:
as2 = p.As // ADD, SUB, ORR, EOR
case ARSB:
as2 = AADD // RSB -> RSB/ADD pair
case AADC:
as2 = AADD // ADC -> ADC/ADD pair
case ASBC:
as2 = ASUB // SBC -> SBC/SUB pair
case ARSC:
as2 = AADD // RSC -> RSC/ADD pair
default:
c.ctxt.Diag("unknown second op for %v", p)
}
o1 = c.oprrr(p, p.As, int(p.Scond))
o2 = c.oprrr(p, as2, int(p.Scond))
o1 |= (uint32(r)&15)<<16 | (uint32(rt)&15)<<12
o2 |= (uint32(rt)&15)<<16 | (uint32(rt)&15)<<12
o1 |= x
o2 |= y
case 3: /* add R<<[IR],[R],R */ case 3: /* add R<<[IR],[R],R */
o1 = c.mov(p) o1 = c.mov(p)
......
// run
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file tests the splitting of constants into
// multiple immediates on arm.
package main
import "fmt"
const c32 = 0xaa00dd
const c64 = 0xaa00dd55000066
//go:noinline
func add32(x uint32) uint32 {
return x + c32
}
//go:noinline
func sub32(x uint32) uint32 {
return x - c32
}
//go:noinline
func or32(x uint32) uint32 {
return x | c32
}
//go:noinline
func xor32(x uint32) uint32 {
return x ^ c32
}
//go:noinline
func subr32(x uint32) uint32 {
return c32 - x
}
//go:noinline
func add64(x uint64) uint64 {
return x + c64
}
//go:noinline
func sub64(x uint64) uint64 {
return x - c64
}
//go:noinline
func or64(x uint64) uint64 {
return x | c64
}
//go:noinline
func xor64(x uint64) uint64 {
return x ^ c64
}
//go:noinline
func subr64(x uint64) uint64 {
return c64 - x
}
// Note: x-c gets rewritten to x+(-c), so SUB and SBC are not directly testable.
// I disabled that rewrite rule before running this test.
func main() {
test32()
test64()
}
func test32() {
var a uint32 = 0x11111111
var want, got uint32
if want, got = a+c32, add32(a); got != want {
panic(fmt.Sprintf("add32(%x) = %x, want %x", a, got, want))
}
if want, got = a-c32, sub32(a); got != want {
panic(fmt.Sprintf("sub32(%x) = %x, want %x", a, got, want))
}
if want, got = a|c32, or32(a); got != want {
panic(fmt.Sprintf("or32(%x) = %x, want %x", a, got, want))
}
if want, got = a^c32, xor32(a); got != want {
panic(fmt.Sprintf("xor32(%x) = %x, want %x", a, got, want))
}
if want, got = c32-a, subr32(a); got != want {
panic(fmt.Sprintf("subr32(%x) = %x, want %x", a, got, want))
}
}
func test64() {
var a uint64 = 0x1111111111111111
var want, got uint64
if want, got = a+c64, add64(a); got != want {
panic(fmt.Sprintf("add64(%x) = %x, want %x", a, got, want))
}
if want, got = a-c64, sub64(a); got != want {
panic(fmt.Sprintf("sub64(%x) = %x, want %x", a, got, want))
}
if want, got = a|c64, or64(a); got != want {
panic(fmt.Sprintf("or64(%x) = %x, want %x", a, got, want))
}
if want, got = a^c64, xor64(a); got != want {
panic(fmt.Sprintf("xor64(%x) = %x, want %x", a, got, want))
}
if want, got = c64-a, subr64(a); got != want {
panic(fmt.Sprintf("subr64(%x) = %x, want %x", a, got, want))
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment